3 # !!!!!!!!!!!!!! IF YOU MODIFY THIS FILE !!!!!!!!!!!!!!!!!!!!!!!!!
4 # Any files created or read by this program should be listed in 'mktables.lst'
5 # Use -makelist to regenerate it.
7 # Needs 'no overloading' to run faster on miniperl. Code commented out at the
8 # subroutine objaddr can be used instead to work as far back (untested) as
9 # 5.8: needs pack "U". But almost all occurrences of objaddr have been
10 # removed in favor of using 'no overloading'. You also would have to go
11 # through and replace occurrences like:
12 # my $addr = do { no overloading; pack 'J', $self; }
14 # my $addr = main::objaddr $self;
15 # (or reverse commit 9b01bafde4b022706c3d6f947a0963f821b2e50b
16 # that instituted the change to main::objaddr, and subsequent commits that
17 # changed 0+$self to pack 'J', $self.)
20 BEGIN { # Get the time the script started running; do it at compilation to
21 # get it as close as possible
36 sub DEBUG () { 0 } # Set to 0 for production; 1 for development
37 my $debugging_build = $Config{"ccflags"} =~ /-DDEBUGGING/;
39 sub NON_ASCII_PLATFORM { ord("A") != 65 }
41 ##########################################################################
43 # mktables -- create the runtime Perl Unicode files (lib/unicore/.../*.pl),
44 # from the Unicode database files (lib/unicore/.../*.txt), It also generates
45 # a pod file and .t files, depending on option parameters.
47 # The structure of this file is:
48 # First these introductory comments; then
49 # code needed for everywhere, such as debugging stuff; then
50 # code to handle input parameters; then
51 # data structures likely to be of external interest (some of which depend on
52 # the input parameters, so follows them; then
53 # more data structures and subroutine and package (class) definitions; then
54 # the small actual loop to process the input files and finish up; then
55 # a __DATA__ section, for the .t tests
57 # This program works on all releases of Unicode so far. The outputs have been
58 # scrutinized most intently for release 5.1. The others have been checked for
59 # somewhat more than just sanity. It can handle all non-provisional Unicode
60 # character properties in those releases.
62 # This program is mostly about Unicode character (or code point) properties.
63 # A property describes some attribute or quality of a code point, like if it
64 # is lowercase or not, its name, what version of Unicode it was first defined
65 # in, or what its uppercase equivalent is. Unicode deals with these disparate
66 # possibilities by making all properties into mappings from each code point
67 # into some corresponding value. In the case of it being lowercase or not,
68 # the mapping is either to 'Y' or 'N' (or various synonyms thereof). Each
69 # property maps each Unicode code point to a single value, called a "property
70 # value". (Some more recently defined properties, map a code point to a set
73 # When using a property in a regular expression, what is desired isn't the
74 # mapping of the code point to its property's value, but the reverse (or the
75 # mathematical "inverse relation"): starting with the property value, "Does a
76 # code point map to it?" These are written in a "compound" form:
77 # \p{property=value}, e.g., \p{category=punctuation}. This program generates
78 # files containing the lists of code points that map to each such regular
79 # expression property value, one file per list
81 # There is also a single form shortcut that Perl adds for many of the commonly
82 # used properties. This happens for all binary properties, plus script,
83 # general_category, and block properties.
85 # Thus the outputs of this program are files. There are map files, mostly in
86 # the 'To' directory; and there are list files for use in regular expression
87 # matching, all in subdirectories of the 'lib' directory, with each
88 # subdirectory being named for the property that the lists in it are for.
89 # Bookkeeping, test, and documentation files are also generated.
91 my $matches_directory = 'lib'; # Where match (\p{}) files go.
92 my $map_directory = 'To'; # Where map files go.
96 # The major data structures of this program are Property, of course, but also
97 # Table. There are two kinds of tables, very similar to each other.
98 # "Match_Table" is the data structure giving the list of code points that have
99 # a particular property value, mentioned above. There is also a "Map_Table"
100 # data structure which gives the property's mapping from code point to value.
101 # There are two structures because the match tables need to be combined in
102 # various ways, such as constructing unions, intersections, complements, etc.,
103 # and the map ones don't. And there would be problems, perhaps subtle, if
104 # a map table were inadvertently operated on in some of those ways.
105 # The use of separate classes with operations defined on one but not the other
106 # prevents accidentally confusing the two.
108 # At the heart of each table's data structure is a "Range_List", which is just
109 # an ordered list of "Ranges", plus ancillary information, and methods to
110 # operate on them. A Range is a compact way to store property information.
111 # Each range has a starting code point, an ending code point, and a value that
112 # is meant to apply to all the code points between the two end points,
113 # inclusive. For a map table, this value is the property value for those
114 # code points. Two such ranges could be written like this:
115 # 0x41 .. 0x5A, 'Upper',
116 # 0x61 .. 0x7A, 'Lower'
118 # Each range also has a type used as a convenience to classify the values.
119 # Most ranges in this program will be Type 0, or normal, but there are some
120 # ranges that have a non-zero type. These are used only in map tables, and
121 # are for mappings that don't fit into the normal scheme of things. Mappings
122 # that require a hash entry to communicate with utf8.c are one example;
123 # another example is mappings for charnames.pm to use which indicate a name
124 # that is algorithmically determinable from its code point (and the reverse).
125 # These are used to significantly compact these tables, instead of listing
126 # each one of the tens of thousands individually.
128 # In a match table, the value of a range is irrelevant (and hence the type as
129 # well, which will always be 0), and arbitrarily set to the null string.
130 # Using the example above, there would be two match tables for those two
131 # entries, one named Upper would contain the 0x41..0x5A range, and the other
132 # named Lower would contain 0x61..0x7A.
134 # Actually, there are two types of range lists, "Range_Map" is the one
135 # associated with map tables, and "Range_List" with match tables.
136 # Again, this is so that methods can be defined on one and not the others so
137 # as to prevent operating on them in incorrect ways.
139 # Eventually, most tables are written out to files to be read by utf8_heavy.pl
140 # in the perl core. All tables could in theory be written, but some are
141 # suppressed because there is no current practical use for them. It is easy
142 # to change which get written by changing various lists that are near the top
143 # of the actual code in this file. The table data structures contain enough
144 # ancillary information to allow them to be treated as separate entities for
145 # writing, such as the path to each one's file. There is a heading in each
146 # map table that gives the format of its entries, and what the map is for all
147 # the code points missing from it. (This allows tables to be more compact.)
149 # The Property data structure contains one or more tables. All properties
150 # contain a map table (except the $perl property which is a
151 # pseudo-property containing only match tables), and any properties that
152 # are usable in regular expression matches also contain various matching
153 # tables, one for each value the property can have. A binary property can
154 # have two values, True and False (or Y and N, which are preferred by Unicode
155 # terminology). Thus each of these properties will have a map table that
156 # takes every code point and maps it to Y or N (but having ranges cuts the
157 # number of entries in that table way down), and two match tables, one
158 # which has a list of all the code points that map to Y, and one for all the
159 # code points that map to N. (For each binary property, a third table is also
160 # generated for the pseudo Perl property. It contains the identical code
161 # points as the Y table, but can be written in regular expressions, not in the
162 # compound form, but in a "single" form like \p{IsUppercase}.) Many
163 # properties are binary, but some properties have several possible values,
164 # some have many, and properties like Name have a different value for every
165 # named code point. Those will not, unless the controlling lists are changed,
166 # have their match tables written out. But all the ones which can be used in
167 # regular expression \p{} and \P{} constructs will. Prior to 5.14, generally
168 # a property would have either its map table or its match tables written but
169 # not both. Again, what gets written is controlled by lists which can easily
170 # be changed. Starting in 5.14, advantage was taken of this, and all the map
171 # tables needed to reconstruct the Unicode db are now written out, while
172 # suppressing the Unicode .txt files that contain the data. Our tables are
173 # much more compact than the .txt files, so a significant space savings was
174 # achieved. Also, tables are not written out that are trivially derivable
175 # from tables that do get written. So, there typically is no file containing
176 # the code points not matched by a binary property (the table for \P{} versus
177 # lowercase \p{}), since you just need to invert the True table to get the
180 # Properties have a 'Type', like 'binary', or 'string', or 'enum' depending on
181 # how many match tables there are and the content of the maps. This 'Type' is
182 # different than a range 'Type', so don't get confused by the two concepts
183 # having the same name.
185 # For information about the Unicode properties, see Unicode's UAX44 document:
187 my $unicode_reference_url = 'http://www.unicode.org/reports/tr44/';
189 # As stated earlier, this program will work on any release of Unicode so far.
190 # Most obvious problems in earlier data have NOT been corrected except when
191 # necessary to make Perl or this program work reasonably, and to keep out
192 # potential security issues. For example, no folding information was given in
193 # early releases, so this program substitutes lower case instead, just so that
194 # a regular expression with the /i option will do something that actually
195 # gives the right results in many cases. There are also a couple other
196 # corrections for version 1.1.5, commented at the point they are made. As an
197 # example of corrections that weren't made (but could be) is this statement
198 # from DerivedAge.txt: "The supplementary private use code points and the
199 # non-character code points were assigned in version 2.0, but not specifically
200 # listed in the UCD until versions 3.0 and 3.1 respectively." (To be precise
201 # it was 3.0.1 not 3.0.0) More information on Unicode version glitches is
202 # further down in these introductory comments.
204 # This program works on all non-provisional properties as of the current
205 # Unicode release, though the files for some are suppressed for various
206 # reasons. You can change which are output by changing lists in this program.
208 # The old version of mktables emphasized the term "Fuzzy" to mean Unicode's
209 # loose matchings rules (from Unicode TR18):
211 # The recommended names for UCD properties and property values are in
212 # PropertyAliases.txt [Prop] and PropertyValueAliases.txt
213 # [PropValue]. There are both abbreviated names and longer, more
214 # descriptive names. It is strongly recommended that both names be
215 # recognized, and that loose matching of property names be used,
216 # whereby the case distinctions, whitespace, hyphens, and underbar
219 # The program still allows Fuzzy to override its determination of if loose
220 # matching should be used, but it isn't currently used, as it is no longer
221 # needed; the calculations it makes are good enough.
223 # SUMMARY OF HOW IT WORKS:
227 # A list is constructed containing each input file that is to be processed
229 # Each file on the list is processed in a loop, using the associated handler
231 # The PropertyAliases.txt and PropValueAliases.txt files are processed
232 # first. These files name the properties and property values.
233 # Objects are created of all the property and property value names
234 # that the rest of the input should expect, including all synonyms.
235 # The other input files give mappings from properties to property
236 # values. That is, they list code points and say what the mapping
237 # is under the given property. Some files give the mappings for
238 # just one property; and some for many. This program goes through
239 # each file and populates the properties and their map tables from
240 # them. Some properties are listed in more than one file, and
241 # Unicode has set up a precedence as to which has priority if there
242 # is a conflict. Thus the order of processing matters, and this
243 # program handles the conflict possibility by processing the
244 # overriding input files last, so that if necessary they replace
246 # After this is all done, the program creates the property mappings not
247 # furnished by Unicode, but derivable from what it does give.
248 # The tables of code points that match each property value in each
249 # property that is accessible by regular expressions are created.
250 # The Perl-defined properties are created and populated. Many of these
251 # require data determined from the earlier steps
252 # Any Perl-defined synonyms are created, and name clashes between Perl
253 # and Unicode are reconciled and warned about.
254 # All the properties are written to files
255 # Any other files are written, and final warnings issued.
257 # For clarity, a number of operators have been overloaded to work on tables:
258 # ~ means invert (take all characters not in the set). The more
259 # conventional '!' is not used because of the possibility of confusing
260 # it with the actual boolean operation.
262 # - means subtraction
263 # & means intersection
264 # The precedence of these is the order listed. Parentheses should be
265 # copiously used. These are not a general scheme. The operations aren't
266 # defined for a number of things, deliberately, to avoid getting into trouble.
267 # Operations are done on references and affect the underlying structures, so
268 # that the copy constructors for them have been overloaded to not return a new
269 # clone, but the input object itself.
271 # The bool operator is deliberately not overloaded to avoid confusion with
272 # "should it mean if the object merely exists, or also is non-empty?".
274 # WHY CERTAIN DESIGN DECISIONS WERE MADE
276 # This program needs to be able to run under miniperl. Therefore, it uses a
277 # minimum of other modules, and hence implements some things itself that could
278 # be gotten from CPAN
280 # This program uses inputs published by the Unicode Consortium. These can
281 # change incompatibly between releases without the Perl maintainers realizing
282 # it. Therefore this program is now designed to try to flag these. It looks
283 # at the directories where the inputs are, and flags any unrecognized files.
284 # It keeps track of all the properties in the files it handles, and flags any
285 # that it doesn't know how to handle. It also flags any input lines that
286 # don't match the expected syntax, among other checks.
288 # It is also designed so if a new input file matches one of the known
289 # templates, one hopefully just needs to add it to a list to have it
292 # As mentioned earlier, some properties are given in more than one file. In
293 # particular, the files in the extracted directory are supposedly just
294 # reformattings of the others. But they contain information not easily
295 # derivable from the other files, including results for Unihan, which this
296 # program doesn't ordinarily look at, and for unassigned code points. They
297 # also have historically had errors or been incomplete. In an attempt to
298 # create the best possible data, this program thus processes them first to
299 # glean information missing from the other files; then processes those other
300 # files to override any errors in the extracted ones. Much of the design was
301 # driven by this need to store things and then possibly override them.
303 # It tries to keep fatal errors to a minimum, to generate something usable for
304 # testing purposes. It always looks for files that could be inputs, and will
305 # warn about any that it doesn't know how to handle (the -q option suppresses
308 # Why is there more than one type of range?
309 # This simplified things. There are some very specialized code points that
310 # have to be handled specially for output, such as Hangul syllable names.
311 # By creating a range type (done late in the development process), it
312 # allowed this to be stored with the range, and overridden by other input.
313 # Originally these were stored in another data structure, and it became a
314 # mess trying to decide if a second file that was for the same property was
315 # overriding the earlier one or not.
317 # Why are there two kinds of tables, match and map?
318 # (And there is a base class shared by the two as well.) As stated above,
319 # they actually are for different things. Development proceeded much more
320 # smoothly when I (khw) realized the distinction. Map tables are used to
321 # give the property value for every code point (actually every code point
322 # that doesn't map to a default value). Match tables are used for regular
323 # expression matches, and are essentially the inverse mapping. Separating
324 # the two allows more specialized methods, and error checks so that one
325 # can't just take the intersection of two map tables, for example, as that
328 # What about 'fate' and 'status'. The concept of a table's fate was created
329 # late when it became clear that something more was needed. The difference
330 # between this and 'status' is unclean, and could be improved if someone
331 # wanted to spend the effort.
335 # This program is written so it will run under miniperl. Occasionally changes
336 # will cause an error where the backtrace doesn't work well under miniperl.
337 # To diagnose the problem, you can instead run it under regular perl, if you
340 # There is a good trace facility. To enable it, first sub DEBUG must be set
341 # to return true. Then a line like
343 # local $to_trace = 1 if main::DEBUG;
345 # can be added to enable tracing in its lexical scope (plus dynamic) or until
346 # you insert another line:
348 # local $to_trace = 0 if main::DEBUG;
350 # To actually trace, use a line like "trace $a, @b, %c, ...;
352 # Some of the more complex subroutines already have trace statements in them.
353 # Permanent trace statements should be like:
355 # trace ... if main::DEBUG && $to_trace;
357 # If there is just one or a few files that you're debugging, you can easily
358 # cause most everything else to be skipped. Change the line
360 # my $debug_skip = 0;
362 # to 1, and every file whose object is in @input_file_objects and doesn't have
363 # a, 'non_skip => 1,' in its constructor will be skipped. However, skipping
364 # Jamo.txt or UnicodeData.txt will likely cause fatal errors.
366 # To compare the output tables, it may be useful to specify the -annotate
367 # flag. This causes the tables to expand so there is one entry for each
368 # non-algorithmically named code point giving, currently its name, and its
369 # graphic representation if printable (and you have a font that knows about
370 # it). This makes it easier to see what the particular code points are in
371 # each output table. The tables are usable, but because they don't have
372 # ranges (for the most part), a Perl using them will run slower. Non-named
373 # code points are annotated with a description of their status, and contiguous
374 # ones with the same description will be output as a range rather than
375 # individually. Algorithmically named characters are also output as ranges,
376 # except when there are just a few contiguous ones.
380 # The program would break if Unicode were to change its names so that
381 # interior white space, underscores, or dashes differences were significant
382 # within property and property value names.
384 # It might be easier to use the xml versions of the UCD if this program ever
385 # would need heavy revision, and the ability to handle old versions was not
388 # There is the potential for name collisions, in that Perl has chosen names
389 # that Unicode could decide it also likes. There have been such collisions in
390 # the past, with mostly Perl deciding to adopt the Unicode definition of the
391 # name. However in the 5.2 Unicode beta testing, there were a number of such
392 # collisions, which were withdrawn before the final release, because of Perl's
393 # and other's protests. These all involved new properties which began with
394 # 'Is'. Based on the protests, Unicode is unlikely to try that again. Also,
395 # many of the Perl-defined synonyms, like Any, Word, etc, are listed in a
396 # Unicode document, so they are unlikely to be used by Unicode for another
397 # purpose. However, they might try something beginning with 'In', or use any
398 # of the other Perl-defined properties. This program will warn you of name
399 # collisions, and refuse to generate tables with them, but manual intervention
400 # will be required in this event. One scheme that could be implemented, if
401 # necessary, would be to have this program generate another file, or add a
402 # field to mktables.lst that gives the date of first definition of a property.
403 # Each new release of Unicode would use that file as a basis for the next
404 # iteration. And the Perl synonym addition code could sort based on the age
405 # of the property, so older properties get priority, and newer ones that clash
406 # would be refused; hence existing code would not be impacted, and some other
407 # synonym would have to be used for the new property. This is ugly, and
408 # manual intervention would certainly be easier to do in the short run; lets
409 # hope it never comes to this.
413 # This program can generate tables from the Unihan database. But it doesn't
414 # by default, letting the CPAN module Unicode::Unihan handle them. Prior to
415 # version 5.2, this database was in a single file, Unihan.txt. In 5.2 the
416 # database was split into 8 different files, all beginning with the letters
417 # 'Unihan'. This program will read those file(s) if present, but it needs to
418 # know which of the many properties in the file(s) should have tables created
419 # for them. It will create tables for any properties listed in
420 # PropertyAliases.txt and PropValueAliases.txt, plus any listed in the
421 # @cjk_properties array and the @cjk_property_values array. Thus, if a
422 # property you want is not in those files of the release you are building
423 # against, you must add it to those two arrays. Starting in 4.0, the
424 # Unicode_Radical_Stroke was listed in those files, so if the Unihan database
425 # is present in the directory, a table will be generated for that property.
426 # In 5.2, several more properties were added. For your convenience, the two
427 # arrays are initialized with all the 6.0 listed properties that are also in
428 # earlier releases. But these are commented out. You can just uncomment the
429 # ones you want, or use them as a template for adding entries for other
432 # You may need to adjust the entries to suit your purposes. setup_unihan(),
433 # and filter_unihan_line() are the functions where this is done. This program
434 # already does some adjusting to make the lines look more like the rest of the
435 # Unicode DB; You can see what that is in filter_unihan_line()
437 # There is a bug in the 3.2 data file in which some values for the
438 # kPrimaryNumeric property have commas and an unexpected comment. A filter
439 # could be added for these; or for a particular installation, the Unihan.txt
440 # file could be edited to fix them.
442 # HOW TO ADD A FILE TO BE PROCESSED
444 # A new file from Unicode needs to have an object constructed for it in
445 # @input_file_objects, probably at the end or at the end of the extracted
446 # ones. The program should warn you if its name will clash with others on
447 # restrictive file systems, like DOS. If so, figure out a better name, and
448 # add lines to the README.perl file giving that. If the file is a character
449 # property, it should be in the format that Unicode has implicitly
450 # standardized for such files for the more recently introduced ones.
451 # If so, the Input_file constructor for @input_file_objects can just be the
452 # file name and release it first appeared in. If not, then it should be
453 # possible to construct an each_line_handler() to massage the line into the
456 # For non-character properties, more code will be needed. You can look at
457 # the existing entries for clues.
459 # UNICODE VERSIONS NOTES
461 # The Unicode UCD has had a number of errors in it over the versions. And
462 # these remain, by policy, in the standard for that version. Therefore it is
463 # risky to correct them, because code may be expecting the error. So this
464 # program doesn't generally make changes, unless the error breaks the Perl
465 # core. As an example, some versions of 2.1.x Jamo.txt have the wrong value
466 # for U+1105, which causes real problems for the algorithms for Jamo
467 # calculations, so it is changed here.
469 # But it isn't so clear cut as to what to do about concepts that are
470 # introduced in a later release; should they extend back to earlier releases
471 # where the concept just didn't exist? It was easier to do this than to not,
472 # so that's what was done. For example, the default value for code points not
473 # in the files for various properties was probably undefined until changed by
474 # some version. No_Block for blocks is such an example. This program will
475 # assign No_Block even in Unicode versions that didn't have it. This has the
476 # benefit that code being written doesn't have to special case earlier
477 # versions; and the detriment that it doesn't match the Standard precisely for
478 # the affected versions.
480 # Here are some observations about some of the issues in early versions:
482 # Prior to version 3.0, there were 3 character decompositions. These are not
483 # handled by Unicode::Normalize, nor will it compile when presented a version
484 # that has them. However, you can trivially get it to compile by simply
485 # ignoring those decompositions, by changing the croak to a carp. At the time
486 # of this writing, the line (in cpan/Unicode-Normalize/mkheader) reads
488 # croak("Weird Canonical Decomposition of U+$h");
490 # Simply change to a carp. It will compile, but will not know about any three
491 # character decomposition.
493 # The number of code points in \p{alpha=True} halved in 2.1.9. It turns out
494 # that the reason is that the CJK block starting at 4E00 was removed from
495 # PropList, and was not put back in until 3.1.0. The Perl extension (the
496 # single property name \p{alpha}) has the correct values. But the compound
497 # form is simply not generated until 3.1, as it can be argued that prior to
498 # this release, this was not an official property. The comments for
499 # filter_old_style_proplist() give more details.
501 # Unicode introduced the synonym Space for White_Space in 4.1. Perl has
502 # always had a \p{Space}. In release 3.2 only, they are not synonymous. The
503 # reason is that 3.2 introduced U+205F=medium math space, which was not
504 # classed as white space, but Perl figured out that it should have been. 4.0
505 # reclassified it correctly.
507 # Another change between 3.2 and 4.0 is the CCC property value ATBL. In 3.2
508 # this was erroneously a synonym for 202 (it should be 200). In 4.0, ATB
509 # became 202, and ATBL was left with no code points, as all the ones that
510 # mapped to 202 stayed mapped to 202. Thus if your program used the numeric
511 # name for the class, it would not have been affected, but if it used the
512 # mnemonic, it would have been.
514 # \p{Script=Hrkt} (Katakana_Or_Hiragana) came in 4.0.1. Before that code
515 # points which eventually came to have this script property value, instead
516 # mapped to "Unknown". But in the next release all these code points were
517 # moved to \p{sc=common} instead.
519 # The default for missing code points for BidiClass is complicated. Starting
520 # in 3.1.1, the derived file DBidiClass.txt handles this, but this program
521 # tries to do the best it can for earlier releases. It is done in
522 # process_PropertyAliases()
524 # In version 2.1.2, the entry in UnicodeData.txt:
525 # 0275;LATIN SMALL LETTER BARRED O;Ll;0;L;;;;;N;;;;019F;
527 # 0275;LATIN SMALL LETTER BARRED O;Ll;0;L;;;;;N;;;019F;;019F
528 # Without this change, there are casing problems for this character.
530 ##############################################################################
532 my $UNDEF = ':UNDEF:'; # String to print out for undefined values in tracing
534 my $MAX_LINE_WIDTH = 78;
536 # Debugging aid to skip most files so as to not be distracted by them when
537 # concentrating on the ones being debugged. Add
539 # to the constructor for those files you want processed when you set this.
540 # Files with a first version number of 0 are special: they are always
541 # processed regardless of the state of this flag. Generally, Jamo.txt and
542 # UnicodeData.txt must not be skipped if you want this program to not die
543 # before normal completion.
547 # Normally these are suppressed.
548 my $write_Unicode_deprecated_tables = 0;
550 # Set to 1 to enable tracing.
553 { # Closure for trace: debugging aid
554 my $print_caller = 1; # ? Include calling subroutine name
555 my $main_with_colon = 'main::';
556 my $main_colon_length = length($main_with_colon);
559 return unless $to_trace; # Do nothing if global flag not set
563 local $DB::trace = 0;
564 $DB::trace = 0; # Quiet 'used only once' message
568 # Loop looking up the stack to get the first non-trace caller
573 $line_number = $caller_line;
574 (my $pkg, my $file, $caller_line, my $caller) = caller $i++;
575 $caller = $main_with_colon unless defined $caller;
577 $caller_name = $caller;
580 $caller_name =~ s/.*:://;
581 if (substr($caller_name, 0, $main_colon_length)
584 $caller_name = substr($caller_name, $main_colon_length);
587 } until ($caller_name ne 'trace');
589 # If the stack was empty, we were called from the top level
590 $caller_name = 'main' if ($caller_name eq ""
591 || $caller_name eq 'trace');
594 foreach my $string (@input) {
595 #print STDERR __LINE__, ": ", join ", ", @input, "\n";
596 if (ref $string eq 'ARRAY' || ref $string eq 'HASH') {
597 $output .= simple_dumper($string);
600 $string = "$string" if ref $string;
601 $string = $UNDEF unless defined $string;
603 $string = '""' if $string eq "";
604 $output .= " " if $output ne ""
606 && substr($output, -1, 1) ne " "
607 && substr($string, 0, 1) ne " ";
612 print STDERR sprintf "%4d: ", $line_number if defined $line_number;
613 print STDERR "$caller_name: " if $print_caller;
614 print STDERR $output, "\n";
619 # This is for a rarely used development feature that allows you to compare two
620 # versions of the Unicode standard without having to deal with changes caused
621 # by the code points introduced in the later version. Change the 0 to a
622 # string containing a SINGLE dotted Unicode release number (e.g. "2.1"). Only
623 # code points introduced in that release and earlier will be used; later ones
624 # are thrown away. You use the version number of the earliest one you want to
625 # compare; then run this program on directory structures containing each
626 # release, and compare the outputs. These outputs will therefore include only
627 # the code points common to both releases, and you can see the changes caused
628 # just by the underlying release semantic changes. For versions earlier than
629 # 3.2, you must copy a version of DAge.txt into the directory.
630 my $string_compare_versions = DEBUG && 0; # e.g., "2.1";
631 my $compare_versions = DEBUG
632 && $string_compare_versions
633 && pack "C*", split /\./, $string_compare_versions;
636 # Returns non-duplicated input values. From "Perl Best Practices:
637 # Encapsulated Cleverness". p. 455 in first edition.
640 # Arguably this breaks encapsulation, if the goal is to permit multiple
641 # distinct objects to stringify to the same value, and be interchangeable.
642 # However, for this program, no two objects stringify identically, and all
643 # lists passed to this function are either objects or strings. So this
644 # doesn't affect correctness, but it does give a couple of percent speedup.
646 return grep { ! $seen{$_}++ } @_;
649 $0 = File::Spec->canonpath($0);
651 my $make_test_script = 0; # ? Should we output a test script
652 my $make_norm_test_script = 0; # ? Should we output a normalization test script
653 my $write_unchanged_files = 0; # ? Should we update the output files even if
654 # we don't think they have changed
655 my $use_directory = ""; # ? Should we chdir somewhere.
656 my $pod_directory; # input directory to store the pod file.
657 my $pod_file = 'perluniprops';
658 my $t_path; # Path to the .t test file
659 my $file_list = 'mktables.lst'; # File to store input and output file names.
660 # This is used to speed up the build, by not
661 # executing the main body of the program if
662 # nothing on the list has changed since the
664 my $make_list = 1; # ? Should we write $file_list. Set to always
665 # make a list so that when the pumpking is
666 # preparing a release, s/he won't have to do
668 my $glob_list = 0; # ? Should we try to include unknown .txt files
670 my $output_range_counts = $debugging_build; # ? Should we include the number
671 # of code points in ranges in
673 my $annotate = 0; # ? Should character names be in the output
675 # Verbosity levels; 0 is quiet
676 my $NORMAL_VERBOSITY = 1;
680 my $verbosity = $NORMAL_VERBOSITY;
682 # Stored in mktables.lst so that if this program is called with different
683 # options, will regenerate even if the files otherwise look like they're
685 my $command_line_arguments = join " ", @ARGV;
689 my $arg = shift @ARGV;
691 $verbosity = $VERBOSE;
693 elsif ($arg eq '-p') {
694 $verbosity = $PROGRESS;
695 $| = 1; # Flush buffers as we go.
697 elsif ($arg eq '-q') {
700 elsif ($arg eq '-w') {
701 $write_unchanged_files = 1; # update the files even if havent changed
703 elsif ($arg eq '-check') {
704 my $this = shift @ARGV;
705 my $ok = shift @ARGV;
707 print "Skipping as check params are not the same.\n";
711 elsif ($arg eq '-P' && defined ($pod_directory = shift)) {
712 -d $pod_directory or croak "Directory '$pod_directory' doesn't exist";
714 elsif ($arg eq '-maketest' || ($arg eq '-T' && defined ($t_path = shift)))
716 $make_test_script = 1;
718 elsif ($arg eq '-makenormtest')
720 $make_norm_test_script = 1;
722 elsif ($arg eq '-makelist') {
725 elsif ($arg eq '-C' && defined ($use_directory = shift)) {
726 -d $use_directory or croak "Unknown directory '$use_directory'";
728 elsif ($arg eq '-L') {
730 # Existence not tested until have chdir'd
733 elsif ($arg eq '-globlist') {
736 elsif ($arg eq '-c') {
737 $output_range_counts = ! $output_range_counts
739 elsif ($arg eq '-annotate') {
741 $debugging_build = 1;
742 $output_range_counts = 1;
746 $with_c .= 'out' if $output_range_counts; # Complements the state
748 usage: $0 [-c|-p|-q|-v|-w] [-C dir] [-L filelist] [ -P pod_dir ]
749 [ -T test_file_path ] [-globlist] [-makelist] [-maketest]
751 -c : Output comments $with_c number of code points in ranges
752 -q : Quiet Mode: Only output serious warnings.
753 -p : Set verbosity level to normal plus show progress.
754 -v : Set Verbosity level high: Show progress and non-serious
756 -w : Write files regardless
757 -C dir : Change to this directory before proceeding. All relative paths
758 except those specified by the -P and -T options will be done
759 with respect to this directory.
760 -P dir : Output $pod_file file to directory 'dir'.
761 -T path : Create a test script as 'path'; overrides -maketest
762 -L filelist : Use alternate 'filelist' instead of standard one
763 -globlist : Take as input all non-Test *.txt files in current and sub
765 -maketest : Make test script 'TestProp.pl' in current (or -C directory),
767 -makelist : Rewrite the file list $file_list based on current setup
768 -annotate : Output an annotation for each character in the table files;
769 useful for debugging mktables, looking at diffs; but is slow,
770 memory intensive; resulting tables are usable but are slow and
771 very large (and currently fail the Unicode::UCD.t tests).
772 -check A B : Executes $0 only if A and B are the same
777 # Stores the most-recently changed file. If none have changed, can skip the
779 my $most_recent = (stat $0)[9]; # Do this before the chdir!
781 # Change directories now, because need to read 'version' early.
782 if ($use_directory) {
783 if ($pod_directory && ! File::Spec->file_name_is_absolute($pod_directory)) {
784 $pod_directory = File::Spec->rel2abs($pod_directory);
786 if ($t_path && ! File::Spec->file_name_is_absolute($t_path)) {
787 $t_path = File::Spec->rel2abs($t_path);
789 chdir $use_directory or croak "Failed to chdir to '$use_directory':$!";
790 if ($pod_directory && File::Spec->file_name_is_absolute($pod_directory)) {
791 $pod_directory = File::Spec->abs2rel($pod_directory);
793 if ($t_path && File::Spec->file_name_is_absolute($t_path)) {
794 $t_path = File::Spec->abs2rel($t_path);
798 # Get Unicode version into regular and v-string. This is done now because
799 # various tables below get populated based on it. These tables are populated
800 # here to be near the top of the file, and so easily seeable by those needing
802 open my $VERSION, "<", "version"
803 or croak "$0: can't open required file 'version': $!\n";
804 my $string_version = <$VERSION>;
806 chomp $string_version;
807 my $v_version = pack "C*", split /\./, $string_version; # v string
809 # The following are the complete names of properties with property values that
810 # are known to not match any code points in some versions of Unicode, but that
811 # may change in the future so they should be matchable, hence an empty file is
812 # generated for them.
813 my @tables_that_may_be_empty;
814 push @tables_that_may_be_empty, 'Joining_Type=Left_Joining'
815 if $v_version lt v6.3.0;
816 push @tables_that_may_be_empty, 'Script=Common' if $v_version le v4.0.1;
817 push @tables_that_may_be_empty, 'Title' if $v_version lt v2.0.0;
818 push @tables_that_may_be_empty, 'Script=Katakana_Or_Hiragana'
819 if $v_version ge v4.1.0;
820 push @tables_that_may_be_empty, 'Script_Extensions=Katakana_Or_Hiragana'
821 if $v_version ge v6.0.0;
822 push @tables_that_may_be_empty, 'Grapheme_Cluster_Break=Prepend'
823 if $v_version ge v6.1.0;
824 push @tables_that_may_be_empty, 'Canonical_Combining_Class=CCC133'
825 if $v_version ge v6.2.0;
827 # The lists below are hashes, so the key is the item in the list, and the
828 # value is the reason why it is in the list. This makes generation of
829 # documentation easier.
831 my %why_suppressed; # No file generated for these.
833 # Files aren't generated for empty extraneous properties. This is arguable.
834 # Extraneous properties generally come about because a property is no longer
835 # used in a newer version of Unicode. If we generated a file without code
836 # points, programs that used to work on that property will still execute
837 # without errors. It just won't ever match (or will always match, with \P{}).
838 # This means that the logic is now likely wrong. I (khw) think its better to
839 # find this out by getting an error message. Just move them to the table
840 # above to change this behavior
841 my %why_suppress_if_empty_warn_if_not = (
843 # It is the only property that has ever officially been removed from the
844 # Standard. The database never contained any code points for it.
845 'Special_Case_Condition' => 'Obsolete',
847 # Apparently never official, but there were code points in some versions of
848 # old-style PropList.txt
849 'Non_Break' => 'Obsolete',
852 # These would normally go in the warn table just above, but they were changed
853 # a long time before this program was written, so warnings about them are
855 if ($v_version gt v3.2.0) {
856 push @tables_that_may_be_empty,
857 'Canonical_Combining_Class=Attached_Below_Left'
860 # These are listed in the Property aliases file in 6.0, but Unihan is ignored
861 # unless explicitly added.
862 if ($v_version ge v5.2.0) {
863 my $unihan = 'Unihan; remove from list if using Unihan';
864 foreach my $table (qw (
868 kCompatibilityVariant
882 $why_suppress_if_empty_warn_if_not{$table} = $unihan;
886 # Enum values for to_output_map() method in the Map_Table package.
887 my $EXTERNAL_MAP = 1;
888 my $INTERNAL_MAP = 2;
889 my $OUTPUT_ADJUSTED = 3;
891 # To override computed values for writing the map tables for these properties.
892 # The default for enum map tables is to write them out, so that the Unicode
893 # .txt files can be removed, but all the data to compute any property value
894 # for any code point is available in a more compact form.
895 my %global_to_output_map = (
896 # Needed by UCD.pm, but don't want to publicize that it exists, so won't
897 # get stuck supporting it if things change. Since it is a STRING
898 # property, it normally would be listed in the pod, but INTERNAL_MAP
900 Unicode_1_Name => $INTERNAL_MAP,
902 Present_In => 0, # Suppress, as easily computed from Age
903 Block => (NON_ASCII_PLATFORM) ? 1 : 0, # Suppress, as Blocks.txt is
904 # retained, but needed for
907 # Suppress, as mapping can be found instead from the
908 # Perl_Decomposition_Mapping file
909 Decomposition_Type => 0,
912 # Properties that this program ignores.
913 my @unimplemented_properties;
915 # With this release, it is automatically handled if the Unihan db is
917 push @unimplemented_properties, 'Unicode_Radical_Stroke' if $v_version le v5.2.0;
919 # There are several types of obsolete properties defined by Unicode. These
920 # must be hand-edited for every new Unicode release.
921 my %why_deprecated; # Generates a deprecated warning message if used.
922 my %why_stabilized; # Documentation only
923 my %why_obsolete; # Documentation only
926 my $simple = 'Perl uses the more complete version';
927 my $unihan = 'Unihan properties are by default not enabled in the Perl core. Instead use CPAN: Unicode::Unihan';
929 my $other_properties = 'other properties';
930 my $contributory = "Used by Unicode internally for generating $other_properties and not intended to be used stand-alone";
931 my $why_no_expand = "Deprecated by Unicode. These are characters that expand to more than one character in the specified normalization form, but whether they actually take up more bytes or not depends on the encoding being used. For example, a UTF-8 encoded character may expand to a different number of bytes than a UTF-32 encoded character.";
934 'Grapheme_Link' => 'Deprecated by Unicode: Duplicates ccc=vr (Canonical_Combining_Class=Virama)',
935 'Jamo_Short_Name' => $contributory,
936 'Line_Break=Surrogate' => 'Deprecated by Unicode because surrogates should never appear in well-formed text, and therefore shouldn\'t be the basis for line breaking',
937 'Other_Alphabetic' => $contributory,
938 'Other_Default_Ignorable_Code_Point' => $contributory,
939 'Other_Grapheme_Extend' => $contributory,
940 'Other_ID_Continue' => $contributory,
941 'Other_ID_Start' => $contributory,
942 'Other_Lowercase' => $contributory,
943 'Other_Math' => $contributory,
944 'Other_Uppercase' => $contributory,
945 'Expands_On_NFC' => $why_no_expand,
946 'Expands_On_NFD' => $why_no_expand,
947 'Expands_On_NFKC' => $why_no_expand,
948 'Expands_On_NFKD' => $why_no_expand,
952 # There is a lib/unicore/Decomposition.pl (used by Normalize.pm) which
953 # contains the same information, but without the algorithmically
954 # determinable Hangul syllables'. This file is not published, so it's
955 # existence is not noted in the comment.
956 'Decomposition_Mapping' => 'Accessible via Unicode::Normalize or Unicode::UCD::prop_invmap()',
958 'Indic_Matra_Category' => "Provisional",
959 'Indic_Syllabic_Category' => "Provisional",
961 # Don't suppress ISO_Comment, as otherwise special handling is needed
962 # to differentiate between it and gc=c, which can be written as 'isc',
963 # which is the same characters as ISO_Comment's short name.
965 'Name' => "Accessible via \\N{...} or 'use charnames;' or Unicode::UCD::prop_invmap()",
967 'Simple_Case_Folding' => "$simple. Can access this through Unicode::UCD::casefold or Unicode::UCD::prop_invmap()",
968 'Simple_Lowercase_Mapping' => "$simple. Can access this through Unicode::UCD::charinfo or Unicode::UCD::prop_invmap()",
969 'Simple_Titlecase_Mapping' => "$simple. Can access this through Unicode::UCD::charinfo or Unicode::UCD::prop_invmap()",
970 'Simple_Uppercase_Mapping' => "$simple. Can access this through Unicode::UCD::charinfo or Unicode::UCD::prop_invmap()",
972 FC_NFKC_Closure => 'Supplanted in usage by NFKC_Casefold; otherwise not useful',
975 foreach my $property (
977 # The following are suppressed because they were made contributory
978 # or deprecated by Unicode before Perl ever thought about
987 # The following are suppressed because they have been marked
988 # as deprecated for a sufficient amount of time
990 'Other_Default_Ignorable_Code_Point',
991 'Other_Grapheme_Extend',
998 $why_suppressed{$property} = $why_deprecated{$property};
1001 # Customize the message for all the 'Other_' properties
1002 foreach my $property (keys %why_deprecated) {
1003 next if (my $main_property = $property) !~ s/^Other_//;
1004 $why_deprecated{$property} =~ s/$other_properties/the $main_property property (which should be used instead)/;
1008 if ($write_Unicode_deprecated_tables) {
1009 foreach my $property (keys %why_suppressed) {
1010 delete $why_suppressed{$property} if $property =~
1011 / ^ Other | Grapheme /x;
1015 if ($v_version ge 4.0.0) {
1016 $why_stabilized{'Hyphen'} = 'Use the Line_Break property instead; see www.unicode.org/reports/tr14';
1017 if ($v_version ge 6.0.0) {
1018 $why_deprecated{'Hyphen'} = 'Supplanted by Line_Break property values; see www.unicode.org/reports/tr14';
1021 if ($v_version ge 5.2.0 && $v_version lt 6.0.0) {
1022 $why_obsolete{'ISO_Comment'} = 'Code points for it have been removed';
1023 if ($v_version ge 6.0.0) {
1024 $why_deprecated{'ISO_Comment'} = 'No longer needed for Unicode\'s internal chart generation; otherwise not useful, and code points for it have been removed';
1028 # Probably obsolete forever
1029 if ($v_version ge v4.1.0) {
1030 $why_suppressed{'Script=Katakana_Or_Hiragana'} = 'Obsolete. All code points previously matched by this have been moved to "Script=Common".';
1032 if ($v_version ge v6.0.0) {
1033 $why_suppressed{'Script=Katakana_Or_Hiragana'} .= ' Consider instead using "Script_Extensions=Katakana" or "Script_Extensions=Hiragana" (or both)';
1034 $why_suppressed{'Script_Extensions=Katakana_Or_Hiragana'} = 'All code points that would be matched by this are matched by either "Script_Extensions=Katakana" or "Script_Extensions=Hiragana"';
1037 # This program can create files for enumerated-like properties, such as
1038 # 'Numeric_Type'. This file would be the same format as for a string
1039 # property, with a mapping from code point to its value, so you could look up,
1040 # for example, the script a code point is in. But no one so far wants this
1041 # mapping, or they have found another way to get it since this is a new
1042 # feature. So no file is generated except if it is in this list.
1043 my @output_mapped_properties = split "\n", <<END;
1046 # If you are using the Unihan database in a Unicode version before 5.2, you
1047 # need to add the properties that you want to extract from it to this table.
1048 # For your convenience, the properties in the 6.0 PropertyAliases.txt file are
1049 # listed, commented out
1050 my @cjk_properties = split "\n", <<'END';
1051 #cjkAccountingNumeric; kAccountingNumeric
1052 #cjkOtherNumeric; kOtherNumeric
1053 #cjkPrimaryNumeric; kPrimaryNumeric
1054 #cjkCompatibilityVariant; kCompatibilityVariant
1055 #cjkIICore ; kIICore
1056 #cjkIRG_GSource; kIRG_GSource
1057 #cjkIRG_HSource; kIRG_HSource
1058 #cjkIRG_JSource; kIRG_JSource
1059 #cjkIRG_KPSource; kIRG_KPSource
1060 #cjkIRG_KSource; kIRG_KSource
1061 #cjkIRG_TSource; kIRG_TSource
1062 #cjkIRG_USource; kIRG_USource
1063 #cjkIRG_VSource; kIRG_VSource
1064 #cjkRSUnicode; kRSUnicode ; Unicode_Radical_Stroke; URS
1067 # Similarly for the property values. For your convenience, the lines in the
1068 # 6.0 PropertyAliases.txt file are listed. Just remove the first BUT NOT both
1069 # '#' marks (for Unicode versions before 5.2)
1070 my @cjk_property_values = split "\n", <<'END';
1071 ## @missing: 0000..10FFFF; cjkAccountingNumeric; NaN
1072 ## @missing: 0000..10FFFF; cjkCompatibilityVariant; <code point>
1073 ## @missing: 0000..10FFFF; cjkIICore; <none>
1074 ## @missing: 0000..10FFFF; cjkIRG_GSource; <none>
1075 ## @missing: 0000..10FFFF; cjkIRG_HSource; <none>
1076 ## @missing: 0000..10FFFF; cjkIRG_JSource; <none>
1077 ## @missing: 0000..10FFFF; cjkIRG_KPSource; <none>
1078 ## @missing: 0000..10FFFF; cjkIRG_KSource; <none>
1079 ## @missing: 0000..10FFFF; cjkIRG_TSource; <none>
1080 ## @missing: 0000..10FFFF; cjkIRG_USource; <none>
1081 ## @missing: 0000..10FFFF; cjkIRG_VSource; <none>
1082 ## @missing: 0000..10FFFF; cjkOtherNumeric; NaN
1083 ## @missing: 0000..10FFFF; cjkPrimaryNumeric; NaN
1084 ## @missing: 0000..10FFFF; cjkRSUnicode; <none>
1087 # The input files don't list every code point. Those not listed are to be
1088 # defaulted to some value. Below are hard-coded what those values are for
1089 # non-binary properties as of 5.1. Starting in 5.0, there are
1090 # machine-parsable comment lines in the files that give the defaults; so this
1091 # list shouldn't have to be extended. The claim is that all missing entries
1092 # for binary properties will default to 'N'. Unicode tried to change that in
1093 # 5.2, but the beta period produced enough protest that they backed off.
1095 # The defaults for the fields that appear in UnicodeData.txt in this hash must
1096 # be in the form that it expects. The others may be synonyms.
1097 my $CODE_POINT = '<code point>';
1098 my %default_mapping = (
1099 Age => "Unassigned",
1100 # Bidi_Class => Complicated; set in code
1101 Bidi_Mirroring_Glyph => "",
1102 Block => 'No_Block',
1103 Canonical_Combining_Class => 0,
1104 Case_Folding => $CODE_POINT,
1105 Decomposition_Mapping => $CODE_POINT,
1106 Decomposition_Type => 'None',
1107 East_Asian_Width => "Neutral",
1108 FC_NFKC_Closure => $CODE_POINT,
1109 General_Category => 'Cn',
1110 Grapheme_Cluster_Break => 'Other',
1111 Hangul_Syllable_Type => 'NA',
1113 Jamo_Short_Name => "",
1114 Joining_Group => "No_Joining_Group",
1115 # Joining_Type => Complicated; set in code
1116 kIICore => 'N', # Is converted to binary
1117 #Line_Break => Complicated; set in code
1118 Lowercase_Mapping => $CODE_POINT,
1125 Numeric_Type => 'None',
1126 Numeric_Value => 'NaN',
1127 Script => ($v_version le 4.1.0) ? 'Common' : 'Unknown',
1128 Sentence_Break => 'Other',
1129 Simple_Case_Folding => $CODE_POINT,
1130 Simple_Lowercase_Mapping => $CODE_POINT,
1131 Simple_Titlecase_Mapping => $CODE_POINT,
1132 Simple_Uppercase_Mapping => $CODE_POINT,
1133 Titlecase_Mapping => $CODE_POINT,
1134 Unicode_1_Name => "",
1135 Unicode_Radical_Stroke => "",
1136 Uppercase_Mapping => $CODE_POINT,
1137 Word_Break => 'Other',
1140 # Below are files that Unicode furnishes, but this program ignores, and why.
1141 # NormalizationCorrections.txt requires some more explanation. It documents
1142 # the cumulative fixes to erroneous normalizations in earlier Unicode
1143 # versions. Its main purpose is so that someone running on an earlier version
1144 # can use this file to override what got published in that earlier release.
1145 # It would be easy for mktables to read and handle this file. But all the
1146 # corrections in it should already be in the other files for the release it
1147 # is. To get it to actually mean something useful, someone would have to be
1148 # using an earlier Unicode release, and copy it to the files for that release
1149 # and recomplile. So far there has been no demand to do that, so this hasn't
1151 my %ignored_files = (
1152 'CJKRadicals.txt' => 'Maps the kRSUnicode property values to corresponding code points',
1153 'Index.txt' => 'Alphabetical index of Unicode characters',
1154 'NamedSqProv.txt' => 'Named sequences proposed for inclusion in a later version of the Unicode Standard; if you need them now, you can append this file to F<NamedSequences.txt> and recompile perl',
1155 'NamesList.txt' => 'Annotated list of characters',
1156 'NamesList.html' => 'Describes the format and contents of F<NamesList.txt>',
1157 'NormalizationCorrections.txt' => 'Documentation of corrections already incorporated into the Unicode data base',
1158 'Props.txt' => 'Only in very early releases; is a subset of F<PropList.txt> (which is used instead)',
1159 'ReadMe.txt' => 'Documentation',
1160 'StandardizedVariants.txt' => 'Certain glyph variations for character display are standardized. This lists the non-Unihan ones; the Unihan ones are also not used by Perl, and are in a separate Unicode data base L<http://www.unicode.org/ivd>',
1161 'StandardizedVariants.html' => 'Provides a visual display of the standard variant sequences derived from F<StandardizedVariants.txt>.',
1162 'EmojiSources.txt' => 'Maps certain Unicode code points to their legacy Japanese cell-phone values',
1163 'USourceData.txt' => 'Documentation of status and cross reference of proposals for encoding by Unicode of Unihan characters',
1164 'USourceGlyphs.pdf' => 'Pictures of the characters in F<USourceData.txt>',
1165 'auxiliary/WordBreakTest.html' => 'Documentation of validation tests',
1166 'auxiliary/SentenceBreakTest.html' => 'Documentation of validation tests',
1167 'auxiliary/GraphemeBreakTest.html' => 'Documentation of validation tests',
1168 'auxiliary/LineBreakTest.html' => 'Documentation of validation tests',
1171 my %skipped_files; # List of files that we skip
1173 ### End of externally interesting definitions, except for @input_file_objects
1176 # !!!!!!! DO NOT EDIT THIS FILE !!!!!!!
1177 # This file is machine-generated by $0 from the Unicode
1178 # database, Version $string_version. Any changes made here will be lost!
1181 my $INTERNAL_ONLY_HEADER = <<"EOF";
1183 # !!!!!!! INTERNAL PERL USE ONLY !!!!!!!
1184 # This file is for internal use by core Perl only. The format and even the
1185 # name or existence of this file are subject to change without notice. Don't
1186 # use it directly. Use Unicode::UCD to access the Unicode character data
1190 my $DEVELOPMENT_ONLY=<<"EOF";
1191 # !!!!!!! DEVELOPMENT USE ONLY !!!!!!!
1192 # This file contains information artificially constrained to code points
1193 # present in Unicode release $string_compare_versions.
1194 # IT CANNOT BE RELIED ON. It is for use during development only and should
1195 # not be used for production.
1199 my $MAX_UNICODE_CODEPOINT_STRING = "10FFFF";
1200 my $MAX_UNICODE_CODEPOINT = hex $MAX_UNICODE_CODEPOINT_STRING;
1201 my $MAX_UNICODE_CODEPOINTS = $MAX_UNICODE_CODEPOINT + 1;
1203 # Matches legal code point. 4-6 hex numbers, If there are 6, the first
1204 # two must be 10; if there are 5, the first must not be a 0. Written this way
1205 # to decrease backtracking. The first regex allows the code point to be at
1206 # the end of a word, but to work properly, the word shouldn't end with a valid
1207 # hex character. The second one won't match a code point at the end of a
1208 # word, and doesn't have the run-on issue
1209 my $run_on_code_point_re =
1210 qr/ (?: 10[0-9A-F]{4} | [1-9A-F][0-9A-F]{4} | [0-9A-F]{4} ) \b/x;
1211 my $code_point_re = qr/\b$run_on_code_point_re/;
1213 # This matches the beginning of the line in the Unicode db files that give the
1214 # defaults for code points not listed (i.e., missing) in the file. The code
1215 # depends on this ending with a semi-colon, so it can assume it is a valid
1216 # field when the line is split() by semi-colons
1217 my $missing_defaults_prefix =
1218 qr/^#\s+\@missing:\s+0000\.\.$MAX_UNICODE_CODEPOINT_STRING\s*;/;
1220 # Property types. Unicode has more types, but these are sufficient for our
1222 my $UNKNOWN = -1; # initialized to illegal value
1223 my $NON_STRING = 1; # Either binary or enum
1225 my $FORCED_BINARY = 3; # Not a binary property, but, besides its normal
1226 # tables, additional true and false tables are
1227 # generated so that false is anything matching the
1228 # default value, and true is everything else.
1229 my $ENUM = 4; # Include catalog
1230 my $STRING = 5; # Anything else: string or misc
1232 # Some input files have lines that give default values for code points not
1233 # contained in the file. Sometimes these should be ignored.
1234 my $NO_DEFAULTS = 0; # Must evaluate to false
1235 my $NOT_IGNORED = 1;
1238 # Range types. Each range has a type. Most ranges are type 0, for normal,
1239 # and will appear in the main body of the tables in the output files, but
1240 # there are other types of ranges as well, listed below, that are specially
1241 # handled. There are pseudo-types as well that will never be stored as a
1242 # type, but will affect the calculation of the type.
1244 # 0 is for normal, non-specials
1245 my $MULTI_CP = 1; # Sequence of more than code point
1246 my $HANGUL_SYLLABLE = 2;
1247 my $CP_IN_NAME = 3; # The NAME contains the code point appended to it.
1248 my $NULL = 4; # The map is to the null string; utf8.c can't
1249 # handle these, nor is there an accepted syntax
1250 # for them in \p{} constructs
1251 my $COMPUTE_NO_MULTI_CP = 5; # Pseudo-type; means that ranges that would
1252 # otherwise be $MULTI_CP type are instead type 0
1254 # process_generic_property_file() can accept certain overrides in its input.
1255 # Each of these must begin AND end with $CMD_DELIM.
1256 my $CMD_DELIM = "\a";
1257 my $REPLACE_CMD = 'replace'; # Override the Replace
1258 my $MAP_TYPE_CMD = 'map_type'; # Override the Type
1263 # Values for the Replace argument to add_range.
1264 # $NO # Don't replace; add only the code points not
1266 my $IF_NOT_EQUIVALENT = 1; # Replace only under certain conditions; details in
1267 # the comments at the subroutine definition.
1268 my $UNCONDITIONALLY = 2; # Replace without conditions.
1269 my $MULTIPLE_BEFORE = 4; # Don't replace, but add a duplicate record if
1271 my $MULTIPLE_AFTER = 5; # Don't replace, but add a duplicate record if
1273 my $CROAK = 6; # Die with an error if is already there
1275 # Flags to give property statuses. The phrases are to remind maintainers that
1276 # if the flag is changed, the indefinite article referring to it in the
1277 # documentation may need to be as well.
1279 my $DEPRECATED = 'D';
1280 my $a_bold_deprecated = "a 'B<$DEPRECATED>'";
1281 my $A_bold_deprecated = "A 'B<$DEPRECATED>'";
1282 my $DISCOURAGED = 'X';
1283 my $a_bold_discouraged = "an 'B<$DISCOURAGED>'";
1284 my $A_bold_discouraged = "An 'B<$DISCOURAGED>'";
1286 my $a_bold_stricter = "a 'B<$STRICTER>'";
1287 my $A_bold_stricter = "A 'B<$STRICTER>'";
1288 my $STABILIZED = 'S';
1289 my $a_bold_stabilized = "an 'B<$STABILIZED>'";
1290 my $A_bold_stabilized = "An 'B<$STABILIZED>'";
1292 my $a_bold_obsolete = "an 'B<$OBSOLETE>'";
1293 my $A_bold_obsolete = "An 'B<$OBSOLETE>'";
1295 my %status_past_participles = (
1296 $DISCOURAGED => 'discouraged',
1297 $STABILIZED => 'stabilized',
1298 $OBSOLETE => 'obsolete',
1299 $DEPRECATED => 'deprecated',
1302 # Table fates. These are somewhat ordered, so that fates < $MAP_PROXIED should be
1303 # externally documented.
1304 my $ORDINARY = 0; # The normal fate.
1305 my $MAP_PROXIED = 1; # The map table for the property isn't written out,
1306 # but there is a file written that can be used to
1307 # reconstruct this table
1308 my $INTERNAL_ONLY = 2; # The file for this table is written out, but it is
1309 # for Perl's internal use only
1310 my $LEGACY_ONLY = 3; # Like $INTERNAL_ONLY, but not actually used by Perl.
1311 # Is for backwards compatibility for applications that
1312 # read the file directly, so it's format is
1314 my $SUPPRESSED = 4; # The file for this table is not written out, and as a
1315 # result, we don't bother to do many computations on
1317 my $PLACEHOLDER = 5; # Like $SUPPRESSED, but we go through all the
1318 # computations anyway, as the values are needed for
1319 # things to work. This happens when we have Perl
1320 # extensions that depend on Unicode tables that
1321 # wouldn't normally be in a given Unicode version.
1323 # The format of the values of the tables:
1324 my $EMPTY_FORMAT = "";
1325 my $BINARY_FORMAT = 'b';
1326 my $DECIMAL_FORMAT = 'd';
1327 my $FLOAT_FORMAT = 'f';
1328 my $INTEGER_FORMAT = 'i';
1329 my $HEX_FORMAT = 'x';
1330 my $RATIONAL_FORMAT = 'r';
1331 my $STRING_FORMAT = 's';
1332 my $ADJUST_FORMAT = 'a';
1333 my $HEX_ADJUST_FORMAT = 'ax';
1334 my $DECOMP_STRING_FORMAT = 'c';
1335 my $STRING_WHITE_SPACE_LIST = 'sw';
1337 my %map_table_formats = (
1338 $BINARY_FORMAT => 'binary',
1339 $DECIMAL_FORMAT => 'single decimal digit',
1340 $FLOAT_FORMAT => 'floating point number',
1341 $INTEGER_FORMAT => 'integer',
1342 $HEX_FORMAT => 'non-negative hex whole number; a code point',
1343 $RATIONAL_FORMAT => 'rational: an integer or a fraction',
1344 $STRING_FORMAT => 'string',
1345 $ADJUST_FORMAT => 'some entries need adjustment',
1346 $HEX_ADJUST_FORMAT => 'mapped value in hex; some entries need adjustment',
1347 $DECOMP_STRING_FORMAT => 'Perl\'s internal (Normalize.pm) decomposition mapping',
1348 $STRING_WHITE_SPACE_LIST => 'string, but some elements are interpreted as a list; white space occurs only as list item separators'
1351 # Unicode didn't put such derived files in a separate directory at first.
1352 my $EXTRACTED_DIR = (-d 'extracted') ? 'extracted' : "";
1353 my $EXTRACTED = ($EXTRACTED_DIR) ? "$EXTRACTED_DIR/" : "";
1354 my $AUXILIARY = 'auxiliary';
1356 # Hashes that will eventually go into Heavy.pl for the use of utf8_heavy.pl
1357 # and into UCD.pl for the use of UCD.pm
1358 my %loose_to_file_of; # loosely maps table names to their respective
1360 my %stricter_to_file_of; # same; but for stricter mapping.
1361 my %loose_property_to_file_of; # Maps a loose property name to its map file
1362 my %file_to_swash_name; # Maps the file name to its corresponding key name
1363 # in the hash %utf8::SwashInfo
1364 my %nv_floating_to_rational; # maps numeric values floating point numbers to
1365 # their rational equivalent
1366 my %loose_property_name_of; # Loosely maps (non_string) property names to
1368 my %string_property_loose_to_name; # Same, for string properties.
1369 my %loose_defaults; # keys are of form "prop=value", where 'prop' is
1370 # the property name in standard loose form, and
1371 # 'value' is the default value for that property,
1372 # also in standard loose form.
1373 my %loose_to_standard_value; # loosely maps table names to the canonical
1375 my %ambiguous_names; # keys are alias names (in standard form) that
1376 # have more than one possible meaning.
1377 my %prop_aliases; # Keys are standard property name; values are each
1379 my %prop_value_aliases; # Keys of top level are standard property name;
1380 # values are keys to another hash, Each one is
1381 # one of the property's values, in standard form.
1382 # The values are that prop-val's aliases.
1383 my %ucd_pod; # Holds entries that will go into the UCD section of the pod
1385 # Most properties are immune to caseless matching, otherwise you would get
1386 # nonsensical results, as properties are a function of a code point, not
1387 # everything that is caselessly equivalent to that code point. For example,
1388 # Changes_When_Case_Folded('s') should be false, whereas caselessly it would
1389 # be true because 's' and 'S' are equivalent caselessly. However,
1390 # traditionally, [:upper:] and [:lower:] are equivalent caselessly, so we
1391 # extend that concept to those very few properties that are like this. Each
1392 # such property will match the full range caselessly. They are hard-coded in
1393 # the program; it's not worth trying to make it general as it's extremely
1394 # unlikely that they will ever change.
1395 my %caseless_equivalent_to;
1397 # These constants names and values were taken from the Unicode standard,
1398 # version 5.1, section 3.12. They are used in conjunction with Hangul
1399 # syllables. The '_string' versions are so generated tables can retain the
1400 # hex format, which is the more familiar value
1401 my $SBase_string = "0xAC00";
1402 my $SBase = CORE::hex $SBase_string;
1403 my $LBase_string = "0x1100";
1404 my $LBase = CORE::hex $LBase_string;
1405 my $VBase_string = "0x1161";
1406 my $VBase = CORE::hex $VBase_string;
1407 my $TBase_string = "0x11A7";
1408 my $TBase = CORE::hex $TBase_string;
1413 my $NCount = $VCount * $TCount;
1415 # For Hangul syllables; These store the numbers from Jamo.txt in conjunction
1416 # with the above published constants.
1418 my %Jamo_L; # Leading consonants
1419 my %Jamo_V; # Vowels
1420 my %Jamo_T; # Trailing consonants
1422 # For code points whose name contains its ordinal as a '-ABCD' suffix.
1423 # The key is the base name of the code point, and the value is an
1424 # array giving all the ranges that use this base name. Each range
1425 # is actually a hash giving the 'low' and 'high' values of it.
1426 my %names_ending_in_code_point;
1427 my %loose_names_ending_in_code_point; # Same as above, but has blanks, dashes
1428 # removed from the names
1429 # Inverse mapping. The list of ranges that have these kinds of
1430 # names. Each element contains the low, high, and base names in an
1432 my @code_points_ending_in_code_point;
1434 # To hold Unicode's normalization test suite
1435 my @normalization_tests;
1437 # Boolean: does this Unicode version have the hangul syllables, and are we
1438 # writing out a table for them?
1439 my $has_hangul_syllables = 0;
1441 # Does this Unicode version have code points whose names end in their
1442 # respective code points, and are we writing out a table for them? 0 for no;
1443 # otherwise points to first property that a table is needed for them, so that
1444 # if multiple tables are needed, we don't create duplicates
1445 my $needing_code_points_ending_in_code_point = 0;
1447 my @backslash_X_tests; # List of tests read in for testing \X
1448 my @unhandled_properties; # Will contain a list of properties found in
1449 # the input that we didn't process.
1450 my @match_properties; # Properties that have match tables, to be
1452 my @map_properties; # Properties that get map files written
1453 my @named_sequences; # NamedSequences.txt contents.
1454 my %potential_files; # Generated list of all .txt files in the directory
1455 # structure so we can warn if something is being
1457 my @files_actually_output; # List of files we generated.
1458 my @more_Names; # Some code point names are compound; this is used
1459 # to store the extra components of them.
1460 my $MIN_FRACTION_LENGTH = 3; # How many digits of a floating point number at
1461 # the minimum before we consider it equivalent to a
1462 # candidate rational
1463 my $MAX_FLOATING_SLOP = 10 ** - $MIN_FRACTION_LENGTH; # And in floating terms
1465 # These store references to certain commonly used property objects
1474 # Are there conflicting names because of beginning with 'In_', or 'Is_'
1475 my $has_In_conflicts = 0;
1476 my $has_Is_conflicts = 0;
1478 sub internal_file_to_platform ($) {
1479 # Convert our file paths which have '/' separators to those of the
1483 return undef unless defined $file;
1485 return File::Spec->join(split '/', $file);
1488 sub file_exists ($) { # platform independent '-e'. This program internally
1489 # uses slash as a path separator.
1491 return 0 if ! defined $file;
1492 return -e internal_file_to_platform($file);
1496 # Returns the address of the blessed input object.
1497 # It doesn't check for blessedness because that would do a string eval
1498 # every call, and the program is structured so that this is never called
1499 # for a non-blessed object.
1501 no overloading; # If overloaded, numifying below won't work.
1503 # Numifying a ref gives its address.
1504 return pack 'J', $_[0];
1507 # These are used only if $annotate is true.
1508 # The entire range of Unicode characters is examined to populate these
1509 # after all the input has been processed. But most can be skipped, as they
1510 # have the same descriptive phrases, such as being unassigned
1511 my @viacode; # Contains the 1 million character names
1512 my @printable; # boolean: And are those characters printable?
1513 my @annotate_char_type; # Contains a type of those characters, specifically
1514 # for the purposes of annotation.
1515 my $annotate_ranges; # A map of ranges of code points that have the same
1516 # name for the purposes of annotation. They map to the
1517 # upper edge of the range, so that the end point can
1518 # be immediately found. This is used to skip ahead to
1519 # the end of a range, and avoid processing each
1520 # individual code point in it.
1521 my $unassigned_sans_noncharacters; # A Range_List of the unassigned
1522 # characters, but excluding those which are
1523 # also noncharacter code points
1525 # The annotation types are an extension of the regular range types, though
1526 # some of the latter are folded into one. Make the new types negative to
1527 # avoid conflicting with the regular types
1528 my $SURROGATE_TYPE = -1;
1529 my $UNASSIGNED_TYPE = -2;
1530 my $PRIVATE_USE_TYPE = -3;
1531 my $NONCHARACTER_TYPE = -4;
1532 my $CONTROL_TYPE = -5;
1533 my $UNKNOWN_TYPE = -6; # Used only if there is a bug in this program
1535 sub populate_char_info ($) {
1536 # Used only with the $annotate option. Populates the arrays with the
1537 # input code point's info that are needed for outputting more detailed
1538 # comments. If calling context wants a return, it is the end point of
1539 # any contiguous range of characters that share essentially the same info
1542 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
1544 $viacode[$i] = $perl_charname->value_of($i) || "";
1546 # A character is generally printable if Unicode says it is,
1547 # but below we make sure that most Unicode general category 'C' types
1549 $printable[$i] = $print->contains($i);
1551 $annotate_char_type[$i] = $perl_charname->type_of($i) || 0;
1553 # Only these two regular types are treated specially for annotations
1555 $annotate_char_type[$i] = 0 if $annotate_char_type[$i] != $CP_IN_NAME
1556 && $annotate_char_type[$i] != $HANGUL_SYLLABLE;
1558 # Give a generic name to all code points that don't have a real name.
1559 # We output ranges, if applicable, for these. Also calculate the end
1560 # point of the range.
1562 if (! $viacode[$i]) {
1564 if ($gc-> table('Private_use')->contains($i)) {
1565 $viacode[$i] = 'Private Use';
1566 $annotate_char_type[$i] = $PRIVATE_USE_TYPE;
1568 $end = $gc->table('Private_Use')->containing_range($i)->end;
1570 elsif ((defined ($nonchar =
1571 Property::property_ref('Noncharacter_Code_Point'))
1572 && $nonchar->table('Y')->contains($i)))
1574 $viacode[$i] = 'Noncharacter';
1575 $annotate_char_type[$i] = $NONCHARACTER_TYPE;
1577 $end = property_ref('Noncharacter_Code_Point')->table('Y')->
1578 containing_range($i)->end;
1580 elsif ($gc-> table('Control')->contains($i)) {
1581 $viacode[$i] = property_ref('Name_Alias')->value_of($i) || 'Control';
1582 $annotate_char_type[$i] = $CONTROL_TYPE;
1585 elsif ($gc-> table('Unassigned')->contains($i)) {
1586 $annotate_char_type[$i] = $UNASSIGNED_TYPE;
1588 if ($v_version lt v2.0.0) { # No blocks in earliest releases
1589 $viacode[$i] = 'Unassigned';
1590 $end = $gc-> table('Unassigned')->containing_range($i)->end;
1593 $viacode[$i] = 'Unassigned, block=' . $block-> value_of($i);
1595 # Because we name the unassigned by the blocks they are in, it
1596 # can't go past the end of that block, and it also can't go
1597 # past the unassigned range it is in. The special table makes
1598 # sure that the non-characters, which are unassigned, are
1600 $end = min($block->containing_range($i)->end,
1601 $unassigned_sans_noncharacters->
1602 containing_range($i)->end);
1605 elsif ($v_version lt v2.0.0) { # No surrogates in earliest releases
1606 $viacode[$i] = $gc->value_of($i);
1607 $annotate_char_type[$i] = $UNKNOWN_TYPE;
1610 elsif ($gc-> table('Surrogate')->contains($i)) {
1611 $viacode[$i] = 'Surrogate';
1612 $annotate_char_type[$i] = $SURROGATE_TYPE;
1614 $end = $gc->table('Surrogate')->containing_range($i)->end;
1617 Carp::my_carp_bug("Can't figure out how to annotate "
1618 . sprintf("U+%04X", $i)
1619 . ". Proceeding anyway.");
1620 $viacode[$i] = 'UNKNOWN';
1621 $annotate_char_type[$i] = $UNKNOWN_TYPE;
1626 # Here, has a name, but if it's one in which the code point number is
1627 # appended to the name, do that.
1628 elsif ($annotate_char_type[$i] == $CP_IN_NAME) {
1629 $viacode[$i] .= sprintf("-%04X", $i);
1630 $end = $perl_charname->containing_range($i)->end;
1633 # And here, has a name, but if it's a hangul syllable one, replace it with
1634 # the correct name from the Unicode algorithm
1635 elsif ($annotate_char_type[$i] == $HANGUL_SYLLABLE) {
1637 my $SIndex = $i - $SBase;
1638 my $L = $LBase + $SIndex / $NCount;
1639 my $V = $VBase + ($SIndex % $NCount) / $TCount;
1640 my $T = $TBase + $SIndex % $TCount;
1641 $viacode[$i] = "HANGUL SYLLABLE $Jamo{$L}$Jamo{$V}";
1642 $viacode[$i] .= $Jamo{$T} if $T != $TBase;
1643 $end = $perl_charname->containing_range($i)->end;
1646 return if ! defined wantarray;
1647 return $i if ! defined $end; # If not a range, return the input
1649 # Save this whole range so can find the end point quickly
1650 $annotate_ranges->add_map($i, $end, $end);
1655 # Commented code below should work on Perl 5.8.
1656 ## This 'require' doesn't necessarily work in miniperl, and even if it does,
1657 ## the native perl version of it (which is what would operate under miniperl)
1658 ## is extremely slow, as it does a string eval every call.
1659 #my $has_fast_scalar_util = $^X !~ /miniperl/
1660 # && defined eval "require Scalar::Util";
1663 # # Returns the address of the blessed input object. Uses the XS version if
1664 # # available. It doesn't check for blessedness because that would do a
1665 # # string eval every call, and the program is structured so that this is
1666 # # never called for a non-blessed object.
1668 # return Scalar::Util::refaddr($_[0]) if $has_fast_scalar_util;
1670 # # Check at least that is a ref.
1671 # my $pkg = ref($_[0]) or return undef;
1673 # # Change to a fake package to defeat any overloaded stringify
1674 # bless $_[0], 'main::Fake';
1676 # # Numifying a ref gives its address.
1677 # my $addr = pack 'J', $_[0];
1679 # # Return to original class
1680 # bless $_[0], $pkg;
1687 return $a if $a >= $b;
1694 return $a if $a <= $b;
1698 sub clarify_number ($) {
1699 # This returns the input number with underscores inserted every 3 digits
1700 # in large (5 digits or more) numbers. Input must be entirely digits, not
1704 my $pos = length($number) - 3;
1705 return $number if $pos <= 1;
1707 substr($number, $pos, 0) = '_';
1716 # These routines give a uniform treatment of messages in this program. They
1717 # are placed in the Carp package to cause the stack trace to not include them,
1718 # although an alternative would be to use another package and set @CARP_NOT
1721 our $Verbose = 1 if main::DEBUG; # Useful info when debugging
1723 # This is a work-around suggested by Nicholas Clark to fix a problem with Carp
1724 # and overload trying to load Scalar:Util under miniperl. See
1725 # http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/2009-11/msg01057.html
1726 undef $overload::VERSION;
1729 my $message = shift || "";
1730 my $nofold = shift || 0;
1733 $message = main::join_lines($message);
1734 $message =~ s/^$0: *//; # Remove initial program name
1735 $message =~ s/[.;,]+$//; # Remove certain ending punctuation
1736 $message = "\n$0: $message;";
1738 # Fold the message with program name, semi-colon end punctuation
1739 # (which looks good with the message that carp appends to it), and a
1740 # hanging indent for continuation lines.
1741 $message = main::simple_fold($message, "", 4) unless $nofold;
1742 $message =~ s/\n$//; # Remove the trailing nl so what carp
1743 # appends is to the same line
1746 return $message if defined wantarray; # If a caller just wants the msg
1753 # This is called when it is clear that the problem is caused by a bug in
1756 my $message = shift;
1757 $message =~ s/^$0: *//;
1758 $message = my_carp("Bug in $0. Please report it by running perlbug or if that is unavailable, by sending email to perbug\@perl.org:\n$message");
1763 sub carp_too_few_args {
1765 my_carp_bug("Wrong number of arguments: to 'carp_too_few_arguments'. No action taken.");
1769 my $args_ref = shift;
1772 my_carp_bug("Need at least $count arguments to "
1774 . ". Instead got: '"
1775 . join ', ', @$args_ref
1776 . "'. No action taken.");
1780 sub carp_extra_args {
1781 my $args_ref = shift;
1782 my_carp_bug("Too many arguments to 'carp_extra_args': (" . join(', ', @_) . "); Extras ignored.") if @_;
1784 unless (ref $args_ref) {
1785 my_carp_bug("Argument to 'carp_extra_args' ($args_ref) must be a ref. Not checking arguments.");
1788 my ($package, $file, $line) = caller;
1789 my $subroutine = (caller 1)[3];
1792 if (ref $args_ref eq 'HASH') {
1793 foreach my $key (keys %$args_ref) {
1794 $args_ref->{$key} = $UNDEF unless defined $args_ref->{$key};
1796 $list = join ', ', each %{$args_ref};
1798 elsif (ref $args_ref eq 'ARRAY') {
1799 foreach my $arg (@$args_ref) {
1800 $arg = $UNDEF unless defined $arg;
1802 $list = join ', ', @$args_ref;
1805 my_carp_bug("Can't cope with ref "
1807 . " . argument to 'carp_extra_args'. Not checking arguments.");
1811 my_carp_bug("Unrecognized parameters in options: '$list' to $subroutine. Skipped.");
1819 # This program uses the inside-out method for objects, as recommended in
1820 # "Perl Best Practices". This closure aids in generating those. There
1821 # are two routines. setup_package() is called once per package to set
1822 # things up, and then set_access() is called for each hash representing a
1823 # field in the object. These routines arrange for the object to be
1824 # properly destroyed when no longer used, and for standard accessor
1825 # functions to be generated. If you need more complex accessors, just
1826 # write your own and leave those accesses out of the call to set_access().
1827 # More details below.
1829 my %constructor_fields; # fields that are to be used in constructors; see
1832 # The values of this hash will be the package names as keys to other
1833 # hashes containing the name of each field in the package as keys, and
1834 # references to their respective hashes as values.
1838 # Sets up the package, creating standard DESTROY and dump methods
1839 # (unless already defined). The dump method is used in debugging by
1841 # The optional parameters are:
1842 # a) a reference to a hash, that gets populated by later
1843 # set_access() calls with one of the accesses being
1844 # 'constructor'. The caller can then refer to this, but it is
1845 # not otherwise used by these two routines.
1846 # b) a reference to a callback routine to call during destruction
1847 # of the object, before any fields are actually destroyed
1850 my $constructor_ref = delete $args{'Constructor_Fields'};
1851 my $destroy_callback = delete $args{'Destroy_Callback'};
1852 Carp::carp_extra_args(\@_) if main::DEBUG && %args;
1855 my $package = (caller)[0];
1857 $package_fields{$package} = \%fields;
1858 $constructor_fields{$package} = $constructor_ref;
1860 unless ($package->can('DESTROY')) {
1861 my $destroy_name = "${package}::DESTROY";
1864 # Use typeglob to give the anonymous subroutine the name we want
1865 *$destroy_name = sub {
1867 my $addr = do { no overloading; pack 'J', $self; };
1869 $self->$destroy_callback if $destroy_callback;
1870 foreach my $field (keys %{$package_fields{$package}}) {
1871 #print STDERR __LINE__, ": Destroying ", ref $self, " ", sprintf("%04X", $addr), ": ", $field, "\n";
1872 delete $package_fields{$package}{$field}{$addr};
1878 unless ($package->can('dump')) {
1879 my $dump_name = "${package}::dump";
1883 return dump_inside_out($self, $package_fields{$package}, @_);
1890 # Arrange for the input field to be garbage collected when no longer
1891 # needed. Also, creates standard accessor functions for the field
1892 # based on the optional parameters-- none if none of these parameters:
1893 # 'addable' creates an 'add_NAME()' accessor function.
1894 # 'readable' or 'readable_array' creates a 'NAME()' accessor
1896 # 'settable' creates a 'set_NAME()' accessor function.
1897 # 'constructor' doesn't create an accessor function, but adds the
1898 # field to the hash that was previously passed to
1900 # Any of the accesses can be abbreviated down, so that 'a', 'ad',
1901 # 'add' etc. all mean 'addable'.
1902 # The read accessor function will work on both array and scalar
1903 # values. If another accessor in the parameter list is 'a', the read
1904 # access assumes an array. You can also force it to be array access
1905 # by specifying 'readable_array' instead of 'readable'
1907 # A sort-of 'protected' access can be set-up by preceding the addable,
1908 # readable or settable with some initial portion of 'protected_' (but,
1909 # the underscore is required), like 'p_a', 'pro_set', etc. The
1910 # "protection" is only by convention. All that happens is that the
1911 # accessor functions' names begin with an underscore. So instead of
1912 # calling set_foo, the call is _set_foo. (Real protection could be
1913 # accomplished by having a new subroutine, end_package, called at the
1914 # end of each package, and then storing the __LINE__ ranges and
1915 # checking them on every accessor. But that is way overkill.)
1917 # We create anonymous subroutines as the accessors and then use
1918 # typeglobs to assign them to the proper package and name
1920 my $name = shift; # Name of the field
1921 my $field = shift; # Reference to the inside-out hash containing the
1924 my $package = (caller)[0];
1926 if (! exists $package_fields{$package}) {
1927 croak "$0: Must call 'setup_package' before 'set_access'";
1930 # Stash the field so DESTROY can get it.
1931 $package_fields{$package}{$name} = $field;
1933 # Remaining arguments are the accessors. For each...
1934 foreach my $access (@_) {
1935 my $access = lc $access;
1939 # Match the input as far as it goes.
1940 if ($access =~ /^(p[^_]*)_/) {
1942 if (substr('protected_', 0, length $protected)
1946 # Add 1 for the underscore not included in $protected
1947 $access = substr($access, length($protected) + 1);
1955 if (substr('addable', 0, length $access) eq $access) {
1956 my $subname = "${package}::${protected}add_$name";
1959 # add_ accessor. Don't add if already there, which we
1960 # determine using 'eq' for scalars and '==' otherwise.
1963 return Carp::carp_too_few_args(\@_, 2) if main::DEBUG && @_ < 2;
1966 my $addr = do { no overloading; pack 'J', $self; };
1967 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
1969 return if grep { $value == $_ } @{$field->{$addr}};
1972 return if grep { $value eq $_ } @{$field->{$addr}};
1974 push @{$field->{$addr}}, $value;
1978 elsif (substr('constructor', 0, length $access) eq $access) {
1980 Carp::my_carp_bug("Can't set-up 'protected' constructors")
1983 $constructor_fields{$package}{$name} = $field;
1986 elsif (substr('readable_array', 0, length $access) eq $access) {
1988 # Here has read access. If one of the other parameters for
1989 # access is array, or this one specifies array (by being more
1990 # than just 'readable_'), then create a subroutine that
1991 # assumes the data is an array. Otherwise just a scalar
1992 my $subname = "${package}::${protected}$name";
1993 if (grep { /^a/i } @_
1994 or length($access) > length('readable_'))
1999 Carp::carp_extra_args(\@_) if main::DEBUG && @_ > 1;
2000 my $addr = do { no overloading; pack 'J', $_[0]; };
2001 if (ref $field->{$addr} ne 'ARRAY') {
2002 my $type = ref $field->{$addr};
2003 $type = 'scalar' unless $type;
2004 Carp::my_carp_bug("Trying to read $name as an array when it is a $type. Big problems.");
2007 return scalar @{$field->{$addr}} unless wantarray;
2009 # Make a copy; had problems with caller modifying the
2010 # original otherwise
2011 my @return = @{$field->{$addr}};
2017 # Here not an array value, a simpler function.
2021 Carp::carp_extra_args(\@_) if main::DEBUG && @_ > 1;
2023 return $field->{pack 'J', $_[0]};
2027 elsif (substr('settable', 0, length $access) eq $access) {
2028 my $subname = "${package}::${protected}set_$name";
2033 return Carp::carp_too_few_args(\@_, 2) if @_ < 2;
2034 Carp::carp_extra_args(\@_) if @_ > 2;
2036 # $self is $_[0]; $value is $_[1]
2038 $field->{pack 'J', $_[0]} = $_[1];
2043 Carp::my_carp_bug("Unknown accessor type $access. No accessor set.");
2052 # All input files use this object, which stores various attributes about them,
2053 # and provides for convenient, uniform handling. The run method wraps the
2054 # processing. It handles all the bookkeeping of opening, reading, and closing
2055 # the file, returning only significant input lines.
2057 # Each object gets a handler which processes the body of the file, and is
2058 # called by run(). All character property files must use the generic,
2059 # default handler, which has code scrubbed to handle things you might not
2060 # expect, including automatic EBCDIC handling. For files that don't deal with
2061 # mapping code points to a property value, such as test files,
2062 # PropertyAliases, PropValueAliases, and named sequences, you can override the
2063 # handler to be a custom one. Such a handler should basically be a
2064 # while(next_line()) {...} loop.
2066 # You can also set up handlers to
2067 # 1) call before the first line is read, for pre processing
2068 # 2) call to adjust each line of the input before the main handler gets
2069 # them. This can be automatically generated, if appropriately simple
2070 # enough, by specifiying a Properties parameter in the constructor.
2071 # 3) call upon EOF before the main handler exits its loop
2072 # 4) call at the end, for post processing
2074 # $_ is used to store the input line, and is to be filtered by the
2075 # each_line_handler()s. So, if the format of the line is not in the desired
2076 # format for the main handler, these are used to do that adjusting. They can
2077 # be stacked (by enclosing them in an [ anonymous array ] in the constructor,
2078 # so the $_ output of one is used as the input to the next. None of the other
2079 # handlers are stackable, but could easily be changed to be so.
2081 # Most of the handlers can call insert_lines() or insert_adjusted_lines()
2082 # which insert the parameters as lines to be processed before the next input
2083 # file line is read. This allows the EOF handler to flush buffers, for
2084 # example. The difference between the two routines is that the lines inserted
2085 # by insert_lines() are subjected to the each_line_handler()s. (So if you
2086 # called it from such a handler, you would get infinite recursion.) Lines
2087 # inserted by insert_adjusted_lines() go directly to the main handler without
2088 # any adjustments. If the post-processing handler calls any of these, there
2089 # will be no effect. Some error checking for these conditions could be added,
2090 # but it hasn't been done.
2092 # carp_bad_line() should be called to warn of bad input lines, which clears $_
2093 # to prevent further processing of the line. This routine will output the
2094 # message as a warning once, and then keep a count of the lines that have the
2095 # same message, and output that count at the end of the file's processing.
2096 # This keeps the number of messages down to a manageable amount.
2098 # get_missings() should be called to retrieve any @missing input lines.
2099 # Messages will be raised if this isn't done if the options aren't to ignore
2102 sub trace { return main::trace(@_); }
2105 # Keep track of fields that are to be put into the constructor.
2106 my %constructor_fields;
2108 main::setup_package(Constructor_Fields => \%constructor_fields);
2110 my %file; # Input file name, required
2111 main::set_access('file', \%file, qw{ c r });
2113 my %first_released; # Unicode version file was first released in, required
2114 main::set_access('first_released', \%first_released, qw{ c r });
2116 my %handler; # Subroutine to process the input file, defaults to
2117 # 'process_generic_property_file'
2118 main::set_access('handler', \%handler, qw{ c });
2121 # name of property this file is for. defaults to none, meaning not
2122 # applicable, or is otherwise determinable, for example, from each line.
2123 main::set_access('property', \%property, qw{ c r });
2126 # If this is true, the file is optional. If not present, no warning is
2127 # output. If it is present, the string given by this parameter is
2128 # evaluated, and if false the file is not processed.
2129 main::set_access('optional', \%optional, 'c', 'r');
2132 # This is used for debugging, to skip processing of all but a few input
2133 # files. Add 'non_skip => 1' to the constructor for those files you want
2134 # processed when you set the $debug_skip global.
2135 main::set_access('non_skip', \%non_skip, 'c');
2138 # This is used to skip processing of this input file semi-permanently,
2139 # when it evaluates to true. The value should be the reason the file is
2140 # being skipped. It is used for files that we aren't planning to process
2141 # anytime soon, but want to allow to be in the directory and not raise a
2142 # message that we are not handling. Mostly for test files. This is in
2143 # contrast to the non_skip element, which is supposed to be used very
2144 # temporarily for debugging. Sets 'optional' to 1. Also, files that we
2145 # pretty much will never look at can be placed in the global
2146 # %ignored_files instead. Ones used here will be added to %skipped files
2147 main::set_access('skip', \%skip, 'c');
2149 my %each_line_handler;
2150 # list of subroutines to look at and filter each non-comment line in the
2151 # file. defaults to none. The subroutines are called in order, each is
2152 # to adjust $_ for the next one, and the final one adjusts it for
2154 main::set_access('each_line_handler', \%each_line_handler, 'c');
2156 my %properties; # Optional ordered list of the properties that occur in each
2157 # meaningful line of the input file. If present, an appropriate
2158 # each_line_handler() is automatically generated and pushed onto the stack
2159 # of such handlers. This is useful when a file contains multiple
2160 # proerties per line, but no other special considerations are necessary.
2161 # The special value "<ignored>" means to discard the corresponding input
2163 # Any @missing lines in the file should also match this syntax; no such
2164 # files exist as of 6.3. But if it happens in a future release, the code
2165 # could be expanded to properly parse them.
2166 main::set_access('properties', \%properties, qw{ c r });
2168 my %has_missings_defaults;
2169 # ? Are there lines in the file giving default values for code points
2170 # missing from it?. Defaults to NO_DEFAULTS. Otherwise NOT_IGNORED is
2171 # the norm, but IGNORED means it has such lines, but the handler doesn't
2172 # use them. Having these three states allows us to catch changes to the
2173 # UCD that this program should track. XXX This could be expanded to
2174 # specify the syntax for such lines, like %properties above.
2175 main::set_access('has_missings_defaults',
2176 \%has_missings_defaults, qw{ c r });
2179 # Subroutine to call before doing anything else in the file. If undef, no
2180 # such handler is called.
2181 main::set_access('pre_handler', \%pre_handler, qw{ c });
2184 # Subroutine to call upon getting an EOF on the input file, but before
2185 # that is returned to the main handler. This is to allow buffers to be
2186 # flushed. The handler is expected to call insert_lines() or
2187 # insert_adjusted() with the buffered material
2188 main::set_access('eof_handler', \%eof_handler, qw{ c r });
2191 # Subroutine to call after all the lines of the file are read in and
2192 # processed. If undef, no such handler is called.
2193 main::set_access('post_handler', \%post_handler, qw{ c });
2195 my %progress_message;
2196 # Message to print to display progress in lieu of the standard one
2197 main::set_access('progress_message', \%progress_message, qw{ c });
2200 # cache open file handle, internal. Is undef if file hasn't been
2201 # processed at all, empty if has;
2202 main::set_access('handle', \%handle);
2205 # cache of lines added virtually to the file, internal
2206 main::set_access('added_lines', \%added_lines);
2209 # cache of lines added virtually to the file, internal
2210 main::set_access('remapped_lines', \%remapped_lines);
2213 # cache of errors found, internal
2214 main::set_access('errors', \%errors);
2217 # storage of '@missing' defaults lines
2218 main::set_access('missings', \%missings);
2221 sub _next_line_with_remapped_range;
2226 my $self = bless \do{ my $anonymous_scalar }, $class;
2227 my $addr = do { no overloading; pack 'J', $self; };
2230 $handler{$addr} = \&main::process_generic_property_file;
2231 $non_skip{$addr} = 0;
2233 $has_missings_defaults{$addr} = $NO_DEFAULTS;
2234 $handle{$addr} = undef;
2235 $added_lines{$addr} = [ ];
2236 $remapped_lines{$addr} = [ ];
2237 $each_line_handler{$addr} = [ ];
2238 $errors{$addr} = { };
2239 $missings{$addr} = [ ];
2241 # Two positional parameters.
2242 return Carp::carp_too_few_args(\@_, 2) if main::DEBUG && @_ < 2;
2243 $file{$addr} = main::internal_file_to_platform(shift);
2244 $first_released{$addr} = shift;
2246 # The rest of the arguments are key => value pairs
2247 # %constructor_fields has been set up earlier to list all possible
2248 # ones. Either set or push, depending on how the default has been set
2251 foreach my $key (keys %args) {
2252 my $argument = $args{$key};
2254 # Note that the fields are the lower case of the constructor keys
2255 my $hash = $constructor_fields{lc $key};
2256 if (! defined $hash) {
2257 Carp::my_carp_bug("Unrecognized parameters '$key => $argument' to new() for $self. Skipped");
2260 if (ref $hash->{$addr} eq 'ARRAY') {
2261 if (ref $argument eq 'ARRAY') {
2262 foreach my $argument (@{$argument}) {
2263 next if ! defined $argument;
2264 push @{$hash->{$addr}}, $argument;
2268 push @{$hash->{$addr}}, $argument if defined $argument;
2272 $hash->{$addr} = $argument;
2277 # If the file has a property for it, it means that the property is not
2278 # listed in the file's entries. So add a handler to the list of line
2279 # handlers to insert the property name into the lines, to provide a
2280 # uniform interface to the final processing subroutine.
2281 # the final code doesn't have to worry about that.
2282 if ($property{$addr}) {
2283 push @{$each_line_handler{$addr}}, \&_insert_property_into_line;
2286 if ($non_skip{$addr} && ! $debug_skip && $verbosity) {
2287 print "Warning: " . __PACKAGE__ . " constructor for $file{$addr} has useless 'non_skip' in it\n";
2290 # If skipping, set to optional, and add to list of ignored files,
2291 # including its reason
2293 $optional{$addr} = 1;
2294 $skipped_files{$file{$addr}} = $skip{$addr}
2296 elsif ($properties{$addr}) {
2298 # Add a handler for each line in the input so that it creates a
2299 # separate input line for each property in those input lines, thus
2300 # making them suitable for process_generic_property_file().
2302 push @{$each_line_handler{$addr}},
2305 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2307 my @fields = split /\s*;\s*/, $_, -1;
2309 if (@fields - 1 > @{$properties{$addr}}) {
2310 $file->carp_bad_line('Extra fields');
2314 my $range = shift @fields; # 0th element is always the
2317 # The next fields in the input line correspond
2318 # respectively to the stored properties.
2319 for my $i (0 .. @{$properties{$addr}} - 1) {
2320 my $property_name = $properties{$addr}[$i];
2321 next if $property_name eq '<ignored>';
2322 $file->insert_adjusted_lines(
2323 "$range; $property_name; $fields[$i]");
2331 { # On non-ascii platforms, we use a special handler
2334 *next_line = (main::NON_ASCII_PLATFORM)
2335 ? *_next_line_with_remapped_range
2345 qw("") => "_operator_stringify",
2346 "." => \&main::_operator_dot,
2347 ".=" => \&main::_operator_dot_equal,
2350 sub _operator_stringify {
2353 return __PACKAGE__ . " object for " . $self->file;
2356 # flag to make sure extracted files are processed early
2357 my $seen_non_extracted_non_age = 0;
2360 # Process the input object $self. This opens and closes the file and
2361 # calls all the handlers for it. Currently, this can only be called
2362 # once per file, as it destroy's the EOF handler
2365 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2367 my $addr = do { no overloading; pack 'J', $self; };
2369 my $file = $file{$addr};
2371 # Don't process if not expecting this file (because released later
2372 # than this Unicode version), and isn't there. This means if someone
2373 # copies it into an earlier version's directory, we will go ahead and
2375 return if $first_released{$addr} gt $v_version && ! -e $file;
2377 # If in debugging mode and this file doesn't have the non-skip
2378 # flag set, and isn't one of the critical files, skip it.
2380 && $first_released{$addr} ne v0
2381 && ! $non_skip{$addr})
2383 print "Skipping $file in debugging\n" if $verbosity;
2387 # File could be optional
2388 if ($optional{$addr}) {
2389 return unless -e $file;
2390 my $result = eval $optional{$addr};
2391 if (! defined $result) {
2392 Carp::my_carp_bug("Got '$@' when tried to eval $optional{$addr}. $file Skipped.");
2397 print STDERR "Skipping processing input file '$file' because '$optional{$addr}' is not true\n";
2403 if (! defined $file || ! -e $file) {
2405 # If the file doesn't exist, see if have internal data for it
2406 # (based on first_released being 0).
2407 if ($first_released{$addr} eq v0) {
2408 $handle{$addr} = 'pretend_is_open';
2411 if (! $optional{$addr} # File could be optional
2412 && $v_version ge $first_released{$addr})
2414 print STDERR "Skipping processing input file '$file' because not found\n" if $v_version ge $first_released{$addr};
2421 # Here, the file exists. Some platforms may change the case of
2423 if ($seen_non_extracted_non_age) {
2424 if ($file =~ /$EXTRACTED/i) {
2425 Carp::my_carp_bug(main::join_lines(<<END
2426 $file should be processed just after the 'Prop...Alias' files, and before
2427 anything not in the $EXTRACTED_DIR directory. Proceeding, but the results may
2428 have subtle problems
2433 elsif ($EXTRACTED_DIR
2434 && $first_released{$addr} ne v0
2435 && $file !~ /$EXTRACTED/i
2436 && lc($file) ne 'dage.txt')
2438 # We don't set this (by the 'if' above) if we have no
2439 # extracted directory, so if running on an early version,
2440 # this test won't work. Not worth worrying about.
2441 $seen_non_extracted_non_age = 1;
2444 # And mark the file as having being processed, and warn if it
2445 # isn't a file we are expecting. As we process the files,
2446 # they are deleted from the hash, so any that remain at the
2447 # end of the program are files that we didn't process.
2448 my $fkey = File::Spec->rel2abs($file);
2449 my $expecting = delete $potential_files{lc($fkey)};
2451 Carp::my_carp("Was not expecting '$file'.") if
2453 && ! defined $handle{$addr};
2455 # Having deleted from expected files, we can quit if not to do
2456 # anything. Don't print progress unless really want verbosity
2458 print "Skipping $file.\n" if $verbosity >= $VERBOSE;
2462 # Open the file, converting the slashes used in this program
2463 # into the proper form for the OS
2465 if (not open $file_handle, "<", $file) {
2466 Carp::my_carp("Can't open $file. Skipping: $!");
2469 $handle{$addr} = $file_handle; # Cache the open file handle
2471 if ($v_version ge v3.2.0 && lc($file) ne 'unicodedata.txt') {
2472 $_ = <$file_handle>;
2473 if ($_ !~ / - $string_version \. /x) {
2476 die Carp::my_carp("File '$file' is version '$_'. It should be version $string_version");
2481 if ($verbosity >= $PROGRESS) {
2482 if ($progress_message{$addr}) {
2483 print "$progress_message{$addr}\n";
2486 # If using a virtual file, say so.
2487 print "Processing ", (-e $file)
2489 : "substitute $file",
2495 # Call any special handler for before the file.
2496 &{$pre_handler{$addr}}($self) if $pre_handler{$addr};
2498 # Then the main handler
2499 &{$handler{$addr}}($self);
2501 # Then any special post-file handler.
2502 &{$post_handler{$addr}}($self) if $post_handler{$addr};
2504 # If any errors have been accumulated, output the counts (as the first
2505 # error message in each class was output when it was encountered).
2506 if ($errors{$addr}) {
2509 foreach my $error (keys %{$errors{$addr}}) {
2510 $total += $errors{$addr}->{$error};
2511 delete $errors{$addr}->{$error};
2516 = "A total of $total lines had errors in $file. ";
2518 $message .= ($types == 1)
2519 ? '(Only the first one was displayed.)'
2520 : '(Only the first of each type was displayed.)';
2521 Carp::my_carp($message);
2525 if (@{$missings{$addr}}) {
2526 Carp::my_carp_bug("Handler for $file didn't look at all the \@missing lines. Generated tables likely are wrong");
2529 # If a real file handle, close it.
2530 close $handle{$addr} or Carp::my_carp("Can't close $file: $!") if
2532 $handle{$addr} = ""; # Uses empty to indicate that has already seen
2533 # the file, as opposed to undef
2538 # Sets $_ to be the next logical input line, if any. Returns non-zero
2539 # if such a line exists. 'logical' means that any lines that have
2540 # been added via insert_lines() will be returned in $_ before the file
2544 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2546 my $addr = do { no overloading; pack 'J', $self; };
2548 # Here the file is open (or if the handle is not a ref, is an open
2549 # 'virtual' file). Get the next line; any inserted lines get priority
2550 # over the file itself.
2554 while (1) { # Loop until find non-comment, non-empty line
2555 #local $to_trace = 1 if main::DEBUG;
2556 my $inserted_ref = shift @{$added_lines{$addr}};
2557 if (defined $inserted_ref) {
2558 ($adjusted, $_) = @{$inserted_ref};
2559 trace $adjusted, $_ if main::DEBUG && $to_trace;
2560 return 1 if $adjusted;
2563 last if ! ref $handle{$addr}; # Don't read unless is real file
2564 last if ! defined ($_ = readline $handle{$addr});
2567 trace $_ if main::DEBUG && $to_trace;
2569 # See if this line is the comment line that defines what property
2570 # value that code points that are not listed in the file should
2571 # have. The format or existence of these lines is not guaranteed
2572 # by Unicode since they are comments, but the documentation says
2573 # that this was added for machine-readability, so probably won't
2574 # change. This works starting in Unicode Version 5.0. They look
2577 # @missing: 0000..10FFFF; Not_Reordered
2578 # @missing: 0000..10FFFF; Decomposition_Mapping; <code point>
2579 # @missing: 0000..10FFFF; ; NaN
2581 # Save the line for a later get_missings() call.
2582 if (/$missing_defaults_prefix/) {
2583 if ($has_missings_defaults{$addr} == $NO_DEFAULTS) {
2584 $self->carp_bad_line("Unexpected \@missing line. Assuming no missing entries");
2586 elsif ($has_missings_defaults{$addr} == $NOT_IGNORED) {
2587 my @defaults = split /\s* ; \s*/x, $_;
2589 # The first field is the @missing, which ends in a
2590 # semi-colon, so can safely shift.
2593 # Some of these lines may have empty field placeholders
2594 # which get in the way. An example is:
2595 # @missing: 0000..10FFFF; ; NaN
2596 # Remove them. Process starting from the top so the
2597 # splice doesn't affect things still to be looked at.
2598 for (my $i = @defaults - 1; $i >= 0; $i--) {
2599 next if $defaults[$i] ne "";
2600 splice @defaults, $i, 1;
2603 # What's left should be just the property (maybe) and the
2604 # default. Having only one element means it doesn't have
2608 if (@defaults >= 1) {
2609 if (@defaults == 1) {
2610 $default = $defaults[0];
2613 $property = $defaults[0];
2614 $default = $defaults[1];
2620 || ($default =~ /^</
2621 && $default !~ /^<code *point>$/i
2622 && $default !~ /^<none>$/i
2623 && $default !~ /^<script>$/i))
2625 $self->carp_bad_line("Unrecognized \@missing line: $_. Assuming no missing entries");
2629 # If the property is missing from the line, it should
2630 # be the one for the whole file
2631 $property = $property{$addr} if ! defined $property;
2633 # Change <none> to the null string, which is what it
2634 # really means. If the default is the code point
2635 # itself, set it to <code point>, which is what
2636 # Unicode uses (but sometimes they've forgotten the
2638 if ($default =~ /^<none>$/i) {
2641 elsif ($default =~ /^<code *point>$/i) {
2642 $default = $CODE_POINT;
2644 elsif ($default =~ /^<script>$/i) {
2646 # Special case this one. Currently is from
2647 # ScriptExtensions.txt, and means for all unlisted
2648 # code points, use their Script property values.
2649 # For the code points not listed in that file, the
2650 # default value is 'Unknown'.
2651 $default = "Unknown";
2654 # Store them as a sub-arrays with both components.
2655 push @{$missings{$addr}}, [ $default, $property ];
2659 # There is nothing for the caller to process on this comment
2664 # Remove comments and trailing space, and skip this line if the
2670 # Call any handlers for this line, and skip further processing of
2671 # the line if the handler sets the line to null.
2672 foreach my $sub_ref (@{$each_line_handler{$addr}}) {
2677 # Here the line is ok. return success.
2679 } # End of looping through lines.
2681 # If there is an EOF handler, call it (only once) and if it generates
2682 # more lines to process go back in the loop to handle them.
2683 if ($eof_handler{$addr}) {
2684 &{$eof_handler{$addr}}($self);
2685 $eof_handler{$addr} = ""; # Currently only get one shot at it.
2686 goto LINE if $added_lines{$addr};
2689 # Return failure -- no more lines.
2694 sub _next_line_with_remapped_range {
2696 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2698 # like _next_line(), but for use on non-ASCII platforms. It sets $_
2699 # to be the next logical input line, if any. Returns non-zero if such
2700 # a line exists. 'logical' means that any lines that have been added
2701 # via insert_lines() will be returned in $_ before the file is read
2704 # The difference from _next_line() is that this remaps the Unicode
2705 # code points in the input to those of the native platform. Each
2706 # input line contains a single code point, or a single contiguous
2707 # range of them This routine splits each range into its individual
2708 # code points and caches them. It returns the cached values,
2709 # translated into their native equivalents, one at a time, for each
2710 # call, before reading the next line. Since native values can only be
2711 # a single byte wide, no translation is needed for code points above
2712 # 0xFF, and ranges that are entirely above that number are not split.
2713 # If an input line contains the range 254-1000, it would be split into
2714 # three elements: 254, 255, and 256-1000. (The downstream table
2715 # insertion code will sort and coalesce the individual code points
2716 # into appropriate ranges.)
2718 my $addr = do { no overloading; pack 'J', $self; };
2722 # Look in cache before reading the next line. Return any cached
2724 my $inserted = shift @{$remapped_lines{$addr}};
2725 if (defined $inserted) {
2726 trace $inserted if main::DEBUG && $to_trace;
2727 $_ = $inserted =~ s/^ ( \d+ ) /sprintf("%04X", utf8::unicode_to_native($1))/xer;
2728 trace $_ if main::DEBUG && $to_trace;
2732 # Get the next line.
2733 return 0 unless _next_line($self);
2735 # If there is a special handler for it, return the line,
2736 # untranslated. This should happen only for files that are
2737 # special, not being code-point related, such as property names.
2738 return 1 if $handler{$addr}
2739 != \&main::process_generic_property_file;
2741 my ($range, $property_name, $map, @remainder)
2742 = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
2745 || ! defined $property_name
2746 || $range !~ /^ ($code_point_re) (?:\.\. ($code_point_re) )? $/x)
2748 Carp::my_carp_bug("Unrecognized input line '$_'. Ignored");
2752 my $high = (defined $2) ? hex $2 : $low;
2754 # If the input maps the range to another code point, remap the
2755 # target if it is between 0 and 255.
2758 $map =~ s/\b 00 ( [0-9A-F]{2} ) \b/sprintf("%04X", utf8::unicode_to_native(hex $1))/gxe;
2759 $tail = "$property_name; $map";
2760 $_ = "$range; $tail";
2763 $tail = $property_name;
2766 # If entire range is above 255, just return it, unchanged (except
2767 # any mapped-to code point, already changed above)
2768 return 1 if $low > 255;
2770 # Cache an entry for every code point < 255. For those in the
2771 # range above 255, return a dummy entry for just that portion of
2772 # the range. Note that this will be out-of-order, but that is not
2774 foreach my $code_point ($low .. $high) {
2775 if ($code_point > 255) {
2776 $_ = sprintf "%04X..%04X; $tail", $code_point, $high;
2779 push @{$remapped_lines{$addr}}, "$code_point; $tail";
2781 } # End of looping through lines.
2786 # Not currently used, not fully tested.
2788 # # Non-destructive look-ahead one non-adjusted, non-comment, non-blank
2789 # # record. Not callable from an each_line_handler(), nor does it call
2790 # # an each_line_handler() on the line.
2793 # my $addr = do { no overloading; pack 'J', $self; };
2795 # foreach my $inserted_ref (@{$added_lines{$addr}}) {
2796 # my ($adjusted, $line) = @{$inserted_ref};
2797 # next if $adjusted;
2799 # # Remove comments and trailing space, and return a non-empty
2802 # $line =~ s/\s+$//;
2803 # return $line if $line ne "";
2806 # return if ! ref $handle{$addr}; # Don't read unless is real file
2807 # while (1) { # Loop until find non-comment, non-empty line
2808 # local $to_trace = 1 if main::DEBUG;
2809 # trace $_ if main::DEBUG && $to_trace;
2810 # return if ! defined (my $line = readline $handle{$addr});
2812 # push @{$added_lines{$addr}}, [ 0, $line ];
2815 # $line =~ s/\s+$//;
2816 # return $line if $line ne "";
2824 # Lines can be inserted so that it looks like they were in the input
2825 # file at the place it was when this routine is called. See also
2826 # insert_adjusted_lines(). Lines inserted via this routine go through
2827 # any each_line_handler()
2831 # Each inserted line is an array, with the first element being 0 to
2832 # indicate that this line hasn't been adjusted, and needs to be
2835 push @{$added_lines{pack 'J', $self}}, map { [ 0, $_ ] } @_;
2839 sub insert_adjusted_lines {
2840 # Lines can be inserted so that it looks like they were in the input
2841 # file at the place it was when this routine is called. See also
2842 # insert_lines(). Lines inserted via this routine are already fully
2843 # adjusted, ready to be processed; each_line_handler()s handlers will
2844 # not be called. This means this is not a completely general
2845 # facility, as only the last each_line_handler on the stack should
2846 # call this. It could be made more general, by passing to each of the
2847 # line_handlers their position on the stack, which they would pass on
2848 # to this routine, and that would replace the boolean first element in
2849 # the anonymous array pushed here, so that the next_line routine could
2850 # use that to call only those handlers whose index is after it on the
2851 # stack. But this is overkill for what is needed now.
2854 trace $_[0] if main::DEBUG && $to_trace;
2856 # Each inserted line is an array, with the first element being 1 to
2857 # indicate that this line has been adjusted
2859 push @{$added_lines{pack 'J', $self}}, map { [ 1, $_ ] } @_;
2864 # Returns the stored up @missings lines' values, and clears the list.
2865 # The values are in an array, consisting of the default in the first
2866 # element, and the property in the 2nd. However, since these lines
2867 # can be stacked up, the return is an array of all these arrays.
2870 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2872 my $addr = do { no overloading; pack 'J', $self; };
2874 # If not accepting a list return, just return the first one.
2875 return shift @{$missings{$addr}} unless wantarray;
2877 my @return = @{$missings{$addr}};
2878 undef @{$missings{$addr}};
2882 sub _insert_property_into_line {
2883 # Add a property field to $_, if this file requires it.
2886 my $addr = do { no overloading; pack 'J', $self; };
2887 my $property = $property{$addr};
2888 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2890 $_ =~ s/(;|$)/; $property$1/;
2895 # Output consistent error messages, using either a generic one, or the
2896 # one given by the optional parameter. To avoid gazillions of the
2897 # same message in case the syntax of a file is way off, this routine
2898 # only outputs the first instance of each message, incrementing a
2899 # count so the totals can be output at the end of the file.
2902 my $message = shift;
2903 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2905 my $addr = do { no overloading; pack 'J', $self; };
2907 $message = 'Unexpected line' unless $message;
2909 # No trailing punctuation so as to fit with our addenda.
2910 $message =~ s/[.:;,]$//;
2912 # If haven't seen this exact message before, output it now. Otherwise
2913 # increment the count of how many times it has occurred
2914 unless ($errors{$addr}->{$message}) {
2915 Carp::my_carp("$message in '$_' in "
2917 . " at line $.. Skipping this line;");
2918 $errors{$addr}->{$message} = 1;
2921 $errors{$addr}->{$message}++;
2924 # Clear the line to prevent any further (meaningful) processing of it.
2931 package Multi_Default;
2933 # Certain properties in early versions of Unicode had more than one possible
2934 # default for code points missing from the files. In these cases, one
2935 # default applies to everything left over after all the others are applied,
2936 # and for each of the others, there is a description of which class of code
2937 # points applies to it. This object helps implement this by storing the
2938 # defaults, and for all but that final default, an eval string that generates
2939 # the class that it applies to.
2944 main::setup_package();
2947 # The defaults structure for the classes
2948 main::set_access('class_defaults', \%class_defaults);
2951 # The default that applies to everything left over.
2952 main::set_access('other_default', \%other_default, 'r');
2956 # The constructor is called with default => eval pairs, terminated by
2957 # the left-over default. e.g.
2958 # Multi_Default->new(
2959 # 'T' => '$gc->table("Mn") + $gc->table("Cf") - 0x200C
2961 # 'R' => 'some other expression that evaluates to code points',
2969 my $self = bless \do{my $anonymous_scalar}, $class;
2970 my $addr = do { no overloading; pack 'J', $self; };
2973 my $default = shift;
2975 $class_defaults{$addr}->{$default} = $eval;
2978 $other_default{$addr} = shift;
2983 sub get_next_defaults {
2984 # Iterates and returns the next class of defaults.
2986 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2988 my $addr = do { no overloading; pack 'J', $self; };
2990 return each %{$class_defaults{$addr}};
2996 # An alias is one of the names that a table goes by. This class defines them
2997 # including some attributes. Everything is currently setup in the
3003 main::setup_package();
3006 main::set_access('name', \%name, 'r');
3009 # Should this name match loosely or not.
3010 main::set_access('loose_match', \%loose_match, 'r');
3012 my %make_re_pod_entry;
3013 # Some aliases should not get their own entries in the re section of the
3014 # pod, because they are covered by a wild-card, and some we want to
3015 # discourage use of. Binary
3016 main::set_access('make_re_pod_entry', \%make_re_pod_entry, 'r', 's');
3019 # Is this documented to be accessible via Unicode::UCD
3020 main::set_access('ucd', \%ucd, 'r', 's');
3023 # Aliases have a status, like deprecated, or even suppressed (which means
3024 # they don't appear in documentation). Enum
3025 main::set_access('status', \%status, 'r');
3028 # Similarly, some aliases should not be considered as usable ones for
3029 # external use, such as file names, or we don't want documentation to
3030 # recommend them. Boolean
3031 main::set_access('ok_as_filename', \%ok_as_filename, 'r');
3036 my $self = bless \do { my $anonymous_scalar }, $class;
3037 my $addr = do { no overloading; pack 'J', $self; };
3039 $name{$addr} = shift;
3040 $loose_match{$addr} = shift;
3041 $make_re_pod_entry{$addr} = shift;
3042 $ok_as_filename{$addr} = shift;
3043 $status{$addr} = shift;
3044 $ucd{$addr} = shift;
3046 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3048 # Null names are never ok externally
3049 $ok_as_filename{$addr} = 0 if $name{$addr} eq "";
3057 # A range is the basic unit for storing code points, and is described in the
3058 # comments at the beginning of the program. Each range has a starting code
3059 # point; an ending code point (not less than the starting one); a value
3060 # that applies to every code point in between the two end-points, inclusive;
3061 # and an enum type that applies to the value. The type is for the user's
3062 # convenience, and has no meaning here, except that a non-zero type is
3063 # considered to not obey the normal Unicode rules for having standard forms.
3065 # The same structure is used for both map and match tables, even though in the
3066 # latter, the value (and hence type) is irrelevant and could be used as a
3067 # comment. In map tables, the value is what all the code points in the range
3068 # map to. Type 0 values have the standardized version of the value stored as
3069 # well, so as to not have to recalculate it a lot.
3071 sub trace { return main::trace(@_); }
3075 main::setup_package();
3078 main::set_access('start', \%start, 'r', 's');
3081 main::set_access('end', \%end, 'r', 's');
3084 main::set_access('value', \%value, 'r');
3087 main::set_access('type', \%type, 'r');
3090 # The value in internal standard form. Defined only if the type is 0.
3091 main::set_access('standard_form', \%standard_form);
3093 # Note that if these fields change, the dump() method should as well
3096 return Carp::carp_too_few_args(\@_, 3) if main::DEBUG && @_ < 3;
3099 my $self = bless \do { my $anonymous_scalar }, $class;
3100 my $addr = do { no overloading; pack 'J', $self; };
3102 $start{$addr} = shift;
3103 $end{$addr} = shift;
3107 my $value = delete $args{'Value'}; # Can be 0
3108 $value = "" unless defined $value;
3109 $value{$addr} = $value;
3111 $type{$addr} = delete $args{'Type'} || 0;
3113 Carp::carp_extra_args(\%args) if main::DEBUG && %args;
3120 qw("") => "_operator_stringify",
3121 "." => \&main::_operator_dot,
3122 ".=" => \&main::_operator_dot_equal,
3125 sub _operator_stringify {
3127 my $addr = do { no overloading; pack 'J', $self; };
3129 # Output it like '0041..0065 (value)'
3130 my $return = sprintf("%04X", $start{$addr})
3132 . sprintf("%04X", $end{$addr});
3133 my $value = $value{$addr};
3134 my $type = $type{$addr};
3136 $return .= "$value";
3137 $return .= ", Type=$type" if $type != 0;
3144 # Calculate the standard form only if needed, and cache the result.
3145 # The standard form is the value itself if the type is special.
3146 # This represents a considerable CPU and memory saving - at the time
3147 # of writing there are 368676 non-special objects, but the standard
3148 # form is only requested for 22047 of them - ie about 6%.
3151 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3153 my $addr = do { no overloading; pack 'J', $self; };
3155 return $standard_form{$addr} if defined $standard_form{$addr};
3157 my $value = $value{$addr};
3158 return $value if $type{$addr};
3159 return $standard_form{$addr} = main::standardize($value);
3163 # Human, not machine readable. For machine readable, comment out this
3164 # entire routine and let the standard one take effect.
3167 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3169 my $addr = do { no overloading; pack 'J', $self; };
3171 my $return = $indent
3172 . sprintf("%04X", $start{$addr})
3174 . sprintf("%04X", $end{$addr})
3175 . " '$value{$addr}';";
3176 if (! defined $standard_form{$addr}) {
3177 $return .= "(type=$type{$addr})";
3179 elsif ($standard_form{$addr} ne $value{$addr}) {
3180 $return .= "(standard '$standard_form{$addr}')";
3186 package _Range_List_Base;
3188 # Base class for range lists. A range list is simply an ordered list of
3189 # ranges, so that the ranges with the lowest starting numbers are first in it.
3191 # When a new range is added that is adjacent to an existing range that has the
3192 # same value and type, it merges with it to form a larger range.
3194 # Ranges generally do not overlap, except that there can be multiple entries
3195 # of single code point ranges. This is because of NameAliases.txt.
3197 # In this program, there is a standard value such that if two different
3198 # values, have the same standard value, they are considered equivalent. This
3199 # value was chosen so that it gives correct results on Unicode data
3201 # There are a number of methods to manipulate range lists, and some operators
3202 # are overloaded to handle them.
3204 sub trace { return main::trace(@_); }
3210 # Max is initialized to a negative value that isn't adjacent to 0, for
3214 main::setup_package();
3217 # The list of ranges
3218 main::set_access('ranges', \%ranges, 'readable_array');
3221 # The highest code point in the list. This was originally a method, but
3222 # actual measurements said it was used a lot.
3223 main::set_access('max', \%max, 'r');
3225 my %each_range_iterator;
3226 # Iterator position for each_range()
3227 main::set_access('each_range_iterator', \%each_range_iterator);
3230 # Name of parent this is attached to, if any. Solely for better error
3232 main::set_access('owner_name_of', \%owner_name_of, 'p_r');
3234 my %_search_ranges_cache;
3235 # A cache of the previous result from _search_ranges(), for better
3237 main::set_access('_search_ranges_cache', \%_search_ranges_cache);
3243 # Optional initialization data for the range list.
3244 my $initialize = delete $args{'Initialize'};
3248 # Use _union() to initialize. _union() returns an object of this
3249 # class, which means that it will call this constructor recursively.
3250 # But it won't have this $initialize parameter so that it won't
3251 # infinitely loop on this.
3252 return _union($class, $initialize, %args) if defined $initialize;
3254 $self = bless \do { my $anonymous_scalar }, $class;
3255 my $addr = do { no overloading; pack 'J', $self; };
3257 # Optional parent object, only for debug info.
3258 $owner_name_of{$addr} = delete $args{'Owner'};
3259 $owner_name_of{$addr} = "" if ! defined $owner_name_of{$addr};
3261 # Stringify, in case it is an object.
3262 $owner_name_of{$addr} = "$owner_name_of{$addr}";
3264 # This is used only for error messages, and so a colon is added
3265 $owner_name_of{$addr} .= ": " if $owner_name_of{$addr} ne "";
3267 Carp::carp_extra_args(\%args) if main::DEBUG && %args;
3269 $max{$addr} = $max_init;
3271 $_search_ranges_cache{$addr} = 0;
3272 $ranges{$addr} = [];
3279 qw("") => "_operator_stringify",
3280 "." => \&main::_operator_dot,
3281 ".=" => \&main::_operator_dot_equal,
3284 sub _operator_stringify {
3286 my $addr = do { no overloading; pack 'J', $self; };
3288 return "Range_List attached to '$owner_name_of{$addr}'"
3289 if $owner_name_of{$addr};
3290 return "anonymous Range_List " . \$self;
3294 # Returns the union of the input code points. It can be called as
3295 # either a constructor or a method. If called as a method, the result
3296 # will be a new() instance of the calling object, containing the union
3297 # of that object with the other parameter's code points; if called as
3298 # a constructor, the first parameter gives the class that the new object
3299 # should be, and the second parameter gives the code points to go into
3301 # In either case, there are two parameters looked at by this routine;
3302 # any additional parameters are passed to the new() constructor.
3304 # The code points can come in the form of some object that contains
3305 # ranges, and has a conventionally named method to access them; or
3306 # they can be an array of individual code points (as integers); or
3307 # just a single code point.
3309 # If they are ranges, this routine doesn't make any effort to preserve
3310 # the range values and types of one input over the other. Therefore
3311 # this base class should not allow _union to be called from other than
3312 # initialization code, so as to prevent two tables from being added
3313 # together where the range values matter. The general form of this
3314 # routine therefore belongs in a derived class, but it was moved here
3315 # to avoid duplication of code. The failure to overload this in this
3316 # class keeps it safe.
3318 # It does make the effort during initialization to accept tables with
3319 # multiple values for the same code point, and to preserve the order
3320 # of these. If there is only one input range or range set, it doesn't
3321 # sort (as it should already be sorted to the desired order), and will
3322 # accept multiple values per code point. Otherwise it will merge
3323 # multiple values into a single one.
3326 my @args; # Arguments to pass to the constructor
3330 # If a method call, will start the union with the object itself, and
3331 # the class of the new object will be the same as self.
3338 # Add the other required parameter.
3340 # Rest of parameters are passed on to the constructor
3342 # Accumulate all records from both lists.
3344 my $input_count = 0;
3345 for my $arg (@args) {
3346 #local $to_trace = 0 if main::DEBUG;
3347 trace "argument = $arg" if main::DEBUG && $to_trace;
3348 if (! defined $arg) {
3350 if (defined $self) {
3352 $message .= $owner_name_of{pack 'J', $self};
3354 Carp::my_carp_bug($message . "Undefined argument to _union. No union done.");
3358 $arg = [ $arg ] if ! ref $arg;
3359 my $type = ref $arg;
3360 if ($type eq 'ARRAY') {
3361 foreach my $element (@$arg) {
3362 push @records, Range->new($element, $element);
3366 elsif ($arg->isa('Range')) {
3367 push @records, $arg;
3370 elsif ($arg->can('ranges')) {
3371 push @records, $arg->ranges;
3376 if (defined $self) {
3378 $message .= $owner_name_of{pack 'J', $self};
3380 Carp::my_carp_bug($message . "Cannot take the union of a $type. No union done.");
3385 # Sort with the range containing the lowest ordinal first, but if
3386 # two ranges start at the same code point, sort with the bigger range
3387 # of the two first, because it takes fewer cycles.
3388 if ($input_count > 1) {
3389 @records = sort { ($a->start <=> $b->start)
3391 # if b is shorter than a, b->end will be
3392 # less than a->end, and we want to select
3393 # a, so want to return -1
3394 ($b->end <=> $a->end)
3398 my $new = $class->new(@_);
3400 # Fold in records so long as they add new information.
3401 for my $set (@records) {
3402 my $start = $set->start;
3403 my $end = $set->end;
3404 my $value = $set->value;
3405 my $type = $set->type;
3406 if ($start > $new->max) {
3407 $new->_add_delete('+', $start, $end, $value, Type => $type);
3409 elsif ($end > $new->max) {
3410 $new->_add_delete('+', $new->max +1, $end, $value,
3413 elsif ($input_count == 1) {
3414 # Here, overlaps existing range, but is from a single input,
3415 # so preserve the multiple values from that input.
3416 $new->_add_delete('+', $start, $end, $value, Type => $type,
3417 Replace => $MULTIPLE_AFTER);
3424 sub range_count { # Return the number of ranges in the range list
3426 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3429 return scalar @{$ranges{pack 'J', $self}};
3433 # Returns the minimum code point currently in the range list, or if
3434 # the range list is empty, 2 beyond the max possible. This is a
3435 # method because used so rarely, that not worth saving between calls,
3436 # and having to worry about changing it as ranges are added and
3440 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3442 my $addr = do { no overloading; pack 'J', $self; };
3444 # If the range list is empty, return a large value that isn't adjacent
3445 # to any that could be in the range list, for simpler tests
3446 return $MAX_UNICODE_CODEPOINT + 2 unless scalar @{$ranges{$addr}};
3447 return $ranges{$addr}->[0]->start;
3451 # Boolean: Is argument in the range list? If so returns $i such that:
3452 # range[$i]->end < $codepoint <= range[$i+1]->end
3453 # which is one beyond what you want; this is so that the 0th range
3454 # doesn't return false
3456 my $codepoint = shift;
3457 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3459 my $i = $self->_search_ranges($codepoint);
3460 return 0 unless defined $i;
3462 # The search returns $i, such that
3463 # range[$i-1]->end < $codepoint <= range[$i]->end
3464 # So is in the table if and only iff it is at least the start position
3467 return 0 if $ranges{pack 'J', $self}->[$i]->start > $codepoint;
3471 sub containing_range {
3472 # Returns the range object that contains the code point, undef if none
3475 my $codepoint = shift;
3476 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3478 my $i = $self->contains($codepoint);
3481 # contains() returns 1 beyond where we should look
3483 return $ranges{pack 'J', $self}->[$i-1];
3487 # Returns the value associated with the code point, undef if none
3490 my $codepoint = shift;
3491 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3493 my $range = $self->containing_range($codepoint);
3494 return unless defined $range;
3496 return $range->value;
3500 # Returns the type of the range containing the code point, undef if
3501 # the code point is not in the table
3504 my $codepoint = shift;
3505 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3507 my $range = $self->containing_range($codepoint);
3508 return unless defined $range;
3510 return $range->type;
3513 sub _search_ranges {
3514 # Find the range in the list which contains a code point, or where it
3515 # should go if were to add it. That is, it returns $i, such that:
3516 # range[$i-1]->end < $codepoint <= range[$i]->end
3517 # Returns undef if no such $i is possible (e.g. at end of table), or
3518 # if there is an error.
3521 my $code_point = shift;
3522 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3524 my $addr = do { no overloading; pack 'J', $self; };
3526 return if $code_point > $max{$addr};
3527 my $r = $ranges{$addr}; # The current list of ranges
3528 my $range_list_size = scalar @$r;
3531 use integer; # want integer division
3533 # Use the cached result as the starting guess for this one, because,
3534 # an experiment on 5.1 showed that 90% of the time the cache was the
3535 # same as the result on the next call (and 7% it was one less).
3536 $i = $_search_ranges_cache{$addr};
3537 $i = 0 if $i >= $range_list_size; # Reset if no longer valid (prob.
3538 # from an intervening deletion
3539 #local $to_trace = 1 if main::DEBUG;
3540 trace "previous \$i is still valid: $i" if main::DEBUG && $to_trace && $code_point <= $r->[$i]->end && ($i == 0 || $r->[$i-1]->end < $code_point);
3541 return $i if $code_point <= $r->[$i]->end
3542 && ($i == 0 || $r->[$i-1]->end < $code_point);
3544 # Here the cache doesn't yield the correct $i. Try adding 1.
3545 if ($i < $range_list_size - 1
3546 && $r->[$i]->end < $code_point &&
3547 $code_point <= $r->[$i+1]->end)
3550 trace "next \$i is correct: $i" if main::DEBUG && $to_trace;
3551 $_search_ranges_cache{$addr} = $i;
3555 # Here, adding 1 also didn't work. We do a binary search to
3556 # find the correct position, starting with current $i
3558 my $upper = $range_list_size - 1;
3560 trace "top of loop i=$i:", sprintf("%04X", $r->[$lower]->start), "[$lower] .. ", sprintf("%04X", $r->[$i]->start), "[$i] .. ", sprintf("%04X", $r->[$upper]->start), "[$upper]" if main::DEBUG && $to_trace;
3562 if ($code_point <= $r->[$i]->end) {
3564 # Here we have met the upper constraint. We can quit if we
3565 # also meet the lower one.
3566 last if $i == 0 || $r->[$i-1]->end < $code_point;
3568 $upper = $i; # Still too high.
3573 # Here, $r[$i]->end < $code_point, so look higher up.
3577 # Split search domain in half to try again.
3578 my $temp = ($upper + $lower) / 2;
3580 # No point in continuing unless $i changes for next time
3584 # We can't reach the highest element because of the averaging.
3585 # So if one below the upper edge, force it there and try one
3587 if ($i == $range_list_size - 2) {
3589 trace "Forcing to upper edge" if main::DEBUG && $to_trace;
3590 $i = $range_list_size - 1;
3592 # Change $lower as well so if fails next time through,
3593 # taking the average will yield the same $i, and we will
3594 # quit with the error message just below.
3598 Carp::my_carp_bug("$owner_name_of{$addr}Can't find where the range ought to go. No action taken.");
3602 } # End of while loop
3604 if (main::DEBUG && $to_trace) {
3605 trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i;
3606 trace "i= [ $i ]", $r->[$i];
3607 trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < $range_list_size - 1;
3610 # Here we have found the offset. Cache it as a starting point for the
3612 $_search_ranges_cache{$addr} = $i;
3617 # Add, replace or delete ranges to or from a list. The $type
3618 # parameter gives which:
3619 # '+' => insert or replace a range, returning a list of any changed
3621 # '-' => delete a range, returning a list of any deleted ranges.
3623 # The next three parameters give respectively the start, end, and
3624 # value associated with the range. 'value' should be null unless the
3627 # The range list is kept sorted so that the range with the lowest
3628 # starting position is first in the list, and generally, adjacent
3629 # ranges with the same values are merged into a single larger one (see
3630 # exceptions below).
3632 # There are more parameters; all are key => value pairs:
3633 # Type gives the type of the value. It is only valid for '+'.
3634 # All ranges have types; if this parameter is omitted, 0 is
3635 # assumed. Ranges with type 0 are assumed to obey the
3636 # Unicode rules for casing, etc; ranges with other types are
3637 # not. Otherwise, the type is arbitrary, for the caller's
3638 # convenience, and looked at only by this routine to keep
3639 # adjacent ranges of different types from being merged into
3640 # a single larger range, and when Replace =>
3641 # $IF_NOT_EQUIVALENT is specified (see just below).
3642 # Replace determines what to do if the range list already contains
3643 # ranges which coincide with all or portions of the input
3644 # range. It is only valid for '+':
3645 # => $NO means that the new value is not to replace
3646 # any existing ones, but any empty gaps of the
3647 # range list coinciding with the input range
3648 # will be filled in with the new value.
3649 # => $UNCONDITIONALLY means to replace the existing values with
3650 # this one unconditionally. However, if the
3651 # new and old values are identical, the
3652 # replacement is skipped to save cycles
3653 # => $IF_NOT_EQUIVALENT means to replace the existing values
3654 # (the default) with this one if they are not equivalent.
3655 # Ranges are equivalent if their types are the
3656 # same, and they are the same string; or if
3657 # both are type 0 ranges, if their Unicode
3658 # standard forms are identical. In this last
3659 # case, the routine chooses the more "modern"
3660 # one to use. This is because some of the
3661 # older files are formatted with values that
3662 # are, for example, ALL CAPs, whereas the
3663 # derived files have a more modern style,
3664 # which looks better. By looking for this
3665 # style when the pre-existing and replacement
3666 # standard forms are the same, we can move to
3668 # => $MULTIPLE_BEFORE means that if this range duplicates an
3669 # existing one, but has a different value,
3670 # don't replace the existing one, but insert
3671 # this, one so that the same range can occur
3672 # multiple times. They are stored LIFO, so
3673 # that the final one inserted is the first one
3674 # returned in an ordered search of the table.
3675 # If this is an exact duplicate, including the
3676 # value, the original will be moved to be
3677 # first, before any other duplicate ranges
3678 # with different values.
3679 # => $MULTIPLE_AFTER is like $MULTIPLE_BEFORE, but is stored
3680 # FIFO, so that this one is inserted after all
3681 # others that currently exist. If this is an
3682 # exact duplicate, including value, of an
3683 # existing range, this one is discarded
3684 # (leaving the existing one in its original,
3685 # higher priority position
3686 # => anything else is the same as => $IF_NOT_EQUIVALENT
3688 # "same value" means identical for non-type-0 ranges, and it means
3689 # having the same standard forms for type-0 ranges.
3691 return Carp::carp_too_few_args(\@_, 5) if main::DEBUG && @_ < 5;
3694 my $operation = shift; # '+' for add/replace; '-' for delete;
3701 $value = "" if not defined $value; # warning: $value can be "0"
3703 my $replace = delete $args{'Replace'};
3704 $replace = $IF_NOT_EQUIVALENT unless defined $replace;
3706 my $type = delete $args{'Type'};
3707 $type = 0 unless defined $type;
3709 Carp::carp_extra_args(\%args) if main::DEBUG && %args;
3711 my $addr = do { no overloading; pack 'J', $self; };
3713 if ($operation ne '+' && $operation ne '-') {
3714 Carp::my_carp_bug("$owner_name_of{$addr}First parameter to _add_delete must be '+' or '-'. No action taken.");
3717 unless (defined $start && defined $end) {
3718 Carp::my_carp_bug("$owner_name_of{$addr}Undefined start and/or end to _add_delete. No action taken.");
3721 unless ($end >= $start) {
3722 Carp::my_carp_bug("$owner_name_of{$addr}End of range (" . sprintf("%04X", $end) . ") must not be before start (" . sprintf("%04X", $start) . "). No action taken.");
3725 if ($end > $MAX_UNICODE_CODEPOINT && $operation eq '+') {
3726 Carp::my_carp("$owner_name_of{$addr}Warning: Range '" . sprintf("%04X..%04X", $start, $end) . ") is above the Unicode maximum of " . sprintf("%04X", $MAX_UNICODE_CODEPOINT) . ". Adding it anyway");
3728 #local $to_trace = 1 if main::DEBUG;
3730 if ($operation eq '-') {
3731 if ($replace != $IF_NOT_EQUIVALENT) {
3732 Carp::my_carp_bug("$owner_name_of{$addr}Replace => \$IF_NOT_EQUIVALENT is required when deleting a range from a range list. Assuming Replace => \$IF_NOT_EQUIVALENT.");
3733 $replace = $IF_NOT_EQUIVALENT;
3736 Carp::my_carp_bug("$owner_name_of{$addr}Type => 0 is required when deleting a range from a range list. Assuming Type => 0.");
3740 Carp::my_carp_bug("$owner_name_of{$addr}Value => \"\" is required when deleting a range from a range list. Assuming Value => \"\".");
3745 my $r = $ranges{$addr}; # The current list of ranges
3746 my $range_list_size = scalar @$r; # And its size
3747 my $max = $max{$addr}; # The current high code point in
3748 # the list of ranges
3750 # Do a special case requiring fewer machine cycles when the new range
3751 # starts after the current highest point. The Unicode input data is
3752 # structured so this is common.
3753 if ($start > $max) {
3755 trace "$owner_name_of{$addr} $operation", sprintf("%04X..%04X (%s) type=%d; prev max=%04X", $start, $end, $value, $type, $max) if main::DEBUG && $to_trace;
3756 return if $operation eq '-'; # Deleting a non-existing range is a
3759 # If the new range doesn't logically extend the current final one
3760 # in the range list, create a new range at the end of the range
3761 # list. (max cleverly is initialized to a negative number not
3762 # adjacent to 0 if the range list is empty, so even adding a range
3763 # to an empty range list starting at 0 will have this 'if'
3765 if ($start > $max + 1 # non-adjacent means can't extend.
3766 || @{$r}[-1]->value ne $value # values differ, can't extend.
3767 || @{$r}[-1]->type != $type # types differ, can't extend.
3769 push @$r, Range->new($start, $end,
3775 # Here, the new range starts just after the current highest in
3776 # the range list, and they have the same type and value.
3777 # Extend the current range to incorporate the new one.
3778 @{$r}[-1]->set_end($end);
3781 # This becomes the new maximum.
3786 #local $to_trace = 0 if main::DEBUG;
3788 trace "$owner_name_of{$addr} $operation", sprintf("%04X", $start) . '..' . sprintf("%04X", $end) . " ($value) replace=$replace" if main::DEBUG && $to_trace;
3790 # Here, the input range isn't after the whole rest of the range list.
3791 # Most likely 'splice' will be needed. The rest of the routine finds
3792 # the needed splice parameters, and if necessary, does the splice.
3793 # First, find the offset parameter needed by the splice function for
3794 # the input range. Note that the input range may span multiple
3795 # existing ones, but we'll worry about that later. For now, just find
3796 # the beginning. If the input range is to be inserted starting in a
3797 # position not currently in the range list, it must (obviously) come
3798 # just after the range below it, and just before the range above it.
3799 # Slightly less obviously, it will occupy the position currently
3800 # occupied by the range that is to come after it. More formally, we
3801 # are looking for the position, $i, in the array of ranges, such that:
3803 # r[$i-1]->start <= r[$i-1]->end < $start < r[$i]->start <= r[$i]->end
3805 # (The ordered relationships within existing ranges are also shown in
3806 # the equation above). However, if the start of the input range is
3807 # within an existing range, the splice offset should point to that
3808 # existing range's position in the list; that is $i satisfies a
3809 # somewhat different equation, namely:
3811 #r[$i-1]->start <= r[$i-1]->end < r[$i]->start <= $start <= r[$i]->end
3813 # More briefly, $start can come before or after r[$i]->start, and at
3814 # this point, we don't know which it will be. However, these
3815 # two equations share these constraints:
3817 # r[$i-1]->end < $start <= r[$i]->end
3819 # And that is good enough to find $i.
3821 my $i = $self->_search_ranges($start);
3823 Carp::my_carp_bug("Searching $self for range beginning with $start unexpectedly returned undefined. Operation '$operation' not performed");
3827 # The search function returns $i such that:
3829 # r[$i-1]->end < $start <= r[$i]->end
3831 # That means that $i points to the first range in the range list
3832 # that could possibly be affected by this operation. We still don't
3833 # know if the start of the input range is within r[$i], or if it
3834 # points to empty space between r[$i-1] and r[$i].
3835 trace "[$i] is the beginning splice point. Existing range there is ", $r->[$i] if main::DEBUG && $to_trace;
3837 # Special case the insertion of data that is not to replace any
3839 if ($replace == $NO) { # If $NO, has to be operation '+'
3840 #local $to_trace = 1 if main::DEBUG;
3841 trace "Doesn't replace" if main::DEBUG && $to_trace;
3843 # Here, the new range is to take effect only on those code points
3844 # that aren't already in an existing range. This can be done by
3845 # looking through the existing range list and finding the gaps in
3846 # the ranges that this new range affects, and then calling this
3847 # function recursively on each of those gaps, leaving untouched
3848 # anything already in the list. Gather up a list of the changed
3849 # gaps first so that changes to the internal state as new ranges
3850 # are added won't be a problem.
3853 # First, if the starting point of the input range is outside an
3854 # existing one, there is a gap from there to the beginning of the
3855 # existing range -- add a span to fill the part that this new
3857 if ($start < $r->[$i]->start) {
3858 push @gap_list, Range->new($start,
3860 $r->[$i]->start - 1),
3862 trace "gap before $r->[$i] [$i], will add", $gap_list[-1] if main::DEBUG && $to_trace;
3865 # Then look through the range list for other gaps until we reach
3866 # the highest range affected by the input one.
3868 for ($j = $i+1; $j < $range_list_size; $j++) {
3869 trace "j=[$j]", $r->[$j] if main::DEBUG && $to_trace;
3870 last if $end < $r->[$j]->start;
3872 # If there is a gap between when this range starts and the
3873 # previous one ends, add a span to fill it. Note that just
3874 # because there are two ranges doesn't mean there is a
3875 # non-zero gap between them. It could be that they have
3876 # different values or types
3877 if ($r->[$j-1]->end + 1 != $r->[$j]->start) {
3879 Range->new($r->[$j-1]->end + 1,
3880 $r->[$j]->start - 1,
3882 trace "gap between $r->[$j-1] and $r->[$j] [$j], will add: $gap_list[-1]" if main::DEBUG && $to_trace;
3886 # Here, we have either found an existing range in the range list,
3887 # beyond the area affected by the input one, or we fell off the
3888 # end of the loop because the input range affects the whole rest
3889 # of the range list. In either case, $j is 1 higher than the
3890 # highest affected range. If $j == $i, it means that there are no
3891 # affected ranges, that the entire insertion is in the gap between
3892 # r[$i-1], and r[$i], which we already have taken care of before
3894 # On the other hand, if there are affected ranges, it might be
3895 # that there is a gap that needs filling after the final such
3896 # range to the end of the input range
3897 if ($r->[$j-1]->end < $end) {
3898 push @gap_list, Range->new(main::max($start,
3899 $r->[$j-1]->end + 1),
3902 trace "gap after $r->[$j-1], will add $gap_list[-1]" if main::DEBUG && $to_trace;
3905 # Call recursively to fill in all the gaps.
3906 foreach my $gap (@gap_list) {
3907 $self->_add_delete($operation,
3917 # Here, we have taken care of the case where $replace is $NO.
3918 # Remember that here, r[$i-1]->end < $start <= r[$i]->end
3919 # If inserting a multiple record, this is where it goes, before the
3920 # first (if any) existing one if inserting LIFO. (If this is to go
3921 # afterwards, FIFO, we below move the pointer to there.) These imply
3922 # an insertion, and no change to any existing ranges. Note that $i
3923 # can be -1 if this new range doesn't actually duplicate any existing,
3924 # and comes at the beginning of the list.
3925 if ($replace == $MULTIPLE_BEFORE || $replace == $MULTIPLE_AFTER) {
3927 if ($start != $end) {
3928 Carp::my_carp_bug("$owner_name_of{$addr}Can't cope with adding a multiple record when the range ($start..$end) contains more than one code point. No action taken.");
3932 # If the new code point is within a current range ...
3933 if ($end >= $r->[$i]->start) {
3935 # Don't add an exact duplicate, as it isn't really a multiple
3936 my $existing_value = $r->[$i]->value;
3937 my $existing_type = $r->[$i]->type;
3938 return if $value eq $existing_value && $type eq $existing_type;
3940 # If the multiple value is part of an existing range, we want
3941 # to split up that range, so that only the single code point
3942 # is affected. To do this, we first call ourselves
3943 # recursively to delete that code point from the table, having
3944 # preserved its current data above. Then we call ourselves
3945 # recursively again to add the new multiple, which we know by
3946 # the test just above is different than the current code
3947 # point's value, so it will become a range containing a single
3948 # code point: just itself. Finally, we add back in the
3949 # pre-existing code point, which will again be a single code
3950 # point range. Because 'i' likely will have changed as a
3951 # result of these operations, we can't just continue on, but
3952 # do this operation recursively as well. If we are inserting
3953 # LIFO, the pre-existing code point needs to go after the new
3954 # one, so use MULTIPLE_AFTER; and vice versa.
3955 if ($r->[$i]->start != $r->[$i]->end) {
3956 $self->_add_delete('-', $start, $end, "");
3957 $self->_add_delete('+', $start, $end, $value, Type => $type);
3958 return $self->_add_delete('+',
3961 Type => $existing_type,
3962 Replace => ($replace == $MULTIPLE_BEFORE)
3964 : $MULTIPLE_BEFORE);
3968 # If to place this new record after, move to beyond all existing
3969 # ones; but don't add this one if identical to any of them, as it
3970 # isn't really a multiple. This leaves the original order, so
3971 # that the current request is ignored. The reasoning is that the
3972 # previous request that wanted this record to have high priority
3973 # should have precedence.
3974 if ($replace == $MULTIPLE_AFTER) {
3975 while ($i < @$r && $r->[$i]->start == $start) {
3976 return if $value eq $r->[$i]->value
3977 && $type eq $r->[$i]->type;
3982 # If instead we are to place this new record before any
3983 # existing ones, remove any identical ones that come after it.
3984 # This changes the existing order so that the new one is
3985 # first, as is being requested.
3986 for (my $j = $i + 1;
3987 $j < @$r && $r->[$j]->start == $start;
3990 if ($value eq $r->[$j]->value && $type eq $r->[$j]->type) {
3992 last; # There should only be one instance, so no
3993 # need to keep looking
3998 trace "Adding multiple record at $i with $start..$end, $value" if main::DEBUG && $to_trace;
3999 my @return = splice @$r,
4006 if (main::DEBUG && $to_trace) {
4007 trace "After splice:";
4008 trace 'i-2=[', $i-2, ']', $r->[$i-2] if $i >= 2;
4009 trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i >= 1;
4010 trace "i =[", $i, "]", $r->[$i] if $i >= 0;
4011 trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < @$r - 1;
4012 trace 'i+2=[', $i+2, ']', $r->[$i+2] if $i < @$r - 2;
4013 trace 'i+3=[', $i+3, ']', $r->[$i+3] if $i < @$r - 3;
4018 # Here, we have taken care of $NO and $MULTIPLE_foo replaces. This
4019 # leaves delete, insert, and replace either unconditionally or if not
4020 # equivalent. $i still points to the first potential affected range.
4021 # Now find the highest range affected, which will determine the length
4022 # parameter to splice. (The input range can span multiple existing
4023 # ones.) If this isn't a deletion, while we are looking through the
4024 # range list, see also if this is a replacement rather than a clean
4025 # insertion; that is if it will change the values of at least one
4026 # existing range. Start off assuming it is an insert, until find it
4028 my $clean_insert = $operation eq '+';
4029 my $j; # This will point to the highest affected range
4031 # For non-zero types, the standard form is the value itself;
4032 my $standard_form = ($type) ? $value : main::standardize($value);
4034 for ($j = $i; $j < $range_list_size; $j++) {
4035 trace "Looking for highest affected range; the one at $j is ", $r->[$j] if main::DEBUG && $to_trace;
4037 # If find a range that it doesn't overlap into, we can stop
4039 last if $end < $r->[$j]->start;
4041 # Here, overlaps the range at $j. If the values don't match,
4042 # and so far we think this is a clean insertion, it becomes a
4043 # non-clean insertion, i.e., a 'change' or 'replace' instead.
4044 if ($clean_insert) {
4045 if ($r->[$j]->standard_form ne $standard_form) {
4047 if ($replace == $CROAK) {
4048 main::croak("The range to add "
4049 . sprintf("%04X", $start)
4051 . sprintf("%04X", $end)
4052 . " with value '$value' overlaps an existing range $r->[$j]");
4057 # Here, the two values are essentially the same. If the
4058 # two are actually identical, replacing wouldn't change
4059 # anything so skip it.
4060 my $pre_existing = $r->[$j]->value;
4061 if ($pre_existing ne $value) {
4063 # Here the new and old standardized values are the
4064 # same, but the non-standardized values aren't. If
4065 # replacing unconditionally, then replace
4066 if( $replace == $UNCONDITIONALLY) {
4071 # Here, are replacing conditionally. Decide to
4072 # replace or not based on which appears to look
4073 # the "nicest". If one is mixed case and the
4074 # other isn't, choose the mixed case one.
4075 my $new_mixed = $value =~ /[A-Z]/
4076 && $value =~ /[a-z]/;
4077 my $old_mixed = $pre_existing =~ /[A-Z]/
4078 && $pre_existing =~ /[a-z]/;
4080 if ($old_mixed != $new_mixed) {
4081 $clean_insert = 0 if $new_mixed;
4082 if (main::DEBUG && $to_trace) {
4083 if ($clean_insert) {
4084 trace "Retaining $pre_existing over $value";
4087 trace "Replacing $pre_existing with $value";
4093 # Here casing wasn't different between the two.
4094 # If one has hyphens or underscores and the
4095 # other doesn't, choose the one with the
4097 my $new_punct = $value =~ /[-_]/;
4098 my $old_punct = $pre_existing =~ /[-_]/;
4100 if ($old_punct != $new_punct) {
4101 $clean_insert = 0 if $new_punct;
4102 if (main::DEBUG && $to_trace) {
4103 if ($clean_insert) {
4104 trace "Retaining $pre_existing over $value";
4107 trace "Replacing $pre_existing with $value";
4110 } # else existing one is just as "good";
4111 # retain it to save cycles.
4117 } # End of loop looking for highest affected range.
4119 # Here, $j points to one beyond the highest range that this insertion
4120 # affects (hence to beyond the range list if that range is the final
4121 # one in the range list).
4123 # The splice length is all the affected ranges. Get it before
4124 # subtracting, for efficiency, so we don't have to later add 1.
4125 my $length = $j - $i;
4127 $j--; # $j now points to the highest affected range.
4128 trace "Final affected range is $j: $r->[$j]" if main::DEBUG && $to_trace;
4130 # Here, have taken care of $NO and $MULTIPLE_foo replaces.
4131 # $j points to the highest affected range. But it can be < $i or even
4132 # -1. These happen only if the insertion is entirely in the gap
4133 # between r[$i-1] and r[$i]. Here's why: j < i means that the j loop
4134 # above exited first time through with $end < $r->[$i]->start. (And
4135 # then we subtracted one from j) This implies also that $start <
4136 # $r->[$i]->start, but we know from above that $r->[$i-1]->end <
4137 # $start, so the entire input range is in the gap.
4140 # Here the entire input range is in the gap before $i.
4142 if (main::DEBUG && $to_trace) {
4144 trace "Entire range is between $r->[$i-1] and $r->[$i]";
4147 trace "Entire range is before $r->[$i]";
4150 return if $operation ne '+'; # Deletion of a non-existent range is
4155 # Here part of the input range is not in the gap before $i. Thus,
4156 # there is at least one affected one, and $j points to the highest
4159 # At this point, here is the situation:
4160 # This is not an insertion of a multiple, nor of tentative ($NO)
4162 # $i points to the first element in the current range list that
4163 # may be affected by this operation. In fact, we know
4164 # that the range at $i is affected because we are in
4165 # the else branch of this 'if'
4166 # $j points to the highest affected range.
4168 # r[$i-1]->end < $start <= r[$i]->end
4170 # r[$i-1]->end < $start <= $end <= r[$j]->end
4173 # $clean_insert is a boolean which is set true if and only if
4174 # this is a "clean insertion", i.e., not a change nor a
4175 # deletion (multiple was handled above).
4177 # We now have enough information to decide if this call is a no-op
4178 # or not. It is a no-op if this is an insertion of already
4181 if (main::DEBUG && $to_trace && $clean_insert
4183 && $start >= $r->[$i]->start)
4187 return if $clean_insert
4188 && $i == $j # more than one affected range => not no-op
4190 # Here, r[$i-1]->end < $start <= $end <= r[$i]->end
4191 # Further, $start and/or $end is >= r[$i]->start
4192 # The test below hence guarantees that
4193 # r[$i]->start < $start <= $end <= r[$i]->end
4194 # This means the input range is contained entirely in
4195 # the one at $i, so is a no-op
4196 && $start >= $r->[$i]->start;
4199 # Here, we know that some action will have to be taken. We have
4200 # calculated the offset and length (though adjustments may be needed)
4201 # for the splice. Now start constructing the replacement list.
4203 my $splice_start = $i;
4208 # See if should extend any adjacent ranges.
4209 if ($operation eq '-') { # Don't extend deletions
4210 $extends_below = $extends_above = 0;
4212 else { # Here, should extend any adjacent ranges. See if there are
4214 $extends_below = ($i > 0
4215 # can't extend unless adjacent
4216 && $r->[$i-1]->end == $start -1
4217 # can't extend unless are same standard value
4218 && $r->[$i-1]->standard_form eq $standard_form
4219 # can't extend unless share type
4220 && $r->[$i-1]->type == $type);
4221 $extends_above = ($j+1 < $range_list_size
4222 && $r->[$j+1]->start == $end +1
4223 && $r->[$j+1]->standard_form eq $standard_form
4224 && $r->[$j+1]->type == $type);
4226 if ($extends_below && $extends_above) { # Adds to both
4227 $splice_start--; # start replace at element below
4228 $length += 2; # will replace on both sides
4229 trace "Extends both below and above ranges" if main::DEBUG && $to_trace;
4231 # The result will fill in any gap, replacing both sides, and
4232 # create one large range.
4233 @replacement = Range->new($r->[$i-1]->start,
4240 # Here we know that the result won't just be the conglomeration of
4241 # a new range with both its adjacent neighbors. But it could
4242 # extend one of them.
4244 if ($extends_below) {
4246 # Here the new element adds to the one below, but not to the
4247 # one above. If inserting, and only to that one range, can
4248 # just change its ending to include the new one.
4249 if ($length == 0 && $clean_insert) {
4250 $r->[$i-1]->set_end($end);
4251 trace "inserted range extends range to below so it is now $r->[$i-1]" if main::DEBUG && $to_trace;
4255 trace "Changing inserted range to start at ", sprintf("%04X", $r->[$i-1]->start), " instead of ", sprintf("%04X", $start) if main::DEBUG && $to_trace;
4256 $splice_start--; # start replace at element below
4257 $length++; # will replace the element below
4258 $start = $r->[$i-1]->start;
4261 elsif ($extends_above) {
4263 # Here the new element adds to the one above, but not below.
4264 # Mirror the code above
4265 if ($length == 0 && $clean_insert) {
4266 $r->[$j+1]->set_start($start);
4267 trace "inserted range extends range to above so it is now $r->[$j+1]" if main::DEBUG && $to_trace;
4271 trace "Changing inserted range to end at ", sprintf("%04X", $r->[$j+1]->end), " instead of ", sprintf("%04X", $end) if main::DEBUG && $to_trace;
4272 $length++; # will replace the element above
4273 $end = $r->[$j+1]->end;
4277 trace "Range at $i is $r->[$i]" if main::DEBUG && $to_trace;
4279 # Finally, here we know there will have to be a splice.
4280 # If the change or delete affects only the highest portion of the
4281 # first affected range, the range will have to be split. The
4282 # splice will remove the whole range, but will replace it by a new
4283 # range containing just the unaffected part. So, in this case,
4284 # add to the replacement list just this unaffected portion.
4285 if (! $extends_below
4286 && $start > $r->[$i]->start && $start <= $r->[$i]->end)
4289 Range->new($r->[$i]->start,
4291 Value => $r->[$i]->value,
4292 Type => $r->[$i]->type);
4295 # In the case of an insert or change, but not a delete, we have to
4296 # put in the new stuff; this comes next.
4297 if ($operation eq '+') {
4298 push @replacement, Range->new($start,
4304 trace "Range at $j is $r->[$j]" if main::DEBUG && $to_trace && $j != $i;
4305 #trace "$end >=", $r->[$j]->start, " && $end <", $r->[$j]->end if main::DEBUG && $to_trace;
4307 # And finally, if we're changing or deleting only a portion of the
4308 # highest affected range, it must be split, as the lowest one was.
4309 if (! $extends_above
4310 && $j >= 0 # Remember that j can be -1 if before first
4312 && $end >= $r->[$j]->start
4313 && $end < $r->[$j]->end)
4316 Range->new($end + 1,
4318 Value => $r->[$j]->value,
4319 Type => $r->[$j]->type);
4323 # And do the splice, as calculated above
4324 if (main::DEBUG && $to_trace) {
4325 trace "replacing $length element(s) at $i with ";
4326 foreach my $replacement (@replacement) {
4327 trace " $replacement";
4329 trace "Before splice:";
4330 trace 'i-2=[', $i-2, ']', $r->[$i-2] if $i >= 2;
4331 trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i >= 1;
4332 trace "i =[", $i, "]", $r->[$i];
4333 trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < @$r - 1;
4334 trace 'i+2=[', $i+2, ']', $r->[$i+2] if $i < @$r - 2;
4337 my @return = splice @$r, $splice_start, $length, @replacement;
4339 if (main::DEBUG && $to_trace) {
4340 trace "After splice:";
4341 trace 'i-2=[', $i-2, ']', $r->[$i-2] if $i >= 2;
4342 trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i >= 1;
4343 trace "i =[", $i, "]", $r->[$i];
4344 trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < @$r - 1;
4345 trace 'i+2=[', $i+2, ']', $r->[$i+2] if $i < @$r - 2;
4346 trace "removed ", @return if @return;
4349 # An actual deletion could have changed the maximum in the list.
4350 # There was no deletion if the splice didn't return something, but
4351 # otherwise recalculate it. This is done too rarely to worry about
4353 if ($operation eq '-' && @return) {
4355 $max{$addr} = $r->[-1]->end;
4358 $max{$addr} = $max_init;
4364 sub reset_each_range { # reset the iterator for each_range();
4366 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4369 undef $each_range_iterator{pack 'J', $self};
4374 # Iterate over each range in a range list. Results are undefined if
4375 # the range list is changed during the iteration.
4378 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4380 my $addr = do { no overloading; pack 'J', $self; };
4382 return if $self->is_empty;
4384 $each_range_iterator{$addr} = -1
4385 if ! defined $each_range_iterator{$addr};
4386 $each_range_iterator{$addr}++;
4387 return $ranges{$addr}->[$each_range_iterator{$addr}]
4388 if $each_range_iterator{$addr} < @{$ranges{$addr}};
4389 undef $each_range_iterator{$addr};
4393 sub count { # Returns count of code points in range list
4395 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4397 my $addr = do { no overloading; pack 'J', $self; };
4400 foreach my $range (@{$ranges{$addr}}) {
4401 $count += $range->end - $range->start + 1;
4406 sub delete_range { # Delete a range
4411 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4413 return $self->_add_delete('-', $start, $end, "");
4416 sub is_empty { # Returns boolean as to if a range list is empty
4418 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4421 return scalar @{$ranges{pack 'J', $self}} == 0;
4425 # Quickly returns a scalar suitable for separating tables into
4426 # buckets, i.e. it is a hash function of the contents of a table, so
4427 # there are relatively few conflicts.
4430 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4432 my $addr = do { no overloading; pack 'J', $self; };
4434 # These are quickly computable. Return looks like 'min..max;count'
4435 return $self->min . "..$max{$addr};" . scalar @{$ranges{$addr}};
4437 } # End closure for _Range_List_Base
4440 use parent '-norequire', '_Range_List_Base';
4442 # A Range_List is a range list for match tables; i.e. the range values are
4443 # not significant. Thus a number of operations can be safely added to it,
4444 # such as inversion, intersection. Note that union is also an unsafe
4445 # operation when range values are cared about, and that method is in the base
4446 # class, not here. But things are set up so that that method is callable only
4447 # during initialization. Only in this derived class, is there an operation
4448 # that combines two tables. A Range_Map can thus be used to initialize a
4449 # Range_List, and its mappings will be in the list, but are not significant to
4452 sub trace { return main::trace(@_); }
4458 '+' => sub { my $self = shift;
4461 return $self->_union($other)
4463 '+=' => sub { my $self = shift;
4465 my $reversed = shift;
4468 Carp::my_carp_bug("Bad news. Can't cope with '"
4472 . "'. undef returned.");
4476 return $self->_union($other)
4478 '&' => sub { my $self = shift;
4481 return $self->_intersect($other, 0);
4483 '&=' => sub { my $self = shift;
4485 my $reversed = shift;
4488 Carp::my_carp_bug("Bad news. Can't cope with '"
4492 . "'. undef returned.");
4496 return $self->_intersect($other, 0);
4503 # Returns a new Range_List that gives all code points not in $self.
4507 my $new = Range_List->new;
4509 # Go through each range in the table, finding the gaps between them
4510 my $max = -1; # Set so no gap before range beginning at 0
4511 for my $range ($self->ranges) {
4512 my $start = $range->start;
4513 my $end = $range->end;
4515 # If there is a gap before this range, the inverse will contain
4517 if ($start > $max + 1) {
4518 $new->add_range($max + 1, $start - 1);
4523 # And finally, add the gap from the end of the table to the max
4524 # possible code point
4525 if ($max < $MAX_UNICODE_CODEPOINT) {
4526 $new->add_range($max + 1, $MAX_UNICODE_CODEPOINT);
4532 # Returns a new Range_List with the argument deleted from it. The
4533 # argument can be a single code point, a range, or something that has
4534 # a range, with the _range_list() method on it returning them
4538 my $reversed = shift;
4539 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4542 Carp::my_carp_bug("Bad news. Can't cope with '"
4546 . "'. undef returned.");
4550 my $new = Range_List->new(Initialize => $self);
4552 if (! ref $other) { # Single code point
4553 $new->delete_range($other, $other);
4555 elsif ($other->isa('Range')) {
4556 $new->delete_range($other->start, $other->end);
4558 elsif ($other->can('_range_list')) {
4559 foreach my $range ($other->_range_list->ranges) {
4560 $new->delete_range($range->start, $range->end);
4564 Carp::my_carp_bug("Can't cope with a "
4566 . " argument to '-'. Subtraction ignored."
4575 # Returns either a boolean giving whether the two inputs' range lists
4576 # intersect (overlap), or a new Range_List containing the intersection
4577 # of the two lists. The optional final parameter being true indicates
4578 # to do the check instead of the intersection.
4580 my $a_object = shift;
4581 my $b_object = shift;
4582 my $check_if_overlapping = shift;
4583 $check_if_overlapping = 0 unless defined $check_if_overlapping;
4584 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4586 if (! defined $b_object) {
4588 $message .= $a_object->_owner_name_of if defined $a_object;
4589 Carp::my_carp_bug($message .= "Called with undefined value. Intersection not done.");
4593 # a & b = !(!a | !b), or in our terminology = ~ ( ~a + -b )
4594 # Thus the intersection could be much more simply be written:
4595 # return ~(~$a_object + ~$b_object);
4596 # But, this is slower, and when taking the inverse of a large
4597 # range_size_1 table, back when such tables were always stored that
4598 # way, it became prohibitively slow, hence the code was changed to the
4601 if ($b_object->isa('Range')) {
4602 $b_object = Range_List->new(Initialize => $b_object,
4603 Owner => $a_object->_owner_name_of);
4605 $b_object = $b_object->_range_list if $b_object->can('_range_list');
4607 my @a_ranges = $a_object->ranges;
4608 my @b_ranges = $b_object->ranges;
4610 #local $to_trace = 1 if main::DEBUG;
4611 trace "intersecting $a_object with ", scalar @a_ranges, "ranges and $b_object with", scalar @b_ranges, " ranges" if main::DEBUG && $to_trace;
4613 # Start with the first range in each list
4615 my $range_a = $a_ranges[$a_i];
4617 my $range_b = $b_ranges[$b_i];
4619 my $new = __PACKAGE__->new(Owner => $a_object->_owner_name_of)
4620 if ! $check_if_overlapping;
4622 # If either list is empty, there is no intersection and no overlap
4623 if (! defined $range_a || ! defined $range_b) {
4624 return $check_if_overlapping ? 0 : $new;
4626 trace "range_a[$a_i]=$range_a; range_b[$b_i]=$range_b" if main::DEBUG && $to_trace;
4628 # Otherwise, must calculate the intersection/overlap. Start with the
4629 # very first code point in each list
4630 my $a = $range_a->start;
4631 my $b = $range_b->start;
4633 # Loop through all the ranges of each list; in each iteration, $a and
4634 # $b are the current code points in their respective lists
4637 # If $a and $b are the same code point, ...
4640 # it means the lists overlap. If just checking for overlap
4641 # know the answer now,
4642 return 1 if $check_if_overlapping;
4644 # The intersection includes this code point plus anything else
4645 # common to both current ranges.
4647 my $end = main::min($range_a->end, $range_b->end);
4648 if (! $check_if_overlapping) {
4649 trace "adding intersection range ", sprintf("%04X", $start) . ".." . sprintf("%04X", $end) if main::DEBUG && $to_trace;
4650 $new->add_range($start, $end);
4653 # Skip ahead to the end of the current intersect
4656 # If the current intersect ends at the end of either range (as
4657 # it must for at least one of them), the next possible one
4658 # will be the beginning code point in it's list's next range.
4659 if ($a == $range_a->end) {
4660 $range_a = $a_ranges[++$a_i];
4661 last unless defined $range_a;
4662 $a = $range_a->start;
4664 if ($b == $range_b->end) {
4665 $range_b = $b_ranges[++$b_i];
4666 last unless defined $range_b;
4667 $b = $range_b->start;
4670 trace "range_a[$a_i]=$range_a; range_b[$b_i]=$range_b" if main::DEBUG && $to_trace;
4674 # Not equal, but if the range containing $a encompasses $b,
4675 # change $a to be the middle of the range where it does equal
4676 # $b, so the next iteration will get the intersection
4677 if ($range_a->end >= $b) {
4682 # Here, the current range containing $a is entirely below
4683 # $b. Go try to find a range that could contain $b.
4684 $a_i = $a_object->_search_ranges($b);
4686 # If no range found, quit.
4687 last unless defined $a_i;
4689 # The search returns $a_i, such that
4690 # range_a[$a_i-1]->end < $b <= range_a[$a_i]->end
4691 # Set $a to the beginning of this new range, and repeat.
4692 $range_a = $a_ranges[$a_i];
4693 $a = $range_a->start;
4696 else { # Here, $b < $a.
4698 # Mirror image code to the leg just above
4699 if ($range_b->end >= $a) {
4703 $b_i = $b_object->_search_ranges($a);
4704 last unless defined $b_i;
4705 $range_b = $b_ranges[$b_i];
4706 $b = $range_b->start;
4709 } # End of looping through ranges.
4711 # Intersection fully computed, or now know that there is no overlap
4712 return $check_if_overlapping ? 0 : $new;
4716 # Returns boolean giving whether the two arguments overlap somewhere
4720 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4722 return $self->_intersect($other, 1);
4726 # Add a range to the list.
4731 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4733 return $self->_add_delete('+', $start, $end, "");
4736 sub matches_identically_to {
4737 # Return a boolean as to whether or not two Range_Lists match identical
4738 # sets of code points.
4742 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4744 # These are ordered in increasing real time to figure out (at least
4745 # until a patch changes that and doesn't change this)
4746 return 0 if $self->max != $other->max;
4747 return 0 if $self->min != $other->min;
4748 return 0 if $self->range_count != $other->range_count;
4749 return 0 if $self->count != $other->count;
4751 # Here they could be identical because all the tests above passed.
4752 # The loop below is somewhat simpler since we know they have the same
4753 # number of elements. Compare range by range, until reach the end or
4754 # find something that differs.
4755 my @a_ranges = $self->ranges;
4756 my @b_ranges = $other->ranges;
4757 for my $i (0 .. @a_ranges - 1) {
4758 my $a = $a_ranges[$i];
4759 my $b = $b_ranges[$i];
4760 trace "self $a; other $b" if main::DEBUG && $to_trace;
4761 return 0 if ! defined $b
4762 || $a->start != $b->start
4763 || $a->end != $b->end;
4768 sub is_code_point_usable {
4769 # This used only for making the test script. See if the input
4770 # proposed trial code point is one that Perl will handle. If second
4771 # parameter is 0, it won't select some code points for various
4772 # reasons, noted below.
4775 my $try_hard = shift;
4776 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4778 return 0 if $code < 0; # Never use a negative
4780 # shun null. I'm (khw) not sure why this was done, but NULL would be
4781 # the character very frequently used.
4782 return $try_hard if $code == 0x0000;
4784 # shun non-character code points.
4785 return $try_hard if $code >= 0xFDD0 && $code <= 0xFDEF;
4786 return $try_hard if ($code & 0xFFFE) == 0xFFFE; # includes FFFF
4788 return $try_hard if $code > $MAX_UNICODE_CODEPOINT; # keep in range
4789 return $try_hard if $code >= 0xD800 && $code <= 0xDFFF; # no surrogate
4794 sub get_valid_code_point {
4795 # Return a code point that's part of the range list. Returns nothing
4796 # if the table is empty or we can't find a suitable code point. This
4797 # used only for making the test script.
4800 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4802 my $addr = do { no overloading; pack 'J', $self; };
4804 # On first pass, don't choose less desirable code points; if no good
4805 # one is found, repeat, allowing a less desirable one to be selected.
4806 for my $try_hard (0, 1) {
4808 # Look through all the ranges for a usable code point.
4809 for my $set (reverse $self->ranges) {
4811 # Try the edge cases first, starting with the end point of the
4813 my $end = $set->end;
4814 return $end if is_code_point_usable($end, $try_hard);
4816 # End point didn't, work. Start at the beginning and try
4817 # every one until find one that does work.
4818 for my $trial ($set->start .. $end - 1) {
4819 return $trial if is_code_point_usable($trial, $try_hard);
4823 return (); # If none found, give up.
4826 sub get_invalid_code_point {
4827 # Return a code point that's not part of the table. Returns nothing
4828 # if the table covers all code points or a suitable code point can't
4829 # be found. This used only for making the test script.
4832 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4834 # Just find a valid code point of the inverse, if any.
4835 return Range_List->new(Initialize => ~ $self)->get_valid_code_point;
4837 } # end closure for Range_List
4840 use parent '-norequire', '_Range_List_Base';
4842 # A Range_Map is a range list in which the range values (called maps) are
4843 # significant, and hence shouldn't be manipulated by our other code, which
4844 # could be ambiguous or lose things. For example, in taking the union of two
4845 # lists, which share code points, but which have differing values, which one
4846 # has precedence in the union?
4847 # It turns out that these operations aren't really necessary for map tables,
4848 # and so this class was created to make sure they aren't accidentally
4854 # Add a range containing a mapping value to the list
4857 # Rest of parameters passed on
4859 return $self->_add_delete('+', @_);
4863 # Adds entry to a range list which can duplicate an existing entry
4866 my $code_point = shift;
4869 my $replace = delete $args{'Replace'} // $MULTIPLE_BEFORE;
4870 Carp::carp_extra_args(\%args) if main::DEBUG && %args;
4872 return $self->add_map($code_point, $code_point,
4873 $value, Replace => $replace);
4875 } # End of closure for package Range_Map
4877 package _Base_Table;
4879 # A table is the basic data structure that gets written out into a file for
4880 # use by the Perl core. This is the abstract base class implementing the
4881 # common elements from the derived ones. A list of the methods to be
4882 # furnished by an implementing class is just after the constructor.
4884 sub standardize { return main::standardize($_[0]); }
4885 sub trace { return main::trace(@_); }
4889 main::setup_package();
4892 # Object containing the ranges of the table.
4893 main::set_access('range_list', \%range_list, 'p_r', 'p_s');
4896 # The full table name.
4897 main::set_access('full_name', \%full_name, 'r');
4900 # The table name, almost always shorter
4901 main::set_access('name', \%name, 'r');
4904 # The shortest of all the aliases for this table, with underscores removed
4905 main::set_access('short_name', \%short_name);
4907 my %nominal_short_name_length;
4908 # The length of short_name before removing underscores
4909 main::set_access('nominal_short_name_length',
4910 \%nominal_short_name_length);
4913 # The complete name, including property.
4914 main::set_access('complete_name', \%complete_name, 'r');
4917 # Parent property this table is attached to.
4918 main::set_access('property', \%property, 'r');
4921 # Ordered list of alias objects of the table's name. The first ones in
4922 # the list are output first in comments
4923 main::set_access('aliases', \%aliases, 'readable_array');
4926 # A comment associated with the table for human readers of the files
4927 main::set_access('comment', \%comment, 's');
4930 # A comment giving a short description of the table's meaning for human
4931 # readers of the files.
4932 main::set_access('description', \%description, 'readable_array');
4935 # A comment giving a short note about the table for human readers of the
4937 main::set_access('note', \%note, 'readable_array');
4940 # Enum; there are a number of possibilities for what happens to this
4941 # table: it could be normal, or suppressed, or not for external use. See
4942 # values at definition for $SUPPRESSED.
4943 main::set_access('fate', \%fate, 'r');
4945 my %find_table_from_alias;
4946 # The parent property passes this pointer to a hash which this class adds
4947 # all its aliases to, so that the parent can quickly take an alias and
4949 main::set_access('find_table_from_alias', \%find_table_from_alias, 'p_r');
4952 # After this table is made equivalent to another one; we shouldn't go
4953 # changing the contents because that could mean it's no longer equivalent
4954 main::set_access('locked', \%locked, 'r');
4957 # This gives the final path to the file containing the table. Each
4958 # directory in the path is an element in the array
4959 main::set_access('file_path', \%file_path, 'readable_array');
4962 # What is the table's status, normal, $OBSOLETE, etc. Enum
4963 main::set_access('status', \%status, 'r');
4966 # A comment about its being obsolete, or whatever non normal status it has
4967 main::set_access('status_info', \%status_info, 'r');
4969 my %caseless_equivalent;
4970 # The table this is equivalent to under /i matching, if any.
4971 main::set_access('caseless_equivalent', \%caseless_equivalent, 'r', 's');
4974 # Is the table to be output with each range only a single code point?
4975 # This is done to avoid breaking existing code that may have come to rely
4976 # on this behavior in previous versions of this program.)
4977 main::set_access('range_size_1', \%range_size_1, 'r', 's');
4980 # A boolean set iff this table is a Perl extension to the Unicode
4982 main::set_access('perl_extension', \%perl_extension, 'r');
4984 my %output_range_counts;
4985 # A boolean set iff this table is to have comments written in the
4986 # output file that contain the number of code points in the range.
4987 # The constructor can override the global flag of the same name.
4988 main::set_access('output_range_counts', \%output_range_counts, 'r');
4991 # The format of the entries of the table. This is calculated from the
4992 # data in the table (or passed in the constructor). This is an enum e.g.,
4993 # $STRING_FORMAT. It is marked protected as it should not be generally
4994 # used to override calculations.
4995 main::set_access('format', \%format, 'r', 'p_s');
4998 # All arguments are key => value pairs, which you can see below, most
4999 # of which match fields documented above. Otherwise: Re_Pod_Entry,
5000 # OK_as_Filename, and Fuzzy apply to the names of the table, and are
5001 # documented in the Alias package
5003 return Carp::carp_too_few_args(\@_, 2) if main::DEBUG && @_ < 2;
5007 my $self = bless \do { my $anonymous_scalar }, $class;
5008 my $addr = do { no overloading; pack 'J', $self; };
5012 $name{$addr} = delete $args{'Name'};
5013 $find_table_from_alias{$addr} = delete $args{'_Alias_Hash'};
5014 $full_name{$addr} = delete $args{'Full_Name'};
5015 my $complete_name = $complete_name{$addr}
5016 = delete $args{'Complete_Name'};
5017 $format{$addr} = delete $args{'Format'};
5018 $output_range_counts{$addr} = delete $args{'Output_Range_Counts'};
5019 $property{$addr} = delete $args{'_Property'};
5020 $range_list{$addr} = delete $args{'_Range_List'};
5021 $status{$addr} = delete $args{'Status'} || $NORMAL;
5022 $status_info{$addr} = delete $args{'_Status_Info'} || "";
5023 $range_size_1{$addr} = delete $args{'Range_Size_1'} || 0;
5024 $caseless_equivalent{$addr} = delete $args{'Caseless_Equivalent'} || 0;
5025 $fate{$addr} = delete $args{'Fate'} || $ORDINARY;
5026 my $ucd = delete $args{'UCD'};
5028 my $description = delete $args{'Description'};
5029 my $ok_as_filename = delete $args{'OK_as_Filename'};
5030 my $loose_match = delete $args{'Fuzzy'};
5031 my $note = delete $args{'Note'};
5032 my $make_re_pod_entry = delete $args{'Re_Pod_Entry'};
5033 my $perl_extension = delete $args{'Perl_Extension'};
5035 # Shouldn't have any left over
5036 Carp::carp_extra_args(\%args) if main::DEBUG && %args;
5038 # Can't use || above because conceivably the name could be 0, and
5039 # can't use // operator in case this program gets used in Perl 5.8
5040 $full_name{$addr} = $name{$addr} if ! defined $full_name{$addr};
5041 $output_range_counts{$addr} = $output_range_counts if
5042 ! defined $output_range_counts{$addr};
5044 $aliases{$addr} = [ ];
5045 $comment{$addr} = [ ];
5046 $description{$addr} = [ ];
5048 $file_path{$addr} = [ ];
5049 $locked{$addr} = "";
5051 push @{$description{$addr}}, $description if $description;
5052 push @{$note{$addr}}, $note if $note;
5054 if ($fate{$addr} == $PLACEHOLDER) {
5056 # A placeholder table doesn't get documented, is a perl extension,
5057 # and quite likely will be empty
5058 $make_re_pod_entry = 0 if ! defined $make_re_pod_entry;
5059 $perl_extension = 1 if ! defined $perl_extension;
5060 $ucd = 0 if ! defined $ucd;
5061 push @tables_that_may_be_empty, $complete_name{$addr};
5062 $self->add_comment(<<END);
5063 This is a placeholder because it is not in Version $string_version of Unicode,
5064 but is needed by the Perl core to work gracefully. Because it is not in this
5065 version of Unicode, it will not be listed in $pod_file.pod
5068 elsif (exists $why_suppressed{$complete_name}
5069 # Don't suppress if overridden
5070 && ! grep { $_ eq $complete_name{$addr} }
5071 @output_mapped_properties)
5073 $fate{$addr} = $SUPPRESSED;
5075 elsif ($fate{$addr} == $SUPPRESSED
5076 && ! exists $why_suppressed{$property{$addr}->complete_name})
5078 Carp::my_carp_bug("There is no current capability to set the reason for suppressing.");
5079 # perhaps Fate => [ $SUPPRESSED, "reason" ]
5082 # If hasn't set its status already, see if it is on one of the
5083 # lists of properties or tables that have particular statuses; if
5084 # not, is normal. The lists are prioritized so the most serious
5085 # ones are checked first
5086 if (! $status{$addr}) {
5087 if (exists $why_deprecated{$complete_name}) {
5088 $status{$addr} = $DEPRECATED;
5090 elsif (exists $why_stabilized{$complete_name}) {
5091 $status{$addr} = $STABILIZED;
5093 elsif (exists $why_obsolete{$complete_name}) {
5094 $status{$addr} = $OBSOLETE;
5097 # Existence above doesn't necessarily mean there is a message
5098 # associated with it. Use the most serious message.
5099 if ($status{$addr}) {
5100 if ($why_deprecated{$complete_name}) {
5102 = $why_deprecated{$complete_name};
5104 elsif ($why_stabilized{$complete_name}) {
5106 = $why_stabilized{$complete_name};
5108 elsif ($why_obsolete{$complete_name}) {
5110 = $why_obsolete{$complete_name};
5115 $perl_extension{$addr} = $perl_extension || 0;
5117 # Don't list a property by default that is internal only
5118 if ($fate{$addr} > $MAP_PROXIED) {
5119 $make_re_pod_entry = 0 if ! defined $make_re_pod_entry;
5120 $ucd = 0 if ! defined $ucd;
5123 $ucd = 1 if ! defined $ucd;
5126 # By convention what typically gets printed only or first is what's
5127 # first in the list, so put the full name there for good output
5128 # clarity. Other routines rely on the full name being first on the
5130 $self->add_alias($full_name{$addr},
5131 OK_as_Filename => $ok_as_filename,
5132 Fuzzy => $loose_match,
5133 Re_Pod_Entry => $make_re_pod_entry,
5134 Status => $status{$addr},
5138 # Then comes the other name, if meaningfully different.
5139 if (standardize($full_name{$addr}) ne standardize($name{$addr})) {
5140 $self->add_alias($name{$addr},
5141 OK_as_Filename => $ok_as_filename,
5142 Fuzzy => $loose_match,
5143 Re_Pod_Entry => $make_re_pod_entry,
5144 Status => $status{$addr},
5152 # Here are the methods that are required to be defined by any derived
5155 handle_special_range
5159 # write() knows how to write out normal ranges, but it calls
5160 # handle_special_range() when it encounters a non-normal one.
5161 # append_to_body() is called by it after it has handled all
5162 # ranges to add anything after the main portion of the table.
5163 # And finally, pre_body() is called after all this to build up
5164 # anything that should appear before the main portion of the
5165 # table. Doing it this way allows things in the middle to
5166 # affect what should appear before the main portion of the
5171 Carp::my_carp_bug( __LINE__
5172 . ": Must create method '$sub()' for "
5180 "." => \&main::_operator_dot,
5181 ".=" => \&main::_operator_dot_equal,
5182 '!=' => \&main::_operator_not_equal,
5183 '==' => \&main::_operator_equal,
5187 # Returns the array of ranges associated with this table.
5190 return $range_list{pack 'J', shift}->ranges;
5194 # Add a synonym for this table.
5196 return Carp::carp_too_few_args(\@_, 3) if main::DEBUG && @_ < 3;
5199 my $name = shift; # The name to add.
5200 my $pointer = shift; # What the alias hash should point to. For
5201 # map tables, this is the parent property;
5202 # for match tables, it is the table itself.
5205 my $loose_match = delete $args{'Fuzzy'};
5207 my $make_re_pod_entry = delete $args{'Re_Pod_Entry'};
5208 $make_re_pod_entry = $YES unless defined $make_re_pod_entry;
5210 my $ok_as_filename = delete $args{'OK_as_Filename'};
5211 $ok_as_filename = 1 unless defined $ok_as_filename;
5213 my $status = delete $args{'Status'};
5214 $status = $NORMAL unless defined $status;
5216 # An internal name does not get documented, unless overridden by the
5218 my $ucd = delete $args{'UCD'} // (($name =~ /^_/) ? 0 : 1);
5220 Carp::carp_extra_args(\%args) if main::DEBUG && %args;
5222 # Capitalize the first letter of the alias unless it is one of the CJK
5223 # ones which specifically begins with a lower 'k'. Do this because
5224 # Unicode has varied whether they capitalize first letters or not, and
5225 # have later changed their minds and capitalized them, but not the
5226 # other way around. So do it always and avoid changes from release to
5228 $name = ucfirst($name) unless $name =~ /^k[A-Z]/;
5230 my $addr = do { no overloading; pack 'J', $self; };
5232 # Figure out if should be loosely matched if not already specified.
5233 if (! defined $loose_match) {
5235 # Is a loose_match if isn't null, and doesn't begin with an
5236 # underscore and isn't just a number
5238 && substr($name, 0, 1) ne '_'
5239 && $name !~ qr{^[0-9_.+-/]+$})
5248 # If this alias has already been defined, do nothing.
5249 return if defined $find_table_from_alias{$addr}->{$name};
5251 # That includes if it is standardly equivalent to an existing alias,
5252 # in which case, add this name to the list, so won't have to search
5254 my $standard_name = main::standardize($name);
5255 if (defined $find_table_from_alias{$addr}->{$standard_name}) {
5256 $find_table_from_alias{$addr}->{$name}
5257 = $find_table_from_alias{$addr}->{$standard_name};
5261 # Set the index hash for this alias for future quick reference.
5262 $find_table_from_alias{$addr}->{$name} = $pointer;
5263 $find_table_from_alias{$addr}->{$standard_name} = $pointer;
5264 local $to_trace = 0 if main::DEBUG;
5265 trace "adding alias $name to $pointer" if main::DEBUG && $to_trace;
5266 trace "adding alias $standard_name to $pointer" if main::DEBUG && $to_trace;
5269 # Put the new alias at the end of the list of aliases unless the final
5270 # element begins with an underscore (meaning it is for internal perl
5271 # use) or is all numeric, in which case, put the new one before that
5272 # one. This floats any all-numeric or underscore-beginning aliases to
5273 # the end. This is done so that they are listed last in output lists,
5274 # to encourage the user to use a better name (either more descriptive
5275 # or not an internal-only one) instead. This ordering is relied on
5276 # implicitly elsewhere in this program, like in short_name()
5277 my $list = $aliases{$addr};
5278 my $insert_position = (@$list == 0
5279 || (substr($list->[-1]->name, 0, 1) ne '_'
5280 && $list->[-1]->name =~ /\D/))
5286 Alias->new($name, $loose_match, $make_re_pod_entry,
5287 $ok_as_filename, $status, $ucd);
5289 # This name may be shorter than any existing ones, so clear the cache
5290 # of the shortest, so will have to be recalculated.
5292 undef $short_name{pack 'J', $self};
5297 # Returns a name suitable for use as the base part of a file name.
5298 # That is, shorter wins. It can return undef if there is no suitable
5299 # name. The name has all non-essential underscores removed.
5301 # The optional second parameter is a reference to a scalar in which
5302 # this routine will store the length the returned name had before the
5303 # underscores were removed, or undef if the return is undef.
5305 # The shortest name can change if new aliases are added. So using
5306 # this should be deferred until after all these are added. The code
5307 # that does that should clear this one's cache.
5308 # Any name with alphabetics is preferred over an all numeric one, even
5312 my $nominal_length_ptr = shift;
5313 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5315 my $addr = do { no overloading; pack 'J', $self; };
5317 # For efficiency, don't recalculate, but this means that adding new
5318 # aliases could change what the shortest is, so the code that does
5319 # that needs to undef this.
5320 if (defined $short_name{$addr}) {
5321 if ($nominal_length_ptr) {
5322 $$nominal_length_ptr = $nominal_short_name_length{$addr};
5324 return $short_name{$addr};
5327 # Look at each alias
5328 foreach my $alias ($self->aliases()) {
5330 # Don't use an alias that isn't ok to use for an external name.
5331 next if ! $alias->ok_as_filename;
5333 my $name = main::Standardize($alias->name);
5334 trace $self, $name if main::DEBUG && $to_trace;
5336 # Take the first one, or a shorter one that isn't numeric. This
5337 # relies on numeric aliases always being last in the array
5338 # returned by aliases(). Any alpha one will have precedence.
5339 if (! defined $short_name{$addr}
5341 && length($name) < length($short_name{$addr})))
5343 # Remove interior underscores.
5344 ($short_name{$addr} = $name) =~ s/ (?<= . ) _ (?= . ) //xg;
5346 $nominal_short_name_length{$addr} = length $name;
5350 # If the short name isn't a nice one, perhaps an equivalent table has
5352 if (! defined $short_name{$addr}
5353 || $short_name{$addr} eq ""
5354 || $short_name{$addr} eq "_")
5357 foreach my $follower ($self->children) { # All equivalents
5358 my $follower_name = $follower->short_name;
5359 next unless defined $follower_name;
5361 # Anything (except undefined) is better than underscore or
5363 if (! defined $return || $return eq "_") {
5364 $return = $follower_name;
5368 # If the new follower name isn't "_" and is shorter than the
5369 # current best one, prefer the new one.
5370 next if $follower_name eq "_";
5371 next if length $follower_name > length $return;
5372 $return = $follower_name;
5374 $short_name{$addr} = $return if defined $return;
5377 # If no suitable external name return undef
5378 if (! defined $short_name{$addr}) {
5379 $$nominal_length_ptr = undef if $nominal_length_ptr;
5383 # Don't allow a null short name.
5384 if ($short_name{$addr} eq "") {
5385 $short_name{$addr} = '_';
5386 $nominal_short_name_length{$addr} = 1;
5389 trace $self, $short_name{$addr} if main::DEBUG && $to_trace;
5391 if ($nominal_length_ptr) {
5392 $$nominal_length_ptr = $nominal_short_name_length{$addr};
5394 return $short_name{$addr};
5398 # Returns the external name that this table should be known by. This
5399 # is usually the short_name, but not if the short_name is undefined,
5400 # in which case the external_name is arbitrarily set to the
5404 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5406 my $short = $self->short_name;
5407 return $short if defined $short;
5412 sub add_description { # Adds the parameter as a short description.
5415 my $description = shift;
5417 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5420 push @{$description{pack 'J', $self}}, $description;
5425 sub add_note { # Adds the parameter as a short note.
5430 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5433 push @{$note{pack 'J', $self}}, $note;
5438 sub add_comment { # Adds the parameter as a comment.
5440 return unless $debugging_build;
5443 my $comment = shift;
5444 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5449 push @{$comment{pack 'J', $self}}, $comment;
5455 # Return the current comment for this table. If called in list
5456 # context, returns the array of comments. In scalar, returns a string
5457 # of each element joined together with a period ending each.
5460 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5462 my $addr = do { no overloading; pack 'J', $self; };
5463 my @list = @{$comment{$addr}};
5464 return @list if wantarray;
5466 foreach my $sentence (@list) {
5467 $return .= '. ' if $return;
5468 $return .= $sentence;
5471 $return .= '.' if $return;
5476 # Initialize the table with the argument which is any valid
5477 # initialization for range lists.
5480 my $addr = do { no overloading; pack 'J', $self; };
5481 my $initialization = shift;
5482 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5484 # Replace the current range list with a new one of the same exact
5486 my $class = ref $range_list{$addr};
5487 $range_list{$addr} = $class->new(Owner => $self,
5488 Initialize => $initialization);
5494 # The header that is output for the table in the file it is written
5498 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5501 $return .= $DEVELOPMENT_ONLY if $compare_versions;
5507 # Write a representation of the table to its file. It calls several
5508 # functions furnished by sub-classes of this abstract base class to
5509 # handle non-normal ranges, to add stuff before the table, and at its
5510 # end. If the table is to be written so that adjustments are
5511 # required, this does that conversion.
5514 my $use_adjustments = shift; # ? output in adjusted format or not
5515 my $tab_stops = shift; # The number of tab stops over to put any
5517 my $suppress_value = shift; # Optional, if the value associated with
5518 # a range equals this one, don't write
5520 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5522 my $addr = do { no overloading; pack 'J', $self; };
5524 # Start with the header
5525 my @HEADER = $self->header;
5528 push @HEADER, "\n", main::simple_fold($comment{$addr}, '# '), "\n"
5531 # Things discovered processing the main body of the document may
5532 # affect what gets output before it, therefore pre_body() isn't called
5533 # until after all other processing of the table is done.
5535 # The main body looks like a 'here' document. If there are comments,
5536 # get rid of them when processing it.
5538 if ($annotate || $output_range_counts) {
5539 # Use the line below in Perls that don't have /r
5540 #push @OUT, 'return join "\n", map { s/\s*#.*//mg; $_ } split "\n", <<\'END\';' . "\n";
5541 push @OUT, "return <<'END' =~ s/\\s*#.*//mgr;\n";
5543 push @OUT, "return <<'END';\n";
5546 if ($range_list{$addr}->is_empty) {
5548 # This is a kludge for empty tables to silence a warning in
5549 # utf8.c, which can't really deal with empty tables, but it can
5550 # deal with a table that matches nothing, as the inverse of 'Any'
5552 push @OUT, "!utf8::Any\n";
5554 elsif ($self->name eq 'N'
5556 # To save disk space and table cache space, avoid putting out
5557 # binary N tables, but instead create a file which just inverts
5558 # the Y table. Since the file will still exist and occupy a
5559 # certain number of blocks, might as well output the whole
5560 # thing if it all will fit in one block. The number of
5561 # ranges below is an approximate number for that.
5562 && ($self->property->type == $BINARY
5563 || $self->property->type == $FORCED_BINARY)
5564 # && $self->property->tables == 2 Can't do this because the
5565 # non-binary properties, like NFDQC aren't specifiable
5567 && $range_list{$addr}->ranges > 15
5568 && ! $annotate) # Under --annotate, want to see everything
5570 push @OUT, "!utf8::" . $self->property->name . "\n";
5573 my $range_size_1 = $range_size_1{$addr};
5574 my $format; # Used only in $annotate option
5575 my $include_name; # Used only in $annotate option
5579 # If annotating each code point, must print 1 per line.
5580 # The variable could point to a subroutine, and we don't want
5581 # to lose that fact, so only set if not set already
5582 $range_size_1 = 1 if ! $range_size_1;
5584 $format = $self->format;
5586 # The name of the character is output only for tables that
5587 # don't already include the name in the output.
5588 my $property = $self->property;
5590 ! ($property == $perl_charname
5591 || $property == main::property_ref('Unicode_1_Name')
5592 || $property == main::property_ref('Name')
5593 || $property == main::property_ref('Name_Alias')
5597 # Values for previous time through the loop. Initialize to
5598 # something that won't be adjacent to the first iteration;
5599 # only $previous_end matters for that.
5601 my $previous_end = -2;
5604 # Values for next time through the portion of the loop that splits
5605 # the range. 0 in $next_start means there is no remaining portion
5612 my $output_value_in_hex = $self->isa('Map_Table')
5613 && $self->format eq $HEX_ADJUST_FORMAT;
5614 # Use leading zeroes just for files whose format should not be
5615 # changed from what it has been. Otherwise, they just take up
5616 # space and time to process.
5617 my $hex_format = ($self->isa('Map_Table')
5618 && $self->to_output_map == $EXTERNAL_MAP)
5622 # Output each range as part of the here document.
5624 for my $set ($range_list{$addr}->ranges) {
5625 if ($set->type != 0) {
5626 $self->handle_special_range($set);
5629 my $start = $set->start;
5630 my $end = $set->end;
5631 my $value = $set->value;
5633 # Don't output ranges whose value is the one to suppress
5634 next RANGE if defined $suppress_value
5635 && $value eq $suppress_value;
5637 { # This bare block encloses the scope where we may need to
5638 # 'redo' to. Consider the table that contains the
5639 # lowercasing maps. mktables stores the ASCII range ones
5641 # ord('A') => ord('a'), .. ord('Z') => ord('z')
5642 # For compactness, the table that gets written has this as
5644 # ( ord('A') .. ord('Z') ) => ord('a')
5645 # and the software that reads the tables is smart enough
5646 # to "connect the dots". This change is accomplished in
5647 # this loop by looking to see if the current iteration
5648 # fits the paradigm of the previous iteration, and if so,
5649 # we merge them by replacing the final output item with
5650 # the merged data. Repeated 25 times, this gets A-Z. But
5651 # we also have to make sure we don't screw up cases where
5652 # we have internally stored
5653 # ( 0x1C4 .. 0x1C6 ) => 0x1C5
5654 # This single internal range has to be output as 3 ranges.
5655 # (There are very few of these, so the gain of doing the
5656 # combining of other ranges far outweighs the splitting of
5657 # these.) To accomplish this, we have to split the range,
5658 # and each time through we handle the next portion of the
5659 # original by ending this block with a 'redo'. The
5660 # values to use for that next time through are set up just
5661 # below in the scalars whose names begin with '$next_'.
5663 if ($use_adjustments && ! $range_size_1) {
5665 # When converting to use adjustments, we can handle
5666 # only single element ranges. Set up so that this
5667 # time through the loop, we look at the first element,
5668 # and the next time through, we start off with the
5669 # remainder. Thus each time through we look at the
5670 # first element of the range
5671 if ($end != $start) {
5672 $next_start = $start + 1;
5674 $next_value = $value;
5678 # The values for some of these tables are stored as
5679 # hex strings. Convert those to decimal
5680 $value = hex($value)
5681 if $self->default_map eq $CODE_POINT
5682 && $value =~ / ^ [A-Fa-f0-9]+ $ /x;
5684 # If this range is adjacent to the previous one, and
5685 # the values in each are integers that are also
5686 # adjacent (differ by 1), then this range really
5687 # extends the previous one that is already in element
5688 # $OUT[-1]. So we pop that element, and pretend that
5689 # the range starts with whatever it started with.
5690 # $offset is incremented by 1 each time so that it
5691 # gives the current offset from the first element in
5692 # the accumulating range, and we keep in $value the
5693 # value of that first element.
5694 if ($start == $previous_end + 1
5695 && $value =~ /^ -? \d+ $/xa
5696 && $previous_value =~ /^ -? \d+ $/xa
5697 && ($value == ($previous_value + ++$offset)))
5700 $start = $previous_start;
5701 $value = $previous_value;
5707 # Save the current values for the next time through
5709 $previous_start = $start;
5710 $previous_end = $end;
5711 $previous_value = $value;
5714 # If there is a range and doesn't need a single point range
5716 if ($start != $end && ! $range_size_1) {
5717 push @OUT, sprintf "$hex_format\t$hex_format",
5720 if ($output_value_in_hex) {
5721 $OUT[-1] .= sprintf "\t$hex_format", $value;
5724 $OUT[-1] .= "\t$value";
5728 # Add a comment with the size of the range, if
5729 # requested. Expand Tabs to make sure they all start
5730 # in the same column, and then unexpand to use mostly
5732 if (! $output_range_counts{$addr}) {
5736 $OUT[-1] = Text::Tabs::expand($OUT[-1]);
5737 my $count = main::clarify_number($end - $start + 1);
5740 my $width = $tab_stops * 8 - 1;
5741 $OUT[-1] = sprintf("%-*s # [%s]\n",
5745 $OUT[-1] = Text::Tabs::unexpand($OUT[-1]);
5749 # Here to output a single code point per line.
5750 # If not to annotate, use the simple formats
5751 elsif (! $annotate) {
5753 # Use any passed in subroutine to output.
5754 if (ref $range_size_1 eq 'CODE') {
5755 for my $i ($start .. $end) {
5756 push @OUT, &{$range_size_1}($i, $value);
5761 # Here, caller is ok with default output.
5762 for (my $i = $start; $i <= $end; $i++) {
5763 if ($output_value_in_hex) {
5765 sprintf "$hex_format\t\t$hex_format\n",
5769 push @OUT, sprintf $hex_format, $i;
5770 $OUT[-1] .= "\t\t$value" if $value ne "";
5778 # Here, wants annotation.
5779 for (my $i = $start; $i <= $end; $i++) {
5781 # Get character information if don't have it already
5782 main::populate_char_info($i)
5783 if ! defined $viacode[$i];
5784 my $type = $annotate_char_type[$i];
5786 # Figure out if should output the next code points
5787 # as part of a range or not. If this is not in an
5788 # annotation range, then won't output as a range,
5789 # so returns $i. Otherwise use the end of the
5790 # annotation range, but no further than the
5791 # maximum possible end point of the loop.
5792 my $range_end = main::min(
5793 $annotate_ranges->value_of($i) || $i,
5796 # Use a range if it is a range, and either is one
5797 # of the special annotation ranges, or the range
5798 # is at most 3 long. This last case causes the
5799 # algorithmically named code points to be output
5800 # individually in spans of at most 3, as they are
5801 # the ones whose $type is > 0.
5802 if ($range_end != $i
5803 && ( $type < 0 || $range_end - $i > 2))
5805 # Here is to output a range. We don't allow a
5806 # caller-specified output format--just use the
5809 "$hex_format\t$hex_format\t%s\t#",
5810 $i, $range_end, $value;
5811 my $range_name = $viacode[$i];
5813 # For the code points which end in their hex
5814 # value, we eliminate that from the output
5815 # annotation, and capitalize only the first
5816 # letter of each word.
5817 if ($type == $CP_IN_NAME) {
5818 my $hex = sprintf $hex_format, $i;
5819 $range_name =~ s/-$hex$//;
5820 my @words = split " ", $range_name;
5821 for my $word (@words) {
5823 ucfirst(lc($word)) if $word ne 'CJK';
5825 $range_name = join " ", @words;
5827 elsif ($type == $HANGUL_SYLLABLE) {
5828 $range_name = "Hangul Syllable";
5831 $OUT[-1] .= " $range_name" if $range_name;
5833 # Include the number of code points in the
5836 main::clarify_number($range_end - $i + 1);
5837 $OUT[-1] .= " [$count]\n";
5839 # Skip to the end of the range
5842 else { # Not in a range.
5845 # When outputting the names of each character,
5846 # use the character itself if printable
5847 $comment .= "'" . chr($i) . "' "
5850 # To make it more readable, use a minimum
5854 my $output_value = $value;
5856 # Determine the annotation
5857 if ($format eq $DECOMP_STRING_FORMAT) {
5859 # This is very specialized, with the type
5860 # of decomposition beginning the line
5861 # enclosed in <...>, and the code points
5862 # that the code point decomposes to
5863 # separated by blanks. Create two
5864 # strings, one of the printable
5865 # characters, and one of their official
5867 (my $map = $output_value)
5868 =~ s/ \ * < .*? > \ +//x;
5872 foreach my $to (split " ", $map) {
5873 $to = CORE::hex $to;
5874 $to_name .= " + " if $to_name;
5875 $to_chr .= chr($to);
5876 main::populate_char_info($to)
5877 if ! defined $viacode[$to];
5878 $to_name .= $viacode[$to];
5882 "=> '$to_chr'; $viacode[$i] => $to_name";
5883 $comment_indent = 25; # Determined by
5887 $output_value = CORE::hex $value
5888 if $format eq $HEX_FORMAT
5889 || $format eq $HEX_ADJUST_FORMAT;
5890 $output_value += $offset
5892 # Don't try to adjust a
5894 && $output_value !~ /[-\D]/;
5896 # Assume that any table that has hex
5897 # format is a mapping of one code point to
5899 if ($format eq $HEX_FORMAT
5900 || $format eq $HEX_ADJUST_FORMAT)
5902 main::populate_char_info($output_value)
5903 if ! defined $viacode[$output_value];
5905 . chr($output_value)
5906 . "'; " if $printable[$output_value];
5908 $comment .= $viacode[$i] if $include_name
5910 if ($format eq $HEX_FORMAT
5911 || $format eq $HEX_ADJUST_FORMAT)
5914 " => $viacode[$output_value]"
5915 if $viacode[$output_value];
5918 $output_value = sprintf($hex_format,
5920 if $format eq $HEX_ADJUST_FORMAT
5921 || ($format eq $HEX_FORMAT
5922 && $self->replacement_property);
5924 # If including the name, no need to
5925 # indent, as the name will already be way
5927 $comment_indent = ($include_name) ? 0 : 60;
5930 # Use any passed in routine to output the base
5932 if (ref $range_size_1 eq 'CODE') {
5933 my $base_part=&{$range_size_1}
5934 ($i, $output_value);
5936 push @OUT, $base_part;
5939 push @OUT, sprintf "$hex_format\t\t%s",
5943 # And add the annotation.
5944 $OUT[-1] = sprintf "%-*s\t# %s",
5954 # If we split the range, set up so the next time through
5955 # we get the remainder, and redo.
5957 $start = $next_start;
5959 $value = $next_value;
5964 } # End of loop through all the table's ranges
5967 # Add anything that goes after the main body, but within the here
5969 my $append_to_body = $self->append_to_body;
5970 push @OUT, $append_to_body if $append_to_body;
5972 # And finish the here document.
5975 # Done with the main portion of the body. Can now figure out what
5976 # should appear before it in the file.
5977 my $pre_body = $self->pre_body;
5978 push @HEADER, $pre_body, "\n" if $pre_body;
5980 # All these files should have a .pl suffix added to them.
5981 my @file_with_pl = @{$file_path{$addr}};
5982 $file_with_pl[-1] .= '.pl';
5984 main::write(\@file_with_pl,
5985 $annotate, # utf8 iff annotating
5991 sub set_status { # Set the table's status
5993 my $status = shift; # The status enum value
5994 my $info = shift; # Any message associated with it.
5995 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5997 my $addr = do { no overloading; pack 'J', $self; };
5999 $status{$addr} = $status;
6000 $status_info{$addr} = $info;
6004 sub set_fate { # Set the fate of a table
6008 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6010 my $addr = do { no overloading; pack 'J', $self; };
6012 return if $fate{$addr} == $fate; # If no-op
6014 # Can only change the ordinary fate, except if going to $MAP_PROXIED
6015 return if $fate{$addr} != $ORDINARY && $fate != $MAP_PROXIED;
6017 $fate{$addr} = $fate;
6019 # Don't document anything to do with a non-normal fated table
6020 if ($fate != $ORDINARY) {
6021 my $put_in_pod = ($fate == $MAP_PROXIED) ? 1 : 0;
6022 foreach my $alias ($self->aliases) {
6023 $alias->set_ucd($put_in_pod);
6025 # MAP_PROXIED doesn't affect the match tables
6026 next if $fate == $MAP_PROXIED;
6027 $alias->set_make_re_pod_entry($put_in_pod);
6031 # Save the reason for suppression for output
6032 if ($fate == $SUPPRESSED && defined $reason) {
6033 $why_suppressed{$complete_name{$addr}} = $reason;
6040 # Don't allow changes to the table from now on. This stores a stack
6041 # trace of where it was called, so that later attempts to modify it
6042 # can immediately show where it got locked.
6045 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6047 my $addr = do { no overloading; pack 'J', $self; };
6049 $locked{$addr} = "";
6051 my $line = (caller(0))[2];
6054 # Accumulate the stack trace
6056 my ($pkg, $file, $caller_line, $caller) = caller $i++;
6058 last unless defined $caller;
6060 $locked{$addr} .= " called from $caller() at line $line\n";
6061 $line = $caller_line;
6063 $locked{$addr} .= " called from main at line $line\n";
6068 sub carp_if_locked {
6069 # Return whether a table is locked or not, and, by the way, complain
6073 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6075 my $addr = do { no overloading; pack 'J', $self; };
6077 return 0 if ! $locked{$addr};
6078 Carp::my_carp_bug("Can't modify a locked table. Stack trace of locking:\n$locked{$addr}\n\n");
6082 sub set_file_path { # Set the final directory path for this table
6084 # Rest of parameters passed on
6087 @{$file_path{pack 'J', $self}} = @_;
6091 # Accessors for the range list stored in this table. First for
6100 matches_identically_to
6113 return $self->_range_list->$sub(@_);
6117 # Then for ones that should fail if locked
6127 return if $self->carp_if_locked;
6129 return $self->_range_list->$sub(@_);
6136 use parent '-norequire', '_Base_Table';
6138 # A Map Table is a table that contains the mappings from code points to
6139 # values. There are two weird cases:
6140 # 1) Anomalous entries are ones that aren't maps of ranges of code points, but
6141 # are written in the table's file at the end of the table nonetheless. It
6142 # requires specially constructed code to handle these; utf8.c can not read
6143 # these in, so they should not go in $map_directory. As of this writing,
6144 # the only case that these happen is for named sequences used in
6145 # charnames.pm. But this code doesn't enforce any syntax on these, so
6146 # something else could come along that uses it.
6147 # 2) Specials are anything that doesn't fit syntactically into the body of the
6148 # table. The ranges for these have a map type of non-zero. The code below
6149 # knows about and handles each possible type. In most cases, these are
6150 # written as part of the header.
6152 # A map table deliberately can't be manipulated at will unlike match tables.
6153 # This is because of the ambiguities having to do with what to do with
6154 # overlapping code points. And there just isn't a need for those things;
6155 # what one wants to do is just query, add, replace, or delete mappings, plus
6156 # write the final result.
6157 # However, there is a method to get the list of possible ranges that aren't in
6158 # this table to use for defaulting missing code point mappings. And,
6159 # map_add_or_replace_non_nulls() does allow one to add another table to this
6160 # one, but it is clearly very specialized, and defined that the other's
6161 # non-null values replace this one's if there is any overlap.
6163 sub trace { return main::trace(@_); }
6167 main::setup_package();
6170 # Many input files omit some entries; this gives what the mapping for the
6171 # missing entries should be
6172 main::set_access('default_map', \%default_map, 'r');
6174 my %anomalous_entries;
6175 # Things that go in the body of the table which don't fit the normal
6176 # scheme of things, like having a range. Not much can be done with these
6177 # once there except to output them. This was created to handle named
6179 main::set_access('anomalous_entry', \%anomalous_entries, 'a');
6180 main::set_access('anomalous_entries', # Append singular, read plural
6181 \%anomalous_entries,
6184 my %replacement_property;
6185 # Certain files are unused by Perl itself, and are kept only for backwards
6186 # compatibility for programs that used them before Unicode::UCD existed.
6187 # These are termed legacy properties. At some point they may be removed,
6188 # but for now mark them as legacy. If non empty, this is the name of the
6189 # property to use instead (i.e., the modern equivalent).
6190 main::set_access('replacement_property', \%replacement_property, 'r');
6193 # Enum as to whether or not to write out this map table, and how:
6195 # $EXTERNAL_MAP means its existence is noted in the documentation, and
6196 # it should not be removed nor its format changed. This
6197 # is done for those files that have traditionally been
6198 # output. Maps of legacy-only properties default to
6200 # $INTERNAL_MAP means Perl reserves the right to do anything it wants
6202 # $OUTPUT_ADJUSTED means that it is an $INTERNAL_MAP, and instead of
6203 # outputting the actual mappings as-is, we adjust things
6204 # to create a much more compact table. Only those few
6205 # tables where the mapping is convertible at least to an
6206 # integer and compacting makes a big difference should
6207 # have this. Hence, the default is to not do this
6208 # unless the table's default mapping is to $CODE_POINT,
6209 # and the range size is not 1.
6210 main::set_access('to_output_map', \%to_output_map, 's');
6218 # Optional initialization data for the table.
6219 my $initialize = delete $args{'Initialize'};
6221 my $default_map = delete $args{'Default_Map'};
6222 my $property = delete $args{'_Property'};
6223 my $full_name = delete $args{'Full_Name'};
6224 my $replacement_property = delete $args{'Replacement_Property'} // "";
6225 my $to_output_map = delete $args{'To_Output_Map'};
6227 # Rest of parameters passed on; legacy properties have several common
6229 if ($replacement_property) {
6230 $args{"Fate"} = $LEGACY_ONLY;
6231 $args{"Range_Size_1"} = 1;
6232 $args{"Perl_Extension"} = 1;
6236 my $range_list = Range_Map->new(Owner => $property);
6238 my $self = $class->SUPER::new(
6240 Complete_Name => $full_name,
6241 Full_Name => $full_name,
6242 _Property => $property,
6243 _Range_List => $range_list,
6246 my $addr = do { no overloading; pack 'J', $self; };
6248 $anomalous_entries{$addr} = [];
6249 $default_map{$addr} = $default_map;
6250 $replacement_property{$addr} = $replacement_property;
6251 $to_output_map = $EXTERNAL_MAP if ! defined $to_output_map
6252 && $replacement_property;
6253 $to_output_map{$addr} = $to_output_map;
6255 $self->initialize($initialize) if defined $initialize;
6262 qw("") => "_operator_stringify",
6265 sub _operator_stringify {
6268 my $name = $self->property->full_name;
6269 $name = '""' if $name eq "";
6270 return "Map table for Property '$name'";
6274 # Add a synonym for this table (which means the property itself)
6277 # Rest of parameters passed on.
6279 $self->SUPER::add_alias($name, $self->property, @_);
6284 # Add a range of code points to the list of specially-handled code
6285 # points. $MULTI_CP is assumed if the type of special is not passed
6294 my $type = delete $args{'Type'} || 0;
6295 # Rest of parameters passed on
6297 # Can't change the table if locked.
6298 return if $self->carp_if_locked;
6300 my $addr = do { no overloading; pack 'J', $self; };
6302 $self->_range_list->add_map($lower, $upper,
6309 sub append_to_body {
6310 # Adds to the written HERE document of the table's body any anomalous
6311 # entries in the table..
6314 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6316 my $addr = do { no overloading; pack 'J', $self; };
6318 return "" unless @{$anomalous_entries{$addr}};
6319 return join("\n", @{$anomalous_entries{$addr}}) . "\n";
6322 sub map_add_or_replace_non_nulls {
6323 # This adds the mappings in the table $other to $self. Non-null
6324 # mappings from $other override those in $self. It essentially merges
6325 # the two tables, with the second having priority except for null
6330 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6332 return if $self->carp_if_locked;
6334 if (! $other->isa(__PACKAGE__)) {
6335 Carp::my_carp_bug("$other should be a "
6343 my $addr = do { no overloading; pack 'J', $self; };
6344 my $other_addr = do { no overloading; pack 'J', $other; };
6346 local $to_trace = 0 if main::DEBUG;
6348 my $self_range_list = $self->_range_list;
6349 my $other_range_list = $other->_range_list;
6350 foreach my $range ($other_range_list->ranges) {
6351 my $value = $range->value;
6352 next if $value eq "";
6353 $self_range_list->_add_delete('+',
6357 Type => $range->type,
6358 Replace => $UNCONDITIONALLY);
6364 sub set_default_map {
6365 # Define what code points that are missing from the input files should
6370 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6372 my $addr = do { no overloading; pack 'J', $self; };
6374 # Convert the input to the standard equivalent, if any (won't have any
6375 # for $STRING properties)
6376 my $standard = $self->_find_table_from_alias->{$map};
6377 $map = $standard->name if defined $standard;
6379 # Warn if there already is a non-equivalent default map for this
6380 # property. Note that a default map can be a ref, which means that
6381 # what it actually means is delayed until later in the program, and it
6382 # IS permissible to override it here without a message.
6383 my $default_map = $default_map{$addr};
6384 if (defined $default_map
6385 && ! ref($default_map)
6386 && $default_map ne $map
6387 && main::Standardize($map) ne $default_map)
6389 my $property = $self->property;
6390 my $map_table = $property->table($map);
6391 my $default_table = $property->table($default_map);
6392 if (defined $map_table
6393 && defined $default_table
6394 && $map_table != $default_table)
6396 Carp::my_carp("Changing the default mapping for "
6398 . " from $default_map to $map'");
6402 $default_map{$addr} = $map;
6404 # Don't also create any missing table for this map at this point,
6405 # because if we did, it could get done before the main table add is
6406 # done for PropValueAliases.txt; instead the caller will have to make
6407 # sure it exists, if desired.
6412 # Returns boolean: should we write this map table?
6415 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6417 my $addr = do { no overloading; pack 'J', $self; };
6419 # If overridden, use that
6420 return $to_output_map{$addr} if defined $to_output_map{$addr};
6422 my $full_name = $self->full_name;
6423 return $global_to_output_map{$full_name}
6424 if defined $global_to_output_map{$full_name};
6426 # If table says to output, do so; if says to suppress it, do so.
6427 my $fate = $self->fate;
6428 return $INTERNAL_MAP if $fate == $INTERNAL_ONLY;
6429 return $EXTERNAL_MAP if grep { $_ eq $full_name } @output_mapped_properties;
6430 return 0 if $fate == $SUPPRESSED || $fate == $MAP_PROXIED;
6432 my $type = $self->property->type;
6434 # Don't want to output binary map tables even for debugging.
6435 return 0 if $type == $BINARY;
6437 # But do want to output string ones. All the ones that remain to
6438 # be dealt with (i.e. which haven't explicitly been set to external)
6439 # are for internal Perl use only. The default for those that map to
6440 # $CODE_POINT and haven't been restricted to a single element range
6441 # is to use the adjusted form.
6442 if ($type == $STRING) {
6443 return $INTERNAL_MAP if $self->range_size_1
6444 || $default_map{$addr} ne $CODE_POINT;
6445 return $OUTPUT_ADJUSTED;
6448 # Otherwise is an $ENUM, do output it, for Perl's purposes
6449 return $INTERNAL_MAP;
6453 # Returns a Range_List that is gaps of the current table. That is,
6457 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6459 my $current = Range_List->new(Initialize => $self->_range_list,
6460 Owner => $self->property);
6466 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6468 my $return = $self->SUPER::header();
6470 if ($self->to_output_map >= $INTERNAL_MAP) {
6471 $return .= $INTERNAL_ONLY_HEADER;
6474 my $property_name = $self->property->replacement_property;
6476 # The legacy-only properties were gotten above; but there are some
6477 # other properties whose files are in current use that have fixed
6479 $property_name = $self->property->full_name unless $property_name;
6483 # !!!!!!! IT IS DEPRECATED TO USE THIS FILE !!!!!!!
6485 # This file is for internal use by core Perl only. It is retained for
6486 # backwards compatibility with applications that may have come to rely on it,
6487 # but its format and even its name or existence are subject to change without
6488 # notice in a future Perl version. Don't use it directly. Instead, its
6489 # contents are now retrievable through a stable API in the Unicode::UCD
6490 # module: Unicode::UCD::prop_invmap('$property_name').
6496 sub set_final_comment {
6497 # Just before output, create the comment that heads the file
6498 # containing this table.
6500 return unless $debugging_build;
6503 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6505 # No sense generating a comment if aren't going to write it out.
6506 return if ! $self->to_output_map;
6508 my $addr = do { no overloading; pack 'J', $self; };
6510 my $property = $self->property;
6512 # Get all the possible names for this property. Don't use any that
6513 # aren't ok for use in a file name, etc. This is perhaps causing that
6514 # flag to do double duty, and may have to be changed in the future to
6515 # have our own flag for just this purpose; but it works now to exclude
6516 # Perl generated synonyms from the lists for properties, where the
6517 # name is always the proper Unicode one.
6518 my @property_aliases = grep { $_->ok_as_filename } $self->aliases;
6520 my $count = $self->count;
6521 my $default_map = $default_map{$addr};
6523 # The ranges that map to the default aren't output, so subtract that
6524 # to get those actually output. A property with matching tables
6525 # already has the information calculated.
6526 if ($property->type != $STRING) {
6527 $count -= $property->table($default_map)->count;
6529 elsif (defined $default_map) {
6531 # But for $STRING properties, must calculate now. Subtract the
6532 # count from each range that maps to the default.
6533 foreach my $range ($self->_range_list->ranges) {
6534 if ($range->value eq $default_map) {
6535 $count -= $range->end +1 - $range->start;
6541 # Get a string version of $count with underscores in large numbers,
6543 my $string_count = main::clarify_number($count);
6545 my $code_points = ($count == 1)
6546 ? 'single code point'
6547 : "$string_count code points";
6552 if (@property_aliases <= 1) {
6553 $mapping = 'mapping';
6554 $these_mappings = 'this mapping';
6558 $mapping = 'synonymous mappings';
6559 $these_mappings = 'these mappings';
6563 if ($count >= $MAX_UNICODE_CODEPOINTS) {
6564 $cp = "any code point in Unicode Version $string_version";
6568 if ($default_map eq "") {
6569 $map_to = 'the null string';
6571 elsif ($default_map eq $CODE_POINT) {
6575 $map_to = "'$default_map'";
6578 $cp = "the single code point";
6581 $cp = "one of the $code_points";
6583 $cp .= " in Unicode Version $string_version for which the mapping is not to $map_to";
6588 my $status = $self->status;
6589 if ($status ne $NORMAL) {
6590 my $warn = uc $status_past_participles{$status};
6593 !!!!!!! $warn !!!!!!!!!!!!!!!!!!!
6594 All property or property=value combinations contained in this file are $warn.
6595 See $unicode_reference_url for what this means.
6599 $comment .= "This file returns the $mapping:\n";
6601 my $ucd_accessible_name = "";
6602 my $full_name = $self->property->full_name;
6603 for my $i (0 .. @property_aliases - 1) {
6604 my $name = $property_aliases[$i]->name;
6605 $comment .= sprintf("%-8s%s\n", " ", $name . '(cp)');
6606 if ($property_aliases[$i]->ucd) {
6607 if ($name eq $full_name) {
6608 $ucd_accessible_name = $full_name;
6610 elsif (! $ucd_accessible_name) {
6611 $ucd_accessible_name = $name;
6615 $comment .= "\nwhere 'cp' is $cp.";
6616 if ($ucd_accessible_name) {
6617 $comment .= " Note that $these_mappings $are accessible via the function prop_invmap('$full_name') in Unicode::UCD";
6620 # And append any commentary already set from the actual property.
6621 $comment .= "\n\n" . $self->comment if $self->comment;
6622 if ($self->description) {
6623 $comment .= "\n\n" . join " ", $self->description;
6626 $comment .= "\n\n" . join " ", $self->note;
6630 if (! $self->perl_extension) {
6633 For information about what this property really means, see:
6634 $unicode_reference_url
6638 if ($count) { # Format differs for empty table
6639 $comment.= "\nThe format of the ";
6640 if ($self->range_size_1) {
6642 main body of lines of this file is: CODE_POINT\\t\\tMAPPING where CODE_POINT
6643 is in hex; MAPPING is what CODE_POINT maps to.
6648 # There are tables which end up only having one element per
6649 # range, but it is not worth keeping track of for making just
6650 # this comment a little better.
6652 non-comment portions of the main body of lines of this file is:
6653 START\\tSTOP\\tMAPPING where START is the starting code point of the
6654 range, in hex; STOP is the ending point, or if omitted, the range has just one
6655 code point; MAPPING is what each code point between START and STOP maps to.
6657 if ($self->output_range_counts) {
6659 Numbers in comments in [brackets] indicate how many code points are in the
6660 range (omitted when the range is a single code point or if the mapping is to
6666 $self->set_comment(main::join_lines($comment));
6670 my %swash_keys; # Makes sure don't duplicate swash names.
6672 # The remaining variables are temporaries used while writing each table,
6673 # to output special ranges.
6674 my @multi_code_point_maps; # Map is to more than one code point.
6676 sub handle_special_range {
6677 # Called in the middle of write when it finds a range it doesn't know
6682 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6684 my $addr = do { no overloading; pack 'J', $self; };
6686 my $type = $range->type;
6688 my $low = $range->start;
6689 my $high = $range->end;
6690 my $map = $range->value;
6692 # No need to output the range if it maps to the default.
6693 return if $map eq $default_map{$addr};
6695 my $property = $self->property;
6697 # Switch based on the map type...
6698 if ($type == $HANGUL_SYLLABLE) {
6700 # These are entirely algorithmically determinable based on
6701 # some constants furnished by Unicode; for now, just set a
6702 # flag to indicate that have them. After everything is figured
6703 # out, we will output the code that does the algorithm. (Don't
6704 # output them if not needed because we are suppressing this
6706 $has_hangul_syllables = 1 if $property->to_output_map;
6708 elsif ($type == $CP_IN_NAME) {
6710 # Code points whose name ends in their code point are also
6711 # algorithmically determinable, but need information about the map
6712 # to do so. Both the map and its inverse are stored in data
6713 # structures output in the file. They are stored in the mean time
6714 # in global lists The lists will be written out later into Name.pm,
6715 # which is created only if needed. In order to prevent duplicates
6716 # in the list, only add to them for one property, should multiple
6718 if ($needing_code_points_ending_in_code_point == 0) {
6719 $needing_code_points_ending_in_code_point = $property;
6721 if ($property == $needing_code_points_ending_in_code_point) {
6722 push @{$names_ending_in_code_point{$map}->{'low'}}, $low;
6723 push @{$names_ending_in_code_point{$map}->{'high'}}, $high;
6725 my $squeezed = $map =~ s/[-\s]+//gr;
6726 push @{$loose_names_ending_in_code_point{$squeezed}->{'low'}},
6728 push @{$loose_names_ending_in_code_point{$squeezed}->{'high'}},
6731 push @code_points_ending_in_code_point, { low => $low,
6737 elsif ($range->type == $MULTI_CP || $range->type == $NULL) {
6739 # Multi-code point maps and null string maps have an entry
6740 # for each code point in the range. They use the same
6742 for my $code_point ($low .. $high) {
6744 # The pack() below can't cope with surrogates. XXX This may
6746 if ($code_point >= 0xD800 && $code_point <= 0xDFFF) {
6747 Carp::my_carp("Surrogate code point '$code_point' in mapping to '$map' in $self. No map created");
6751 # Generate the hash entries for these in the form that
6752 # utf8.c understands.
6756 foreach my $to (split " ", $map) {
6757 if ($to !~ /^$code_point_re$/) {
6758 Carp::my_carp("Illegal code point '$to' in mapping '$map' from $code_point in $self. No map created");
6761 $tostr .= sprintf "\\x{%s}", $to;
6762 $to = CORE::hex $to;
6764 $to_name .= " + " if $to_name;
6765 $to_chr .= chr($to);
6766 main::populate_char_info($to)
6767 if ! defined $viacode[$to];
6768 $to_name .= $viacode[$to];
6772 # I (khw) have never waded through this line to
6773 # understand it well enough to comment it.
6774 my $utf8 = sprintf(qq["%s" => "$tostr",],
6775 join("", map { sprintf "\\x%02X", $_ }
6776 unpack("U0C*", pack("U", $code_point))));
6778 # Add a comment so that a human reader can more easily
6779 # see what's going on.
6780 push @multi_code_point_maps,
6781 sprintf("%-45s # U+%04X", $utf8, $code_point);
6783 $multi_code_point_maps[-1] .= " => $map";
6786 main::populate_char_info($code_point)
6787 if ! defined $viacode[$code_point];
6788 $multi_code_point_maps[-1] .= " '"
6790 . "' => '$to_chr'; $viacode[$code_point] => $to_name";
6795 Carp::my_carp("Unrecognized map type '$range->type' in '$range' in $self. Not written");
6802 # Returns the string that should be output in the file before the main
6803 # body of this table. It isn't called until the main body is
6804 # calculated, saving a pass. The string includes some hash entries
6805 # identifying the format of the body, and what the single value should
6806 # be for all ranges missing from it. It also includes any code points
6807 # which have map_types that don't go in the main table.
6810 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6812 my $addr = do { no overloading; pack 'J', $self; };
6814 my $name = $self->property->swash_name;
6816 # Currently there is nothing in the pre_body unless a swash is being
6818 return unless defined $name;
6820 if (defined $swash_keys{$name}) {
6821 Carp::my_carp(main::join_lines(<<END
6822 Already created a swash name '$name' for $swash_keys{$name}. This means that
6823 the same name desired for $self shouldn't be used. Bad News. This must be
6824 fixed before production use, but proceeding anyway
6828 $swash_keys{$name} = "$self";
6832 # Here we assume we were called after have gone through the whole
6833 # file. If we actually generated anything for each map type, add its
6834 # respective header and trailer
6835 my $specials_name = "";
6836 if (@multi_code_point_maps) {
6837 $specials_name = "utf8::ToSpec$name";
6840 # Some code points require special handling because their mappings are each to
6841 # multiple code points. These do not appear in the main body, but are defined
6842 # in the hash below.
6844 # Each key is the string of N bytes that together make up the UTF-8 encoding
6845 # for the code point. (i.e. the same as looking at the code point's UTF-8
6846 # under "use bytes"). Each value is the UTF-8 of the translation, for speed.
6847 \%$specials_name = (
6849 $pre_body .= join("\n", @multi_code_point_maps) . "\n);\n";
6852 my $format = $self->format;
6856 my $output_adjusted = ($self->to_output_map == $OUTPUT_ADJUSTED);
6857 if ($output_adjusted) {
6858 if ($specials_name) {
6860 # The mappings in the non-hash portion of this file must be modified to get the
6861 # correct values by adding the code point ordinal number to each one that is
6867 # The mappings must be modified to get the correct values by adding the code
6868 # point ordinal number to each one that is numeric.
6875 # The name this swash is to be known by, with the format of the mappings in
6876 # the main body of the table, and what all code points missing from this file
6878 \$utf8::SwashInfo{'To$name'}{'format'} = '$format'; # $map_table_formats{$format}
6880 if ($specials_name) {
6882 \$utf8::SwashInfo{'To$name'}{'specials_name'} = '$specials_name'; # Name of hash of special mappings
6885 my $default_map = $default_map{$addr};
6887 # For $CODE_POINT default maps and using adjustments, instead the default
6889 $return .= "\$utf8::SwashInfo{'To$name'}{'missing'} = '"
6890 . (($output_adjusted && $default_map eq $CODE_POINT)
6895 if ($default_map eq $CODE_POINT) {
6896 $return .= ' # code point maps to itself';
6898 elsif ($default_map eq "") {
6899 $return .= ' # code point maps to the null string';
6903 $return .= $pre_body;
6909 # Write the table to the file.
6912 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6914 my $addr = do { no overloading; pack 'J', $self; };
6916 # Clear the temporaries
6917 undef @multi_code_point_maps;
6919 # Calculate the format of the table if not already done.
6920 my $format = $self->format;
6921 my $type = $self->property->type;
6922 my $default_map = $self->default_map;
6923 if (! defined $format) {
6924 if ($type == $BINARY) {
6926 # Don't bother checking the values, because we elsewhere
6927 # verify that a binary table has only 2 values.
6928 $format = $BINARY_FORMAT;
6931 my @ranges = $self->_range_list->ranges;
6933 # default an empty table based on its type and default map
6936 # But it turns out that the only one we can say is a
6937 # non-string (besides binary, handled above) is when the
6938 # table is a string and the default map is to a code point
6939 if ($type == $STRING && $default_map eq $CODE_POINT) {
6940 $format = $HEX_FORMAT;
6943 $format = $STRING_FORMAT;
6948 # Start with the most restrictive format, and as we find
6949 # something that doesn't fit with that, change to the next
6950 # most restrictive, and so on.
6951 $format = $DECIMAL_FORMAT;
6952 foreach my $range (@ranges) {
6953 next if $range->type != 0; # Non-normal ranges don't
6954 # affect the main body
6955 my $map = $range->value;
6956 if ($map ne $default_map) {
6957 last if $format eq $STRING_FORMAT; # already at
6960 $format = $INTEGER_FORMAT
6961 if $format eq $DECIMAL_FORMAT
6962 && $map !~ / ^ [0-9] $ /x;
6963 $format = $FLOAT_FORMAT
6964 if $format eq $INTEGER_FORMAT
6965 && $map !~ / ^ -? [0-9]+ $ /x;
6966 $format = $RATIONAL_FORMAT
6967 if $format eq $FLOAT_FORMAT
6968 && $map !~ / ^ -? [0-9]+ \. [0-9]* $ /x;
6969 $format = $HEX_FORMAT
6970 if ($format eq $RATIONAL_FORMAT
6972 m/ ^ -? [0-9]+ ( \/ [0-9]+ )? $ /x)
6973 # Assume a leading zero means hex,
6974 # even if all digits are 0-9
6975 || ($format eq $INTEGER_FORMAT
6976 && $map =~ /^0[0-9A-F]/);
6977 $format = $STRING_FORMAT if $format eq $HEX_FORMAT
6978 && $map =~ /[^0-9A-F]/;
6983 } # end of calculating format
6985 if ($default_map eq $CODE_POINT
6986 && $format ne $HEX_FORMAT
6987 && ! defined $self->format) # manual settings are always
6990 Carp::my_carp_bug("Expecting hex format for mapping table for $self, instead got '$format'")
6993 # If the output is to be adjusted, the format of the table that gets
6994 # output is actually 'a' or 'ax' instead of whatever it is stored
6996 my $output_adjusted = ($self->to_output_map == $OUTPUT_ADJUSTED);
6997 if ($output_adjusted) {
6998 if ($default_map eq $CODE_POINT) {
6999 $format = $HEX_ADJUST_FORMAT;
7002 $format = $ADJUST_FORMAT;
7006 $self->_set_format($format);
7008 return $self->SUPER::write(
7010 ($self->property == $block)
7011 ? 7 # block file needs more tab stops
7013 $default_map); # don't write defaulteds
7016 # Accessors for the underlying list that should fail if locked.
7026 return if $self->carp_if_locked;
7027 return $self->_range_list->$sub(@_);
7030 } # End closure for Map_Table
7032 package Match_Table;
7033 use parent '-norequire', '_Base_Table';
7035 # A Match table is one which is a list of all the code points that have
7036 # the same property and property value, for use in \p{property=value}
7037 # constructs in regular expressions. It adds very little data to the base
7038 # structure, but many methods, as these lists can be combined in many ways to
7040 # There are only a few concepts added:
7041 # 1) Equivalents and Relatedness.
7042 # Two tables can match the identical code points, but have different names.
7043 # This always happens when there is a perl single form extension
7044 # \p{IsProperty} for the Unicode compound form \P{Property=True}. The two
7045 # tables are set to be related, with the Perl extension being a child, and
7046 # the Unicode property being the parent.
7048 # It may be that two tables match the identical code points and we don't
7049 # know if they are related or not. This happens most frequently when the
7050 # Block and Script properties have the exact range. But note that a
7051 # revision to Unicode could add new code points to the script, which would
7052 # now have to be in a different block (as the block was filled, or there
7053 # would have been 'Unknown' script code points in it and they wouldn't have
7054 # been identical). So we can't rely on any two properties from Unicode
7055 # always matching the same code points from release to release, and thus
7056 # these tables are considered coincidentally equivalent--not related. When
7057 # two tables are unrelated but equivalent, one is arbitrarily chosen as the
7058 # 'leader', and the others are 'equivalents'. This concept is useful
7059 # to minimize the number of tables written out. Only one file is used for
7060 # any identical set of code points, with entries in Heavy.pl mapping all
7061 # the involved tables to it.
7063 # Related tables will always be identical; we set them up to be so. Thus
7064 # if the Unicode one is deprecated, the Perl one will be too. Not so for
7065 # unrelated tables. Relatedness makes generating the documentation easier.
7068 # Like equivalents, two tables may be the inverses of each other, the
7069 # intersection between them is null, and the union is every Unicode code
7070 # point. The two tables that occupy a binary property are necessarily like
7071 # this. By specifying one table as the complement of another, we can avoid
7072 # storing it on disk (using the other table and performing a fast
7073 # transform), and some memory and calculations.
7075 # 3) Conflicting. It may be that there will eventually be name clashes, with
7076 # the same name meaning different things. For a while, there actually were
7077 # conflicts, but they have so far been resolved by changing Perl's or
7078 # Unicode's definitions to match the other, but when this code was written,
7079 # it wasn't clear that that was what was going to happen. (Unicode changed
7080 # because of protests during their beta period.) Name clashes are warned
7081 # about during compilation, and the documentation. The generated tables
7082 # are sane, free of name clashes, because the code suppresses the Perl
7083 # version. But manual intervention to decide what the actual behavior
7084 # should be may be required should this happen. The introductory comments
7085 # have more to say about this.
7087 sub standardize { return main::standardize($_[0]); }
7088 sub trace { return main::trace(@_); }
7093 main::setup_package();
7096 # The leader table of this one; initially $self.
7097 main::set_access('leader', \%leader, 'r');
7100 # An array of any tables that have this one as their leader
7101 main::set_access('equivalents', \%equivalents, 'readable_array');
7104 # The parent table to this one, initially $self. This allows us to
7105 # distinguish between equivalent tables that are related (for which this
7106 # is set to), and those which may not be, but share the same output file
7107 # because they match the exact same set of code points in the current
7109 main::set_access('parent', \%parent, 'r');
7112 # An array of any tables that have this one as their parent
7113 main::set_access('children', \%children, 'readable_array');
7116 # Array of any tables that would have the same name as this one with
7117 # a different meaning. This is used for the generated documentation.
7118 main::set_access('conflicting', \%conflicting, 'readable_array');
7121 # Set in the constructor for tables that are expected to match all code
7123 main::set_access('matches_all', \%matches_all, 'r');
7126 # Points to the complement that this table is expressed in terms of; 0 if
7128 main::set_access('complement', \%complement, 'r');
7135 # The property for which this table is a listing of property values.
7136 my $property = delete $args{'_Property'};
7138 my $name = delete $args{'Name'};
7139 my $full_name = delete $args{'Full_Name'};
7140 $full_name = $name if ! defined $full_name;
7143 my $initialize = delete $args{'Initialize'};
7144 my $matches_all = delete $args{'Matches_All'} || 0;
7145 my $format = delete $args{'Format'};
7146 # Rest of parameters passed on.
7148 my $range_list = Range_List->new(Initialize => $initialize,
7149 Owner => $property);
7151 my $complete = $full_name;
7152 $complete = '""' if $complete eq ""; # A null name shouldn't happen,
7153 # but this helps debug if it
7155 # The complete name for a match table includes it's property in a
7156 # compound form 'property=table', except if the property is the
7157 # pseudo-property, perl, in which case it is just the single form,
7158 # 'table' (If you change the '=' must also change the ':' in lots of
7159 # places in this program that assume an equal sign)
7160 $complete = $property->full_name . "=$complete" if $property != $perl;
7162 my $self = $class->SUPER::new(%args,
7164 Complete_Name => $complete,
7165 Full_Name => $full_name,
7166 _Property => $property,
7167 _Range_List => $range_list,
7168 Format => $EMPTY_FORMAT,
7170 my $addr = do { no overloading; pack 'J', $self; };
7172 $conflicting{$addr} = [ ];
7173 $equivalents{$addr} = [ ];
7174 $children{$addr} = [ ];
7175 $matches_all{$addr} = $matches_all;
7176 $leader{$addr} = $self;
7177 $parent{$addr} = $self;
7178 $complement{$addr} = 0;
7180 if (defined $format && $format ne $EMPTY_FORMAT) {
7181 Carp::my_carp_bug("'Format' must be '$EMPTY_FORMAT' in a match table instead of '$format'. Using '$EMPTY_FORMAT'");
7187 # See this program's beginning comment block about overloading these.
7190 qw("") => "_operator_stringify",
7194 return if $self->carp_if_locked;
7202 return $self->_range_list + $other;
7208 return $self->_range_list & $other;
7213 my $reversed = shift;
7216 Carp::my_carp_bug("Bad news. Can't cope with '"
7220 . "'. undef returned.");
7224 return if $self->carp_if_locked;
7226 my $addr = do { no overloading; pack 'J', $self; };
7230 # Change the range list of this table to be the
7232 $self->_set_range_list($self->_range_list
7235 else { # $other is just a simple value
7236 $self->add_range($other, $other);
7243 my $reversed = shift;
7246 Carp::my_carp_bug("Bad news. Can't cope with '"
7250 . "'. undef returned.");
7254 return if $self->carp_if_locked;
7255 $self->_set_range_list($self->_range_list & $other);
7258 '-' => sub { my $self = shift;
7260 my $reversed = shift;
7262 Carp::my_carp_bug("Bad news. Can't cope with '"
7266 . "'. undef returned.");
7270 return $self->_range_list - $other;
7272 '~' => sub { my $self = shift;
7273 return ~ $self->_range_list;
7277 sub _operator_stringify {
7280 my $name = $self->complete_name;
7281 return "Table '$name'";
7285 # Returns the range list associated with this table, which will be the
7286 # complement's if it has one.
7290 if (($complement = $self->complement) != 0) {
7291 return ~ $complement->_range_list;
7294 return $self->SUPER::_range_list;
7299 # Add a synonym for this table. See the comments in the base class
7303 # Rest of parameters passed on.
7305 $self->SUPER::add_alias($name, $self, @_);
7309 sub add_conflicting {
7310 # Add the name of some other object to the list of ones that name
7311 # clash with this match table.
7314 my $conflicting_name = shift; # The name of the conflicting object
7315 my $p = shift || 'p'; # Optional, is this a \p{} or \P{} ?
7316 my $conflicting_object = shift; # Optional, the conflicting object
7317 # itself. This is used to
7318 # disambiguate the text if the input
7319 # name is identical to any of the
7320 # aliases $self is known by.
7321 # Sometimes the conflicting object is
7322 # merely hypothetical, so this has to
7323 # be an optional parameter.
7324 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7326 my $addr = do { no overloading; pack 'J', $self; };
7328 # Check if the conflicting name is exactly the same as any existing
7329 # alias in this table (as long as there is a real object there to
7330 # disambiguate with).
7331 if (defined $conflicting_object) {
7332 foreach my $alias ($self->aliases) {
7333 if ($alias->name eq $conflicting_name) {
7335 # Here, there is an exact match. This results in
7336 # ambiguous comments, so disambiguate by changing the
7337 # conflicting name to its object's complete equivalent.
7338 $conflicting_name = $conflicting_object->complete_name;
7344 # Convert to the \p{...} final name
7345 $conflicting_name = "\\$p" . "{$conflicting_name}";
7348 return if grep { $conflicting_name eq $_ } @{$conflicting{$addr}};
7350 push @{$conflicting{$addr}}, $conflicting_name;
7355 sub is_set_equivalent_to {
7356 # Return boolean of whether or not the other object is a table of this
7357 # type and has been marked equivalent to this one.
7361 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7363 return 0 if ! defined $other; # Can happen for incomplete early
7365 unless ($other->isa(__PACKAGE__)) {
7366 my $ref_other = ref $other;
7367 my $ref_self = ref $self;
7368 Carp::my_carp_bug("Argument to 'is_set_equivalent_to' must be another $ref_self, not a '$ref_other'. $other not set equivalent to $self.");
7372 # Two tables are equivalent if they have the same leader.
7374 return $leader{pack 'J', $self} == $leader{pack 'J', $other};
7378 sub set_equivalent_to {
7379 # Set $self equivalent to the parameter table.
7380 # The required Related => 'x' parameter is a boolean indicating
7381 # whether these tables are related or not. If related, $other becomes
7382 # the 'parent' of $self; if unrelated it becomes the 'leader'
7384 # Related tables share all characteristics except names; equivalents
7385 # not quite so many.
7386 # If they are related, one must be a perl extension. This is because
7387 # we can't guarantee that Unicode won't change one or the other in a
7388 # later release even if they are identical now.
7394 my $related = delete $args{'Related'};
7396 Carp::carp_extra_args(\%args) if main::DEBUG && %args;
7398 return if ! defined $other; # Keep on going; happens in some early
7401 if (! defined $related) {
7402 Carp::my_carp_bug("set_equivalent_to must have 'Related => [01] parameter. Assuming $self is not related to $other");
7406 # If already are equivalent, no need to re-do it; if subroutine
7407 # returns null, it found an error, also do nothing
7408 my $are_equivalent = $self->is_set_equivalent_to($other);
7409 return if ! defined $are_equivalent || $are_equivalent;
7411 my $addr = do { no overloading; pack 'J', $self; };
7412 my $current_leader = ($related) ? $parent{$addr} : $leader{$addr};
7415 if ($current_leader->perl_extension) {
7416 if ($other->perl_extension) {
7417 Carp::my_carp_bug("Use add_alias() to set two Perl tables '$self' and '$other', equivalent.");
7420 } elsif ($self->property != $other->property # Depending on
7426 && ! $other->perl_extension)
7428 Carp::my_carp_bug("set_equivalent_to should have 'Related => 0 for equivalencing two Unicode properties. Assuming $self is not related to $other");
7433 if (! $self->is_empty && ! $self->matches_identically_to($other)) {
7434 Carp::my_carp_bug("$self should be empty or match identically to $other. Not setting equivalent");
7438 my $leader = do { no overloading; pack 'J', $current_leader; };
7439 my $other_addr = do { no overloading; pack 'J', $other; };
7441 # Any tables that are equivalent to or children of this table must now
7442 # instead be equivalent to or (children) to the new leader (parent),
7443 # still equivalent. The equivalency includes their matches_all info,
7444 # and for related tables, their fate and status.
7445 # All related tables are of necessity equivalent, but the converse
7446 # isn't necessarily true
7447 my $status = $other->status;
7448 my $status_info = $other->status_info;
7449 my $fate = $other->fate;
7450 my $matches_all = $matches_all{other_addr};
7451 my $caseless_equivalent = $other->caseless_equivalent;
7452 foreach my $table ($current_leader, @{$equivalents{$leader}}) {
7453 next if $table == $other;
7454 trace "setting $other to be the leader of $table, status=$status" if main::DEBUG && $to_trace;
7456 my $table_addr = do { no overloading; pack 'J', $table; };
7457 $leader{$table_addr} = $other;
7458 $matches_all{$table_addr} = $matches_all;
7459 $self->_set_range_list($other->_range_list);
7460 push @{$equivalents{$other_addr}}, $table;
7462 $parent{$table_addr} = $other;
7463 push @{$children{$other_addr}}, $table;
7464 $table->set_status($status, $status_info);
7466 # This reason currently doesn't get exposed outside; otherwise
7467 # would have to look up the parent's reason and use it instead.
7468 $table->set_fate($fate, "Parent's fate");
7470 $self->set_caseless_equivalent($caseless_equivalent);
7474 # Now that we've declared these to be equivalent, any changes to one
7475 # of the tables would invalidate that equivalency.
7481 sub set_complement {
7482 # Set $self to be the complement of the parameter table. $self is
7483 # locked, as what it contains should all come from the other table.
7489 Carp::carp_extra_args(\%args) if main::DEBUG && %args;
7491 if ($other->complement != 0) {
7492 Carp::my_carp_bug("Can't set $self to be the complement of $other, which itself is the complement of " . $other->complement);
7495 my $addr = do { no overloading; pack 'J', $self; };
7496 $complement{$addr} = $other;
7501 sub add_range { # Add a range to the list for this table.
7503 # Rest of parameters passed on
7505 return if $self->carp_if_locked;
7506 return $self->_range_list->add_range(@_);
7511 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7513 # All match tables are to be used only by the Perl core.
7514 return $self->SUPER::header() . $INTERNAL_ONLY_HEADER;
7517 sub pre_body { # Does nothing for match tables.
7521 sub append_to_body { # Does nothing for match tables.
7529 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7531 $self->SUPER::set_fate($fate, $reason);
7533 # All children share this fate
7534 foreach my $child ($self->children) {
7535 $child->set_fate($fate, $reason);
7542 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7544 return $self->SUPER::write(0, 2); # No adjustments; 2 tab stops
7547 sub set_final_comment {
7548 # This creates a comment for the file that is to hold the match table
7549 # $self. It is somewhat convoluted to make the English read nicely,
7550 # but, heh, it's just a comment.
7551 # This should be called only with the leader match table of all the
7552 # ones that share the same file. It lists all such tables, ordered so
7553 # that related ones are together.
7555 return unless $debugging_build;
7557 my $leader = shift; # Should only be called on the leader table of
7558 # an equivalent group
7559 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7561 my $addr = do { no overloading; pack 'J', $leader; };
7563 if ($leader{$addr} != $leader) {
7564 Carp::my_carp_bug(<<END
7565 set_final_comment() must be called on a leader table, which $leader is not.
7566 It is equivalent to $leader{$addr}. No comment created
7572 # Get the number of code points matched by each of the tables in this
7573 # file, and add underscores for clarity.
7574 my $count = $leader->count;
7575 my $string_count = main::clarify_number($count);
7577 my $loose_count = 0; # how many aliases loosely matched
7578 my $compound_name = ""; # ? Are any names compound?, and if so, an
7580 my $properties_with_compound_names = 0; # count of these
7583 my %flags; # The status flags used in the file
7584 my $total_entries = 0; # number of entries written in the comment
7585 my $matches_comment = ""; # The portion of the comment about the
7587 my @global_comments; # List of all the tables' comments that are
7588 # there before this routine was called.
7589 my $has_ucd_alias = 0; # If there is an alias that is accessible via
7590 # Unicode::UCD. If not, then don't say it is
7593 # Get list of all the parent tables that are equivalent to this one
7594 # (including itself).
7595 my @parents = grep { $parent{main::objaddr $_} == $_ }
7596 main::uniques($leader, @{$equivalents{$addr}});
7597 my $has_unrelated = (@parents >= 2); # boolean, ? are there unrelated
7600 for my $parent (@parents) {
7602 my $property = $parent->property;
7604 # Special case 'N' tables in properties with two match tables when
7605 # the other is a 'Y' one. These are likely to be binary tables,
7606 # but not necessarily. In either case, \P{} will match the
7607 # complement of \p{}, and so if something is a synonym of \p, the
7608 # complement of that something will be the synonym of \P. This
7609 # would be true of any property with just two match tables, not
7610 # just those whose values are Y and N; but that would require a
7611 # little extra work, and there are none such so far in Unicode.
7612 my $perl_p = 'p'; # which is it? \p{} or \P{}
7613 my @yes_perl_synonyms; # list of any synonyms for the 'Y' table
7615 if (scalar $property->tables == 2
7616 && $parent == $property->table('N')
7617 && defined (my $yes = $property->table('Y')))
7619 my $yes_addr = do { no overloading; pack 'J', $yes; };
7621 = grep { $_->property == $perl }
7624 $parent{$yes_addr}->children);
7626 # But these synonyms are \P{} ,not \p{}
7630 my @description; # Will hold the table description
7631 my @note; # Will hold the table notes.
7632 my @conflicting; # Will hold the table conflicts.
7634 # Look at the parent, any yes synonyms, and all the children
7635 my $parent_addr = do { no overloading; pack 'J', $parent; };
7636 for my $table ($parent,
7638 @{$children{$parent_addr}})
7640 my $table_addr = do { no overloading; pack 'J', $table; };
7641 my $table_property = $table->property;
7643 # Tables are separated by a blank line to create a grouping.
7644 $matches_comment .= "\n" if $matches_comment;
7646 # The table is named based on the property and value
7647 # combination it is for, like script=greek. But there may be
7648 # a number of synonyms for each side, like 'sc' for 'script',
7649 # and 'grek' for 'greek'. Any combination of these is a valid
7650 # name for this table. In this case, there are three more,
7651 # 'sc=grek', 'sc=greek', and 'script='grek'. Rather than
7652 # listing all possible combinations in the comment, we make
7653 # sure that each synonym occurs at least once, and add
7654 # commentary that the other combinations are possible.
7655 # Because regular expressions don't recognize things like
7656 # \p{jsn=}, only look at non-null right-hand-sides
7657 my @property_aliases = $table_property->aliases;
7658 my @table_aliases = grep { $_->name ne "" } $table->aliases;
7660 # The alias lists above are already ordered in the order we
7661 # want to output them. To ensure that each synonym is listed,
7662 # we must use the max of the two numbers. But if there are no
7663 # legal synonyms (nothing in @table_aliases), then we don't
7665 my $listed_combos = (@table_aliases)
7666 ? main::max(scalar @table_aliases,
7667 scalar @property_aliases)
7669 trace "$listed_combos, tables=", scalar @table_aliases, "; names=", scalar @property_aliases if main::DEBUG;
7672 my $property_had_compound_name = 0;
7674 for my $i (0 .. $listed_combos - 1) {
7677 # The current alias for the property is the next one on
7678 # the list, or if beyond the end, start over. Similarly
7679 # for the table (\p{prop=table})
7680 my $property_alias = $property_aliases
7681 [$i % @property_aliases]->name;
7682 my $table_alias_object = $table_aliases
7683 [$i % @table_aliases];
7684 my $table_alias = $table_alias_object->name;
7685 my $loose_match = $table_alias_object->loose_match;
7686 $has_ucd_alias |= $table_alias_object->ucd;
7688 if ($table_alias !~ /\D/) { # Clarify large numbers.
7689 $table_alias = main::clarify_number($table_alias)
7692 # Add a comment for this alias combination
7693 my $current_match_comment;
7694 if ($table_property == $perl) {
7695 $current_match_comment = "\\$perl_p"
7699 $current_match_comment
7700 = "\\p{$property_alias=$table_alias}";
7701 $property_had_compound_name = 1;
7704 # Flag any abnormal status for this table.
7705 my $flag = $property->status
7707 || $table_alias_object->status;
7708 if ($flag && $flag ne $PLACEHOLDER) {
7709 $flags{$flag} = $status_past_participles{$flag};
7714 # Pretty up the comment. Note the \b; it says don't make
7715 # this line a continuation.
7716 $matches_comment .= sprintf("\b%-1s%-s%s\n",
7719 $current_match_comment);
7720 } # End of generating the entries for this table.
7722 # Save these for output after this group of related tables.
7723 push @description, $table->description;
7724 push @note, $table->note;
7725 push @conflicting, $table->conflicting;
7727 # And this for output after all the tables.
7728 push @global_comments, $table->comment;
7730 # Compute an alternate compound name using the final property
7731 # synonym and the first table synonym with a colon instead of
7732 # the equal sign used elsewhere.
7733 if ($property_had_compound_name) {
7734 $properties_with_compound_names ++;
7735 if (! $compound_name || @property_aliases > 1) {
7736 $compound_name = $property_aliases[-1]->name
7738 . $table_aliases[0]->name;
7741 } # End of looping through all children of this table
7743 # Here have assembled in $matches_comment all the related tables
7744 # to the current parent (preceded by the same info for all the
7745 # previous parents). Put out information that applies to all of
7746 # the current family.
7749 # But output the conflicting information now, as it applies to
7751 my $conflicting = join ", ", @conflicting;
7753 $matches_comment .= <<END;
7755 Note that contrary to what you might expect, the above is NOT the same as
7757 $matches_comment .= "any of: " if @conflicting > 1;
7758 $matches_comment .= "$conflicting\n";
7762 $matches_comment .= "\n Meaning: "
7763 . join('; ', @description)
7767 $matches_comment .= "\n Note: "
7768 . join("\n ", @note)
7771 } # End of looping through all tables
7779 $code_points = 'single code point';
7783 $code_points = "$string_count code points";
7788 if ($total_entries == 1) {
7791 $any_of_these = 'this'
7794 $synonyms = " any of the following regular expression constructs";
7795 $entries = 'entries';
7796 $any_of_these = 'any of these'
7800 if ($has_ucd_alias) {
7801 $comment .= "Use Unicode::UCD::prop_invlist() to access the contents of this file.\n\n";
7803 if ($has_unrelated) {
7805 This file is for tables that are not necessarily related: To conserve
7806 resources, every table that matches the identical set of code points in this
7807 version of Unicode uses this file. Each one is listed in a separate group
7808 below. It could be that the tables will match the same set of code points in
7809 other Unicode releases, or it could be purely coincidence that they happen to
7810 be the same in Unicode $string_version, and hence may not in other versions.
7816 foreach my $flag (sort keys %flags) {
7818 '$flag' below means that this form is $flags{$flag}.
7819 Consult $pod_file.pod
7825 if ($total_entries == 0) {
7826 Carp::my_carp("No regular expression construct can match $leader, as all names for it are the null string. Creating file anyway.");
7828 This file returns the $code_points in Unicode Version $string_version for
7829 $leader, but it is inaccessible through Perl regular expressions, as
7830 "\\p{prop=}" is not recognized.
7835 This file returns the $code_points in Unicode Version $string_version that
7839 $pod_file.pod should be consulted for the syntax rules for $any_of_these,
7840 including if adding or subtracting white space, underscore, and hyphen
7841 characters matters or doesn't matter, and other permissible syntactic
7842 variants. Upper/lower case distinctions never matter.
7846 if ($compound_name) {
7849 A colon can be substituted for the equals sign, and
7851 if ($properties_with_compound_names > 1) {
7853 within each group above,
7856 $compound_name = sprintf("%-8s\\p{%s}", " ", $compound_name);
7858 # Note the \b below, it says don't make that line a continuation.
7860 anything to the left of the equals (or colon) can be combined with anything to
7861 the right. Thus, for example,
7867 # And append any comment(s) from the actual tables. They are all
7868 # gathered here, so may not read all that well.
7869 if (@global_comments) {
7870 $comment .= "\n" . join("\n\n", @global_comments) . "\n";
7873 if ($count) { # The format differs if no code points, and needs no
7874 # explanation in that case
7877 The format of the lines of this file is:
7880 START\\tSTOP\\twhere START is the starting code point of the range, in hex;
7881 STOP is the ending point, or if omitted, the range has just one code point.
7883 if ($leader->output_range_counts) {
7885 Numbers in comments in [brackets] indicate how many code points are in the
7891 $leader->set_comment(main::join_lines($comment));
7895 # Accessors for the underlying list
7897 get_valid_code_point
7898 get_invalid_code_point
7906 return $self->_range_list->$sub(@_);
7909 } # End closure for Match_Table
7913 # The Property class represents a Unicode property, or the $perl
7914 # pseudo-property. It contains a map table initialized empty at construction
7915 # time, and for properties accessible through regular expressions, various
7916 # match tables, created through the add_match_table() method, and referenced
7917 # by the table('NAME') or tables() methods, the latter returning a list of all
7918 # of the match tables. Otherwise table operations implicitly are for the map
7921 # Most of the data in the property is actually about its map table, so it
7922 # mostly just uses that table's accessors for most methods. The two could
7923 # have been combined into one object, but for clarity because of their
7924 # differing semantics, they have been kept separate. It could be argued that
7925 # the 'file' and 'directory' fields should be kept with the map table.
7927 # Each property has a type. This can be set in the constructor, or in the
7928 # set_type accessor, but mostly it is figured out by the data. Every property
7929 # starts with unknown type, overridden by a parameter to the constructor, or
7930 # as match tables are added, or ranges added to the map table, the data is
7931 # inspected, and the type changed. After the table is mostly or entirely
7932 # filled, compute_type() should be called to finalize they analysis.
7934 # There are very few operations defined. One can safely remove a range from
7935 # the map table, and property_add_or_replace_non_nulls() adds the maps from another
7936 # table to this one, replacing any in the intersection of the two.
7938 sub standardize { return main::standardize($_[0]); }
7939 sub trace { return main::trace(@_) if main::DEBUG && $to_trace }
7943 # This hash will contain as keys, all the aliases of all properties, and
7944 # as values, pointers to their respective property objects. This allows
7945 # quick look-up of a property from any of its names.
7946 my %alias_to_property_of;
7948 sub dump_alias_to_property_of {
7951 print "\n", main::simple_dumper (\%alias_to_property_of), "\n";
7956 # This is a package subroutine, not called as a method.
7957 # If the single parameter is a literal '*' it returns a list of all
7958 # defined properties.
7959 # Otherwise, the single parameter is a name, and it returns a pointer
7960 # to the corresponding property object, or undef if none.
7962 # Properties can have several different names. The 'standard' form of
7963 # each of them is stored in %alias_to_property_of as they are defined.
7964 # But it's possible that this subroutine will be called with some
7965 # variant, so if the initial lookup fails, it is repeated with the
7966 # standardized form of the input name. If found, besides returning the
7967 # result, the input name is added to the list so future calls won't
7968 # have to do the conversion again.
7972 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7974 if (! defined $name) {
7975 Carp::my_carp_bug("Undefined input property. No action taken.");
7979 return main::uniques(values %alias_to_property_of) if $name eq '*';
7981 # Return cached result if have it.
7982 my $result = $alias_to_property_of{$name};
7983 return $result if defined $result;
7985 # Convert the input to standard form.
7986 my $standard_name = standardize($name);
7988 $result = $alias_to_property_of{$standard_name};
7989 return unless defined $result; # Don't cache undefs
7991 # Cache the result before returning it.
7992 $alias_to_property_of{$name} = $result;
7997 main::setup_package();
8000 # A pointer to the map table object for this property
8001 main::set_access('map', \%map);
8004 # The property's full name. This is a duplicate of the copy kept in the
8005 # map table, but is needed because stringify needs it during
8006 # construction of the map table, and then would have a chicken before egg
8008 main::set_access('full_name', \%full_name, 'r');
8011 # This hash will contain as keys, all the aliases of any match tables
8012 # attached to this property, and as values, the pointers to their
8013 # respective tables. This allows quick look-up of a table from any of its
8015 main::set_access('table_ref', \%table_ref);
8018 # The type of the property, $ENUM, $BINARY, etc
8019 main::set_access('type', \%type, 'r');
8022 # The filename where the map table will go (if actually written).
8023 # Normally defaulted, but can be overridden.
8024 main::set_access('file', \%file, 'r', 's');
8027 # The directory where the map table will go (if actually written).
8028 # Normally defaulted, but can be overridden.
8029 main::set_access('directory', \%directory, 's');
8031 my %pseudo_map_type;
8032 # This is used to affect the calculation of the map types for all the
8033 # ranges in the table. It should be set to one of the values that signify
8034 # to alter the calculation.
8035 main::set_access('pseudo_map_type', \%pseudo_map_type, 'r');
8037 my %has_only_code_point_maps;
8038 # A boolean used to help in computing the type of data in the map table.
8039 main::set_access('has_only_code_point_maps', \%has_only_code_point_maps);
8042 # A list of the first few distinct mappings this property has. This is
8043 # used to disambiguate between binary and enum property types, so don't
8044 # have to keep more than three.
8045 main::set_access('unique_maps', \%unique_maps);
8047 my %pre_declared_maps;
8048 # A boolean that gives whether the input data should declare all the
8049 # tables used, or not. If the former, unknown ones raise a warning.
8050 main::set_access('pre_declared_maps',
8051 \%pre_declared_maps, 'r', 's');
8054 # The only required parameter is the positionally first, name. All
8055 # other parameters are key => value pairs. See the documentation just
8056 # above for the meanings of the ones not passed directly on to the map
8057 # table constructor.
8060 my $name = shift || "";
8062 my $self = property_ref($name);
8063 if (defined $self) {
8064 my $options_string = join ", ", @_;
8065 $options_string = ". Ignoring options $options_string" if $options_string;
8066 Carp::my_carp("$self is already in use. Using existing one$options_string;");
8072 $self = bless \do { my $anonymous_scalar }, $class;
8073 my $addr = do { no overloading; pack 'J', $self; };
8075 $directory{$addr} = delete $args{'Directory'};
8076 $file{$addr} = delete $args{'File'};
8077 $full_name{$addr} = delete $args{'Full_Name'} || $name;
8078 $type{$addr} = delete $args{'Type'} || $UNKNOWN;
8079 $pseudo_map_type{$addr} = delete $args{'Map_Type'};
8080 $pre_declared_maps{$addr} = delete $args{'Pre_Declared_Maps'}
8081 # Starting in this release, property
8082 # values should be defined for all
8083 # properties, except those overriding this
8084 // $v_version ge v5.1.0;
8086 # Rest of parameters passed on.
8088 $has_only_code_point_maps{$addr} = 1;
8089 $table_ref{$addr} = { };
8090 $unique_maps{$addr} = { };
8092 $map{$addr} = Map_Table->new($name,
8093 Full_Name => $full_name{$addr},
8094 _Alias_Hash => \%alias_to_property_of,
8100 # See this program's beginning comment block about overloading the copy
8101 # constructor. Few operations are defined on properties, but a couple are
8102 # useful. It is safe to take the inverse of a property, and to remove a
8103 # single code point from it.
8106 qw("") => "_operator_stringify",
8107 "." => \&main::_operator_dot,
8108 ".=" => \&main::_operator_dot_equal,
8109 '==' => \&main::_operator_equal,
8110 '!=' => \&main::_operator_not_equal,
8111 '=' => sub { return shift },
8112 '-=' => "_minus_and_equal",
8115 sub _operator_stringify {
8116 return "Property '" . shift->full_name . "'";
8119 sub _minus_and_equal {
8120 # Remove a single code point from the map table of a property.
8124 my $reversed = shift;
8125 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8128 Carp::my_carp_bug("Bad news. Can't cope with a "
8130 . " argument to '-='. Subtraction ignored.");
8133 elsif ($reversed) { # Shouldn't happen in a -=, but just in case
8134 Carp::my_carp_bug("Bad news. Can't cope with subtracting a "
8136 . " from a non-object. undef returned.");
8141 $map{pack 'J', $self}->delete_range($other, $other);
8146 sub add_match_table {
8147 # Add a new match table for this property, with name given by the
8148 # parameter. It returns a pointer to the table.
8154 my $addr = do { no overloading; pack 'J', $self; };
8156 my $table = $table_ref{$addr}{$name};
8157 my $standard_name = main::standardize($name);
8159 || (defined ($table = $table_ref{$addr}{$standard_name})))
8161 Carp::my_carp("Table '$name' in $self is already in use. Using existing one");
8162 $table_ref{$addr}{$name} = $table;
8167 # See if this is a perl extension, if not passed in.
8168 my $perl_extension = delete $args{'Perl_Extension'};
8170 = $self->perl_extension if ! defined $perl_extension;
8172 $table = Match_Table->new(
8174 Perl_Extension => $perl_extension,
8175 _Alias_Hash => $table_ref{$addr},
8178 # gets property's fate and status by default,
8179 # except if the name begind with an
8180 # underscore, default it to internal
8181 Fate => ($name =~ /^_/)
8184 Status => $self->status,
8185 _Status_Info => $self->status_info,
8187 return unless defined $table;
8190 # Save the names for quick look up
8191 $table_ref{$addr}{$standard_name} = $table;
8192 $table_ref{$addr}{$name} = $table;
8194 # Perhaps we can figure out the type of this property based on the
8195 # fact of adding this match table. First, string properties don't
8196 # have match tables; second, a binary property can't have 3 match
8198 if ($type{$addr} == $UNKNOWN) {
8199 $type{$addr} = $NON_STRING;
8201 elsif ($type{$addr} == $STRING) {
8202 Carp::my_carp("$self Added a match table '$name' to a string property '$self'. Changed it to a non-string property. Bad News.");
8203 $type{$addr} = $NON_STRING;
8205 elsif ($type{$addr} != $ENUM && $type{$addr} != $FORCED_BINARY) {
8206 if (scalar main::uniques(values %{$table_ref{$addr}}) > 2) {
8207 if ($type{$addr} == $BINARY) {
8208 Carp::my_carp("$self now has more than 2 tables (with the addition of '$name'), and so is no longer binary. Changing its type to 'enum'. Bad News.");
8210 $type{$addr} = $ENUM;
8217 sub delete_match_table {
8218 # Delete the table referred to by $2 from the property $1.
8221 my $table_to_remove = shift;
8222 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8224 my $addr = do { no overloading; pack 'J', $self; };
8226 # Remove all names that refer to it.
8227 foreach my $key (keys %{$table_ref{$addr}}) {
8228 delete $table_ref{$addr}{$key}
8229 if $table_ref{$addr}{$key} == $table_to_remove;
8232 $table_to_remove->DESTROY;
8237 # Return a pointer to the match table (with name given by the
8238 # parameter) associated with this property; undef if none.
8242 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8244 my $addr = do { no overloading; pack 'J', $self; };
8246 return $table_ref{$addr}{$name} if defined $table_ref{$addr}{$name};
8248 # If quick look-up failed, try again using the standard form of the
8249 # input name. If that succeeds, cache the result before returning so
8250 # won't have to standardize this input name again.
8251 my $standard_name = main::standardize($name);
8252 return unless defined $table_ref{$addr}{$standard_name};
8254 $table_ref{$addr}{$name} = $table_ref{$addr}{$standard_name};
8255 return $table_ref{$addr}{$name};
8259 # Return a list of pointers to all the match tables attached to this
8263 return main::uniques(values %{$table_ref{pack 'J', shift}});
8267 # Returns the directory the map table for this property should be
8268 # output in. If a specific directory has been specified, that has
8269 # priority; 'undef' is returned if the type isn't defined;
8270 # or $map_directory for everything else.
8272 my $addr = do { no overloading; pack 'J', shift; };
8274 return $directory{$addr} if defined $directory{$addr};
8275 return undef if $type{$addr} == $UNKNOWN;
8276 return $map_directory;
8280 # Return the name that is used to both:
8281 # 1) Name the file that the map table is written to.
8282 # 2) The name of swash related stuff inside that file.
8283 # The reason for this is that the Perl core historically has used
8284 # certain names that aren't the same as the Unicode property names.
8285 # To continue using these, $file is hard-coded in this file for those,
8286 # but otherwise the standard name is used. This is different from the
8287 # external_name, so that the rest of the files, like in lib can use
8288 # the standard name always, without regard to historical precedent.
8291 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8293 my $addr = do { no overloading; pack 'J', $self; };
8295 # Swash names are used only on either
8296 # 1) legacy-only properties, because the formats for these are
8297 # unchangeable, and they have had these lines in them; or
8298 # 2) regular map tables; otherwise there should be no access to the
8299 # property map table from other parts of Perl.
8300 return if $map{$addr}->fate != $ORDINARY
8301 && $map{$addr}->fate != $LEGACY_ONLY;
8303 return $file{$addr} if defined $file{$addr};
8304 return $map{$addr}->external_name;
8307 sub to_create_match_tables {
8308 # Returns a boolean as to whether or not match tables should be
8309 # created for this property.
8312 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8314 # The whole point of this pseudo property is match tables.
8315 return 1 if $self == $perl;
8317 my $addr = do { no overloading; pack 'J', $self; };
8319 # Don't generate tables of code points that match the property values
8320 # of a string property. Such a list would most likely have many
8321 # property values, each with just one or very few code points mapping
8323 return 0 if $type{$addr} == $STRING;
8325 # Don't generate anything for unimplemented properties.
8326 return 0 if grep { $self->complete_name eq $_ }
8327 @unimplemented_properties;
8332 sub property_add_or_replace_non_nulls {
8333 # This adds the mappings in the property $other to $self. Non-null
8334 # mappings from $other override those in $self. It essentially merges
8335 # the two properties, with the second having priority except for null
8340 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8342 if (! $other->isa(__PACKAGE__)) {
8343 Carp::my_carp_bug("$other should be a "
8352 return $map{pack 'J', $self}->map_add_or_replace_non_nulls($map{pack 'J', $other});
8356 # Certain tables are not generally written out to files, but
8357 # Unicode::UCD has the intelligence to know that the file for $self
8358 # can be used to reconstruct those tables. This routine just changes
8359 # things so that UCD pod entries for those suppressed tables are
8360 # generated, so the fact that a proxy is used is invisible to the
8365 foreach my $property_name (@_) {
8366 my $ref = property_ref($property_name);
8367 next if $ref->to_output_map;
8368 $ref->set_fate($MAP_PROXIED);
8373 # Set the type of the property. Mostly this is figured out by the
8374 # data in the table. But this is used to set it explicitly. The
8375 # reason it is not a standard accessor is that when setting a binary
8376 # property, we need to make sure that all the true/false aliases are
8377 # present, as they were omitted in early Unicode releases.
8381 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8385 && $type != $FORCED_BINARY
8386 && $type != $STRING)
8388 Carp::my_carp("Unrecognized type '$type'. Type not set");
8392 { no overloading; $type{pack 'J', $self} = $type; }
8393 return if $type != $BINARY && $type != $FORCED_BINARY;
8395 my $yes = $self->table('Y');
8396 $yes = $self->table('Yes') if ! defined $yes;
8397 $yes = $self->add_match_table('Y', Full_Name => 'Yes')
8400 # Add aliases in order wanted, duplicates will be ignored. We use a
8401 # binary property present in all releases for its ordered lists of
8402 # true/false aliases. Note, that could run into problems in
8403 # outputting things in that we don't distinguish between the name and
8404 # full name of these. Hopefully, if the table was already created
8405 # before this code is executed, it was done with these set properly.
8406 my $bm = property_ref("Bidi_Mirrored");
8407 foreach my $alias ($bm->table("Y")->aliases) {
8408 $yes->add_alias($alias->name);
8410 my $no = $self->table('N');
8411 $no = $self->table('No') if ! defined $no;
8412 $no = $self->add_match_table('N', Full_Name => 'No') if ! defined $no;
8413 foreach my $alias ($bm->table("N")->aliases) {
8414 $no->add_alias($alias->name);
8421 # Add a map to the property's map table. This also keeps
8422 # track of the maps so that the property type can be determined from
8426 my $start = shift; # First code point in range
8427 my $end = shift; # Final code point in range
8428 my $map = shift; # What the range maps to.
8429 # Rest of parameters passed on.
8431 my $addr = do { no overloading; pack 'J', $self; };
8433 # If haven't the type of the property, gather information to figure it
8435 if ($type{$addr} == $UNKNOWN) {
8437 # If the map contains an interior blank or dash, or most other
8438 # nonword characters, it will be a string property. This
8439 # heuristic may actually miss some string properties. If so, they
8440 # may need to have explicit set_types called for them. This
8441 # happens in the Unihan properties.
8442 if ($map =~ / (?<= . ) [ -] (?= . ) /x
8443 || $map =~ / [^\w.\/\ -] /x)
8445 $self->set_type($STRING);
8447 # $unique_maps is used for disambiguating between ENUM and
8448 # BINARY later; since we know the property is not going to be
8449 # one of those, no point in keeping the data around
8450 undef $unique_maps{$addr};
8454 # Not necessarily a string. The final decision has to be
8455 # deferred until all the data are in. We keep track of if all
8456 # the values are code points for that eventual decision.
8457 $has_only_code_point_maps{$addr} &=
8458 $map =~ / ^ $code_point_re $/x;
8460 # For the purposes of disambiguating between binary and other
8461 # enumerations at the end, we keep track of the first three
8462 # distinct property values. Once we get to three, we know
8463 # it's not going to be binary, so no need to track more.
8464 if (scalar keys %{$unique_maps{$addr}} < 3) {
8465 $unique_maps{$addr}{main::standardize($map)} = 1;
8470 # Add the mapping by calling our map table's method
8471 return $map{$addr}->add_map($start, $end, $map, @_);
8475 # Compute the type of the property: $ENUM, $STRING, or $BINARY. This
8476 # should be called after the property is mostly filled with its maps.
8477 # We have been keeping track of what the property values have been,
8478 # and now have the necessary information to figure out the type.
8481 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8483 my $addr = do { no overloading; pack 'J', $self; };
8485 my $type = $type{$addr};
8487 # If already have figured these out, no need to do so again, but we do
8488 # a double check on ENUMS to make sure that a string property hasn't
8489 # improperly been classified as an ENUM, so continue on with those.
8490 return if $type == $STRING
8492 || $type == $FORCED_BINARY;
8494 # If every map is to a code point, is a string property.
8495 if ($type == $UNKNOWN
8496 && ($has_only_code_point_maps{$addr}
8497 || (defined $map{$addr}->default_map
8498 && $map{$addr}->default_map eq "")))
8500 $self->set_type($STRING);
8504 # Otherwise, it is to some sort of enumeration. (The case where
8505 # it is a Unicode miscellaneous property, and treated like a
8506 # string in this program is handled in add_map()). Distinguish
8507 # between binary and some other enumeration type. Of course, if
8508 # there are more than two values, it's not binary. But more
8509 # subtle is the test that the default mapping is defined means it
8510 # isn't binary. This in fact may change in the future if Unicode
8511 # changes the way its data is structured. But so far, no binary
8512 # properties ever have @missing lines for them, so the default map
8513 # isn't defined for them. The few properties that are two-valued
8514 # and aren't considered binary have the default map defined
8515 # starting in Unicode 5.0, when the @missing lines appeared; and
8516 # this program has special code to put in a default map for them
8517 # for earlier than 5.0 releases.
8519 || scalar keys %{$unique_maps{$addr}} > 2
8520 || defined $self->default_map)
8522 my $tables = $self->tables;
8523 my $count = $self->count;
8524 if ($verbosity && $tables > 500 && $tables/$count > .1) {
8525 Carp::my_carp_bug("It appears that $self should be a \$STRING property, not an \$ENUM because it has too many match tables: $tables\n");
8527 $self->set_type($ENUM);
8530 $self->set_type($BINARY);
8533 undef $unique_maps{$addr}; # Garbage collect
8540 my $reason = shift; # Ignored unless suppressing
8541 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8543 my $addr = do { no overloading; pack 'J', $self; };
8544 if ($fate == $SUPPRESSED) {
8545 $why_suppressed{$self->complete_name} = $reason;
8548 # Each table shares the property's fate, except that MAP_PROXIED
8549 # doesn't affect match tables
8550 $map{$addr}->set_fate($fate, $reason);
8551 if ($fate != $MAP_PROXIED) {
8552 foreach my $table ($map{$addr}, $self->tables) {
8553 $table->set_fate($fate, $reason);
8560 # Most of the accessors for a property actually apply to its map table.
8561 # Setup up accessor functions for those, referring to %map
8586 replacement_property
8611 # 'property' above is for symmetry, so that one can take
8612 # the property of a property and get itself, and so don't
8613 # have to distinguish between properties and tables in
8621 return $map{pack 'J', $self}->$sub(@_);
8631 # Returns lines of the input joined together, so that they can be folded
8633 # This causes continuation lines to be joined together into one long line
8634 # for folding. A continuation line is any line that doesn't begin with a
8635 # space or "\b" (the latter is stripped from the output). This is so
8636 # lines can be be in a HERE document so as to fit nicely in the terminal
8637 # width, but be joined together in one long line, and then folded with
8638 # indents, '#' prefixes, etc, properly handled.
8639 # A blank separates the joined lines except if there is a break; an extra
8640 # blank is inserted after a period ending a line.
8642 # Initialize the return with the first line.
8643 my ($return, @lines) = split "\n", shift;
8645 # If the first line is null, it was an empty line, add the \n back in
8646 $return = "\n" if $return eq "";
8648 # Now join the remainder of the physical lines.
8649 for my $line (@lines) {
8651 # An empty line means wanted a blank line, so add two \n's to get that
8652 # effect, and go to the next line.
8653 if (length $line == 0) {
8658 # Look at the last character of what we have so far.
8659 my $previous_char = substr($return, -1, 1);
8661 # And at the next char to be output.
8662 my $next_char = substr($line, 0, 1);
8664 if ($previous_char ne "\n") {
8666 # Here didn't end wth a nl. If the next char a blank or \b, it
8667 # means that here there is a break anyway. So add a nl to the
8669 if ($next_char eq " " || $next_char eq "\b") {
8670 $previous_char = "\n";
8671 $return .= $previous_char;
8674 # Add an extra space after periods.
8675 $return .= " " if $previous_char eq '.';
8678 # Here $previous_char is still the latest character to be output. If
8679 # it isn't a nl, it means that the next line is to be a continuation
8680 # line, with a blank inserted between them.
8681 $return .= " " if $previous_char ne "\n";
8684 substr($line, 0, 1) = "" if $next_char eq "\b";
8686 # And append this next line.
8693 sub simple_fold($;$$$) {
8694 # Returns a string of the input (string or an array of strings) folded
8695 # into multiple-lines each of no more than $MAX_LINE_WIDTH characters plus
8697 # This is tailored for the kind of text written by this program,
8698 # especially the pod file, which can have very long names with
8699 # underscores in the middle, or words like AbcDefgHij.... We allow
8700 # breaking in the middle of such constructs if the line won't fit
8701 # otherwise. The break in such cases will come either just after an
8702 # underscore, or just before one of the Capital letters.
8704 local $to_trace = 0 if main::DEBUG;
8707 my $prefix = shift; # Optional string to prepend to each output
8709 $prefix = "" unless defined $prefix;
8711 my $hanging_indent = shift; # Optional number of spaces to indent
8712 # continuation lines
8713 $hanging_indent = 0 unless $hanging_indent;
8715 my $right_margin = shift; # Optional number of spaces to narrow the
8717 $right_margin = 0 unless defined $right_margin;
8719 # Call carp with the 'nofold' option to avoid it from trying to call us
8721 Carp::carp_extra_args(\@_, 'nofold') if main::DEBUG && @_;
8723 # The space available doesn't include what's automatically prepended
8724 # to each line, or what's reserved on the right.
8725 my $max = $MAX_LINE_WIDTH - length($prefix) - $right_margin;
8726 # XXX Instead of using the 'nofold' perhaps better to look up the stack
8728 if (DEBUG && $hanging_indent >= $max) {
8729 Carp::my_carp("Too large a hanging indent ($hanging_indent); must be < $max. Using 0", 'nofold');
8730 $hanging_indent = 0;
8733 # First, split into the current physical lines.
8735 if (ref $line) { # Better be an array, because not bothering to
8737 foreach my $line (@{$line}) {
8738 push @line, split /\n/, $line;
8742 @line = split /\n/, $line;
8745 #local $to_trace = 1 if main::DEBUG;
8746 trace "", join(" ", @line), "\n" if main::DEBUG && $to_trace;
8748 # Look at each current physical line.
8749 for (my $i = 0; $i < @line; $i++) {
8750 Carp::my_carp("Tabs don't work well.", 'nofold') if $line[$i] =~ /\t/;
8751 #local $to_trace = 1 if main::DEBUG;
8752 trace "i=$i: $line[$i]\n" if main::DEBUG && $to_trace;
8754 # Remove prefix, because will be added back anyway, don't want
8756 $line[$i] =~ s/^$prefix//;
8758 # Remove trailing space
8759 $line[$i] =~ s/\s+\Z//;
8761 # If the line is too long, fold it.
8762 if (length $line[$i] > $max) {
8765 # Here needs to fold. Save the leading space in the line for
8767 $line[$i] =~ /^ ( \s* )/x;
8768 my $leading_space = $1;
8769 trace "line length", length $line[$i], "; lead length", length($leading_space) if main::DEBUG && $to_trace;
8771 # If character at final permissible position is white space,
8772 # fold there, which will delete that white space
8773 if (substr($line[$i], $max - 1, 1) =~ /\s/) {
8774 $remainder = substr($line[$i], $max);
8775 $line[$i] = substr($line[$i], 0, $max - 1);
8779 # Otherwise fold at an acceptable break char closest to
8780 # the max length. Look at just the maximal initial
8781 # segment of the line
8782 my $segment = substr($line[$i], 0, $max - 1);
8784 /^ ( .{$hanging_indent} # Don't look before the
8786 \ * # Don't look in leading
8787 # blanks past the indent
8788 [^ ] .* # Find the right-most
8789 (?: # acceptable break:
8790 [ \s = ] # space or equal
8791 | - (?! [.0-9] ) # or non-unary minus.
8792 ) # $1 includes the character
8795 # Split into the initial part that fits, and remaining
8797 $remainder = substr($line[$i], length $1);
8799 trace $line[$i] if DEBUG && $to_trace;
8800 trace $remainder if DEBUG && $to_trace;
8803 # If didn't find a good breaking spot, see if there is a
8804 # not-so-good breaking spot. These are just after
8805 # underscores or where the case changes from lower to
8806 # upper. Use \a as a soft hyphen, but give up
8807 # and don't break the line if there is actually a \a
8808 # already in the input. We use an ascii character for the
8809 # soft-hyphen to avoid any attempt by miniperl to try to
8810 # access the files that this program is creating.
8811 elsif ($segment !~ /\a/
8812 && ($segment =~ s/_/_\a/g
8813 || $segment =~ s/ ( [a-z] ) (?= [A-Z] )/$1\a/xg))
8815 # Here were able to find at least one place to insert
8816 # our substitute soft hyphen. Find the right-most one
8817 # and replace it by a real hyphen.
8818 trace $segment if DEBUG && $to_trace;
8820 rindex($segment, "\a"),
8823 # Then remove the soft hyphen substitutes.
8824 $segment =~ s/\a//g;
8825 trace $segment if DEBUG && $to_trace;
8827 # And split into the initial part that fits, and
8828 # remainder of the line
8829 my $pos = rindex($segment, '-');
8830 $remainder = substr($line[$i], $pos);
8831 trace $remainder if DEBUG && $to_trace;
8832 $line[$i] = substr($segment, 0, $pos + 1);
8836 # Here we know if we can fold or not. If we can, $remainder
8837 # is what remains to be processed in the next iteration.
8838 if (defined $remainder) {
8839 trace "folded='$line[$i]'" if main::DEBUG && $to_trace;
8841 # Insert the folded remainder of the line as a new element
8842 # of the array. (It may still be too long, but we will
8843 # deal with that next time through the loop.) Omit any
8844 # leading space in the remainder.
8845 $remainder =~ s/^\s+//;
8846 trace "remainder='$remainder'" if main::DEBUG && $to_trace;
8848 # But then indent by whichever is larger of:
8849 # 1) the leading space on the input line;
8850 # 2) the hanging indent.
8851 # This preserves indentation in the original line.
8852 my $lead = ($leading_space)
8853 ? length $leading_space
8855 $lead = max($lead, $hanging_indent);
8856 splice @line, $i+1, 0, (" " x $lead) . $remainder;
8860 # Ready to output the line. Get rid of any trailing space
8861 # And prefix by the required $prefix passed in.
8862 $line[$i] =~ s/\s+$//;
8863 $line[$i] = "$prefix$line[$i]\n";
8864 } # End of looping through all the lines.
8866 return join "", @line;
8869 sub property_ref { # Returns a reference to a property object.
8870 return Property::property_ref(@_);
8873 sub force_unlink ($) {
8874 my $filename = shift;
8875 return unless file_exists($filename);
8876 return if CORE::unlink($filename);
8878 # We might need write permission
8879 chmod 0777, $filename;
8880 CORE::unlink($filename) or Carp::my_carp("Couldn't unlink $filename. Proceeding anyway: $!");
8885 # Given a filename and references to arrays of lines, write the lines of
8886 # each array to the file
8887 # Filename can be given as an arrayref of directory names
8889 return Carp::carp_too_few_args(\@_, 3) if main::DEBUG && @_ < 3;
8892 my $use_utf8 = shift;
8894 # Get into a single string if an array, and get rid of, in Unix terms, any
8896 $file= File::Spec->join(@$file) if ref $file eq 'ARRAY';
8897 $file = File::Spec->canonpath($file);
8899 # If has directories, make sure that they all exist
8900 (undef, my $directories, undef) = File::Spec->splitpath($file);
8901 File::Path::mkpath($directories) if $directories && ! -d $directories;
8903 push @files_actually_output, $file;
8905 force_unlink ($file);
8908 if (not open $OUT, ">", $file) {
8909 Carp::my_carp("can't open $file for output. Skipping this file: $!");
8913 binmode $OUT, ":utf8" if $use_utf8;
8915 while (defined (my $lines_ref = shift)) {
8916 unless (@$lines_ref) {
8917 Carp::my_carp("An array of lines for writing to file '$file' is empty; writing it anyway;");
8920 print $OUT @$lines_ref or die Carp::my_carp("write to '$file' failed: $!");
8922 close $OUT or die Carp::my_carp("close '$file' failed: $!");
8924 print "$file written.\n" if $verbosity >= $VERBOSE;
8930 sub Standardize($) {
8931 # This converts the input name string into a standardized equivalent to
8935 unless (defined $name) {
8936 Carp::my_carp_bug("Standardize() called with undef. Returning undef.");
8940 # Remove any leading or trailing white space
8944 # Convert interior white space and hyphens into underscores.
8945 $name =~ s/ (?<= .) [ -]+ (.) /_$1/xg;
8947 # Capitalize the letter following an underscore, and convert a sequence of
8948 # multiple underscores to a single one
8949 $name =~ s/ (?<= .) _+ (.) /_\u$1/xg;
8951 # And capitalize the first letter, but not for the special cjk ones.
8952 $name = ucfirst($name) unless $name =~ /^k[A-Z]/;
8956 sub standardize ($) {
8957 # Returns a lower-cased standardized name, without underscores. This form
8958 # is chosen so that it can distinguish between any real versus superficial
8959 # Unicode name differences. It relies on the fact that Unicode doesn't
8960 # have interior underscores, white space, nor dashes in any
8961 # stricter-matched name. It should not be used on Unicode code point
8962 # names (the Name property), as they mostly, but not always follow these
8965 my $name = Standardize(shift);
8966 return if !defined $name;
8968 $name =~ s/ (?<= .) _ (?= . ) //xg;
8972 sub utf8_heavy_name ($$) {
8973 # Returns the name that utf8_heavy.pl will use to find a table. XXX
8974 # perhaps this function should be placed somewhere, like Heavy.pl so that
8975 # utf8_heavy can use it directly without duplicating code that can get
8980 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8982 my $property = $table->property;
8983 $property = ($property == $perl)
8984 ? "" # 'perl' is never explicitly stated
8985 : standardize($property->name) . '=';
8986 if ($alias->loose_match) {
8987 return $property . standardize($alias->name);
8990 return lc ($property . $alias->name);
8998 my $indent_increment = " " x (($debugging_build) ? 2 : 0);
9001 $main::simple_dumper_nesting = 0;
9004 # Like Simple Data::Dumper. Good enough for our needs. We can't use
9005 # the real thing as we have to run under miniperl.
9007 # It is designed so that on input it is at the beginning of a line,
9008 # and the final thing output in any call is a trailing ",\n".
9012 $indent = "" if ! $debugging_build || ! defined $indent;
9014 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9016 # nesting level is localized, so that as the call stack pops, it goes
9017 # back to the prior value.
9018 local $main::simple_dumper_nesting = $main::simple_dumper_nesting;
9019 undef %already_output if $main::simple_dumper_nesting == 0;
9020 $main::simple_dumper_nesting++;
9021 #print STDERR __LINE__, ": $main::simple_dumper_nesting: $indent$item\n";
9023 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9025 # Determine the indent for recursive calls.
9026 my $next_indent = $indent . $indent_increment;
9031 # Dump of scalar: just output it in quotes if not a number. To do
9032 # so we must escape certain characters, and therefore need to
9033 # operate on a copy to avoid changing the original
9035 $copy = $UNDEF unless defined $copy;
9037 # Quote non-integers (integers also have optional leading '-')
9038 if ($copy eq "" || $copy !~ /^ -? \d+ $/x) {
9040 # Escape apostrophe and backslash
9041 $copy =~ s/ ( ['\\] ) /\\$1/xg;
9044 $output = "$indent$copy,\n";
9048 # Keep track of cycles in the input, and refuse to infinitely loop
9049 my $addr = do { no overloading; pack 'J', $item; };
9050 if (defined $already_output{$addr}) {
9051 return "${indent}ALREADY OUTPUT: $item\n";
9053 $already_output{$addr} = $item;
9055 if (ref $item eq 'ARRAY') {
9058 if ($main::simple_dumper_nesting > 1) {
9060 $using_brackets = 1;
9063 $using_brackets = 0;
9066 # If the array is empty, put the closing bracket on the same
9067 # line. Otherwise, recursively add each array element
9073 for (my $i = 0; $i < @$item; $i++) {
9075 # Indent array elements one level
9076 $output .= &simple_dumper($item->[$i], $next_indent);
9077 next if ! $debugging_build;
9078 $output =~ s/\n$//; # Remove any trailing nl so
9079 $output .= " # [$i]\n"; # as to add a comment giving
9082 $output .= $indent; # Indent closing ']' to orig level
9084 $output .= ']' if $using_brackets;
9087 elsif (ref $item eq 'HASH') {
9092 # No surrounding braces at top level
9094 if ($main::simple_dumper_nesting > 1) {
9097 $body_indent = $next_indent;
9098 $next_indent .= $indent_increment;
9103 $body_indent = $indent;
9107 # Output hashes sorted alphabetically instead of apparently
9108 # random. Use caseless alphabetic sort
9109 foreach my $key (sort { lc $a cmp lc $b } keys %$item)
9111 if ($is_first_line) {
9115 $output .= "$body_indent";
9118 # The key must be a scalar, but this recursive call quotes
9120 $output .= &simple_dumper($key);
9122 # And change the trailing comma and nl to the hash fat
9123 # comma for clarity, and so the value can be on the same
9125 $output =~ s/,\n$/ => /;
9127 # Recursively call to get the value's dump.
9128 my $next = &simple_dumper($item->{$key}, $next_indent);
9130 # If the value is all on one line, remove its indent, so
9131 # will follow the => immediately. If it takes more than
9132 # one line, start it on a new line.
9133 if ($next !~ /\n.*\n/) {
9142 $output .= "$indent},\n" if $using_braces;
9144 elsif (ref $item eq 'CODE' || ref $item eq 'GLOB') {
9145 $output = $indent . ref($item) . "\n";
9146 # XXX see if blessed
9148 elsif ($item->can('dump')) {
9150 # By convention in this program, objects furnish a 'dump'
9151 # method. Since not doing any output at this level, just pass
9152 # on the input indent
9153 $output = $item->dump($indent);
9156 Carp::my_carp("Can't cope with dumping a " . ref($item) . ". Skipping.");
9163 sub dump_inside_out {
9164 # Dump inside-out hashes in an object's state by converting them to a
9165 # regular hash and then calling simple_dumper on that.
9168 my $fields_ref = shift;
9169 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9171 my $addr = do { no overloading; pack 'J', $object; };
9174 foreach my $key (keys %$fields_ref) {
9175 $hash{$key} = $fields_ref->{$key}{$addr};
9178 return simple_dumper(\%hash, @_);
9182 # Overloaded '.' method that is common to all packages. It uses the
9183 # package's stringify method.
9187 my $reversed = shift;
9188 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9190 $other = "" unless defined $other;
9192 foreach my $which (\$self, \$other) {
9193 next unless ref $$which;
9194 if ($$which->can('_operator_stringify')) {
9195 $$which = $$which->_operator_stringify;
9198 my $ref = ref $$which;
9199 my $addr = do { no overloading; pack 'J', $$which; };
9200 $$which = "$ref ($addr)";
9208 sub _operator_dot_equal {
9209 # Overloaded '.=' method that is common to all packages.
9213 my $reversed = shift;
9214 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9216 $other = "" unless defined $other;
9219 return $other .= "$self";
9222 return "$self" . "$other";
9226 sub _operator_equal {
9227 # Generic overloaded '==' routine. To be equal, they must be the exact
9233 return 0 unless defined $other;
9234 return 0 unless ref $other;
9236 return $self == $other;
9239 sub _operator_not_equal {
9243 return ! _operator_equal($self, $other);
9246 sub process_PropertyAliases($) {
9247 # This reads in the PropertyAliases.txt file, which contains almost all
9248 # the character properties in Unicode and their equivalent aliases:
9249 # scf ; Simple_Case_Folding ; sfc
9251 # Field 0 is the preferred short name for the property.
9252 # Field 1 is the full name.
9253 # Any succeeding ones are other accepted names.
9256 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9258 # This whole file was non-existent in early releases, so use our own
9260 $file->insert_lines(get_old_property_aliases())
9261 if ! -e 'PropertyAliases.txt';
9263 # Add any cjk properties that may have been defined.
9264 $file->insert_lines(@cjk_properties);
9266 while ($file->next_line) {
9268 my @data = split /\s*;\s*/;
9270 my $full = $data[1];
9272 my $this = Property->new($data[0], Full_Name => $full);
9274 # Start looking for more aliases after these two.
9275 for my $i (2 .. @data - 1) {
9276 $this->add_alias($data[$i]);
9281 my $scf = property_ref("Simple_Case_Folding");
9282 $scf->add_alias("scf");
9283 $scf->add_alias("sfc");
9288 sub finish_property_setup {
9289 # Finishes setting up after PropertyAliases.
9292 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9294 # This entry was missing from this file in earlier Unicode versions
9295 if (-e 'Jamo.txt' && ! defined property_ref('JSN')) {
9296 Property->new('JSN', Full_Name => 'Jamo_Short_Name');
9299 # These two properties must be defined in all releases so we can generate
9300 # the tables from them to make regex \X work, but suppress their output so
9301 # aren't application visible prior to releases where they should be
9302 if (! defined property_ref('GCB')) {
9303 Property->new('GCB', Full_Name => 'Grapheme_Cluster_Break',
9304 Fate => $PLACEHOLDER);
9306 if (! defined property_ref('hst')) {
9307 Property->new('hst', Full_Name => 'Hangul_Syllable_Type',
9308 Fate => $PLACEHOLDER);
9311 # These are used so much, that we set globals for them.
9312 $gc = property_ref('General_Category');
9313 $block = property_ref('Block');
9314 $script = property_ref('Script');
9316 # Perl adds this alias.
9317 $gc->add_alias('Category');
9319 # Unicode::Normalize expects this file with this name and directory.
9320 my $ccc = property_ref('Canonical_Combining_Class');
9322 $ccc->set_file('CombiningClass');
9323 $ccc->set_directory(File::Spec->curdir());
9326 # These two properties aren't actually used in the core, but unfortunately
9327 # the names just above that are in the core interfere with these, so
9328 # choose different names. These aren't a problem unless the map tables
9329 # for these files get written out.
9330 my $lowercase = property_ref('Lowercase');
9331 $lowercase->set_file('IsLower') if defined $lowercase;
9332 my $uppercase = property_ref('Uppercase');
9333 $uppercase->set_file('IsUpper') if defined $uppercase;
9335 # Set up the hard-coded default mappings, but only on properties defined
9337 foreach my $property (keys %default_mapping) {
9338 my $property_object = property_ref($property);
9339 next if ! defined $property_object;
9340 my $default_map = $default_mapping{$property};
9341 $property_object->set_default_map($default_map);
9343 # A map of <code point> implies the property is string.
9344 if ($property_object->type == $UNKNOWN
9345 && $default_map eq $CODE_POINT)
9347 $property_object->set_type($STRING);
9351 # The following use the Multi_Default class to create objects for
9354 # Bidi class has a complicated default, but the derived file takes care of
9355 # the complications, leaving just 'L'.
9356 if (file_exists("${EXTRACTED}DBidiClass.txt")) {
9357 property_ref('Bidi_Class')->set_default_map('L');
9362 # The derived file was introduced in 3.1.1. The values below are
9363 # taken from table 3-8, TUS 3.0
9365 'my $default = Range_List->new;
9366 $default->add_range(0x0590, 0x05FF);
9367 $default->add_range(0xFB1D, 0xFB4F);'
9370 # The defaults apply only to unassigned characters
9371 $default_R .= '$gc->table("Unassigned") & $default;';
9373 if ($v_version lt v3.0.0) {
9374 $default = Multi_Default->new(R => $default_R, 'L');
9378 # AL apparently not introduced until 3.0: TUS 2.x references are
9379 # not on-line to check it out
9381 'my $default = Range_List->new;
9382 $default->add_range(0x0600, 0x07BF);
9383 $default->add_range(0xFB50, 0xFDFF);
9384 $default->add_range(0xFE70, 0xFEFF);'
9387 # Non-character code points introduced in this release; aren't AL
9388 if ($v_version ge 3.1.0) {
9389 $default_AL .= '$default->delete_range(0xFDD0, 0xFDEF);';
9391 $default_AL .= '$gc->table("Unassigned") & $default';
9392 $default = Multi_Default->new(AL => $default_AL,
9396 property_ref('Bidi_Class')->set_default_map($default);
9399 # Joining type has a complicated default, but the derived file takes care
9400 # of the complications, leaving just 'U' (or Non_Joining), except the file
9402 if (file_exists("${EXTRACTED}DJoinType.txt") || -e 'ArabicShaping.txt') {
9403 if (file_exists("${EXTRACTED}DJoinType.txt") && $v_version ne 3.1.0) {
9404 property_ref('Joining_Type')->set_default_map('Non_Joining');
9408 # Otherwise, there are not one, but two possibilities for the
9409 # missing defaults: T and U.
9410 # The missing defaults that evaluate to T are given by:
9411 # T = Mn + Cf - ZWNJ - ZWJ
9412 # where Mn and Cf are the general category values. In other words,
9413 # any non-spacing mark or any format control character, except
9414 # U+200C ZERO WIDTH NON-JOINER (joining type U) and U+200D ZERO
9415 # WIDTH JOINER (joining type C).
9416 my $default = Multi_Default->new(
9417 'T' => '$gc->table("Mn") + $gc->table("Cf") - 0x200C - 0x200D',
9419 property_ref('Joining_Type')->set_default_map($default);
9423 # Line break has a complicated default in early releases. It is 'Unknown'
9424 # for non-assigned code points; 'AL' for assigned.
9425 if (file_exists("${EXTRACTED}DLineBreak.txt") || -e 'LineBreak.txt') {
9426 my $lb = property_ref('Line_Break');
9427 if ($v_version gt 3.2.0) {
9428 $lb->set_default_map('Unknown');
9431 my $default = Multi_Default->new( 'Unknown' => '$gc->table("Cn")',
9433 $lb->set_default_map($default);
9436 # If has the URS property, make sure that the standard aliases are in
9437 # it, since not in the input tables in some versions.
9438 my $urs = property_ref('Unicode_Radical_Stroke');
9440 $urs->add_alias('cjkRSUnicode');
9441 $urs->add_alias('kRSUnicode');
9445 # For backwards compatibility with applications that may read the mapping
9446 # file directly (it was documented in 5.12 and 5.14 as being thusly
9447 # usable), keep it from being adjusted. (range_size_1 is
9448 # used to force the traditional format.)
9449 if (defined (my $nfkc_cf = property_ref('NFKC_Casefold'))) {
9450 $nfkc_cf->set_to_output_map($EXTERNAL_MAP);
9451 $nfkc_cf->set_range_size_1(1);
9453 if (defined (my $bmg = property_ref('Bidi_Mirroring_Glyph'))) {
9454 $bmg->set_to_output_map($EXTERNAL_MAP);
9455 $bmg->set_range_size_1(1);
9458 property_ref('Numeric_Value')->set_to_output_map($OUTPUT_ADJUSTED);
9463 sub get_old_property_aliases() {
9464 # Returns what would be in PropertyAliases.txt if it existed in very old
9465 # versions of Unicode. It was derived from the one in 3.2, and pared
9466 # down based on the data that was actually in the older releases.
9467 # An attempt was made to use the existence of files to mean inclusion or
9468 # not of various aliases, but if this was not sufficient, using version
9469 # numbers was resorted to.
9473 # These are to be used in all versions (though some are constructed by
9474 # this program if missing)
9475 push @return, split /\n/, <<'END';
9477 Bidi_M ; Bidi_Mirrored
9479 ccc ; Canonical_Combining_Class
9480 dm ; Decomposition_Mapping
9481 dt ; Decomposition_Type
9482 gc ; General_Category
9484 lc ; Lowercase_Mapping
9486 na1 ; Unicode_1_Name
9489 scf ; Simple_Case_Folding
9490 slc ; Simple_Lowercase_Mapping
9491 stc ; Simple_Titlecase_Mapping
9492 suc ; Simple_Uppercase_Mapping
9493 tc ; Titlecase_Mapping
9494 uc ; Uppercase_Mapping
9497 if (-e 'Blocks.txt') {
9498 push @return, "blk ; Block\n";
9500 if (-e 'ArabicShaping.txt') {
9501 push @return, split /\n/, <<'END';
9506 if (-e 'PropList.txt') {
9508 # This first set is in the original old-style proplist.
9509 push @return, split /\n/, <<'END';
9510 Bidi_C ; Bidi_Control
9518 Join_C ; Join_Control
9520 QMark ; Quotation_Mark
9521 Term ; Terminal_Punctuation
9522 WSpace ; White_Space
9524 # The next sets were added later
9525 if ($v_version ge v3.0.0) {
9526 push @return, split /\n/, <<'END';
9531 if ($v_version ge v3.0.1) {
9532 push @return, split /\n/, <<'END';
9533 NChar ; Noncharacter_Code_Point
9536 # The next sets were added in the new-style
9537 if ($v_version ge v3.1.0) {
9538 push @return, split /\n/, <<'END';
9539 OAlpha ; Other_Alphabetic
9540 OLower ; Other_Lowercase
9542 OUpper ; Other_Uppercase
9545 if ($v_version ge v3.1.1) {
9546 push @return, "AHex ; ASCII_Hex_Digit\n";
9549 if (-e 'EastAsianWidth.txt') {
9550 push @return, "ea ; East_Asian_Width\n";
9552 if (-e 'CompositionExclusions.txt') {
9553 push @return, "CE ; Composition_Exclusion\n";
9555 if (-e 'LineBreak.txt') {
9556 push @return, "lb ; Line_Break\n";
9558 if (-e 'BidiMirroring.txt') {
9559 push @return, "bmg ; Bidi_Mirroring_Glyph\n";
9561 if (-e 'Scripts.txt') {
9562 push @return, "sc ; Script\n";
9564 if (-e 'DNormalizationProps.txt') {
9565 push @return, split /\n/, <<'END';
9566 Comp_Ex ; Full_Composition_Exclusion
9567 FC_NFKC ; FC_NFKC_Closure
9568 NFC_QC ; NFC_Quick_Check
9569 NFD_QC ; NFD_Quick_Check
9570 NFKC_QC ; NFKC_Quick_Check
9571 NFKD_QC ; NFKD_Quick_Check
9572 XO_NFC ; Expands_On_NFC
9573 XO_NFD ; Expands_On_NFD
9574 XO_NFKC ; Expands_On_NFKC
9575 XO_NFKD ; Expands_On_NFKD
9578 if (-e 'DCoreProperties.txt') {
9579 push @return, split /\n/, <<'END';
9585 # These can also appear in some versions of PropList.txt
9586 push @return, "Lower ; Lowercase\n"
9587 unless grep { $_ =~ /^Lower\b/} @return;
9588 push @return, "Upper ; Uppercase\n"
9589 unless grep { $_ =~ /^Upper\b/} @return;
9592 # This flag requires the DAge.txt file to be copied into the directory.
9593 if (DEBUG && $compare_versions) {
9594 push @return, 'age ; Age';
9600 sub process_PropValueAliases {
9601 # This file contains values that properties look like:
9602 # bc ; AL ; Arabic_Letter
9603 # blk; n/a ; Greek_And_Coptic ; Greek
9605 # Field 0 is the property.
9606 # Field 1 is the short name of a property value or 'n/a' if no
9607 # short name exists;
9608 # Field 2 is the full property value name;
9609 # Any other fields are more synonyms for the property value.
9610 # Purely numeric property values are omitted from the file; as are some
9611 # others, fewer and fewer in later releases
9613 # Entries for the ccc property have an extra field before the
9615 # ccc; 0; NR ; Not_Reordered
9616 # It is the numeric value that the names are synonyms for.
9618 # There are comment entries for values missing from this file:
9619 # # @missing: 0000..10FFFF; ISO_Comment; <none>
9620 # # @missing: 0000..10FFFF; Lowercase_Mapping; <code point>
9623 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9625 # This whole file was non-existent in early releases, so use our own
9626 # internal one if necessary.
9627 if (! -e 'PropValueAliases.txt') {
9628 $file->insert_lines(get_old_property_value_aliases());
9631 if ($v_version lt 4.0.0) {
9632 $file->insert_lines(split /\n/, <<'END'
9633 hst; L ; Leading_Jamo
9634 hst; LV ; LV_Syllable
9635 hst; LVT ; LVT_Syllable
9636 hst; NA ; Not_Applicable
9637 hst; T ; Trailing_Jamo
9642 if ($v_version lt 4.1.0) {
9643 $file->insert_lines(split /\n/, <<'END'
9659 # Add any explicit cjk values
9660 $file->insert_lines(@cjk_property_values);
9662 # This line is used only for testing the code that checks for name
9663 # conflicts. There is a script Inherited, and when this line is executed
9664 # it causes there to be a name conflict with the 'Inherited' that this
9665 # program generates for this block property value
9666 #$file->insert_lines('blk; n/a; Herited');
9669 # Process each line of the file ...
9670 while ($file->next_line) {
9672 # Fix typo in input file
9673 s/CCC133/CCC132/g if $v_version eq v6.1.0;
9675 my ($property, @data) = split /\s*;\s*/;
9677 # The ccc property has an extra field at the beginning, which is the
9678 # numeric value. Move it to be after the other two, mnemonic, fields,
9679 # so that those will be used as the property value's names, and the
9680 # number will be an extra alias. (Rightmost splice removes field 1-2,
9681 # returning them in a slice; left splice inserts that before anything,
9682 # thus shifting the former field 0 to after them.)
9683 splice (@data, 0, 0, splice(@data, 1, 2)) if $property eq 'ccc';
9685 # Field 0 is a short name unless "n/a"; field 1 is the full name. If
9686 # there is no short name, use the full one in element 1
9687 if ($data[0] eq "n/a") {
9688 $data[0] = $data[1];
9690 elsif ($data[0] ne $data[1]
9691 && standardize($data[0]) eq standardize($data[1])
9692 && $data[1] !~ /[[:upper:]]/)
9694 # Also, there is a bug in the file in which "n/a" is omitted, and
9695 # the two fields are identical except for case, and the full name
9696 # is all lower case. Copy the "short" name unto the full one to
9697 # give it some upper case.
9699 $data[1] = $data[0];
9702 # Earlier releases had the pseudo property 'qc' that should expand to
9703 # the ones that replace it below.
9704 if ($property eq 'qc') {
9705 if (lc $data[0] eq 'y') {
9706 $file->insert_lines('NFC_QC; Y ; Yes',
9712 elsif (lc $data[0] eq 'n') {
9713 $file->insert_lines('NFC_QC; N ; No',
9719 elsif (lc $data[0] eq 'm') {
9720 $file->insert_lines('NFC_QC; M ; Maybe',
9721 'NFKC_QC; M ; Maybe',
9725 $file->carp_bad_line("qc followed by unexpected '$data[0]");
9730 # The first field is the short name, 2nd is the full one.
9731 my $property_object = property_ref($property);
9732 my $table = $property_object->add_match_table($data[0],
9733 Full_Name => $data[1]);
9735 # Start looking for more aliases after these two.
9736 for my $i (2 .. @data - 1) {
9737 $table->add_alias($data[$i]);
9739 } # End of looping through the file
9741 # As noted in the comments early in the program, it generates tables for
9742 # the default values for all releases, even those for which the concept
9743 # didn't exist at the time. Here we add those if missing.
9744 my $age = property_ref('age');
9745 if (defined $age && ! defined $age->table('Unassigned')) {
9746 $age->add_match_table('Unassigned');
9748 $block->add_match_table('No_Block') if -e 'Blocks.txt'
9749 && ! defined $block->table('No_Block');
9752 # Now set the default mappings of the properties from the file. This is
9753 # done after the loop because a number of properties have only @missings
9754 # entries in the file, and may not show up until the end.
9755 my @defaults = $file->get_missings;
9756 foreach my $default_ref (@defaults) {
9757 my $default = $default_ref->[0];
9758 my $property = property_ref($default_ref->[1]);
9759 $property->set_default_map($default);
9764 sub get_old_property_value_aliases () {
9765 # Returns what would be in PropValueAliases.txt if it existed in very old
9766 # versions of Unicode. It was derived from the one in 3.2, and pared
9767 # down. An attempt was made to use the existence of files to mean
9768 # inclusion or not of various aliases, but if this was not sufficient,
9769 # using version numbers was resorted to.
9771 my @return = split /\n/, <<'END';
9772 bc ; AN ; Arabic_Number
9773 bc ; B ; Paragraph_Separator
9774 bc ; CS ; Common_Separator
9775 bc ; EN ; European_Number
9776 bc ; ES ; European_Separator
9777 bc ; ET ; European_Terminator
9778 bc ; L ; Left_To_Right
9779 bc ; ON ; Other_Neutral
9780 bc ; R ; Right_To_Left
9781 bc ; WS ; White_Space
9783 Bidi_M; N; No; F; False
9784 Bidi_M; Y; Yes; T; True
9786 # The standard combining classes are very much different in v1, so only use
9787 # ones that look right (not checked thoroughly)
9788 ccc; 0; NR ; Not_Reordered
9789 ccc; 1; OV ; Overlay
9791 ccc; 8; KV ; Kana_Voicing
9793 ccc; 202; ATBL ; Attached_Below_Left
9794 ccc; 216; ATAR ; Attached_Above_Right
9795 ccc; 218; BL ; Below_Left
9797 ccc; 222; BR ; Below_Right
9799 ccc; 228; AL ; Above_Left
9801 ccc; 232; AR ; Above_Right
9802 ccc; 234; DA ; Double_Above
9804 dt ; can ; canonical
9818 gc ; C ; Other # Cc | Cf | Cn | Co | Cs
9820 gc ; Cn ; Unassigned
9821 gc ; Co ; Private_Use
9822 gc ; L ; Letter # Ll | Lm | Lo | Lt | Lu
9823 gc ; LC ; Cased_Letter # Ll | Lt | Lu
9824 gc ; Ll ; Lowercase_Letter
9825 gc ; Lm ; Modifier_Letter
9826 gc ; Lo ; Other_Letter
9827 gc ; Lu ; Uppercase_Letter
9828 gc ; M ; Mark # Mc | Me | Mn
9829 gc ; Mc ; Spacing_Mark
9830 gc ; Mn ; Nonspacing_Mark
9831 gc ; N ; Number # Nd | Nl | No
9832 gc ; Nd ; Decimal_Number
9833 gc ; No ; Other_Number
9834 gc ; P ; Punctuation # Pc | Pd | Pe | Pf | Pi | Po | Ps
9835 gc ; Pd ; Dash_Punctuation
9836 gc ; Pe ; Close_Punctuation
9837 gc ; Po ; Other_Punctuation
9838 gc ; Ps ; Open_Punctuation
9839 gc ; S ; Symbol # Sc | Sk | Sm | So
9840 gc ; Sc ; Currency_Symbol
9841 gc ; Sm ; Math_Symbol
9842 gc ; So ; Other_Symbol
9843 gc ; Z ; Separator # Zl | Zp | Zs
9844 gc ; Zl ; Line_Separator
9845 gc ; Zp ; Paragraph_Separator
9846 gc ; Zs ; Space_Separator
9854 if (-e 'ArabicShaping.txt') {
9855 push @return, split /\n/, <<'END';
9862 jg ; n/a ; NO_JOINING_GROUP
9870 jt ; C ; Join_Causing
9871 jt ; D ; Dual_Joining
9872 jt ; L ; Left_Joining
9873 jt ; R ; Right_Joining
9874 jt ; U ; Non_Joining
9875 jt ; T ; Transparent
9877 if ($v_version ge v3.0.0) {
9878 push @return, split /\n/, <<'END';
9882 jg ; n/a ; DALATH_RISH
9885 jg ; n/a ; FINAL_SEMKATH
9888 jg ; n/a ; HAMZA_ON_HEH_GOAL
9895 jg ; n/a ; KNOTTED_HEH
9902 jg ; n/a ; REVERSED_PE
9906 jg ; n/a ; SWASH_KAF
9908 jg ; n/a ; TEH_MARBUTA
9911 jg ; n/a ; YEH_BARREE
9912 jg ; n/a ; YEH_WITH_TAIL
9921 if (-e 'EastAsianWidth.txt') {
9922 push @return, split /\n/, <<'END';
9932 if (-e 'LineBreak.txt') {
9933 push @return, split /\n/, <<'END';
9935 lb ; AL ; Alphabetic
9936 lb ; B2 ; Break_Both
9937 lb ; BA ; Break_After
9938 lb ; BB ; Break_Before
9939 lb ; BK ; Mandatory_Break
9940 lb ; CB ; Contingent_Break
9941 lb ; CL ; Close_Punctuation
9942 lb ; CM ; Combining_Mark
9943 lb ; CR ; Carriage_Return
9944 lb ; EX ; Exclamation
9947 lb ; ID ; Ideographic
9948 lb ; IN ; Inseperable
9949 lb ; IS ; Infix_Numeric
9951 lb ; NS ; Nonstarter
9953 lb ; OP ; Open_Punctuation
9954 lb ; PO ; Postfix_Numeric
9955 lb ; PR ; Prefix_Numeric
9957 lb ; SA ; Complex_Context
9960 lb ; SY ; Break_Symbols
9966 if (-e 'DNormalizationProps.txt') {
9967 push @return, split /\n/, <<'END';
9974 if (-e 'Scripts.txt') {
9975 push @return, split /\n/, <<'END';
9977 sc ; Armn ; Armenian
9979 sc ; Bopo ; Bopomofo
9980 sc ; Cans ; Canadian_Aboriginal
9981 sc ; Cher ; Cherokee
9982 sc ; Cyrl ; Cyrillic
9983 sc ; Deva ; Devanagari
9985 sc ; Ethi ; Ethiopic
9986 sc ; Geor ; Georgian
9989 sc ; Gujr ; Gujarati
9990 sc ; Guru ; Gurmukhi
9994 sc ; Hira ; Hiragana
9995 sc ; Ital ; Old_Italic
9996 sc ; Kana ; Katakana
10001 sc ; Mlym ; Malayalam
10002 sc ; Mong ; Mongolian
10003 sc ; Mymr ; Myanmar
10006 sc ; Qaai ; Inherited
10008 sc ; Sinh ; Sinhala
10014 sc ; Tibt ; Tibetan
10020 if ($v_version ge v2.0.0) {
10021 push @return, split /\n/, <<'END';
10025 dt ; vert ; vertical
10029 gc ; Cs ; Surrogate
10030 gc ; Lt ; Titlecase_Letter
10031 gc ; Me ; Enclosing_Mark
10032 gc ; Nl ; Letter_Number
10033 gc ; Pc ; Connector_Punctuation
10034 gc ; Sk ; Modifier_Symbol
10037 if ($v_version ge v2.1.2) {
10038 push @return, "bc ; S ; Segment_Separator\n";
10040 if ($v_version ge v2.1.5) {
10041 push @return, split /\n/, <<'END';
10042 gc ; Pf ; Final_Punctuation
10043 gc ; Pi ; Initial_Punctuation
10046 if ($v_version ge v2.1.8) {
10047 push @return, "ccc; 240; IS ; Iota_Subscript\n";
10050 if ($v_version ge v3.0.0) {
10051 push @return, split /\n/, <<'END';
10052 bc ; AL ; Arabic_Letter
10053 bc ; BN ; Boundary_Neutral
10054 bc ; LRE ; Left_To_Right_Embedding
10055 bc ; LRO ; Left_To_Right_Override
10056 bc ; NSM ; Nonspacing_Mark
10057 bc ; PDF ; Pop_Directional_Format
10058 bc ; RLE ; Right_To_Left_Embedding
10059 bc ; RLO ; Right_To_Left_Override
10061 ccc; 233; DB ; Double_Below
10065 if ($v_version ge v3.1.0) {
10066 push @return, "ccc; 226; R ; Right\n";
10072 sub process_NormalizationsTest {
10074 # Each line looks like:
10075 # source code point; NFC; NFD; NFKC; NFKD
10077 # 1E0A;1E0A;0044 0307;1E0A;0044 0307;
10080 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10082 # Process each line of the file ...
10083 while ($file->next_line) {
10087 my ($c1, $c2, $c3, $c4, $c5) = split /\s*;\s*/;
10089 foreach my $var (\$c1, \$c2, \$c3, \$c4, \$c5) {
10090 $$var = pack "U0U*", map { hex } split " ", $$var;
10091 $$var =~ s/(\\)/$1$1/g;
10094 push @normalization_tests,
10095 "Test_N(q
\a$c1
\a, q
\a$c2
\a, q
\a$c3
\a, q
\a$c4
\a, q
\a$c5
\a);\n";
10096 } # End of looping through the file
10099 sub output_perl_charnames_line ($$) {
10101 # Output the entries in Perl_charnames specially, using 5 digits instead
10102 # of four. This makes the entries a constant length, and simplifies
10103 # charnames.pm which this table is for. Unicode can have 6 digit
10104 # ordinals, but they are all private use or noncharacters which do not
10105 # have names, so won't be in this table.
10107 return sprintf "%05X\t%s\n", $_[0], $_[1];
10111 # This is used to store the range list of all the code points usable when
10112 # the little used $compare_versions feature is enabled.
10113 my $compare_versions_range_list;
10115 # These are constants to the $property_info hash in this subroutine, to
10116 # avoid using a quoted-string which might have a typo.
10118 my $DEFAULT_MAP = 'default_map';
10119 my $DEFAULT_TABLE = 'default_table';
10120 my $PSEUDO_MAP_TYPE = 'pseudo_map_type';
10121 my $MISSINGS = 'missings';
10123 sub process_generic_property_file {
10124 # This processes a file containing property mappings and puts them
10125 # into internal map tables. It should be used to handle any property
10126 # files that have mappings from a code point or range thereof to
10127 # something else. This means almost all the UCD .txt files.
10128 # each_line_handlers() should be set to adjust the lines of these
10129 # files, if necessary, to what this routine understands:
10132 # 003C..003E ; Math
10134 # the fields are: "codepoint-range ; property; map"
10136 # meaning the codepoints in the range all have the value 'map' under
10138 # Beginning and trailing white space in each field are not significant.
10139 # Note there is not a trailing semi-colon in the above. A trailing
10140 # semi-colon means the map is a null-string. An omitted map, as
10141 # opposed to a null-string, is assumed to be 'Y', based on Unicode
10142 # table syntax. (This could have been hidden from this routine by
10143 # doing it in the $file object, but that would require parsing of the
10144 # line there, so would have to parse it twice, or change the interface
10145 # to pass this an array. So not done.)
10147 # The map field may begin with a sequence of commands that apply to
10148 # this range. Each such command begins and ends with $CMD_DELIM.
10149 # These are used to indicate, for example, that the mapping for a
10150 # range has a non-default type.
10152 # This loops through the file, calling its next_line() method, and
10153 # then taking the map and adding it to the property's table.
10154 # Complications arise because any number of properties can be in the
10155 # file, in any order, interspersed in any way. The first time a
10156 # property is seen, it gets information about that property and
10157 # caches it for quick retrieval later. It also normalizes the maps
10158 # so that only one of many synonyms is stored. The Unicode input
10159 # files do use some multiple synonyms.
10162 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10164 my %property_info; # To keep track of what properties
10165 # have already had entries in the
10166 # current file, and info about each,
10167 # so don't have to recompute.
10168 my $property_name; # property currently being worked on
10169 my $property_type; # and its type
10170 my $previous_property_name = ""; # name from last time through loop
10171 my $property_object; # pointer to the current property's
10173 my $property_addr; # the address of that object
10174 my $default_map; # the string that code points missing
10175 # from the file map to
10176 my $default_table; # For non-string properties, a
10177 # reference to the match table that
10178 # will contain the list of code
10179 # points that map to $default_map.
10181 # Get the next real non-comment line
10183 while ($file->next_line) {
10185 # Default replacement type; means that if parts of the range have
10186 # already been stored in our tables, the new map overrides them if
10187 # they differ more than cosmetically
10188 my $replace = $IF_NOT_EQUIVALENT;
10189 my $map_type; # Default type for the map of this range
10191 #local $to_trace = 1 if main::DEBUG;
10192 trace $_ if main::DEBUG && $to_trace;
10194 # Split the line into components
10195 my ($range, $property_name, $map, @remainder)
10196 = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
10198 # If more or less on the line than we are expecting, warn and skip
10201 $file->carp_bad_line('Extra fields');
10204 elsif ( ! defined $property_name) {
10205 $file->carp_bad_line('Missing property');
10209 # Examine the range.
10210 if ($range !~ /^ ($code_point_re) (?:\.\. ($code_point_re) )? $/x)
10212 $file->carp_bad_line("Range '$range' not of the form 'CP1' or 'CP1..CP2' (where CP1,2 are code points in hex)");
10216 my $high = (defined $2) ? hex $2 : $low;
10218 # For the very specialized case of comparing two Unicode
10220 if (DEBUG && $compare_versions) {
10221 if ($property_name eq 'Age') {
10223 # Only allow code points at least as old as the version
10225 my $age = pack "C*", split(/\./, $map); # v string
10226 next LINE if $age gt $compare_versions;
10230 # Again, we throw out code points younger than those of
10231 # the specified version. By now, the Age property is
10232 # populated. We use the intersection of each input range
10233 # with this property to find what code points in it are
10234 # valid. To do the intersection, we have to convert the
10235 # Age property map to a Range_list. We only have to do
10237 if (! defined $compare_versions_range_list) {
10238 my $age = property_ref('Age');
10239 if (! -e 'DAge.txt') {
10240 croak "Need to have 'DAge.txt' file to do version comparison";
10242 elsif ($age->count == 0) {
10243 croak "The 'Age' table is empty, but its file exists";
10245 $compare_versions_range_list
10246 = Range_List->new(Initialize => $age);
10249 # An undefined map is always 'Y'
10250 $map = 'Y' if ! defined $map;
10252 # Calculate the intersection of the input range with the
10253 # code points that are known in the specified version
10254 my @ranges = ($compare_versions_range_list
10255 & Range->new($low, $high))->ranges;
10257 # If the intersection is empty, throw away this range
10258 next LINE unless @ranges;
10260 # Only examine the first range this time through the loop.
10261 my $this_range = shift @ranges;
10263 # Put any remaining ranges in the queue to be processed
10264 # later. Note that there is unnecessary work here, as we
10265 # will do the intersection again for each of these ranges
10266 # during some future iteration of the LINE loop, but this
10267 # code is not used in production. The later intersections
10268 # are guaranteed to not splinter, so this will not become
10269 # an infinite loop.
10270 my $line = join ';', $property_name, $map;
10271 foreach my $range (@ranges) {
10272 $file->insert_adjusted_lines(sprintf("%04X..%04X; %s",
10278 # And process the first range, like any other.
10279 $low = $this_range->start;
10280 $high = $this_range->end;
10282 } # End of $compare_versions
10284 # If changing to a new property, get the things constant per
10286 if ($previous_property_name ne $property_name) {
10288 $property_object = property_ref($property_name);
10289 if (! defined $property_object) {
10290 $file->carp_bad_line("Unexpected property '$property_name'. Skipped");
10293 { no overloading; $property_addr = pack 'J', $property_object; }
10295 # Defer changing names until have a line that is acceptable
10296 # (the 'next' statement above means is unacceptable)
10297 $previous_property_name = $property_name;
10299 # If not the first time for this property, retrieve info about
10300 # it from the cache
10301 if (defined ($property_info{$property_addr}{$TYPE})) {
10302 $property_type = $property_info{$property_addr}{$TYPE};
10303 $default_map = $property_info{$property_addr}{$DEFAULT_MAP};
10305 = $property_info{$property_addr}{$PSEUDO_MAP_TYPE};
10307 = $property_info{$property_addr}{$DEFAULT_TABLE};
10311 # Here, is the first time for this property. Set up the
10313 $property_type = $property_info{$property_addr}{$TYPE}
10314 = $property_object->type;
10316 = $property_info{$property_addr}{$PSEUDO_MAP_TYPE}
10317 = $property_object->pseudo_map_type;
10319 # The Unicode files are set up so that if the map is not
10320 # defined, it is a binary property
10321 if (! defined $map && $property_type != $BINARY) {
10322 if ($property_type != $UNKNOWN
10323 && $property_type != $NON_STRING)
10325 $file->carp_bad_line("No mapping defined on a non-binary property. Using 'Y' for the map");
10328 $property_object->set_type($BINARY);
10330 = $property_info{$property_addr}{$TYPE}
10335 # Get any @missings default for this property. This
10336 # should precede the first entry for the property in the
10337 # input file, and is located in a comment that has been
10338 # stored by the Input_file class until we access it here.
10339 # It's possible that there is more than one such line
10340 # waiting for us; collect them all, and parse
10341 my @missings_list = $file->get_missings
10342 if $file->has_missings_defaults;
10343 foreach my $default_ref (@missings_list) {
10344 my $default = $default_ref->[0];
10345 my $addr = do { no overloading; pack 'J', property_ref($default_ref->[1]); };
10347 # For string properties, the default is just what the
10348 # file says, but non-string properties should already
10349 # have set up a table for the default property value;
10350 # use the table for these, so can resolve synonyms
10351 # later to a single standard one.
10352 if ($property_type == $STRING
10353 || $property_type == $UNKNOWN)
10355 $property_info{$addr}{$MISSINGS} = $default;
10358 $property_info{$addr}{$MISSINGS}
10359 = $property_object->table($default);
10363 # Finished storing all the @missings defaults in the input
10364 # file so far. Get the one for the current property.
10365 my $missings = $property_info{$property_addr}{$MISSINGS};
10367 # But we likely have separately stored what the default
10368 # should be. (This is to accommodate versions of the
10369 # standard where the @missings lines are absent or
10370 # incomplete.) Hopefully the two will match. But check
10372 $default_map = $property_object->default_map;
10374 # If the map is a ref, it means that the default won't be
10375 # processed until later, so undef it, so next few lines
10376 # will redefine it to something that nothing will match
10377 undef $default_map if ref $default_map;
10379 # Create a $default_map if don't have one; maybe a dummy
10380 # that won't match anything.
10381 if (! defined $default_map) {
10383 # Use any @missings line in the file.
10384 if (defined $missings) {
10385 if (ref $missings) {
10386 $default_map = $missings->full_name;
10387 $default_table = $missings;
10390 $default_map = $missings;
10393 # And store it with the property for outside use.
10394 $property_object->set_default_map($default_map);
10398 # Neither an @missings nor a default map. Create
10399 # a dummy one, so won't have to test definedness
10400 # in the main loop.
10401 $default_map = '_Perl This will never be in a file
10406 # Here, we have $default_map defined, possibly in terms of
10407 # $missings, but maybe not, and possibly is a dummy one.
10408 if (defined $missings) {
10410 # Make sure there is no conflict between the two.
10411 # $missings has priority.
10412 if (ref $missings) {
10414 = $property_object->table($default_map);
10415 if (! defined $default_table
10416 || $default_table != $missings)
10418 if (! defined $default_table) {
10419 $default_table = $UNDEF;
10421 $file->carp_bad_line(<<END
10422 The \@missings line for $property_name in $file says that missings default to
10423 $missings, but we expect it to be $default_table. $missings used.
10426 $default_table = $missings;
10427 $default_map = $missings->full_name;
10429 $property_info{$property_addr}{$DEFAULT_TABLE}
10432 elsif ($default_map ne $missings) {
10433 $file->carp_bad_line(<<END
10434 The \@missings line for $property_name in $file says that missings default to
10435 $missings, but we expect it to be $default_map. $missings used.
10438 $default_map = $missings;
10442 $property_info{$property_addr}{$DEFAULT_MAP}
10445 # If haven't done so already, find the table corresponding
10446 # to this map for non-string properties.
10447 if (! defined $default_table
10448 && $property_type != $STRING
10449 && $property_type != $UNKNOWN)
10451 $default_table = $property_info{$property_addr}
10453 = $property_object->table($default_map);
10455 } # End of is first time for this property
10456 } # End of switching properties.
10458 # Ready to process the line.
10459 # The Unicode files are set up so that if the map is not defined,
10460 # it is a binary property with value 'Y'
10461 if (! defined $map) {
10466 # If the map begins with a special command to us (enclosed in
10467 # delimiters), extract the command(s).
10468 while ($map =~ s/ ^ $CMD_DELIM (.*?) $CMD_DELIM //x) {
10470 if ($command =~ / ^ $REPLACE_CMD= (.*) /x) {
10473 elsif ($command =~ / ^ $MAP_TYPE_CMD= (.*) /x) {
10477 $file->carp_bad_line("Unknown command line: '$1'");
10483 if ($default_map eq $CODE_POINT && $map =~ / ^ $code_point_re $/x)
10486 # Here, we have a map to a particular code point, and the
10487 # default map is to a code point itself. If the range
10488 # includes the particular code point, change that portion of
10489 # the range to the default. This makes sure that in the final
10490 # table only the non-defaults are listed.
10491 my $decimal_map = hex $map;
10492 if ($low <= $decimal_map && $decimal_map <= $high) {
10494 # If the range includes stuff before or after the map
10495 # we're changing, split it and process the split-off parts
10497 if ($low < $decimal_map) {
10498 $file->insert_adjusted_lines(
10499 sprintf("%04X..%04X; %s; %s",
10505 if ($high > $decimal_map) {
10506 $file->insert_adjusted_lines(
10507 sprintf("%04X..%04X; %s; %s",
10513 $low = $high = $decimal_map;
10514 $map = $CODE_POINT;
10518 # If we can tell that this is a synonym for the default map, use
10519 # the default one instead.
10520 if ($property_type != $STRING
10521 && $property_type != $UNKNOWN)
10523 my $table = $property_object->table($map);
10524 if (defined $table && $table == $default_table) {
10525 $map = $default_map;
10529 # And figure out the map type if not known.
10530 if (! defined $map_type || $map_type == $COMPUTE_NO_MULTI_CP) {
10531 if ($map eq "") { # Nulls are always $NULL map type
10533 } # Otherwise, non-strings, and those that don't allow
10534 # $MULTI_CP, and those that aren't multiple code points are
10537 (($property_type != $STRING && $property_type != $UNKNOWN)
10538 || (defined $map_type && $map_type == $COMPUTE_NO_MULTI_CP)
10539 || $map !~ /^ $code_point_re ( \ $code_point_re )+ $ /x)
10544 $map_type = $MULTI_CP;
10548 $property_object->add_map($low, $high,
10551 Replace => $replace);
10552 } # End of loop through file's lines
10558 { # Closure for UnicodeData.txt handling
10560 # This file was the first one in the UCD; its design leads to some
10561 # awkwardness in processing. Here is a sample line:
10562 # 0041;LATIN CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0061;
10563 # The fields in order are:
10564 my $i = 0; # The code point is in field 0, and is shifted off.
10565 my $CHARNAME = $i++; # character name (e.g. "LATIN CAPITAL LETTER A")
10566 my $CATEGORY = $i++; # category (e.g. "Lu")
10567 my $CCC = $i++; # Canonical combining class (e.g. "230")
10568 my $BIDI = $i++; # directional class (e.g. "L")
10569 my $PERL_DECOMPOSITION = $i++; # decomposition mapping
10570 my $PERL_DECIMAL_DIGIT = $i++; # decimal digit value
10571 my $NUMERIC_TYPE_OTHER_DIGIT = $i++; # digit value, like a superscript
10572 # Dual-use in this program; see below
10573 my $NUMERIC = $i++; # numeric value
10574 my $MIRRORED = $i++; # ? mirrored
10575 my $UNICODE_1_NAME = $i++; # name in Unicode 1.0
10576 my $COMMENT = $i++; # iso comment
10577 my $UPPER = $i++; # simple uppercase mapping
10578 my $LOWER = $i++; # simple lowercase mapping
10579 my $TITLE = $i++; # simple titlecase mapping
10580 my $input_field_count = $i;
10582 # This routine in addition outputs these extra fields:
10584 my $DECOMP_TYPE = $i++; # Decomposition type
10586 # These fields are modifications of ones above, and are usually
10587 # suppressed; they must come last, as for speed, the loop upper bound is
10588 # normally set to ignore them
10589 my $NAME = $i++; # This is the strict name field, not the one that
10591 my $DECOMP_MAP = $i++; # Strict decomposition mapping; not the one used
10592 # by Unicode::Normalize
10593 my $last_field = $i - 1;
10595 # All these are read into an array for each line, with the indices defined
10596 # above. The empty fields in the example line above indicate that the
10597 # value is defaulted. The handler called for each line of the input
10598 # changes these to their defaults.
10600 # Here are the official names of the properties, in a parallel array:
10602 $field_names[$BIDI] = 'Bidi_Class';
10603 $field_names[$CATEGORY] = 'General_Category';
10604 $field_names[$CCC] = 'Canonical_Combining_Class';
10605 $field_names[$CHARNAME] = 'Perl_Charnames';
10606 $field_names[$COMMENT] = 'ISO_Comment';
10607 $field_names[$DECOMP_MAP] = 'Decomposition_Mapping';
10608 $field_names[$DECOMP_TYPE] = 'Decomposition_Type';
10609 $field_names[$LOWER] = 'Lowercase_Mapping';
10610 $field_names[$MIRRORED] = 'Bidi_Mirrored';
10611 $field_names[$NAME] = 'Name';
10612 $field_names[$NUMERIC] = 'Numeric_Value';
10613 $field_names[$NUMERIC_TYPE_OTHER_DIGIT] = 'Numeric_Type';
10614 $field_names[$PERL_DECIMAL_DIGIT] = 'Perl_Decimal_Digit';
10615 $field_names[$PERL_DECOMPOSITION] = 'Perl_Decomposition_Mapping';
10616 $field_names[$TITLE] = 'Titlecase_Mapping';
10617 $field_names[$UNICODE_1_NAME] = 'Unicode_1_Name';
10618 $field_names[$UPPER] = 'Uppercase_Mapping';
10620 # Some of these need a little more explanation:
10621 # The $PERL_DECIMAL_DIGIT field does not lead to an official Unicode
10622 # property, but is used in calculating the Numeric_Type. Perl however,
10623 # creates a file from this field, so a Perl property is created from it.
10624 # Similarly, the Other_Digit field is used only for calculating the
10625 # Numeric_Type, and so it can be safely re-used as the place to store
10626 # the value for Numeric_Type; hence it is referred to as
10627 # $NUMERIC_TYPE_OTHER_DIGIT.
10628 # The input field named $PERL_DECOMPOSITION is a combination of both the
10629 # decomposition mapping and its type. Perl creates a file containing
10630 # exactly this field, so it is used for that. The two properties are
10631 # separated into two extra output fields, $DECOMP_MAP and $DECOMP_TYPE.
10632 # $DECOMP_MAP is usually suppressed (unless the lists are changed to
10633 # output it), as Perl doesn't use it directly.
10634 # The input field named here $CHARNAME is used to construct the
10635 # Perl_Charnames property, which is a combination of the Name property
10636 # (which the input field contains), and the Unicode_1_Name property, and
10637 # others from other files. Since, the strict Name property is not used
10638 # by Perl, this field is used for the table that Perl does use. The
10639 # strict Name property table is usually suppressed (unless the lists are
10640 # changed to output it), so it is accumulated in a separate field,
10641 # $NAME, which to save time is discarded unless the table is actually to
10644 # This file is processed like most in this program. Control is passed to
10645 # process_generic_property_file() which calls filter_UnicodeData_line()
10646 # for each input line. This filter converts the input into line(s) that
10647 # process_generic_property_file() understands. There is also a setup
10648 # routine called before any of the file is processed, and a handler for
10649 # EOF processing, all in this closure.
10651 # A huge speed-up occurred at the cost of some added complexity when these
10652 # routines were altered to buffer the outputs into ranges. Almost all the
10653 # lines of the input file apply to just one code point, and for most
10654 # properties, the map for the next code point up is the same as the
10655 # current one. So instead of creating a line for each property for each
10656 # input line, filter_UnicodeData_line() remembers what the previous map
10657 # of a property was, and doesn't generate a line to pass on until it has
10658 # to, as when the map changes; and that passed-on line encompasses the
10659 # whole contiguous range of code points that have the same map for that
10660 # property. This means a slight amount of extra setup, and having to
10661 # flush these buffers on EOF, testing if the maps have changed, plus
10662 # remembering state information in the closure. But it means a lot less
10663 # real time in not having to change the data base for each property on
10666 # Another complication is that there are already a few ranges designated
10667 # in the input. There are two lines for each, with the same maps except
10668 # the code point and name on each line. This was actually the hardest
10669 # thing to design around. The code points in those ranges may actually
10670 # have real maps not given by these two lines. These maps will either
10671 # be algorithmically determinable, or be in the extracted files furnished
10672 # with the UCD. In the event of conflicts between these extracted files,
10673 # and this one, Unicode says that this one prevails. But it shouldn't
10674 # prevail for conflicts that occur in these ranges. The data from the
10675 # extracted files prevails in those cases. So, this program is structured
10676 # so that those files are processed first, storing maps. Then the other
10677 # files are processed, generally overwriting what the extracted files
10678 # stored. But just the range lines in this input file are processed
10679 # without overwriting. This is accomplished by adding a special string to
10680 # the lines output to tell process_generic_property_file() to turn off the
10681 # overwriting for just this one line.
10682 # A similar mechanism is used to tell it that the map is of a non-default
10685 sub setup_UnicodeData { # Called before any lines of the input are read
10687 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10689 # Create a new property specially located that is a combination of the
10690 # various Name properties: Name, Unicode_1_Name, Named Sequences, and
10691 # Name_Alias properties. (The final duplicates elements of the
10692 # first.) A comment for it will later be constructed based on the
10693 # actual properties present and used
10694 $perl_charname = Property->new('Perl_Charnames',
10696 Directory => File::Spec->curdir(),
10698 Fate => $INTERNAL_ONLY,
10699 Perl_Extension => 1,
10700 Range_Size_1 => \&output_perl_charnames_line,
10703 $perl_charname->set_proxy_for('Name');
10705 my $Perl_decomp = Property->new('Perl_Decomposition_Mapping',
10706 Directory => File::Spec->curdir(),
10707 File => 'Decomposition',
10708 Format => $DECOMP_STRING_FORMAT,
10709 Fate => $INTERNAL_ONLY,
10710 Perl_Extension => 1,
10711 Default_Map => $CODE_POINT,
10713 # normalize.pm can't cope with these
10714 Output_Range_Counts => 0,
10716 # This is a specially formatted table
10717 # explicitly for normalize.pm, which
10718 # is expecting a particular format,
10719 # which means that mappings containing
10720 # multiple code points are in the main
10721 # body of the table
10722 Map_Type => $COMPUTE_NO_MULTI_CP,
10724 To_Output_Map => $INTERNAL_MAP,
10726 $Perl_decomp->set_proxy_for('Decomposition_Mapping', 'Decomposition_Type');
10727 $Perl_decomp->add_comment(join_lines(<<END
10728 This mapping is a combination of the Unicode 'Decomposition_Type' and
10729 'Decomposition_Mapping' properties, formatted for use by normalize.pm. It is
10730 identical to the official Unicode 'Decomposition_Mapping' property except for
10732 1) It omits the algorithmically determinable Hangul syllable decompositions,
10733 which normalize.pm handles algorithmically.
10734 2) It contains the decomposition type as well. Non-canonical decompositions
10735 begin with a word in angle brackets, like <super>, which denotes the
10736 compatible decomposition type. If the map does not begin with the <angle
10737 brackets>, the decomposition is canonical.
10741 my $Decimal_Digit = Property->new("Perl_Decimal_Digit",
10743 Perl_Extension => 1,
10744 Directory => $map_directory,
10746 To_Output_Map => $OUTPUT_ADJUSTED,
10748 $Decimal_Digit->add_comment(join_lines(<<END
10749 This file gives the mapping of all code points which represent a single
10750 decimal digit [0-9] to their respective digits, but it has ranges of 10 code
10751 points, and the mapping of each non-initial element of each range is actually
10752 not to "0", but to the offset that element has from its corresponding DIGIT 0.
10753 These code points are those that have Numeric_Type=Decimal; not special
10754 things, like subscripts nor Roman numerals.
10758 # These properties are not used for generating anything else, and are
10759 # usually not output. By making them last in the list, we can just
10760 # change the high end of the loop downwards to avoid the work of
10761 # generating a table(s) that is/are just going to get thrown away.
10762 if (! property_ref('Decomposition_Mapping')->to_output_map
10763 && ! property_ref('Name')->to_output_map)
10765 $last_field = min($NAME, $DECOMP_MAP) - 1;
10766 } elsif (property_ref('Decomposition_Mapping')->to_output_map) {
10767 $last_field = $DECOMP_MAP;
10768 } elsif (property_ref('Name')->to_output_map) {
10769 $last_field = $NAME;
10774 my $first_time = 1; # ? Is this the first line of the file
10775 my $in_range = 0; # ? Are we in one of the file's ranges
10776 my $previous_cp; # hex code point of previous line
10777 my $decimal_previous_cp = -1; # And its decimal equivalent
10778 my @start; # For each field, the current starting
10779 # code point in hex for the range
10780 # being accumulated.
10781 my @fields; # The input fields;
10782 my @previous_fields; # And those from the previous call
10784 sub filter_UnicodeData_line {
10785 # Handle a single input line from UnicodeData.txt; see comments above
10786 # Conceptually this takes a single line from the file containing N
10787 # properties, and converts it into N lines with one property per line,
10788 # which is what the final handler expects. But there are
10789 # complications due to the quirkiness of the input file, and to save
10790 # time, it accumulates ranges where the property values don't change
10791 # and only emits lines when necessary. This is about an order of
10792 # magnitude fewer lines emitted.
10795 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10797 # $_ contains the input line.
10798 # -1 in split means retain trailing null fields
10799 (my $cp, @fields) = split /\s*;\s*/, $_, -1;
10801 #local $to_trace = 1 if main::DEBUG;
10802 trace $cp, @fields , $input_field_count if main::DEBUG && $to_trace;
10803 if (@fields > $input_field_count) {
10804 $file->carp_bad_line('Extra fields');
10809 my $decimal_cp = hex $cp;
10811 # We have to output all the buffered ranges when the next code point
10812 # is not exactly one after the previous one, which means there is a
10813 # gap in the ranges.
10814 my $force_output = ($decimal_cp != $decimal_previous_cp + 1);
10816 # The decomposition mapping field requires special handling. It looks
10819 # <compat> 0032 0020
10822 # The decomposition type is enclosed in <brackets>; if missing, it
10823 # means the type is canonical. There are two decomposition mapping
10824 # tables: the one for use by Perl's normalize.pm has a special format
10825 # which is this field intact; the other, for general use is of
10826 # standard format. In either case we have to find the decomposition
10827 # type. Empty fields have None as their type, and map to the code
10829 if ($fields[$PERL_DECOMPOSITION] eq "") {
10830 $fields[$DECOMP_TYPE] = 'None';
10831 $fields[$DECOMP_MAP] = $fields[$PERL_DECOMPOSITION] = $CODE_POINT;
10834 ($fields[$DECOMP_TYPE], my $map) = $fields[$PERL_DECOMPOSITION]
10835 =~ / < ( .+? ) > \s* ( .+ ) /x;
10836 if (! defined $fields[$DECOMP_TYPE]) {
10837 $fields[$DECOMP_TYPE] = 'Canonical';
10838 $fields[$DECOMP_MAP] = $fields[$PERL_DECOMPOSITION];
10841 $fields[$DECOMP_MAP] = $map;
10845 # The 3 numeric fields also require special handling. The 2 digit
10846 # fields must be either empty or match the number field. This means
10847 # that if it is empty, they must be as well, and the numeric type is
10848 # None, and the numeric value is 'Nan'.
10849 # The decimal digit field must be empty or match the other digit
10850 # field. If the decimal digit field is non-empty, the code point is
10851 # a decimal digit, and the other two fields will have the same value.
10852 # If it is empty, but the other digit field is non-empty, the code
10853 # point is an 'other digit', and the number field will have the same
10854 # value as the other digit field. If the other digit field is empty,
10855 # but the number field is non-empty, the code point is a generic
10857 if ($fields[$NUMERIC] eq "") {
10858 if ($fields[$PERL_DECIMAL_DIGIT] ne ""
10859 || $fields[$NUMERIC_TYPE_OTHER_DIGIT] ne ""
10861 $file->carp_bad_line("Numeric values inconsistent. Trying to process anyway");
10863 $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'None';
10864 $fields[$NUMERIC] = 'NaN';
10867 $file->carp_bad_line("'$fields[$NUMERIC]' should be a whole or rational number. Processing as if it were") if $fields[$NUMERIC] !~ qr{ ^ -? \d+ ( / \d+ )? $ }x;
10868 if ($fields[$PERL_DECIMAL_DIGIT] ne "") {
10869 $file->carp_bad_line("$fields[$PERL_DECIMAL_DIGIT] should equal $fields[$NUMERIC]. Processing anyway") if $fields[$PERL_DECIMAL_DIGIT] != $fields[$NUMERIC];
10870 $file->carp_bad_line("$fields[$PERL_DECIMAL_DIGIT] should be empty since the general category ($fields[$CATEGORY]) isn't 'Nd'. Processing as Decimal") if $fields[$CATEGORY] ne "Nd";
10871 $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'Decimal';
10873 elsif ($fields[$NUMERIC_TYPE_OTHER_DIGIT] ne "") {
10874 $file->carp_bad_line("$fields[$NUMERIC_TYPE_OTHER_DIGIT] should equal $fields[$NUMERIC]. Processing anyway") if $fields[$NUMERIC_TYPE_OTHER_DIGIT] != $fields[$NUMERIC];
10875 $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'Digit';
10878 $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'Numeric';
10880 # Rationals require extra effort.
10881 register_fraction($fields[$NUMERIC])
10882 if $fields[$NUMERIC] =~ qr{/};
10886 # For the properties that have empty fields in the file, and which
10887 # mean something different from empty, change them to that default.
10888 # Certain fields just haven't been empty so far in any Unicode
10889 # version, so don't look at those, namely $MIRRORED, $BIDI, $CCC,
10890 # $CATEGORY. This leaves just the two fields, and so we hard-code in
10891 # the defaults; which are very unlikely to ever change.
10892 $fields[$UPPER] = $CODE_POINT if $fields[$UPPER] eq "";
10893 $fields[$LOWER] = $CODE_POINT if $fields[$LOWER] eq "";
10895 # UAX44 says that if title is empty, it is the same as whatever upper
10897 $fields[$TITLE] = $fields[$UPPER] if $fields[$TITLE] eq "";
10899 # There are a few pairs of lines like:
10900 # AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
10901 # D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
10902 # that define ranges. These should be processed after the fields are
10903 # adjusted above, as they may override some of them; but mostly what
10904 # is left is to possibly adjust the $CHARNAME field. The names of all the
10905 # paired lines start with a '<', but this is also true of '<control>,
10906 # which isn't one of these special ones.
10907 if ($fields[$CHARNAME] eq '<control>') {
10909 # Some code points in this file have the pseudo-name
10910 # '<control>', but the official name for such ones is the null
10912 $fields[$NAME] = $fields[$CHARNAME] = "";
10914 # We had better not be in between range lines.
10916 $file->carp_bad_line("Expecting a closing range line, not a $fields[$CHARNAME]'. Trying anyway");
10920 elsif (substr($fields[$CHARNAME], 0, 1) ne '<') {
10922 # Here is a non-range line. We had better not be in between range
10925 $file->carp_bad_line("Expecting a closing range line, not a $fields[$CHARNAME]'. Trying anyway");
10928 if ($fields[$CHARNAME] =~ s/- $cp $//x) {
10930 # These are code points whose names end in their code points,
10931 # which means the names are algorithmically derivable from the
10932 # code points. To shorten the output Name file, the algorithm
10933 # for deriving these is placed in the file instead of each
10934 # code point, so they have map type $CP_IN_NAME
10935 $fields[$CHARNAME] = $CMD_DELIM
10940 . $fields[$CHARNAME];
10942 $fields[$NAME] = $fields[$CHARNAME];
10944 elsif ($fields[$CHARNAME] =~ /^<(.+), First>$/) {
10945 $fields[$CHARNAME] = $fields[$NAME] = $1;
10947 # Here we are at the beginning of a range pair.
10949 $file->carp_bad_line("Expecting a closing range line, not a beginning one, $fields[$CHARNAME]'. Trying anyway");
10953 # Because the properties in the range do not overwrite any already
10954 # in the db, we must flush the buffers of what's already there, so
10955 # they get handled in the normal scheme.
10959 elsif ($fields[$CHARNAME] !~ s/^<(.+), Last>$/$1/) {
10960 $file->carp_bad_line("Unexpected name starting with '<' $fields[$CHARNAME]. Ignoring this line.");
10964 else { # Here, we are at the last line of a range pair.
10967 $file->carp_bad_line("Unexpected end of range $fields[$CHARNAME] when not in one. Ignoring this line.");
10973 $fields[$NAME] = $fields[$CHARNAME];
10975 # Check that the input is valid: that the closing of the range is
10976 # the same as the beginning.
10977 foreach my $i (0 .. $last_field) {
10978 next if $fields[$i] eq $previous_fields[$i];
10979 $file->carp_bad_line("Expecting '$fields[$i]' to be the same as '$previous_fields[$i]'. Bad News. Trying anyway");
10982 # The processing differs depending on the type of range,
10983 # determined by its $CHARNAME
10984 if ($fields[$CHARNAME] =~ /^Hangul Syllable/) {
10986 # Check that the data looks right.
10987 if ($decimal_previous_cp != $SBase) {
10988 $file->carp_bad_line("Unexpected Hangul syllable start = $previous_cp. Bad News. Results will be wrong");
10990 if ($decimal_cp != $SBase + $SCount - 1) {
10991 $file->carp_bad_line("Unexpected Hangul syllable end = $cp. Bad News. Results will be wrong");
10994 # The Hangul syllable range has a somewhat complicated name
10995 # generation algorithm. Each code point in it has a canonical
10996 # decomposition also computable by an algorithm. The
10997 # perl decomposition map table built from these is used only
10998 # by normalize.pm, which has the algorithm built in it, so the
10999 # decomposition maps are not needed, and are large, so are
11000 # omitted from it. If the full decomposition map table is to
11001 # be output, the decompositions are generated for it, in the
11002 # EOF handling code for this input file.
11004 $previous_fields[$DECOMP_TYPE] = 'Canonical';
11006 # This range is stored in our internal structure with its
11007 # own map type, different from all others.
11008 $previous_fields[$CHARNAME] = $previous_fields[$NAME]
11014 . $fields[$CHARNAME];
11016 elsif ($fields[$CHARNAME] =~ /^CJK/) {
11018 # The name for these contains the code point itself, and all
11019 # are defined to have the same base name, regardless of what
11020 # is in the file. They are stored in our internal structure
11021 # with a map type of $CP_IN_NAME
11022 $previous_fields[$CHARNAME] = $previous_fields[$NAME]
11028 . 'CJK UNIFIED IDEOGRAPH';
11031 elsif ($fields[$CATEGORY] eq 'Co'
11032 || $fields[$CATEGORY] eq 'Cs')
11034 # The names of all the code points in these ranges are set to
11035 # null, as there are no names for the private use and
11036 # surrogate code points.
11038 $previous_fields[$CHARNAME] = $previous_fields[$NAME] = "";
11041 $file->carp_bad_line("Unexpected code point range $fields[$CHARNAME] because category is $fields[$CATEGORY]. Attempting to process it.");
11044 # The first line of the range caused everything else to be output,
11045 # and then its values were stored as the beginning values for the
11046 # next set of ranges, which this one ends. Now, for each value,
11047 # add a command to tell the handler that these values should not
11048 # replace any existing ones in our database.
11049 foreach my $i (0 .. $last_field) {
11050 $previous_fields[$i] = $CMD_DELIM
11055 . $previous_fields[$i];
11058 # And change things so it looks like the entire range has been
11059 # gone through with this being the final part of it. Adding the
11060 # command above to each field will cause this range to be flushed
11061 # during the next iteration, as it guaranteed that the stored
11062 # field won't match whatever value the next one has.
11063 $previous_cp = $cp;
11064 $decimal_previous_cp = $decimal_cp;
11066 # We are now set up for the next iteration; so skip the remaining
11067 # code in this subroutine that does the same thing, but doesn't
11068 # know about these ranges.
11074 # On the very first line, we fake it so the code below thinks there is
11075 # nothing to output, and initialize so that when it does get output it
11076 # uses the first line's values for the lowest part of the range.
11077 # (One could avoid this by using peek(), but then one would need to
11078 # know the adjustments done above and do the same ones in the setup
11079 # routine; not worth it)
11082 @previous_fields = @fields;
11083 @start = ($cp) x scalar @fields;
11084 $decimal_previous_cp = $decimal_cp - 1;
11087 # For each field, output the stored up ranges that this code point
11088 # doesn't fit in. Earlier we figured out if all ranges should be
11089 # terminated because of changing the replace or map type styles, or if
11090 # there is a gap between this new code point and the previous one, and
11091 # that is stored in $force_output. But even if those aren't true, we
11092 # need to output the range if this new code point's value for the
11093 # given property doesn't match the stored range's.
11094 #local $to_trace = 1 if main::DEBUG;
11095 foreach my $i (0 .. $last_field) {
11096 my $field = $fields[$i];
11097 if ($force_output || $field ne $previous_fields[$i]) {
11099 # Flush the buffer of stored values.
11100 $file->insert_adjusted_lines("$start[$i]..$previous_cp; $field_names[$i]; $previous_fields[$i]");
11102 # Start a new range with this code point and its value
11104 $previous_fields[$i] = $field;
11108 # Set the values for the next time.
11109 $previous_cp = $cp;
11110 $decimal_previous_cp = $decimal_cp;
11112 # The input line has generated whatever adjusted lines are needed, and
11113 # should not be looked at further.
11118 sub EOF_UnicodeData {
11119 # Called upon EOF to flush the buffers, and create the Hangul
11120 # decomposition mappings if needed.
11123 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
11125 # Flush the buffers.
11126 foreach my $i (0 .. $last_field) {
11127 $file->insert_adjusted_lines("$start[$i]..$previous_cp; $field_names[$i]; $previous_fields[$i]");
11130 if (-e 'Jamo.txt') {
11132 # The algorithm is published by Unicode, based on values in
11133 # Jamo.txt, (which should have been processed before this
11134 # subroutine), and the results left in %Jamo
11136 Carp::my_carp_bug("Jamo.txt should be processed before Unicode.txt. Hangul syllables not generated.");
11140 # If the full decomposition map table is being output, insert
11141 # into it the Hangul syllable mappings. This is to avoid having
11142 # to publish a subroutine in it to compute them. (which would
11143 # essentially be this code.) This uses the algorithm published by
11144 # Unicode. (No hangul syllables in version 1)
11145 if ($v_version ge v2.0.0
11146 && property_ref('Decomposition_Mapping')->to_output_map) {
11147 for (my $S = $SBase; $S < $SBase + $SCount; $S++) {
11149 my $SIndex = $S - $SBase;
11150 my $L = $LBase + $SIndex / $NCount;
11151 my $V = $VBase + ($SIndex % $NCount) / $TCount;
11152 my $T = $TBase + $SIndex % $TCount;
11154 trace "L=$L, V=$V, T=$T" if main::DEBUG && $to_trace;
11155 my $decomposition = sprintf("%04X %04X", $L, $V);
11156 $decomposition .= sprintf(" %04X", $T) if $T != $TBase;
11157 $file->insert_adjusted_lines(
11158 sprintf("%04X; Decomposition_Mapping; %s",
11168 sub filter_v1_ucd {
11169 # Fix UCD lines in version 1. This is probably overkill, but this
11170 # fixes some glaring errors in Version 1 UnicodeData.txt. That file:
11171 # 1) had many Hangul (U+3400 - U+4DFF) code points that were later
11172 # removed. This program retains them
11173 # 2) didn't include ranges, which it should have, and which are now
11174 # added in @corrected_lines below. It was hand populated by
11175 # taking the data from Version 2, verified by analyzing
11177 # 3) There is a syntax error in the entry for U+09F8 which could
11178 # cause problems for utf8_heavy, and so is changed. It's
11179 # numeric value was simply a minus sign, without any number.
11180 # (Eventually Unicode changed the code point to non-numeric.)
11181 # 4) The decomposition types often don't match later versions
11182 # exactly, and the whole syntax of that field is different; so
11183 # the syntax is changed as well as the types to their later
11184 # terminology. Otherwise normalize.pm would be very unhappy
11185 # 5) Many ccc classes are different. These are left intact.
11186 # 6) U+FF10..U+FF19 are missing their numeric values in all three
11187 # fields. These are unchanged because it doesn't really cause
11188 # problems for Perl.
11189 # 7) A number of code points, such as controls, don't have their
11190 # Unicode Version 1 Names in this file. These are added.
11191 # 8) A number of Symbols were marked as Lm. This changes those in
11192 # the Latin1 range, so that regexes work.
11193 # 9) The odd characters U+03DB .. U+03E1 weren't encoded but are
11194 # referred to by their lc equivalents. Not fixed.
11196 my @corrected_lines = split /\n/, <<'END';
11197 4E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;;
11198 9FA5;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;;
11199 E000;<Private Use, First>;Co;0;L;;;;;N;;;;;
11200 F8FF;<Private Use, Last>;Co;0;L;;;;;N;;;;;
11201 F900;<CJK Compatibility Ideograph, First>;Lo;0;L;;;;;N;;;;;
11202 FA2D;<CJK Compatibility Ideograph, Last>;Lo;0;L;;;;;N;;;;;
11206 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
11208 #local $to_trace = 1 if main::DEBUG;
11209 trace $_ if main::DEBUG && $to_trace;
11211 # -1 => retain trailing null fields
11212 my ($code_point, @fields) = split /\s*;\s*/, $_, -1;
11214 # At the first place that is wrong in the input, insert all the
11215 # corrections, replacing the wrong line.
11216 if ($code_point eq '4E00') {
11217 my @copy = @corrected_lines;
11219 ($code_point, @fields) = split /\s*;\s*/, $_, -1;
11221 $file->insert_lines(@copy);
11223 elsif ($code_point =~ /^00/ && $fields[$CATEGORY] eq 'Lm') {
11225 # There are no Lm characters in Latin1; these should be 'Sk', but
11226 # there isn't that in V1.
11227 $fields[$CATEGORY] = 'So';
11230 if ($fields[$NUMERIC] eq '-') {
11231 $fields[$NUMERIC] = '-1'; # This is what 2.0 made it.
11234 if ($fields[$PERL_DECOMPOSITION] ne "") {
11236 # Several entries have this change to superscript 2 or 3 in the
11237 # middle. Convert these to the modern version, which is to use
11238 # the actual U+00B2 and U+00B3 (the superscript forms) instead.
11239 # So 'HHHH HHHH <+sup> 0033 <-sup> HHHH' becomes
11240 # 'HHHH HHHH 00B3 HHHH'.
11241 # It turns out that all of these that don't have another
11242 # decomposition defined at the beginning of the line have the
11243 # <square> decomposition in later releases.
11244 if ($code_point ne '00B2' && $code_point ne '00B3') {
11245 if ($fields[$PERL_DECOMPOSITION]
11246 =~ s/<\+sup> 003([23]) <-sup>/00B$1/)
11248 if (substr($fields[$PERL_DECOMPOSITION], 0, 1) ne '<') {
11249 $fields[$PERL_DECOMPOSITION] = '<square> '
11250 . $fields[$PERL_DECOMPOSITION];
11255 # If is like '<+circled> 0052 <-circled>', convert to
11257 $fields[$PERL_DECOMPOSITION] =~
11258 s/ < \+ ( .*? ) > \s* (.*?) \s* <-\1> /<$1> $2/xg;
11260 # Convert '<join> HHHH HHHH <join>' to '<medial> HHHH HHHH', etc.
11261 $fields[$PERL_DECOMPOSITION] =~
11262 s/ <join> \s* (.*?) \s* <no-join> /<final> $1/x
11263 or $fields[$PERL_DECOMPOSITION] =~
11264 s/ <join> \s* (.*?) \s* <join> /<medial> $1/x
11265 or $fields[$PERL_DECOMPOSITION] =~
11266 s/ <no-join> \s* (.*?) \s* <join> /<initial> $1/x
11267 or $fields[$PERL_DECOMPOSITION] =~
11268 s/ <no-join> \s* (.*?) \s* <no-join> /<isolated> $1/x;
11270 # Convert '<break> HHHH HHHH <break>' to '<break> HHHH', etc.
11271 $fields[$PERL_DECOMPOSITION] =~
11272 s/ <(break|no-break)> \s* (.*?) \s* <\1> /<$1> $2/x;
11274 # Change names to modern form.
11275 $fields[$PERL_DECOMPOSITION] =~ s/<font variant>/<font>/g;
11276 $fields[$PERL_DECOMPOSITION] =~ s/<no-break>/<noBreak>/g;
11277 $fields[$PERL_DECOMPOSITION] =~ s/<circled>/<circle>/g;
11278 $fields[$PERL_DECOMPOSITION] =~ s/<break>/<fraction>/g;
11280 # One entry has weird braces
11281 $fields[$PERL_DECOMPOSITION] =~ s/[{}]//g;
11283 # One entry at U+2116 has an extra <sup>
11284 $fields[$PERL_DECOMPOSITION] =~ s/( < .*? > .* ) < .*? > \ * /$1/x;
11287 $_ = join ';', $code_point, @fields;
11288 trace $_ if main::DEBUG && $to_trace;
11292 sub filter_bad_Nd_ucd {
11293 # Early versions specified a value in the decimal digit field even
11294 # though the code point wasn't a decimal digit. Clear the field in
11295 # that situation, so that the main code doesn't think it is a decimal
11298 my ($code_point, @fields) = split /\s*;\s*/, $_, -1;
11299 if ($fields[$PERL_DECIMAL_DIGIT] ne "" && $fields[$CATEGORY] ne 'Nd') {
11300 $fields[$PERL_DECIMAL_DIGIT] = "";
11301 $_ = join ';', $code_point, @fields;
11306 my @U1_control_names = split /\n/, <<'END';
11311 END OF TRANSMISSION
11316 HORIZONTAL TABULATION
11318 VERTICAL TABULATION
11326 DEVICE CONTROL THREE
11327 DEVICE CONTROL FOUR
11328 NEGATIVE ACKNOWLEDGE
11330 END OF TRANSMISSION BLOCK
11340 BREAK PERMITTED HERE
11344 START OF SELECTED AREA
11345 END OF SELECTED AREA
11346 CHARACTER TABULATION SET
11347 CHARACTER TABULATION WITH JUSTIFICATION
11348 LINE TABULATION SET
11354 DEVICE CONTROL STRING
11360 START OF GUARDED AREA
11361 END OF GUARDED AREA
11363 SINGLE CHARACTER INTRODUCER
11364 CONTROL SEQUENCE INTRODUCER
11366 OPERATING SYSTEM COMMAND
11368 APPLICATION PROGRAM COMMAND
11371 sub filter_early_U1_names {
11372 # Very early versions did not have the Unicode_1_name field specified.
11373 # They differed in which ones were present; make sure a U1 name
11374 # exists, so that Unicode::UCD::charinfo will work
11376 my ($code_point, @fields) = split /\s*;\s*/, $_, -1;
11379 # @U1_control names above are entirely positional, so we pull them out
11380 # in the exact order required, with gaps for the ones that don't have
11382 if ($code_point =~ /^00[01]/
11383 || $code_point eq '007F'
11384 || $code_point =~ /^008[2-9A-F]/
11385 || $code_point =~ /^009[0-8A-F]/)
11387 my $u1_name = shift @U1_control_names;
11388 $fields[$UNICODE_1_NAME] = $u1_name unless $fields[$UNICODE_1_NAME];
11389 $_ = join ';', $code_point, @fields;
11394 sub filter_v2_1_5_ucd {
11395 # A dozen entries in this 2.1.5 file had the mirrored and numeric
11396 # columns swapped; These all had mirrored be 'N'. So if the numeric
11397 # column appears to be N, swap it back.
11399 my ($code_point, @fields) = split /\s*;\s*/, $_, -1;
11400 if ($fields[$NUMERIC] eq 'N') {
11401 $fields[$NUMERIC] = $fields[$MIRRORED];
11402 $fields[$MIRRORED] = 'N';
11403 $_ = join ';', $code_point, @fields;
11408 sub filter_v6_ucd {
11410 # Unicode 6.0 co-opted the name BELL for U+1F514, but until 5.17,
11411 # it wasn't accepted, to allow for some deprecation cycles. This
11412 # function is not called after 5.16
11414 return if $_ !~ /^(?:0007|1F514|070F);/;
11416 my ($code_point, @fields) = split /\s*;\s*/, $_, -1;
11417 if ($code_point eq '0007') {
11418 $fields[$CHARNAME] = "";
11420 elsif ($code_point eq '070F') { # Unicode Corrigendum #8; see
11421 # http://www.unicode.org/versions/corrigendum8.html
11422 $fields[$BIDI] = "AL";
11424 elsif ($^V lt v5.18.0) { # For 5.18 will convert to use Unicode's name
11425 $fields[$CHARNAME] = "";
11428 $_ = join ';', $code_point, @fields;
11432 } # End closure for UnicodeData
11434 sub process_GCB_test {
11437 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
11439 while ($file->next_line) {
11440 push @backslash_X_tests, $_;
11446 sub process_NamedSequences {
11447 # NamedSequences.txt entries are just added to an array. Because these
11448 # don't look like the other tables, they have their own handler.
11450 # LATIN CAPITAL LETTER A WITH MACRON AND GRAVE;0100 0300
11452 # This just adds the sequence to an array for later handling
11455 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
11457 while ($file->next_line) {
11458 my ($name, $sequence, @remainder) = split /\s*;\s*/, $_, -1;
11460 $file->carp_bad_line(
11461 "Doesn't look like 'KHMER VOWEL SIGN OM;17BB 17C6'");
11465 # Note single \t in keeping with special output format of
11466 # Perl_charnames. But it turns out that the code points don't have to
11467 # be 5 digits long, like the rest, based on the internal workings of
11468 # charnames.pm. This could be easily changed for consistency.
11469 push @named_sequences, "$sequence\t$name";
11478 sub filter_early_ea_lb {
11479 # Fixes early EastAsianWidth.txt and LineBreak.txt files. These had a
11480 # third field be the name of the code point, which can be ignored in
11481 # most cases. But it can be meaningful if it marks a range:
11482 # 33FE;W;IDEOGRAPHIC TELEGRAPH SYMBOL FOR DAY THIRTY-ONE
11483 # 3400;W;<CJK Ideograph Extension A, First>
11485 # We need to see the First in the example above to know it's a range.
11486 # They did not use the later range syntaxes. This routine changes it
11487 # to use the modern syntax.
11488 # $1 is the Input_file object.
11490 my @fields = split /\s*;\s*/;
11491 if ($fields[2] =~ /^<.*, First>/) {
11492 $first_range = $fields[0];
11495 elsif ($fields[2] =~ /^<.*, Last>/) {
11496 $_ = $_ = "$first_range..$fields[0]; $fields[1]";
11499 undef $first_range;
11500 $_ = "$fields[0]; $fields[1]";
11507 sub filter_old_style_arabic_shaping {
11508 # Early versions used a different term for the later one.
11510 my @fields = split /\s*;\s*/;
11511 $fields[3] =~ s/<no shaping>/No_Joining_Group/;
11512 $fields[3] =~ s/\s+/_/g; # Change spaces to underscores
11513 $_ = join ';', @fields;
11518 my $lc; # Table for lowercase mapping
11521 my %special_casing_code_points;
11523 sub setup_special_casing {
11524 # SpecialCasing.txt contains the non-simple case change mappings. The
11525 # simple ones are in UnicodeData.txt, which should already have been
11526 # read in to the full property data structures, so as to initialize
11527 # these with the simple ones. Then the SpecialCasing.txt entries
11528 # add or overwrite the ones which have different full mappings.
11530 # This routine sees if the simple mappings are to be output, and if
11531 # so, copies what has already been put into the full mapping tables,
11532 # while they still contain only the simple mappings.
11534 # The reason it is done this way is that the simple mappings are
11535 # probably not going to be output, so it saves work to initialize the
11536 # full tables with the simple mappings, and then overwrite those
11537 # relatively few entries in them that have different full mappings,
11538 # and thus skip the simple mapping tables altogether.
11541 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
11543 $lc = property_ref('lc');
11544 $tc = property_ref('tc');
11545 $uc = property_ref('uc');
11547 # For each of the case change mappings...
11548 foreach my $full_casing_table ($lc, $tc, $uc) {
11549 my $full_casing_name = $full_casing_table->name;
11550 my $full_casing_full_name = $full_casing_table->full_name;
11551 unless (defined $full_casing_table
11552 && ! $full_casing_table->is_empty)
11554 Carp::my_carp_bug("Need to process UnicodeData before SpecialCasing. Only special casing will be generated.");
11557 # Create a table in the old-style format and with the original
11558 # file name for backwards compatibility with applications that
11559 # read it directly. The new tables contain both the simple and
11560 # full maps, and the old are missing simple maps when there is a
11561 # conflicting full one. Probably it would have been ok to add
11562 # those to the legacy version, as was already done in 5.14 to the
11563 # case folding one, but this was not done, out of an abundance of
11564 # caution. The tables are set up here before we deal with the
11565 # full maps so that as we handle those, we can override the simple
11566 # maps for them in the legacy table, and merely add them in the
11568 my $legacy = Property->new("Legacy_" . $full_casing_full_name,
11569 File => $full_casing_full_name
11570 =~ s/case_Mapping//r,
11571 Format => $HEX_FORMAT,
11572 Default_Map => $CODE_POINT,
11573 Initialize => $full_casing_table,
11574 Replacement_Property => $full_casing_full_name,
11577 $full_casing_table->add_comment(join_lines( <<END
11578 This file includes both the simple and full case changing maps. The simple
11579 ones are in the main body of the table below, and the full ones adding to or
11580 overriding them are in the hash.
11584 # The simple version's name in each mapping merely has an 's' in
11585 # front of the full one's
11586 my $simple_name = 's' . $full_casing_name;
11587 my $simple = property_ref($simple_name);
11588 $simple->initialize($full_casing_table) if $simple->to_output_map();
11594 sub filter_2_1_8_special_casing_line {
11596 # This version had duplicate entries in this file. Delete all but the
11598 my @fields = split /\s*;\s*/, $_, -1; # -1 => retain trailing null
11600 if (exists $special_casing_code_points{$fields[0]}) {
11605 $special_casing_code_points{$fields[0]} = 1;
11606 filter_special_casing_line(@_);
11609 sub filter_special_casing_line {
11610 # Change the format of $_ from SpecialCasing.txt into something that
11611 # the generic handler understands. Each input line contains three
11612 # case mappings. This will generate three lines to pass to the
11613 # generic handler for each of those.
11615 # The input syntax (after stripping comments and trailing white space
11616 # is like one of the following (with the final two being entries that
11618 # 00DF; 00DF; 0053 0073; 0053 0053; # LATIN SMALL LETTER SHARP S
11619 # 03A3; 03C2; 03A3; 03A3; Final_Sigma;
11620 # 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
11621 # Note the trailing semi-colon, unlike many of the input files. That
11622 # means that there will be an extra null field generated by the split
11625 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
11627 my @fields = split /\s*;\s*/, $_, -1; # -1 => retain trailing null
11630 # field #4 is when this mapping is conditional. If any of these get
11631 # implemented, it would be by hard-coding in the casing functions in
11632 # the Perl core, not through tables. But if there is a new condition
11633 # we don't know about, output a warning. We know about all the
11634 # conditions through 6.0
11635 if ($fields[4] ne "") {
11636 my @conditions = split ' ', $fields[4];
11637 if ($conditions[0] ne 'tr' # We know that these languages have
11638 # conditions, and some are multiple
11639 && $conditions[0] ne 'az'
11640 && $conditions[0] ne 'lt'
11642 # And, we know about a single condition Final_Sigma, but
11644 && ($v_version gt v5.2.0
11645 && (@conditions > 1 || $conditions[0] ne 'Final_Sigma')))
11647 $file->carp_bad_line("Unknown condition '$fields[4]'. You should inspect it and either add code to handle it, or add to list of those that are to ignore");
11649 elsif ($conditions[0] ne 'Final_Sigma') {
11651 # Don't print out a message for Final_Sigma, because we
11652 # have hard-coded handling for it. (But the standard
11653 # could change what the rule should be, but it wouldn't
11654 # show up here anyway.
11656 print "# SKIPPING Special Casing: $_\n"
11657 if $verbosity >= $VERBOSE;
11662 elsif (@fields > 6 || (@fields == 6 && $fields[5] ne "" )) {
11663 $file->carp_bad_line('Extra fields');
11668 my $decimal_code_point = hex $fields[0];
11670 # Loop to handle each of the three mappings in the input line, in
11671 # order, with $i indicating the current field number.
11673 for my $object ($lc, $tc, $uc) {
11674 $i++; # First time through, $i = 0 ... 3rd time = 3
11676 my $value = $object->value_of($decimal_code_point);
11677 $value = ($value eq $CODE_POINT)
11678 ? $decimal_code_point
11681 # If this isn't a multi-character mapping, it should already have
11683 if ($fields[$i] !~ / /) {
11684 if ($value != hex $fields[$i]) {
11685 Carp::my_carp("Bad news. UnicodeData.txt thinks "
11687 . "(0x$fields[0]) is $value"
11688 . " and SpecialCasing.txt thinks it is "
11690 . ". Good luck. Retaining UnicodeData value, and proceeding anyway.");
11695 # The mapping goes into both the legacy table, in which it
11696 # replaces the simple one...
11697 $file->insert_adjusted_lines("$fields[0]; Legacy_"
11698 . $object->full_name
11699 . "; $fields[$i]");
11701 # ... and the regular table, in which it is additional,
11702 # beyond the simple mapping.
11703 $file->insert_adjusted_lines("$fields[0]; "
11707 . "$REPLACE_CMD=$MULTIPLE_BEFORE"
11713 # Everything has been handled by the insert_adjusted_lines()
11720 sub filter_old_style_case_folding {
11721 # This transforms $_ containing the case folding style of 3.0.1, to 3.1
11722 # and later style. Different letters were used in the earlier.
11725 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
11727 my @fields = split /\s*;\s*/;
11728 if ($fields[0] =~ /^ 013 [01] $/x) { # The two turkish fields
11731 elsif ($fields[1] eq 'L') {
11732 $fields[1] = 'C'; # L => C always
11734 elsif ($fields[1] eq 'E') {
11735 if ($fields[2] =~ / /) { # E => C if one code point; F otherwise
11743 $file->carp_bad_line("Expecting L or E in second field");
11747 $_ = join("; ", @fields) . ';';
11751 { # Closure for case folding
11753 # Create the map for simple only if are going to output it, for otherwise
11754 # it takes no part in anything we do.
11755 my $to_output_simple;
11757 sub setup_case_folding($) {
11758 # Read in the case foldings in CaseFolding.txt. This handles both
11759 # simple and full case folding.
11762 = property_ref('Simple_Case_Folding')->to_output_map;
11764 if (! $to_output_simple) {
11765 property_ref('Case_Folding')->set_proxy_for('Simple_Case_Folding');
11768 # If we ever wanted to show that these tables were combined, a new
11769 # property method could be created, like set_combined_props()
11770 property_ref('Case_Folding')->add_comment(join_lines( <<END
11771 This file includes both the simple and full case folding maps. The simple
11772 ones are in the main body of the table below, and the full ones adding to or
11773 overriding them are in the hash.
11779 sub filter_case_folding_line {
11780 # Called for each line in CaseFolding.txt
11781 # Input lines look like:
11782 # 0041; C; 0061; # LATIN CAPITAL LETTER A
11783 # 00DF; F; 0073 0073; # LATIN SMALL LETTER SHARP S
11784 # 1E9E; S; 00DF; # LATIN CAPITAL LETTER SHARP S
11786 # 'C' means that folding is the same for both simple and full
11787 # 'F' that it is only for full folding
11788 # 'S' that it is only for simple folding
11789 # 'T' is locale-dependent, and ignored
11790 # 'I' is a type of 'F' used in some early releases.
11791 # Note the trailing semi-colon, unlike many of the input files. That
11792 # means that there will be an extra null field generated by the split
11793 # below, which we ignore and hence is not an error.
11796 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
11798 my ($range, $type, $map, @remainder) = split /\s*;\s*/, $_, -1;
11799 if (@remainder > 1 || (@remainder == 1 && $remainder[0] ne "" )) {
11800 $file->carp_bad_line('Extra fields');
11805 if ($type =~ / ^ [IT] $/x) { # Skip Turkic case folding, is locale dependent
11810 # C: complete, F: full, or I: dotted uppercase I -> dotless lowercase
11811 # I are all full foldings; S is single-char. For S, there is always
11812 # an F entry, so we must allow multiple values for the same code
11813 # point. Fortunately this table doesn't need further manipulation
11814 # which would preclude using multiple-values. The S is now included
11815 # so that _swash_inversion_hash() is able to construct closures
11816 # without having to worry about F mappings.
11817 if ($type eq 'C' || $type eq 'F' || $type eq 'I' || $type eq 'S') {
11818 $_ = "$range; Case_Folding; "
11819 . "$CMD_DELIM$REPLACE_CMD=$MULTIPLE_BEFORE$CMD_DELIM$map";
11823 $file->carp_bad_line('Expecting C F I S or T in second field');
11826 # C and S are simple foldings, but simple case folding is not needed
11827 # unless we explicitly want its map table output.
11828 if ($to_output_simple && $type eq 'C' || $type eq 'S') {
11829 $file->insert_adjusted_lines("$range; Simple_Case_Folding; $map");
11835 } # End case fold closure
11837 sub filter_jamo_line {
11838 # Filter Jamo.txt lines. This routine mainly is used to populate hashes
11839 # from this file that is used in generating the Name property for Jamo
11840 # code points. But, it also is used to convert early versions' syntax
11841 # into the modern form. Here are two examples:
11842 # 1100; G # HANGUL CHOSEONG KIYEOK # Modern syntax
11843 # U+1100; G; HANGUL CHOSEONG KIYEOK # 2.0 syntax
11845 # The input is $_, the output is $_ filtered.
11847 my @fields = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
11849 # Let the caller handle unexpected input. In earlier versions, there was
11850 # a third field which is supposed to be a comment, but did not have a '#'
11852 return if @fields > (($v_version gt v3.0.0) ? 2 : 3);
11854 $fields[0] =~ s/^U\+//; # Also, early versions had this extraneous
11857 # Some 2.1 versions had this wrong. Causes havoc with the algorithm.
11858 $fields[1] = 'R' if $fields[0] eq '1105';
11860 # Add to structure so can generate Names from it.
11861 my $cp = hex $fields[0];
11862 my $short_name = $fields[1];
11863 $Jamo{$cp} = $short_name;
11864 if ($cp <= $LBase + $LCount) {
11865 $Jamo_L{$short_name} = $cp - $LBase;
11867 elsif ($cp <= $VBase + $VCount) {
11868 $Jamo_V{$short_name} = $cp - $VBase;
11870 elsif ($cp <= $TBase + $TCount) {
11871 $Jamo_T{$short_name} = $cp - $TBase;
11874 Carp::my_carp_bug("Unexpected Jamo code point in $_");
11878 # Reassemble using just the first two fields to look like a typical
11879 # property file line
11880 $_ = "$fields[0]; $fields[1]";
11885 sub register_fraction($) {
11886 # This registers the input rational number so that it can be passed on to
11887 # utf8_heavy.pl, both in rational and floating forms.
11889 my $rational = shift;
11891 my $float = eval $rational;
11892 $nv_floating_to_rational{$float} = $rational;
11896 sub filter_numeric_value_line {
11897 # DNumValues contains lines of a different syntax than the typical
11899 # 0F33 ; -0.5 ; ; -1/2 # No TIBETAN DIGIT HALF ZERO
11901 # This routine transforms $_ containing the anomalous syntax to the
11902 # typical, by filtering out the extra columns, and convert early version
11903 # decimal numbers to strings that look like rational numbers.
11906 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
11908 # Starting in 5.1, there is a rational field. Just use that, omitting the
11909 # extra columns. Otherwise convert the decimal number in the second field
11910 # to a rational, and omit extraneous columns.
11911 my @fields = split /\s*;\s*/, $_, -1;
11914 if ($v_version ge v5.1.0) {
11915 if (@fields != 4) {
11916 $file->carp_bad_line('Not 4 semi-colon separated fields');
11920 $rational = $fields[3];
11921 $_ = join '; ', @fields[ 0, 3 ];
11925 # Here, is an older Unicode file, which has decimal numbers instead of
11926 # rationals in it. Use the fraction to calculate the denominator and
11927 # convert to rational.
11929 if (@fields != 2 && @fields != 3) {
11930 $file->carp_bad_line('Not 2 or 3 semi-colon separated fields');
11935 my $codepoints = $fields[0];
11936 my $decimal = $fields[1];
11937 if ($decimal =~ s/\.0+$//) {
11939 # Anything ending with a decimal followed by nothing but 0's is an
11941 $_ = "$codepoints; $decimal";
11942 $rational = $decimal;
11947 if ($decimal =~ /\.50*$/) {
11951 # Here have the hardcoded repeating decimals in the fraction, and
11952 # the denominator they imply. There were only a few denominators
11953 # in the older Unicode versions of this file which this code
11954 # handles, so it is easy to convert them.
11956 # The 4 is because of a round-off error in the Unicode 3.2 files
11957 elsif ($decimal =~ /\.33*[34]$/ || $decimal =~ /\.6+7$/) {
11960 elsif ($decimal =~ /\.[27]50*$/) {
11963 elsif ($decimal =~ /\.[2468]0*$/) {
11966 elsif ($decimal =~ /\.16+7$/ || $decimal =~ /\.83+$/) {
11969 elsif ($decimal =~ /\.(12|37|62|87)50*$/) {
11972 if ($denominator) {
11973 my $sign = ($decimal < 0) ? "-" : "";
11974 my $numerator = int((abs($decimal) * $denominator) + .5);
11975 $rational = "$sign$numerator/$denominator";
11976 $_ = "$codepoints; $rational";
11979 $file->carp_bad_line("Can't cope with number '$decimal'.");
11986 register_fraction($rational) if $rational =~ qr{/};
11991 my %unihan_properties;
11994 # Do any special setup for Unihan properties.
11996 # This property gives the wrong computed type, so override.
11997 my $usource = property_ref('kIRG_USource');
11998 $usource->set_type($STRING) if defined $usource;
12000 # This property is to be considered binary (it says so in
12001 # http://www.unicode.org/reports/tr38/)
12002 my $iicore = property_ref('kIICore');
12003 if (defined $iicore) {
12004 $iicore->set_type($FORCED_BINARY);
12005 $iicore->table("Y")->add_note("Forced to a binary property as per unicode.org UAX #38.");
12007 # Unicode doesn't include the maps for this property, so don't
12008 # warn that they are missing.
12009 $iicore->set_pre_declared_maps(0);
12010 $iicore->add_comment(join_lines( <<END
12011 This property contains enum values, but Unicode UAX #38 says it should be
12012 interpreted as binary, so Perl creates tables for both 1) its enum values,
12013 plus 2) true/false tables in which it is considered true for all code points
12014 that have a non-null value
12022 sub filter_unihan_line {
12023 # Change unihan db lines to look like the others in the db. Here is
12025 # U+341C kCangjie IEKN
12027 # Tabs are used instead of semi-colons to separate fields; therefore
12028 # they may have semi-colons embedded in them. Change these to periods
12029 # so won't screw up the rest of the code.
12032 # Remove lines that don't look like ones we accept.
12033 if ($_ !~ /^ [^\t]* \t ( [^\t]* ) /x) {
12038 # Extract the property, and save a reference to its object.
12040 if (! exists $unihan_properties{$property}) {
12041 $unihan_properties{$property} = property_ref($property);
12044 # Don't do anything unless the property is one we're handling, which
12045 # we determine by seeing if there is an object defined for it or not
12046 if (! defined $unihan_properties{$property}) {
12051 # Convert the tab separators to our standard semi-colons, and convert
12052 # the U+HHHH notation to the rest of the standard's HHHH
12054 s/\b U \+ (?= $code_point_re )//xg;
12056 #local $to_trace = 1 if main::DEBUG;
12057 trace $_ if main::DEBUG && $to_trace;
12063 sub filter_blocks_lines {
12064 # In the Blocks.txt file, the names of the blocks don't quite match the
12065 # names given in PropertyValueAliases.txt, so this changes them so they
12066 # do match: Blanks and hyphens are changed into underscores. Also makes
12067 # early release versions look like later ones
12069 # $_ is transformed to the correct value.
12072 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
12074 if ($v_version lt v3.2.0) {
12075 if (/FEFF.*Specials/) { # Bug in old versions: line wrongly inserted
12080 # Old versions used a different syntax to mark the range.
12081 $_ =~ s/;\s+/../ if $v_version lt v3.1.0;
12084 my @fields = split /\s*;\s*/, $_, -1;
12085 if (@fields != 2) {
12086 $file->carp_bad_line("Expecting exactly two fields");
12091 # Change hyphens and blanks in the block name field only
12092 $fields[1] =~ s/[ -]/_/g;
12093 $fields[1] =~ s/_ ( [a-z] ) /_\u$1/g; # Capitalize first letter of word
12095 $_ = join("; ", @fields);
12100 my $current_property;
12102 sub filter_old_style_proplist {
12103 # PropList.txt has been in Unicode since version 2.0. Until 3.1, it
12104 # was in a completely different syntax. Ken Whistler of Unicode says
12105 # that it was something he used as an aid for his own purposes, but
12106 # was never an official part of the standard. Many of the properties
12107 # in it were incorporated into the later PropList.txt, but some were
12108 # not. This program uses this early file to generate property tables
12109 # that are otherwise not accessible in the early UCD's. It does this
12110 # for the ones that eventually became official, and don't appear to be
12111 # too different in their contents from the later official version, and
12112 # throws away the rest. It could be argued that the ones it generates
12113 # were probably not really official at that time, so should be
12114 # ignored. You can easily modify things to skip all of them by
12115 # changing this function to just set $_ to "", and return; and to skip
12116 # certain of them by by simply removing their declarations from
12117 # get_old_property_aliases().
12119 # Here is a list of all the ones that are thrown away:
12120 # Alphabetic The definitions for this are very
12121 # defective, so better to not mislead
12122 # people into thinking it works.
12123 # Instead the Perl extension of the
12124 # same name is constructed from first
12126 # Bidi=* duplicates UnicodeData.txt
12127 # Combining never made into official property;
12129 # Composite never made into official property.
12130 # Currency Symbol duplicates UnicodeData.txt: gc=sc
12131 # Decimal Digit duplicates UnicodeData.txt: gc=nd
12132 # Delimiter never made into official property;
12134 # Format Control never made into official property;
12136 # High Surrogate duplicates Blocks.txt
12137 # Ignorable Control never made into official property;
12139 # ISO Control duplicates UnicodeData.txt: gc=cc
12140 # Left of Pair never made into official property;
12141 # Line Separator duplicates UnicodeData.txt: gc=zl
12142 # Low Surrogate duplicates Blocks.txt
12143 # Non-break was actually listed as a property
12144 # in 3.2, but without any code
12145 # points. Unicode denies that this
12146 # was ever an official property
12147 # Non-spacing duplicate UnicodeData.txt: gc=mn
12148 # Numeric duplicates UnicodeData.txt: gc=cc
12149 # Paired Punctuation never made into official property;
12150 # appears to be gc=ps + gc=pe
12151 # Paragraph Separator duplicates UnicodeData.txt: gc=cc
12152 # Private Use duplicates UnicodeData.txt: gc=co
12153 # Private Use High Surrogate duplicates Blocks.txt
12154 # Punctuation duplicates UnicodeData.txt: gc=p
12155 # Space different definition than eventual
12157 # Titlecase duplicates UnicodeData.txt: gc=lt
12158 # Unassigned Code Value duplicates UnicodeData.txt: gc=cn
12159 # Zero-width never made into official property;
12161 # Most of the properties have the same names in this file as in later
12162 # versions, but a couple do not.
12164 # This subroutine filters $_, converting it from the old style into
12165 # the new style. Here's a sample of the old-style
12167 # *******************************************
12169 # Property dump for: 0x100000A0 (Join Control)
12171 # 200C..200D (2 chars)
12173 # In the example, the property is "Join Control". It is kept in this
12174 # closure between calls to the subroutine. The numbers beginning with
12175 # 0x were internal to Ken's program that generated this file.
12177 # If this line contains the property name, extract it.
12178 if (/^Property dump for: [^(]*\((.*)\)/) {
12181 # Convert white space to underscores.
12184 # Convert the few properties that don't have the same name as
12185 # their modern counterparts
12186 s/Identifier_Part/ID_Continue/
12187 or s/Not_a_Character/NChar/;
12189 # If the name matches an existing property, use it.
12190 if (defined property_ref($_)) {
12191 trace "new property=", $_ if main::DEBUG && $to_trace;
12192 $current_property = $_;
12194 else { # Otherwise discard it
12195 trace "rejected property=", $_ if main::DEBUG && $to_trace;
12196 undef $current_property;
12198 $_ = ""; # The property is saved for the next lines of the
12199 # file, but this defining line is of no further use,
12200 # so clear it so that the caller won't process it
12203 elsif (! defined $current_property || $_ !~ /^$code_point_re/) {
12205 # Here, the input line isn't a header defining a property for the
12206 # following section, and either we aren't in such a section, or
12207 # the line doesn't look like one that defines the code points in
12208 # such a section. Ignore this line.
12213 # Here, we have a line defining the code points for the current
12214 # stashed property. Anything starting with the first blank is
12215 # extraneous. Otherwise, it should look like a normal range to
12216 # the caller. Append the property name so that it looks just like
12217 # a modern PropList entry.
12220 $_ .= "; $current_property";
12222 trace $_ if main::DEBUG && $to_trace;
12225 } # End closure for old style proplist
12227 sub filter_old_style_normalization_lines {
12228 # For early releases of Unicode, the lines were like:
12229 # 74..2A76 ; NFKD_NO
12230 # For later releases this became:
12231 # 74..2A76 ; NFKD_QC; N
12232 # Filter $_ to look like those in later releases.
12233 # Similarly for MAYBEs
12235 s/ _NO \b /_QC; N/x || s/ _MAYBE \b /_QC; M/x;
12237 # Also, the property FC_NFKC was abbreviated to FNC
12242 sub setup_script_extensions {
12243 # The Script_Extensions property starts out with a clone of the Script
12246 my $scx = property_ref("Script_Extensions");
12247 $scx = Property->new("scx", Full_Name => "Script_Extensions")
12249 $scx->_set_format($STRING_WHITE_SPACE_LIST);
12250 $scx->initialize($script);
12251 $scx->set_default_map($script->default_map);
12252 $scx->set_pre_declared_maps(0); # PropValueAliases doesn't list these
12253 $scx->add_comment(join_lines( <<END
12254 The values for code points that appear in one script are just the same as for
12255 the 'Script' property. Likewise the values for those that appear in many
12256 scripts are either 'Common' or 'Inherited', same as with 'Script'. But the
12257 values of code points that appear in a few scripts are a space separated list
12262 # Initialize scx's tables and the aliases for them to be the same as sc's
12263 foreach my $table ($script->tables) {
12264 my $scx_table = $scx->add_match_table($table->name,
12265 Full_Name => $table->full_name);
12266 foreach my $alias ($table->aliases) {
12267 $scx_table->add_alias($alias->name);
12272 sub filter_script_extensions_line {
12273 # The Scripts file comes with the full name for the scripts; the
12274 # ScriptExtensions, with the short name. The final mapping file is a
12275 # combination of these, and without adjustment, would have inconsistent
12276 # entries. This filters the latter file to convert to full names.
12277 # Entries look like this:
12278 # 064B..0655 ; Arab Syrc # Mn [11] ARABIC FATHATAN..ARABIC HAMZA BELOW
12280 my @fields = split /\s*;\s*/;
12282 # This script was erroneously omitted in this Unicode version.
12283 $fields[1] .= ' Takr' if $v_version eq v6.1.0 && $fields[0] =~ /^0964/;
12286 foreach my $short_name (split " ", $fields[1]) {
12287 push @full_names, $script->table($short_name)->full_name;
12289 $fields[1] = join " ", @full_names;
12290 $_ = join "; ", @fields;
12297 # Populates the Hangul Syllable Type property from first principles
12300 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
12302 # These few ranges are hard-coded in.
12303 $file->insert_lines(split /\n/, <<'END'
12311 # The Hangul syllables in version 1 are completely different than what came
12312 # after, so just ignore them there.
12313 if ($v_version lt v2.0.0) {
12314 my $property = property_ref($file->property);
12315 push @tables_that_may_be_empty, $property->table('LV')->complete_name;
12316 push @tables_that_may_be_empty, $property->table('LVT')->complete_name;
12320 # The algorithmically derived syllables are almost all LVT ones, so
12321 # initialize the whole range with that.
12322 $file->insert_lines(sprintf "%04X..%04X; LVT\n",
12323 $SBase, $SBase + $SCount -1);
12325 # Those ones that aren't LVT are LV, and they occur at intervals of
12326 # $TCount code points, starting with the first code point, at $SBase.
12327 for (my $i = $SBase; $i < $SBase + $SCount; $i += $TCount) {
12328 $file->insert_lines(sprintf "%04X..%04X; LV\n", $i, $i);
12336 # Populates the Grapheme Cluster Break property from first principles
12339 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
12341 # All these definitions are from
12342 # http://www.unicode.org/reports/tr29/tr29-3.html with confirmation
12343 # from http://www.unicode.org/reports/tr29/tr29-4.html
12345 foreach my $range ($gc->ranges) {
12347 # Extend includes gc=Me and gc=Mn, while Control includes gc=Cc
12349 if ($range->value =~ / ^ M [en] $ /x) {
12350 $file->insert_lines(sprintf "%04X..%04X; Extend",
12351 $range->start, $range->end);
12353 elsif ($range->value =~ / ^ C [cf] $ /x) {
12354 $file->insert_lines(sprintf "%04X..%04X; Control",
12355 $range->start, $range->end);
12358 $file->insert_lines("2028; Control"); # Line Separator
12359 $file->insert_lines("2029; Control"); # Paragraph Separator
12361 $file->insert_lines("000D; CR");
12362 $file->insert_lines("000A; LF");
12364 # Also from http://www.unicode.org/reports/tr29/tr29-3.html.
12365 foreach my $code_point ( qw{
12367 09BE 09D7 0B3E 0B57 0BBE 0BD7 0CC2 0CD5 0CD6
12368 0D3E 0D57 0DCF 0DDF FF9E FF9F 1D165 1D16E 1D16F
12371 my $category = $gc->value_of(hex $code_point);
12372 next if ! defined $category || $category eq 'Cn'; # But not if
12373 # unassigned in this
12375 $file->insert_lines("$code_point; Extend");
12378 my $hst = property_ref('Hangul_Syllable_Type');
12379 if ($hst->count > 0) {
12380 foreach my $range ($hst->ranges) {
12381 $file->insert_lines(sprintf "%04X..%04X; %s",
12382 $range->start, $range->end, $range->value);
12386 generate_hst($file);
12392 sub setup_early_name_alias {
12394 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
12396 # This has the effect of pretending that the Name_Alias property was
12397 # available in all Unicode releases. Strictly speaking, this property
12398 # should not be availabe in early releases, but doing this allows
12399 # charnames.pm to work on older releases without change. Prior to v5.16
12400 # it had these names hard-coded inside it. Unicode 6.1 came along and
12401 # created these names, and so they were removed from charnames.
12403 my $aliases = property_ref('Name_Alias');
12404 if (! defined $aliases) {
12405 $aliases = Property->new('Name_Alias', Default_Map => "");
12408 $file->insert_lines(get_old_name_aliases());
12413 sub get_old_name_aliases () {
12415 # The Unicode_1_Name field, contains most of these names. One would
12416 # expect, given the field's name, that its values would be fixed across
12417 # versions, giving the true Unicode version 1 name for the character.
12418 # Sadly, this is not the case. Actually Version 1.1.5 had no names for
12419 # any of the controls; Version 2.0 introduced names for the C0 controls,
12420 # and 3.0 introduced C1 names. 3.0.1 removed the name INDEX; and 3.2
12421 # changed some names: it
12422 # changed to parenthesized versions like "NEXT LINE" to
12423 # "NEXT LINE (NEL)";
12424 # changed PARTIAL LINE DOWN to PARTIAL LINE FORWARD
12425 # changed PARTIAL LINE UP to PARTIAL LINE BACKWARD;;
12426 # changed e.g. FILE SEPARATOR to INFORMATION SEPARATOR FOUR
12427 # This list contains all the names that were defined so that
12428 # charnames::vianame(), etc. understand them all EVEN if this version of
12429 # Unicode didn't specify them (this could be construed as a bug).
12430 # mktables elsewhere gives preference to the Unicode_1_Name field over
12431 # these names, so that viacode() will return the correct value for that
12432 # version of Unicode, except when that version doesn't define a name,
12433 # viacode() will return one anyway (this also could be construed as a
12434 # bug). But these potential "bugs" allow for the smooth working of code
12435 # on earlier Unicode releases.
12437 my @return = split /\n/, <<'END';
12439 0000;NUL;abbreviation
12440 0001;START OF HEADING;control
12441 0001;SOH;abbreviation
12442 0002;START OF TEXT;control
12443 0002;STX;abbreviation
12444 0003;END OF TEXT;control
12445 0003;ETX;abbreviation
12446 0004;END OF TRANSMISSION;control
12447 0004;EOT;abbreviation
12448 0005;ENQUIRY;control
12449 0005;ENQ;abbreviation
12450 0006;ACKNOWLEDGE;control
12451 0006;ACK;abbreviation
12453 0007;BEL;abbreviation
12454 0008;BACKSPACE;control
12455 0008;BS;abbreviation
12456 0009;CHARACTER TABULATION;control
12457 0009;HORIZONTAL TABULATION;control
12458 0009;HT;abbreviation
12459 0009;TAB;abbreviation
12460 000A;LINE FEED;control
12461 000A;LINE FEED (LF);control
12462 000A;NEW LINE;control
12463 000A;END OF LINE;control
12464 000A;LF;abbreviation
12465 000A;NL;abbreviation
12466 000A;EOL;abbreviation
12467 000B;LINE TABULATION;control
12468 000B;VERTICAL TABULATION;control
12469 000B;VT;abbreviation
12470 000C;FORM FEED;control
12471 000C;FORM FEED (FF);control
12472 000C;FF;abbreviation
12473 000D;CARRIAGE RETURN;control
12474 000D;CARRIAGE RETURN (CR);control
12475 000D;CR;abbreviation
12476 000E;SHIFT OUT;control
12477 000E;LOCKING-SHIFT ONE;control
12478 000E;SO;abbreviation
12479 000F;SHIFT IN;control
12480 000F;LOCKING-SHIFT ZERO;control
12481 000F;SI;abbreviation
12482 0010;DATA LINK ESCAPE;control
12483 0010;DLE;abbreviation
12484 0011;DEVICE CONTROL ONE;control
12485 0011;DC1;abbreviation
12486 0012;DEVICE CONTROL TWO;control
12487 0012;DC2;abbreviation
12488 0013;DEVICE CONTROL THREE;control
12489 0013;DC3;abbreviation
12490 0014;DEVICE CONTROL FOUR;control
12491 0014;DC4;abbreviation
12492 0015;NEGATIVE ACKNOWLEDGE;control
12493 0015;NAK;abbreviation
12494 0016;SYNCHRONOUS IDLE;control
12495 0016;SYN;abbreviation
12496 0017;END OF TRANSMISSION BLOCK;control
12497 0017;ETB;abbreviation
12498 0018;CANCEL;control
12499 0018;CAN;abbreviation
12500 0019;END OF MEDIUM;control
12501 0019;EOM;abbreviation
12502 001A;SUBSTITUTE;control
12503 001A;SUB;abbreviation
12504 001B;ESCAPE;control
12505 001B;ESC;abbreviation
12506 001C;INFORMATION SEPARATOR FOUR;control
12507 001C;FILE SEPARATOR;control
12508 001C;FS;abbreviation
12509 001D;INFORMATION SEPARATOR THREE;control
12510 001D;GROUP SEPARATOR;control
12511 001D;GS;abbreviation
12512 001E;INFORMATION SEPARATOR TWO;control
12513 001E;RECORD SEPARATOR;control
12514 001E;RS;abbreviation
12515 001F;INFORMATION SEPARATOR ONE;control
12516 001F;UNIT SEPARATOR;control
12517 001F;US;abbreviation
12518 0020;SP;abbreviation
12519 007F;DELETE;control
12520 007F;DEL;abbreviation
12521 0080;PADDING CHARACTER;figment
12522 0080;PAD;abbreviation
12523 0081;HIGH OCTET PRESET;figment
12524 0081;HOP;abbreviation
12525 0082;BREAK PERMITTED HERE;control
12526 0082;BPH;abbreviation
12527 0083;NO BREAK HERE;control
12528 0083;NBH;abbreviation
12530 0084;IND;abbreviation
12531 0085;NEXT LINE;control
12532 0085;NEXT LINE (NEL);control
12533 0085;NEL;abbreviation
12534 0086;START OF SELECTED AREA;control
12535 0086;SSA;abbreviation
12536 0087;END OF SELECTED AREA;control
12537 0087;ESA;abbreviation
12538 0088;CHARACTER TABULATION SET;control
12539 0088;HORIZONTAL TABULATION SET;control
12540 0088;HTS;abbreviation
12541 0089;CHARACTER TABULATION WITH JUSTIFICATION;control
12542 0089;HORIZONTAL TABULATION WITH JUSTIFICATION;control
12543 0089;HTJ;abbreviation
12544 008A;LINE TABULATION SET;control
12545 008A;VERTICAL TABULATION SET;control
12546 008A;VTS;abbreviation
12547 008B;PARTIAL LINE FORWARD;control
12548 008B;PARTIAL LINE DOWN;control
12549 008B;PLD;abbreviation
12550 008C;PARTIAL LINE BACKWARD;control
12551 008C;PARTIAL LINE UP;control
12552 008C;PLU;abbreviation
12553 008D;REVERSE LINE FEED;control
12554 008D;REVERSE INDEX;control
12555 008D;RI;abbreviation
12556 008E;SINGLE SHIFT TWO;control
12557 008E;SINGLE-SHIFT-2;control
12558 008E;SS2;abbreviation
12559 008F;SINGLE SHIFT THREE;control
12560 008F;SINGLE-SHIFT-3;control
12561 008F;SS3;abbreviation
12562 0090;DEVICE CONTROL STRING;control
12563 0090;DCS;abbreviation
12564 0091;PRIVATE USE ONE;control
12565 0091;PRIVATE USE-1;control
12566 0091;PU1;abbreviation
12567 0092;PRIVATE USE TWO;control
12568 0092;PRIVATE USE-2;control
12569 0092;PU2;abbreviation
12570 0093;SET TRANSMIT STATE;control
12571 0093;STS;abbreviation
12572 0094;CANCEL CHARACTER;control
12573 0094;CCH;abbreviation
12574 0095;MESSAGE WAITING;control
12575 0095;MW;abbreviation
12576 0096;START OF GUARDED AREA;control
12577 0096;START OF PROTECTED AREA;control
12578 0096;SPA;abbreviation
12579 0097;END OF GUARDED AREA;control
12580 0097;END OF PROTECTED AREA;control
12581 0097;EPA;abbreviation
12582 0098;START OF STRING;control
12583 0098;SOS;abbreviation
12584 0099;SINGLE GRAPHIC CHARACTER INTRODUCER;figment
12585 0099;SGC;abbreviation
12586 009A;SINGLE CHARACTER INTRODUCER;control
12587 009A;SCI;abbreviation
12588 009B;CONTROL SEQUENCE INTRODUCER;control
12589 009B;CSI;abbreviation
12590 009C;STRING TERMINATOR;control
12591 009C;ST;abbreviation
12592 009D;OPERATING SYSTEM COMMAND;control
12593 009D;OSC;abbreviation
12594 009E;PRIVACY MESSAGE;control
12595 009E;PM;abbreviation
12596 009F;APPLICATION PROGRAM COMMAND;control
12597 009F;APC;abbreviation
12598 00A0;NBSP;abbreviation
12599 00AD;SHY;abbreviation
12600 200B;ZWSP;abbreviation
12601 200C;ZWNJ;abbreviation
12602 200D;ZWJ;abbreviation
12603 200E;LRM;abbreviation
12604 200F;RLM;abbreviation
12605 202A;LRE;abbreviation
12606 202B;RLE;abbreviation
12607 202C;PDF;abbreviation
12608 202D;LRO;abbreviation
12609 202E;RLO;abbreviation
12610 FEFF;BYTE ORDER MARK;alternate
12611 FEFF;BOM;abbreviation
12612 FEFF;ZWNBSP;abbreviation
12615 if ($v_version ge v3.0.0) {
12616 push @return, split /\n/, <<'END';
12617 180B; FVS1; abbreviation
12618 180C; FVS2; abbreviation
12619 180D; FVS3; abbreviation
12620 180E; MVS; abbreviation
12621 202F; NNBSP; abbreviation
12625 if ($v_version ge v3.2.0) {
12626 push @return, split /\n/, <<'END';
12627 034F; CGJ; abbreviation
12628 205F; MMSP; abbreviation
12629 2060; WJ; abbreviation
12632 my $cp = 0xFE00 - 1;
12633 for my $i (1..16) {
12634 push @return, sprintf("%04X; VS%d; abbreviation", $cp + $i, $i);
12637 if ($v_version ge v4.0.0) { # Add in VS17..VS256
12638 my $cp = 0xE0100 - 17;
12639 for my $i (17..256) {
12640 push @return, sprintf("%04X; VS%d; abbreviation", $cp + $i, $i);
12644 # ALERT did not come along until 6.0, at which point it became preferred
12645 # over BELL, and was never in the Unicode_1_Name field. For the same
12646 # reasons, that the other names are made known to all releases by this
12647 # function, we make ALERT known too. By inserting it
12648 # last in early releases, BELL is preferred over it; and vice-vers in 6.0
12649 my $alert = '0007; ALERT; control';
12650 if ($v_version lt v6.0.0) {
12651 push @return, $alert;
12654 unshift @return, $alert;
12660 sub filter_later_version_name_alias_line {
12662 # This file has an extra entry per line for the alias type. This is
12663 # handled by creating a compound entry: "$alias: $type"; First, split
12664 # the line into components.
12665 my ($range, $alias, $type, @remainder)
12666 = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
12668 # This file contains multiple entries for some components, so tell the
12669 # downstream code to allow this in our internal tables; the
12670 # $MULTIPLE_AFTER preserves the input ordering.
12671 $_ = join ";", $range, $CMD_DELIM
12681 sub filter_early_version_name_alias_line {
12683 # Early versions did not have the trailing alias type field; implicitly it
12684 # was 'correction'. But our synthetic lines we add in this program do
12685 # have it, so test for the type field.
12686 $_ .= "; correction" if $_ !~ /;.*;/;
12688 filter_later_version_name_alias_line;
12692 sub finish_Unicode() {
12693 # This routine should be called after all the Unicode files have been read
12695 # 1) Creates properties that are missing from the version of Unicode being
12696 # compiled, and which, for whatever reason, are needed for the Perl
12697 # core to function properly. These are minimally populated as
12699 # 2) Adds the mappings for code points missing from the files which have
12700 # defaults specified for them.
12701 # 3) At this this point all mappings are known, so it computes the type of
12702 # each property whose type hasn't been determined yet.
12703 # 4) Calculates all the regular expression match tables based on the
12705 # 5) Calculates and adds the tables which are defined by Unicode, but
12706 # which aren't derived by them, and certain derived tables that Perl
12709 # Folding information was introduced later into Unicode data. To get
12710 # Perl's case ignore (/i) to work at all in releases that don't have
12711 # folding, use the best available alternative, which is lower casing.
12712 my $fold = property_ref('Case_Folding');
12713 if ($fold->is_empty) {
12714 $fold->initialize(property_ref('Lowercase_Mapping'));
12715 $fold->add_note(join_lines(<<END
12716 WARNING: This table uses lower case as a substitute for missing fold
12722 # Multiple-character mapping was introduced later into Unicode data, so it
12723 # is by default the simple version. If to output the simple versions and
12724 # not present, just use the regular (which in these Unicode versions is
12725 # the simple as well).
12726 foreach my $map (qw { Uppercase_Mapping
12732 my $comment = <<END;
12734 Note that although the Perl core uses this file, it has the standard values
12735 for code points from U+0000 to U+00FF compiled in, so changing this table will
12736 not change the core's behavior with respect to these code points. Use
12737 Unicode::Casing to override this table.
12739 if ($map eq 'Case_Folding') {
12741 (/i regex matching is not overridable except by using a custom regex engine)
12744 property_ref($map)->add_comment(join_lines($comment));
12745 my $simple = property_ref("Simple_$map");
12746 next if ! $simple->is_empty;
12747 if ($simple->to_output_map) {
12748 $simple->initialize(property_ref($map));
12751 property_ref($map)->set_proxy_for($simple->name);
12755 # For each property, fill in any missing mappings, and calculate the re
12756 # match tables. If a property has more than one missing mapping, the
12757 # default is a reference to a data structure, and requires data from other
12758 # properties to resolve. The sort is used to cause these to be processed
12759 # last, after all the other properties have been calculated.
12760 # (Fortunately, the missing properties so far don't depend on each other.)
12761 foreach my $property
12762 (sort { (defined $a->default_map && ref $a->default_map) ? 1 : -1 }
12765 # $perl has been defined, but isn't one of the Unicode properties that
12766 # need to be finished up.
12767 next if $property == $perl;
12769 # Nor do we need to do anything with properties that aren't going to
12771 next if $property->fate == $SUPPRESSED;
12773 # Handle the properties that have more than one possible default
12774 if (ref $property->default_map) {
12775 my $default_map = $property->default_map;
12777 # These properties have stored in the default_map:
12779 # 1) A default map which applies to all code points in a
12781 # 2) an expression which will evaluate to the list of code
12782 # points in that class
12784 # 3) the default map which applies to every other missing code
12787 # Go through each list.
12788 while (my ($default, $eval) = $default_map->get_next_defaults) {
12790 # Get the class list, and intersect it with all the so-far
12791 # unspecified code points yielding all the code points
12792 # in the class that haven't been specified.
12793 my $list = eval $eval;
12795 Carp::my_carp("Can't set some defaults for missing code points for $property because eval '$eval' failed with '$@'");
12799 # Narrow down the list to just those code points we don't have
12801 $list = $list & $property->inverse_list;
12803 # Add mappings to the property for each code point in the list
12804 foreach my $range ($list->ranges) {
12805 $property->add_map($range->start, $range->end, $default,
12806 Replace => $CROAK);
12810 # All remaining code points have the other mapping. Set that up
12811 # so the normal single-default mapping code will work on them
12812 $property->set_default_map($default_map->other_default);
12814 # And fall through to do that
12817 # We should have enough data now to compute the type of the property.
12818 my $property_name = $property->name;
12819 $property->compute_type;
12820 my $property_type = $property->type;
12822 next if ! $property->to_create_match_tables;
12824 # Here want to create match tables for this property
12826 # The Unicode db always (so far, and they claim into the future) have
12827 # the default for missing entries in binary properties be 'N' (unless
12828 # there is a '@missing' line that specifies otherwise)
12829 if (! defined $property->default_map) {
12830 if ($property_type == $BINARY) {
12831 $property->set_default_map('N');
12833 elsif ($property_type == $ENUM) {
12834 Carp::my_carp("Property '$property_name doesn't have a default mapping. Using a fake one");
12835 $property->set_default_map('XXX This makes sure there is a default map');
12839 # Add any remaining code points to the mapping, using the default for
12840 # missing code points.
12842 if (defined (my $default_map = $property->default_map)) {
12844 # Make sure there is a match table for the default
12845 if (! defined ($default_table = $property->table($default_map))) {
12846 $default_table = $property->add_match_table($default_map);
12849 # And, if the property is binary, the default table will just
12850 # be the complement of the other table.
12851 if ($property_type == $BINARY) {
12852 my $non_default_table;
12854 # Find the non-default table.
12855 for my $table ($property->tables) {
12856 next if $table == $default_table;
12857 $non_default_table = $table;
12859 $default_table->set_complement($non_default_table);
12863 # This fills in any missing values with the default. It's not
12864 # necessary to do this with binary properties, as the default
12865 # is defined completely in terms of the Y table.
12866 $property->add_map(0, $MAX_UNICODE_CODEPOINT,
12867 $default_map, Replace => $NO);
12871 # Have all we need to populate the match tables.
12872 my $maps_should_be_defined = $property->pre_declared_maps;
12873 foreach my $range ($property->ranges) {
12874 my $map = $range->value;
12875 my $table = $property->table($map);
12876 if (! defined $table) {
12878 # Integral and rational property values are not necessarily
12879 # defined in PropValueAliases, but whether all the other ones
12880 # should be depends on the property.
12881 if ($maps_should_be_defined
12882 && $map !~ /^ -? \d+ ( \/ \d+ )? $/x)
12884 Carp::my_carp("Table '$property_name=$map' should have been defined. Defining it now.")
12886 $table = $property->add_match_table($map);
12889 next if $table->complement != 0; # Don't need to populate these
12890 $table->add_range($range->start, $range->end);
12893 # A forced binary property has additional true/false tables which
12894 # should have been set up when it was forced into binary. The false
12895 # table matches exactly the same set as the property's default table.
12896 # The true table matches the complement of that. The false table is
12897 # not the same as an additional set of aliases on top of the default
12898 # table, so use 'set_equivalent_to'. If it were implemented as
12899 # additional aliases, various things would have to be adjusted, but
12900 # especially, if the user wants to get a list of names for the table
12901 # using Unicode::UCD::prop_value_aliases(), s/he should get a
12902 # different set depending on whether they want the default table or
12904 if ($property_type == $FORCED_BINARY) {
12905 $property->table('N')->set_equivalent_to($default_table,
12907 $property->table('Y')->set_complement($default_table);
12910 # For Perl 5.6 compatibility, all properties matchable in regexes can
12911 # have an optional 'Is_' prefix. This is now done in utf8_heavy.pl.
12912 # But warn if this creates a conflict with a (new) Unicode property
12913 # name, although it appears that Unicode has made a decision never to
12914 # begin a property name with 'Is_', so this shouldn't happen.
12915 foreach my $alias ($property->aliases) {
12916 my $Is_name = 'Is_' . $alias->name;
12917 if (defined (my $pre_existing = property_ref($Is_name))) {
12918 Carp::my_carp(<<END
12919 There is already an alias named $Is_name (from " . $pre_existing . "), so
12920 creating one for $property won't work. This is bad news. If it is not too
12921 late, get Unicode to back off. Otherwise go back to the old scheme (findable
12922 from the git blame log for this area of the code that suppressed individual
12923 aliases that conflict with the new Unicode names. Proceeding anyway.
12927 } # End of loop through aliases for this property
12928 } # End of loop through all Unicode properties.
12930 # Fill in the mappings that Unicode doesn't completely furnish. First the
12931 # single letter major general categories. If Unicode were to start
12932 # delivering the values, this would be redundant, but better that than to
12933 # try to figure out if should skip and not get it right. Ths could happen
12934 # if a new major category were to be introduced, and the hard-coded test
12935 # wouldn't know about it.
12936 # This routine depends on the standard names for the general categories
12937 # being what it thinks they are, like 'Cn'. The major categories are the
12938 # union of all the general category tables which have the same first
12939 # letters. eg. L = Lu + Lt + Ll + Lo + Lm
12940 foreach my $minor_table ($gc->tables) {
12941 my $minor_name = $minor_table->name;
12942 next if length $minor_name == 1;
12943 if (length $minor_name != 2) {
12944 Carp::my_carp_bug("Unexpected general category '$minor_name'. Skipped.");
12948 my $major_name = uc(substr($minor_name, 0, 1));
12949 my $major_table = $gc->table($major_name);
12950 $major_table += $minor_table;
12953 # LC is Ll, Lu, and Lt. (used to be L& or L_, but PropValueAliases.txt
12954 # defines it as LC)
12955 my $LC = $gc->table('LC');
12956 $LC->add_alias('L_', Status => $DISCOURAGED); # For backwards...
12957 $LC->add_alias('L&', Status => $DISCOURAGED); # compatibility.
12960 if ($LC->is_empty) { # Assume if not empty that Unicode has started to
12961 # deliver the correct values in it
12962 $LC->initialize($gc->table('Ll') + $gc->table('Lu'));
12964 # Lt not in release 1.
12965 if (defined $gc->table('Lt')) {
12966 $LC += $gc->table('Lt');
12967 $gc->table('Lt')->set_caseless_equivalent($LC);
12970 $LC->add_description('[\p{Ll}\p{Lu}\p{Lt}]');
12972 $gc->table('Ll')->set_caseless_equivalent($LC);
12973 $gc->table('Lu')->set_caseless_equivalent($LC);
12975 my $Cs = $gc->table('Cs');
12977 # Create digit and case fold tables with the original file names for
12978 # backwards compatibility with applications that read them directly.
12979 my $Digit = Property->new("Legacy_Perl_Decimal_Digit",
12981 File => 'Digit', # Trad. location
12982 Directory => $map_directory,
12984 Replacement_Property => "Perl_Decimal_Digit",
12985 Initialize => property_ref('Perl_Decimal_Digit'),
12987 $Digit->add_comment(join_lines(<<END
12988 This file gives the mapping of all code points which represent a single
12989 decimal digit [0-9] to their respective digits. For example, the code point
12990 U+0031 (an ASCII '1') is mapped to a numeric 1. These code points are those
12991 that have Numeric_Type=Decimal; not special things, like subscripts nor Roman
12996 Property->new('Legacy_Case_Folding',
12998 Directory => $map_directory,
12999 Default_Map => $CODE_POINT,
13001 Replacement_Property => "Case_Folding",
13002 Format => $HEX_FORMAT,
13003 Initialize => property_ref('cf'),
13006 # The Script_Extensions property started out as a clone of the Script
13007 # property. But processing its data file caused some elements to be
13008 # replaced with different data. (These elements were for the Common and
13009 # Inherited properties.) This data is a qw() list of all the scripts that
13010 # the code points in the given range are in. An example line is:
13011 # 060C ; Arab Syrc Thaa # Po ARABIC COMMA
13013 # The code above has created a new match table named "Arab Syrc Thaa"
13014 # which contains 060C. (The cloned table started out with this code point
13015 # mapping to "Common".) Now we add 060C to each of the Arab, Syrc, and
13016 # Thaa match tables. Then we delete the now spurious "Arab Syrc Thaa"
13017 # match table. This is repeated for all these tables and ranges. The map
13018 # data is retained in the map table for reference, but the spurious match
13019 # tables are deleted.
13021 my $scx = property_ref("Script_Extensions");
13022 if (defined $scx) {
13023 foreach my $table ($scx->tables) {
13024 next unless $table->name =~ /\s/; # All the new and only the new
13025 # tables have a space in their
13027 my @scripts = split /\s+/, $table->name;
13028 foreach my $script (@scripts) {
13029 my $script_table = $scx->table($script);
13030 $script_table += $table;
13032 $scx->delete_match_table($table);
13039 sub pre_3_dot_1_Nl () {
13041 # Return a range list for gc=nl for Unicode versions prior to 3.1, which
13042 # is when Unicode's became fully usable. These code points were
13043 # determined by inspection and experimentation. gc=nl is important for
13044 # certain Perl-extension properties that should be available in all
13047 my $Nl = Range_List->new();
13048 if (defined (my $official = $gc->table('Nl'))) {
13052 $Nl->add_range(0x2160, 0x2182);
13053 $Nl->add_range(0x3007, 0x3007);
13054 $Nl->add_range(0x3021, 0x3029);
13056 $Nl->add_range(0xFE20, 0xFE23);
13057 $Nl->add_range(0x16EE, 0x16F0) if $v_version ge v3.0.0; # 3.0 was when
13062 sub compile_perl() {
13063 # Create perl-defined tables. Almost all are part of the pseudo-property
13064 # named 'perl' internally to this program. Many of these are recommended
13065 # in UTS#18 "Unicode Regular Expressions", and their derivations are based
13066 # on those found there.
13067 # Almost all of these are equivalent to some Unicode property.
13068 # A number of these properties have equivalents restricted to the ASCII
13069 # range, with their names prefaced by 'Posix', to signify that these match
13070 # what the Posix standard says they should match. A couple are
13071 # effectively this, but the name doesn't have 'Posix' in it because there
13072 # just isn't any Posix equivalent. 'XPosix' are the Posix tables extended
13073 # to the full Unicode range, by our guesses as to what is appropriate.
13075 # 'Any' is all code points. As an error check, instead of just setting it
13076 # to be that, construct it to be the union of all the major categories
13077 $Any = $perl->add_match_table('Any',
13078 Description => "[\\x{0000}-\\x{$MAX_UNICODE_CODEPOINT_STRING}]",
13081 foreach my $major_table ($gc->tables) {
13083 # Major categories are the ones with single letter names.
13084 next if length($major_table->name) != 1;
13086 $Any += $major_table;
13089 if ($Any->max != $MAX_UNICODE_CODEPOINT) {
13090 Carp::my_carp_bug("Generated highest code point ("
13091 . sprintf("%X", $Any->max)
13092 . ") doesn't match expected value $MAX_UNICODE_CODEPOINT_STRING.")
13094 if ($Any->range_count != 1 || $Any->min != 0) {
13095 Carp::my_carp_bug("Generated table 'Any' doesn't match all code points.")
13098 $Any->add_alias('All');
13100 # Assigned is the opposite of gc=unassigned
13101 my $Assigned = $perl->add_match_table('Assigned',
13102 Description => "All assigned code points",
13103 Initialize => ~ $gc->table('Unassigned'),
13106 # Our internal-only property should be treated as more than just a
13107 # synonym; grandfather it in to the pod.
13108 $perl->add_match_table('_CombAbove', Re_Pod_Entry => 1,
13109 Fate => $INTERNAL_ONLY, Status => $DISCOURAGED)
13110 ->set_equivalent_to(property_ref('ccc')->table('Above'),
13113 my $ASCII = $perl->add_match_table('ASCII', Description => '[[:ASCII:]]');
13114 if (defined $block) { # This is equivalent to the block if have it.
13115 my $Unicode_ASCII = $block->table('Basic_Latin');
13116 if (defined $Unicode_ASCII && ! $Unicode_ASCII->is_empty) {
13117 $ASCII->set_equivalent_to($Unicode_ASCII, Related => 1);
13121 # Very early releases didn't have blocks, so initialize ASCII ourselves if
13123 if ($ASCII->is_empty) {
13124 if (! NON_ASCII_PLATFORM) {
13125 $ASCII->add_range(0, 127);
13128 for my $i (0 .. 127) {
13129 $ASCII->add_range(utf8::unicode_to_native($i),
13130 utf8::unicode_to_native($i));
13135 # Get the best available case definitions. Early Unicode versions didn't
13136 # have Uppercase and Lowercase defined, so use the general category
13137 # instead for them, modified by hard-coding in the code points each is
13139 my $Lower = $perl->add_match_table('Lower');
13140 my $Unicode_Lower = property_ref('Lowercase');
13141 if (defined $Unicode_Lower && ! $Unicode_Lower->is_empty) {
13142 $Lower->set_equivalent_to($Unicode_Lower->table('Y'), Related => 1);
13146 $Lower += $gc->table('Lowercase_Letter');
13148 # There are quite a few code points in Lower, that aren't in gc=lc,
13149 # and not all are in all releases.
13150 foreach my $code_point ( utf8::unicode_to_native(0xAA),
13151 utf8::unicode_to_native(0xBA),
13169 # Don't include the code point unless it is assigned in this
13171 my $category = $gc->value_of(hex $code_point);
13172 next if ! defined $category || $category eq 'Cn';
13174 $Lower += $code_point;
13177 $Lower->add_alias('XPosixLower');
13178 my $Posix_Lower = $perl->add_match_table("PosixLower",
13179 Description => "[a-z]",
13180 Initialize => $Lower & $ASCII,
13183 my $Upper = $perl->add_match_table('Upper');
13184 my $Unicode_Upper = property_ref('Uppercase');
13185 if (defined $Unicode_Upper && ! $Unicode_Upper->is_empty) {
13186 $Upper->set_equivalent_to($Unicode_Upper->table('Y'), Related => 1);
13190 # Unlike Lower, there are only two ranges in Upper that aren't in
13191 # gc=Lu, and all code points were assigned in all releases.
13192 $Upper += $gc->table('Uppercase_Letter');
13193 $Upper->add_range(0x2160, 0x216F); # Uppercase Roman numerals
13194 $Upper->add_range(0x24B6, 0x24CF); # Circled Latin upper case letters
13196 $Upper->add_alias('XPosixUpper');
13197 my $Posix_Upper = $perl->add_match_table("PosixUpper",
13198 Description => "[A-Z]",
13199 Initialize => $Upper & $ASCII,
13202 # Earliest releases didn't have title case. Initialize it to empty if not
13203 # otherwise present
13204 my $Title = $perl->add_match_table('Title', Full_Name => 'Titlecase',
13205 Description => '(= \p{Gc=Lt})');
13206 my $lt = $gc->table('Lt');
13208 # Earlier versions of mktables had this related to $lt since they have
13209 # identical code points, but their caseless equivalents are not the same,
13210 # one being 'Cased' and the other being 'LC', and so now must be kept as
13211 # separate entities.
13216 push @tables_that_may_be_empty, $Title->complete_name;
13219 my $Unicode_Cased = property_ref('Cased');
13220 if (defined $Unicode_Cased) {
13221 my $yes = $Unicode_Cased->table('Y');
13222 my $no = $Unicode_Cased->table('N');
13223 $Title->set_caseless_equivalent($yes);
13224 if (defined $Unicode_Upper) {
13225 $Unicode_Upper->table('Y')->set_caseless_equivalent($yes);
13226 $Unicode_Upper->table('N')->set_caseless_equivalent($no);
13228 $Upper->set_caseless_equivalent($yes);
13229 if (defined $Unicode_Lower) {
13230 $Unicode_Lower->table('Y')->set_caseless_equivalent($yes);
13231 $Unicode_Lower->table('N')->set_caseless_equivalent($no);
13233 $Lower->set_caseless_equivalent($yes);
13236 # If this Unicode version doesn't have Cased, set up the Perl
13237 # extension from first principles. From Unicode 5.1: Definition D120:
13238 # A character C is defined to be cased if and only if C has the
13239 # Lowercase or Uppercase property or has a General_Category value of
13240 # Titlecase_Letter.
13241 my $cased = $perl->add_match_table('Cased',
13242 Initialize => $Lower + $Upper + $Title,
13243 Description => 'Uppercase or Lowercase or Titlecase',
13245 # $notcased is purely for the caseless equivalents below
13246 my $notcased = $perl->add_match_table('_Not_Cased',
13247 Initialize => ~ $cased,
13248 Fate => $INTERNAL_ONLY,
13249 Description => 'All not-cased code points');
13250 $Title->set_caseless_equivalent($cased);
13251 if (defined $Unicode_Upper) {
13252 $Unicode_Upper->table('Y')->set_caseless_equivalent($cased);
13253 $Unicode_Upper->table('N')->set_caseless_equivalent($notcased);
13255 $Upper->set_caseless_equivalent($cased);
13256 if (defined $Unicode_Lower) {
13257 $Unicode_Lower->table('Y')->set_caseless_equivalent($cased);
13258 $Unicode_Lower->table('N')->set_caseless_equivalent($notcased);
13260 $Lower->set_caseless_equivalent($cased);
13263 # Similarly, set up our own Case_Ignorable property if this Unicode
13264 # version doesn't have it. From Unicode 5.1: Definition D121: A character
13265 # C is defined to be case-ignorable if C has the value MidLetter or the
13266 # value MidNumLet for the Word_Break property or its General_Category is
13267 # one of Nonspacing_Mark (Mn), Enclosing_Mark (Me), Format (Cf),
13268 # Modifier_Letter (Lm), or Modifier_Symbol (Sk).
13270 # Perl has long had an internal-only alias for this property; grandfather
13271 # it in to the pod, but discourage its use.
13272 my $perl_case_ignorable = $perl->add_match_table('_Case_Ignorable',
13274 Fate => $INTERNAL_ONLY,
13275 Status => $DISCOURAGED);
13276 my $case_ignorable = property_ref('Case_Ignorable');
13277 if (defined $case_ignorable && ! $case_ignorable->is_empty) {
13278 $perl_case_ignorable->set_equivalent_to($case_ignorable->table('Y'),
13283 $perl_case_ignorable->initialize($gc->table('Mn') + $gc->table('Lm'));
13285 # The following three properties are not in early releases
13286 $perl_case_ignorable += $gc->table('Me') if defined $gc->table('Me');
13287 $perl_case_ignorable += $gc->table('Cf') if defined $gc->table('Cf');
13288 $perl_case_ignorable += $gc->table('Sk') if defined $gc->table('Sk');
13290 # For versions 4.1 - 5.0, there is no MidNumLet property, and
13291 # correspondingly the case-ignorable definition lacks that one. For
13292 # 4.0, it appears that it was meant to be the same definition, but was
13293 # inadvertently omitted from the standard's text, so add it if the
13294 # property actually is there
13295 my $wb = property_ref('Word_Break');
13297 my $midlet = $wb->table('MidLetter');
13298 $perl_case_ignorable += $midlet if defined $midlet;
13299 my $midnumlet = $wb->table('MidNumLet');
13300 $perl_case_ignorable += $midnumlet if defined $midnumlet;
13304 # In earlier versions of the standard, instead of the above two
13305 # properties , just the following characters were used:
13306 $perl_case_ignorable +=
13308 + utf8::unicode_to_native(0xAD) # SOFT HYPHEN (SHY)
13309 + 0x2019; # RIGHT SINGLE QUOTATION MARK
13313 # The remaining perl defined tables are mostly based on Unicode TR 18,
13314 # "Annex C: Compatibility Properties". All of these have two versions,
13315 # one whose name generally begins with Posix that is posix-compliant, and
13316 # one that matches Unicode characters beyond the Posix, ASCII range
13318 my $Alpha = $perl->add_match_table('Alpha');
13320 # Alphabetic was not present in early releases
13321 my $Alphabetic = property_ref('Alphabetic');
13322 if (defined $Alphabetic && ! $Alphabetic->is_empty) {
13323 $Alpha->set_equivalent_to($Alphabetic->table('Y'), Related => 1);
13327 # The Alphabetic property doesn't exist for early releases, so
13328 # generate it. The actual definition, in 5.2 terms is:
13330 # gc=L + gc=Nl + Other_Alphabetic
13332 # Other_Alphabetic is also not defined in these early releases, but it
13333 # contains one gc=So range plus most of gc=Mn and gc=Mc, so we add
13334 # those last two as well, then subtract the relatively few of them that
13335 # shouldn't have been added. (The gc=So range is the circled capital
13336 # Latin characters. Early releases mistakenly didn't also include the
13337 # lower-case versions of these characters, and so we don't either, to
13338 # maintain consistency with those releases that first had this
13340 $Alpha->initialize($gc->table('Letter')
13345 $Alpha->add_range(0x24D0, 0x24E9); # gc=So
13346 foreach my $range ( [ 0x0300, 0x0344 ],
13347 [ 0x0346, 0x034E ],
13348 [ 0x0360, 0x0362 ],
13349 [ 0x0483, 0x0486 ],
13350 [ 0x0591, 0x05AF ],
13351 [ 0x06DF, 0x06E0 ],
13352 [ 0x06EA, 0x06EC ],
13353 [ 0x0740, 0x074A ],
13356 [ 0x0951, 0x0954 ],
13370 [ 0x0E47, 0x0E4C ],
13372 [ 0x0EC8, 0x0ECC ],
13373 [ 0x0F18, 0x0F19 ],
13377 [ 0x0F3E, 0x0F3F ],
13378 [ 0x0F82, 0x0F84 ],
13379 [ 0x0F86, 0x0F87 ],
13383 [ 0x17C9, 0x17D3 ],
13384 [ 0x20D0, 0x20DC ],
13386 [ 0x302A, 0x302F ],
13387 [ 0x3099, 0x309A ],
13388 [ 0xFE20, 0xFE23 ],
13389 [ 0x1D165, 0x1D169 ],
13390 [ 0x1D16D, 0x1D172 ],
13391 [ 0x1D17B, 0x1D182 ],
13392 [ 0x1D185, 0x1D18B ],
13393 [ 0x1D1AA, 0x1D1AD ],
13396 $Alpha->delete_range($range->[0], $range->[1]);
13399 $Alpha->delete_range($range, $range);
13402 $Alpha->add_description('Alphabetic');
13403 $Alpha->add_alias('Alphabetic');
13405 $Alpha->add_alias('XPosixAlpha');
13406 my $Posix_Alpha = $perl->add_match_table("PosixAlpha",
13407 Description => "[A-Za-z]",
13408 Initialize => $Alpha & $ASCII,
13410 $Posix_Upper->set_caseless_equivalent($Posix_Alpha);
13411 $Posix_Lower->set_caseless_equivalent($Posix_Alpha);
13413 my $Alnum = $perl->add_match_table('Alnum',
13414 Description => 'Alphabetic and (decimal) Numeric',
13415 Initialize => $Alpha + $gc->table('Decimal_Number'),
13417 $Alnum->add_alias('XPosixAlnum');
13418 $perl->add_match_table("PosixAlnum",
13419 Description => "[A-Za-z0-9]",
13420 Initialize => $Alnum & $ASCII,
13423 my $Word = $perl->add_match_table('Word',
13424 Description => '\w, including beyond ASCII;'
13425 . ' = \p{Alnum} + \pM + \p{Pc}',
13426 Initialize => $Alnum + $gc->table('Mark'),
13428 $Word->add_alias('XPosixWord');
13429 my $Pc = $gc->table('Connector_Punctuation'); # 'Pc' Not in release 1
13434 $Word += ord('_'); # Make sure this is a $Word
13436 my $JC = property_ref('Join_Control'); # Wasn't in release 1
13438 $Word += $JC->table('Y');
13441 $Word += 0x200C + 0x200D;
13444 # This is a Perl extension, so the name doesn't begin with Posix.
13445 my $PerlWord = $perl->add_match_table('PerlWord',
13446 Description => '\w, restricted to ASCII = [A-Za-z0-9_]',
13447 Initialize => $Word & $ASCII,
13449 $PerlWord->add_alias('PosixWord');
13451 my $Blank = $perl->add_match_table('Blank',
13452 Description => '\h, Horizontal white space',
13454 # 200B is Zero Width Space which is for line
13455 # break control, and was listed as
13456 # Space_Separator in early releases
13457 Initialize => $gc->table('Space_Separator')
13461 $Blank->add_alias('HorizSpace'); # Another name for it.
13462 $Blank->add_alias('XPosixBlank');
13463 $perl->add_match_table("PosixBlank",
13464 Description => "\\t and ' '",
13465 Initialize => $Blank & $ASCII,
13468 my $VertSpace = $perl->add_match_table('VertSpace',
13469 Description => '\v',
13471 $gc->table('Line_Separator')
13472 + $gc->table('Paragraph_Separator')
13473 + utf8::unicode_to_native(0x0A) # LINE FEED
13474 + utf8::unicode_to_native(0x0B) # VERTICAL TAB
13476 + utf8::unicode_to_native(0x0D) # CARRIAGE RETURN
13477 + utf8::unicode_to_native(0x85) # NEL
13479 # No Posix equivalent for vertical space
13481 my $Space = $perl->add_match_table('Space',
13482 Description => '\s including beyond ASCII and vertical tab',
13483 Initialize => $Blank + $VertSpace,
13485 $Space->add_alias('XPosixSpace');
13486 my $posix_space = $perl->add_match_table("PosixSpace",
13487 Description => "\\t, \\n, \\cK, \\f, \\r, and ' '. (\\cK is vertical tab)",
13488 Initialize => $Space & $ASCII,
13491 # Perl's traditional space doesn't include Vertical Tab prior to v5.18
13492 my $XPerlSpace = $perl->add_match_table('XPerlSpace',
13493 Description => '\s, including beyond ASCII',
13494 Initialize => $Space,
13495 #Initialize => $Space
13496 # - utf8::unicode_to_native(0x0B]
13498 $XPerlSpace->add_alias('SpacePerl'); # A pre-existing synonym
13499 my $PerlSpace = $perl->add_match_table('PerlSpace',
13500 Description => '\s, restricted to ASCII = [ \f\n\r\t] plus vertical tab',
13501 Initialize => $XPerlSpace & $ASCII,
13505 my $Cntrl = $perl->add_match_table('Cntrl',
13506 Description => 'Control characters');
13507 $Cntrl->set_equivalent_to($gc->table('Cc'), Related => 1);
13508 $Cntrl->add_alias('XPosixCntrl');
13509 $perl->add_match_table("PosixCntrl",
13510 Description => "ASCII control characters: NUL, SOH, STX, ETX, EOT, ENQ, ACK, BEL, BS, HT, LF, VT, FF, CR, SO, SI, DLE, DC1, DC2, DC3, DC4, NAK, SYN, ETB, CAN, EOM, SUB, ESC, FS, GS, RS, US, and DEL",
13511 Initialize => $Cntrl & $ASCII,
13514 # $controls is a temporary used to construct Graph.
13515 my $controls = Range_List->new(Initialize => $gc->table('Unassigned')
13516 + $gc->table('Control'));
13517 # Cs not in release 1
13518 $controls += $gc->table('Surrogate') if defined $gc->table('Surrogate');
13520 # Graph is ~space & ~(Cc|Cs|Cn) = ~(space + $controls)
13521 my $Graph = $perl->add_match_table('Graph',
13522 Description => 'Characters that are graphical',
13523 Initialize => ~ ($Space + $controls),
13525 $Graph->add_alias('XPosixGraph');
13526 $perl->add_match_table("PosixGraph",
13528 '[-!"#$%&\'()*+,./:;<=>?@[\\\]^_`{|}~0-9A-Za-z]',
13529 Initialize => $Graph & $ASCII,
13532 $print = $perl->add_match_table('Print',
13533 Description => 'Characters that are graphical plus space characters (but no controls)',
13534 Initialize => $Blank + $Graph - $gc->table('Control'),
13536 $print->add_alias('XPosixPrint');
13537 $perl->add_match_table("PosixPrint",
13539 '[- 0-9A-Za-z!"#$%&\'()*+,./:;<=>?@[\\\]^_`{|}~]',
13540 Initialize => $print & $ASCII,
13543 my $Punct = $perl->add_match_table('Punct');
13544 $Punct->set_equivalent_to($gc->table('Punctuation'), Related => 1);
13546 # \p{punct} doesn't include the symbols, which posix does
13547 my $XPosixPunct = $perl->add_match_table('XPosixPunct',
13548 Description => '\p{Punct} + ASCII-range \p{Symbol}',
13549 Initialize => $gc->table('Punctuation')
13550 + ($ASCII & $gc->table('Symbol')),
13551 Perl_Extension => 1
13553 $perl->add_match_table('PosixPunct', Perl_Extension => 1,
13554 Description => '[-!"#$%&\'()*+,./:;<=>?@[\\\]^_`{|}~]',
13555 Initialize => $ASCII & $XPosixPunct,
13558 my $Digit = $perl->add_match_table('Digit',
13559 Description => '[0-9] + all other decimal digits');
13560 $Digit->set_equivalent_to($gc->table('Decimal_Number'), Related => 1);
13561 $Digit->add_alias('XPosixDigit');
13562 my $PosixDigit = $perl->add_match_table("PosixDigit",
13563 Description => '[0-9]',
13564 Initialize => $Digit & $ASCII,
13567 # Hex_Digit was not present in first release
13568 my $Xdigit = $perl->add_match_table('XDigit');
13569 $Xdigit->add_alias('XPosixXDigit');
13570 my $Hex = property_ref('Hex_Digit');
13571 if (defined $Hex && ! $Hex->is_empty) {
13572 $Xdigit->set_equivalent_to($Hex->table('Y'), Related => 1);
13575 $Xdigit->initialize([ ord('0') .. ord('9'),
13576 ord('A') .. ord('F'),
13577 ord('a') .. ord('f'),
13578 0xFF10..0xFF19, 0xFF21..0xFF26, 0xFF41..0xFF46]);
13579 $Xdigit->add_description('[0-9A-Fa-f] and corresponding fullwidth versions, like U+FF10: FULLWIDTH DIGIT ZERO');
13582 # AHex was not present in early releases
13583 my $PosixXDigit = $perl->add_match_table('PosixXDigit');
13584 my $AHex = property_ref('ASCII_Hex_Digit');
13585 if (defined $AHex && ! $AHex->is_empty) {
13586 $PosixXDigit->set_equivalent_to($AHex->table('Y'), Related => 1);
13589 $PosixXDigit->initialize($Xdigit & $ASCII);
13590 $PosixXDigit->add_alias('AHex');
13591 $PosixXDigit->add_alias('Ascii_Hex_Digit');
13593 $PosixXDigit->add_description('[0-9A-Fa-f]');
13595 my $any_folds = $perl->add_match_table("_Perl_Any_Folds",
13596 Description => "Code points that particpate in some fold",
13599 foreach my $range (property_ref('Case_Folding')->ranges) {
13600 $any_folds->add_range($range->start, $range->end);
13601 foreach my $hex_code_point (split " ", $range->value) {
13602 my $code_point = hex $hex_code_point;
13603 $any_folds->add_range($code_point, $code_point);
13607 my $dt = property_ref('Decomposition_Type');
13608 $dt->add_match_table('Non_Canon', Full_Name => 'Non_Canonical',
13609 Initialize => ~ ($dt->table('None') + $dt->table('Canonical')),
13610 Perl_Extension => 1,
13611 Note => 'Union of all non-canonical decompositions',
13614 # _CanonDCIJ is equivalent to Soft_Dotted, but if on a release earlier
13615 # than SD appeared, construct it ourselves, based on the first release SD
13616 # was in. A pod entry is grandfathered in for it
13617 my $CanonDCIJ = $perl->add_match_table('_CanonDCIJ', Re_Pod_Entry => 1,
13618 Perl_Extension => 1,
13619 Fate => $INTERNAL_ONLY,
13620 Status => $DISCOURAGED);
13621 my $soft_dotted = property_ref('Soft_Dotted');
13622 if (defined $soft_dotted && ! $soft_dotted->is_empty) {
13623 $CanonDCIJ->set_equivalent_to($soft_dotted->table('Y'), Related => 1);
13627 # This list came from 3.2 Soft_Dotted; all of these code points are in
13629 $CanonDCIJ->initialize([ ord('i'),
13638 $CanonDCIJ = $CanonDCIJ & $Assigned;
13641 # For backward compatibility, Perl has its own definition for IDStart.
13642 # It is regular XID_Start plus the underscore, but all characters must be
13643 # Word characters as well
13644 my $XID_Start = property_ref('XID_Start');
13645 my $perl_xids = $perl->add_match_table('_Perl_IDStart',
13646 Perl_Extension => 1,
13647 Fate => $INTERNAL_ONLY,
13648 Initialize => ord('_')
13650 if (defined $XID_Start
13651 || defined ($XID_Start = property_ref('ID_Start')))
13653 $perl_xids += $XID_Start->table('Y');
13656 # For Unicode versions that don't have the property, construct our own
13657 # from first principles. The actual definition is:
13659 # + letter numbers (Nl)
13661 # - Pattern_White_Space
13662 # + stability extensions
13663 # - NKFC modifications
13665 # What we do in the code below is to include the identical code points
13666 # that are in the first release that had Unicode's version of this
13667 # property, essentially extrapolating backwards. There were no
13668 # stability extensions until v4.1, so none are included; likewise in
13669 # no Unicode version so far do subtracting PatSyn and PatWS make any
13670 # difference, so those also are ignored.
13671 $perl_xids += $gc->table('Letter') + pre_3_dot_1_Nl();
13673 # We do subtract the NFKC modifications that are in the first version
13674 # that had this property. We don't bother to test if they are in the
13675 # version in question, because if they aren't, the operation is a
13676 # no-op. The NKFC modifications are discussed in
13677 # http://www.unicode.org/reports/tr31/#NFKC_Modifications
13678 foreach my $range ( 0x037A,
13681 [ 0xFC5E, 0xFC63 ],
13682 [ 0xFDFA, 0xFE70 ],
13683 [ 0xFE72, 0xFE76 ],
13688 [ 0xFF9E, 0xFF9F ],
13691 $perl_xids->delete_range($range->[0], $range->[1]);
13694 $perl_xids->delete_range($range, $range);
13699 $perl_xids &= $Word;
13701 my $perl_xidc = $perl->add_match_table('_Perl_IDCont',
13702 Perl_Extension => 1,
13703 Fate => $INTERNAL_ONLY);
13704 my $XIDC = property_ref('XID_Continue');
13706 || defined ($XIDC = property_ref('ID_Continue')))
13708 $perl_xidc += $XIDC->table('Y');
13711 # Similarly, we construct our own XIDC if necessary for early Unicode
13712 # versions. The definition is:
13713 # everything in XIDS
13719 # - Pattern_White_Space
13720 # + stability extensions
13721 # - NFKC modifications
13723 # The same thing applies to this as with XIDS for the PatSyn, PatWS,
13724 # and stability extensions. There is a somewhat different set of NFKC
13725 # mods to remove (and add in this case). The ones below make this
13726 # have identical code points as in the first release that defined it.
13727 $perl_xidc += $perl_xids
13732 + utf8::unicode_to_native(0xB7)
13734 if (defined (my $pc = $gc->table('Pc'))) {
13737 else { # 1.1.5 didn't have Pc, but these should have been in it
13738 $perl_xidc += 0xFF3F;
13739 $perl_xidc->add_range(0x203F, 0x2040);
13740 $perl_xidc->add_range(0xFE33, 0xFE34);
13741 $perl_xidc->add_range(0xFE4D, 0xFE4F);
13744 # Subtract the NFKC mods
13745 foreach my $range ( 0x037A,
13746 [ 0xFC5E, 0xFC63 ],
13747 [ 0xFDFA, 0xFE1F ],
13749 [ 0xFE72, 0xFE76 ],
13756 $perl_xidc->delete_range($range->[0], $range->[1]);
13759 $perl_xidc->delete_range($range, $range);
13764 $perl_xidc &= $Word;
13766 my $charname_begin = $perl->add_match_table('_Perl_Charname_Begin',
13767 Perl_Extension => 1,
13768 Fate => $INTERNAL_ONLY,
13769 Initialize => $gc->table('Letter') & $Alpha & $perl_xids,
13772 my $charname_continue = $perl->add_match_table('_Perl_Charname_Continue',
13773 Perl_Extension => 1,
13774 Fate => $INTERNAL_ONLY,
13775 Initialize => $perl_xidc
13780 + utf8::unicode_to_native(0xA0) # NBSP
13783 # These two tables are for matching \X, which is based on the 'extended'
13784 # grapheme cluster, which came in 5.1; create empty ones if not already
13785 # present. The straight 'grapheme cluster' (non-extended) is used prior
13786 # to 5.1, and differs from the extended (see
13787 # http://www.unicode.org/reports/tr29/) only by these two tables, so we
13788 # get the older definition automatically when they are empty.
13789 my $gcb = property_ref('Grapheme_Cluster_Break');
13790 my $perl_prepend = $perl->add_match_table('_X_GCB_Prepend',
13791 Perl_Extension => 1,
13792 Fate => $INTERNAL_ONLY);
13793 if (defined (my $gcb_prepend = $gcb->table('Prepend'))) {
13794 $perl_prepend->set_equivalent_to($gcb_prepend, Related => 1);
13797 push @tables_that_may_be_empty, $perl_prepend->complete_name;
13800 # All the tables with _X_ in their names are used in defining \X handling,
13801 # and are based on the Unicode GCB property. Basically, \X matches:
13803 # | Prepend* Begin Extend*
13805 # Begin is: ( Special_Begin | ! Control )
13806 # Begin is also: ( Regular_Begin | Special_Begin )
13807 # where Regular_Begin is defined as ( ! Control - Special_Begin )
13808 # Special_Begin is: ( Regional-Indicator+ | Hangul-syllable )
13809 # Extend is: ( Grapheme_Extend | Spacing_Mark )
13810 # Control is: [ GCB_Control | CR | LF ]
13811 # Hangul-syllable is: ( T+ | ( L* ( L | ( LVT | ( V | LV ) V* ) T* ) ))
13813 foreach my $gcb_name (qw{ L V T LV LVT }) {
13815 # The perl internal extension's name is the gcb table name prepended
13817 my $perl_table = $perl->add_match_table('_X_GCB_' . $gcb_name,
13818 Perl_Extension => 1,
13819 Fate => $INTERNAL_ONLY,
13820 Initialize => $gcb->table($gcb_name),
13822 # Version 1 had mostly different Hangul syllables that were removed
13823 # from later versions, so some of the tables may not apply.
13824 if ($v_version lt v2.0) {
13825 push @tables_that_may_be_empty, $perl_table->complete_name;
13829 # More GCB. Populate a combined hangul syllables table
13830 my $lv_lvt_v = $perl->add_match_table('_X_LV_LVT_V',
13831 Perl_Extension => 1,
13832 Fate => $INTERNAL_ONLY);
13833 $lv_lvt_v += $gcb->table('LV') + $gcb->table('LVT') + $gcb->table('V');
13834 $lv_lvt_v->add_comment('For use in \X; matches: gcb=LV | gcb=LVT | gcb=V');
13836 my $ri = $perl->add_match_table('_X_RI', Perl_Extension => 1,
13837 Fate => $INTERNAL_ONLY);
13838 if ($v_version ge v6.2) {
13839 $ri += $gcb->table('RI');
13842 push @tables_that_may_be_empty, $ri->full_name;
13845 my $specials_begin = $perl->add_match_table('_X_Special_Begin_Start',
13846 Perl_Extension => 1,
13847 Fate => $INTERNAL_ONLY,
13848 Initialize => $lv_lvt_v
13853 $specials_begin->add_comment(join_lines( <<END
13854 For use in \\X; matches first (perhaps only) character of potential
13855 multi-character sequences that can begin an extended grapheme cluster. They
13856 need special handling because of their complicated nature.
13859 my $regular_begin = $perl->add_match_table('_X_Regular_Begin',
13860 Perl_Extension => 1,
13861 Fate => $INTERNAL_ONLY,
13862 Initialize => ~ $gcb->table('Control')
13864 - $gcb->table('CR')
13865 - $gcb->table('LF')
13867 $regular_begin->add_comment(join_lines( <<END
13868 For use in \\X; matches first character of anything that can begin an extended
13869 grapheme cluster, except those that require special handling.
13873 my $extend = $perl->add_match_table('_X_Extend', Perl_Extension => 1,
13874 Fate => $INTERNAL_ONLY,
13875 Initialize => $gcb->table('Extend')
13877 if (defined (my $sm = $gcb->table('SpacingMark'))) {
13880 $extend->add_comment('For use in \X; matches: Extend | SpacingMark');
13882 # End of GCB \X processing
13884 my @composition = ('Name', 'Unicode_1_Name', 'Name_Alias');
13886 if (@named_sequences) {
13887 push @composition, 'Named_Sequence';
13888 foreach my $sequence (@named_sequences) {
13889 $perl_charname->add_anomalous_entry($sequence);
13893 my $alias_sentence = "";
13895 my $alias = property_ref('Name_Alias');
13896 $perl_charname->set_proxy_for('Name_Alias');
13898 # Add each entry in Name_Alias to Perl_Charnames. Where these go with
13899 # respect to any existing entry depends on the entry type. Corrections go
13900 # before said entry, as they should be returned in preference over the
13901 # existing entry. (A correction to a correction should be later in the
13902 # Name_Alias table, so it will correctly precede the erroneous correction
13903 # in Perl_Charnames.)
13905 # Abbreviations go after everything else, so they are saved temporarily in
13906 # a hash for later.
13908 # Everything else is added added afterwards, which preserves the input
13911 foreach my $range ($alias->ranges) {
13912 next if $range->value eq "";
13913 my $code_point = $range->start;
13914 if ($code_point != $range->end) {
13915 Carp::my_carp_bug("Bad News. Expecting only one code point in the range $range. Just to keep going, using only the first code point;");
13917 my ($value, $type) = split ': ', $range->value;
13919 if ($type eq 'correction') {
13920 $replace_type = $MULTIPLE_BEFORE;
13922 elsif ($type eq 'abbreviation') {
13925 $abbreviations{$value} = $code_point;
13929 $replace_type = $MULTIPLE_AFTER;
13932 # Actually add; before or after current entry(ies) as determined
13935 $perl_charname->add_duplicate($code_point, $value, Replace => $replace_type);
13937 $alias_sentence = <<END;
13938 The Name_Alias property adds duplicate code point entries that are
13939 alternatives to the original name. If an addition is a corrected
13940 name, it will be physically first in the table. The original (less correct,
13941 but still valid) name will be next; then any alternatives, in no particular
13942 order; and finally any abbreviations, again in no particular order.
13945 # Now add the Unicode_1 names for the controls. The Unicode_1 names had
13946 # precedence before 6.1, so should be first in the file; the other names
13947 # have precedence starting in 6.1,
13948 my $before_or_after = ($v_version lt v6.1.0)
13952 foreach my $range (property_ref('Unicode_1_Name')->ranges) {
13953 my $code_point = $range->start;
13954 my $unicode_1_value = $range->value;
13955 next if $unicode_1_value eq ""; # Skip if name doesn't exist.
13957 if ($code_point != $range->end) {
13958 Carp::my_carp_bug("Bad News. Expecting only one code point in the range $range. Just to keep going, using only the first code point;");
13961 # To handle EBCDIC, we don't hard code in the code points of the
13962 # controls; instead realizing that all of them are below 256.
13963 last if $code_point > 255;
13965 # We only add in the controls.
13966 next if $gc->value_of($code_point) ne 'Cc';
13968 # We reject this Unicode1 name for later Perls, as it is used for
13969 # another code point
13970 next if $unicode_1_value eq 'BELL' && $^V ge v5.17.0;
13972 # This won't add an exact duplicate.
13973 $perl_charname->add_duplicate($code_point, $unicode_1_value,
13974 Replace => $before_or_after);
13977 # But in this version only, the ALERT has precedence over BELL, the
13978 # Unicode_1_Name that would otherwise have precedence.
13979 if ($v_version eq v6.0.0) {
13980 $perl_charname->add_duplicate(7, 'ALERT', Replace => $MULTIPLE_BEFORE);
13983 # Now that have everything added, add in abbreviations after
13984 # everything else. Sort so results don't change between runs of this
13986 foreach my $value (sort keys %abbreviations) {
13987 $perl_charname->add_duplicate($abbreviations{$value}, $value,
13988 Replace => $MULTIPLE_AFTER);
13992 if (@composition <= 2) { # Always at least 2
13993 $comment = join " and ", @composition;
13996 $comment = join ", ", @composition[0 .. scalar @composition - 2];
13997 $comment .= ", and $composition[-1]";
14000 $perl_charname->add_comment(join_lines( <<END
14001 This file is for charnames.pm. It is the union of the $comment properties.
14002 Unicode_1_Name entries are used only for nameless code points in the Name
14005 This file doesn't include the algorithmically determinable names. For those,
14006 use 'unicore/Name.pm'
14009 property_ref('Name')->add_comment(join_lines( <<END
14010 This file doesn't include the algorithmically determinable names. For those,
14011 use 'unicore/Name.pm'
14015 # Construct the Present_In property from the Age property.
14016 if (-e 'DAge.txt' && defined (my $age = property_ref('Age'))) {
14017 my $default_map = $age->default_map;
14018 my $in = Property->new('In',
14019 Default_Map => $default_map,
14020 Full_Name => "Present_In",
14021 Perl_Extension => 1,
14023 Initialize => $age,
14025 $in->add_comment(join_lines(<<END
14026 THIS FILE SHOULD NOT BE USED FOR ANY PURPOSE. The values in this file are the
14027 same as for $age, and not for what $in really means. This is because anything
14028 defined in a given release should have multiple values: that release and all
14029 higher ones. But only one value per code point can be represented in a table
14034 # The Age tables are named like 1.5, 2.0, 2.1, .... Sort so that the
14035 # lowest numbered (earliest) come first, with the non-numeric one
14037 my ($first_age, @rest_ages) = sort { ($a->name !~ /^[\d.]*$/)
14039 : ($b->name !~ /^[\d.]*$/)
14041 : $a->name <=> $b->name
14044 # The Present_In property is the cumulative age properties. The first
14045 # one hence is identical to the first age one.
14046 my $previous_in = $in->add_match_table($first_age->name);
14047 $previous_in->set_equivalent_to($first_age, Related => 1);
14049 my $description_start = "Code point's usage introduced in version ";
14050 $first_age->add_description($description_start . $first_age->name);
14052 # To construct the accumulated values, for each of the age tables
14053 # starting with the 2nd earliest, merge the earliest with it, to get
14054 # all those code points existing in the 2nd earliest. Repeat merging
14055 # the new 2nd earliest with the 3rd earliest to get all those existing
14056 # in the 3rd earliest, and so on.
14057 foreach my $current_age (@rest_ages) {
14058 next if $current_age->name !~ /^[\d.]*$/; # Skip the non-numeric
14060 my $current_in = $in->add_match_table(
14061 $current_age->name,
14062 Initialize => $current_age + $previous_in,
14063 Description => $description_start
14064 . $current_age->name
14067 $previous_in = $current_in;
14069 # Add clarifying material for the corresponding age file. This is
14070 # in part because of the confusing and contradictory information
14071 # given in the Standard's documentation itself, as of 5.2.
14072 $current_age->add_description(
14073 "Code point's usage was introduced in version "
14074 . $current_age->name);
14075 $current_age->add_note("See also $in");
14079 # And finally the code points whose usages have yet to be decided are
14080 # the same in both properties. Note that permanently unassigned code
14081 # points actually have their usage assigned (as being permanently
14082 # unassigned), so that these tables are not the same as gc=cn.
14083 my $unassigned = $in->add_match_table($default_map);
14084 my $age_default = $age->table($default_map);
14085 $age_default->add_description(<<END
14086 Code point's usage has not been assigned in any Unicode release thus far.
14089 $unassigned->set_equivalent_to($age_default, Related => 1);
14092 # See L<perlfunc/quotemeta>
14093 my $quotemeta = $perl->add_match_table('_Perl_Quotemeta',
14094 Perl_Extension => 1,
14095 Fate => $INTERNAL_ONLY,
14097 # Initialize to what's common in
14098 # all Unicode releases.
14101 + $gc->table('Control')
14104 # In early releases without the proper Unicode properties, just set to \W.
14105 if (! defined (my $patsyn = property_ref('Pattern_Syntax'))
14106 || ! defined (my $patws = property_ref('Pattern_White_Space'))
14107 || ! defined (my $di = property_ref('Default_Ignorable_Code_Point')))
14109 $quotemeta += ~ $Word;
14112 $quotemeta += $patsyn->table('Y')
14113 + $patws->table('Y')
14115 + ((~ $Word) & $ASCII);
14118 # Finished creating all the perl properties. All non-internal non-string
14119 # ones have a synonym of 'Is_' prefixed. (Internal properties begin with
14120 # an underscore.) These do not get a separate entry in the pod file
14121 foreach my $table ($perl->tables) {
14122 foreach my $alias ($table->aliases) {
14123 next if $alias->name =~ /^_/;
14124 $table->add_alias('Is_' . $alias->name,
14127 Status => $alias->status,
14128 OK_as_Filename => 0);
14132 # Here done with all the basic stuff. Ready to populate the information
14133 # about each character if annotating them.
14136 # See comments at its declaration
14137 $annotate_ranges = Range_Map->new;
14139 # This separates out the non-characters from the other unassigneds, so
14140 # can give different annotations for each.
14141 $unassigned_sans_noncharacters = Range_List->new(
14142 Initialize => $gc->table('Unassigned'));
14143 if (defined (my $nonchars = property_ref('Noncharacter_Code_Point'))) {
14144 $unassigned_sans_noncharacters &= $nonchars->table('N');
14147 for (my $i = 0; $i <= $MAX_UNICODE_CODEPOINT; $i++ ) {
14148 $i = populate_char_info($i); # Note sets $i so may cause skips
14155 sub add_perl_synonyms() {
14156 # A number of Unicode tables have Perl synonyms that are expressed in
14157 # the single-form, \p{name}. These are:
14158 # All the binary property Y tables, so that \p{Name=Y} gets \p{Name} and
14159 # \p{Is_Name} as synonyms
14160 # \p{Script=Value} gets \p{Value}, \p{Is_Value} as synonyms
14161 # \p{General_Category=Value} gets \p{Value}, \p{Is_Value} as synonyms
14162 # \p{Block=Value} gets \p{In_Value} as a synonym, and, if there is no
14163 # conflict, \p{Value} and \p{Is_Value} as well
14165 # This routine generates these synonyms, warning of any unexpected
14168 # Construct the list of tables to get synonyms for. Start with all the
14169 # binary and the General_Category ones.
14170 my @tables = grep { $_->type == $BINARY || $_->type == $FORCED_BINARY }
14172 push @tables, $gc->tables;
14174 # If the version of Unicode includes the Script property, add its tables
14175 push @tables, $script->tables if defined $script;
14177 # The Block tables are kept separate because they are treated differently.
14178 # And the earliest versions of Unicode didn't include them, so add only if
14181 push @blocks, $block->tables if defined $block;
14183 # Here, have the lists of tables constructed. Process blocks last so that
14184 # if there are name collisions with them, blocks have lowest priority.
14185 # Should there ever be other collisions, manual intervention would be
14186 # required. See the comments at the beginning of the program for a
14187 # possible way to handle those semi-automatically.
14188 foreach my $table (@tables, @blocks) {
14190 # For non-binary properties, the synonym is just the name of the
14191 # table, like Greek, but for binary properties the synonym is the name
14192 # of the property, and means the code points in its 'Y' table.
14193 my $nominal = $table;
14194 my $nominal_property = $nominal->property;
14196 if (! $nominal->isa('Property')) {
14201 # Here is a binary property. Use the 'Y' table. Verify that is
14203 my $yes = $nominal->table('Y');
14204 unless (defined $yes) { # Must be defined, but is permissible to
14206 Carp::my_carp_bug("Undefined $nominal, 'Y'. Skipping.");
14212 foreach my $alias ($nominal->aliases) {
14214 # Attempt to create a table in the perl directory for the
14215 # candidate table, using whatever aliases in it that don't
14216 # conflict. Also add non-conflicting aliases for all these
14217 # prefixed by 'Is_' (and/or 'In_' for Block property tables)
14219 foreach my $prefix ("", 'Is_', 'In_') {
14221 # Only Block properties can have added 'In_' aliases.
14222 next if $prefix eq 'In_' and $nominal_property != $block;
14224 my $proposed_name = $prefix . $alias->name;
14226 # No Is_Is, In_In, nor combinations thereof
14227 trace "$proposed_name is a no-no" if main::DEBUG && $to_trace && $proposed_name =~ /^ I [ns] _I [ns] _/x;
14228 next if $proposed_name =~ /^ I [ns] _I [ns] _/x;
14230 trace "Seeing if can add alias or table: 'perl=$proposed_name' based on $nominal" if main::DEBUG && $to_trace;
14232 # Get a reference to any existing table in the perl
14233 # directory with the desired name.
14234 my $pre_existing = $perl->table($proposed_name);
14236 if (! defined $pre_existing) {
14238 # No name collision, so ok to add the perl synonym.
14240 my $make_re_pod_entry;
14241 my $ok_as_filename;
14242 my $status = $alias->status;
14243 if ($nominal_property == $block) {
14245 # For block properties, the 'In' form is preferred for
14246 # external use; the pod file contains wild cards for
14247 # this and the 'Is' form so no entries for those; and
14248 # we don't want people using the name without the
14249 # 'In', so discourage that.
14250 if ($prefix eq "") {
14251 $make_re_pod_entry = 1;
14252 $status = $status || $DISCOURAGED;
14253 $ok_as_filename = 0;
14255 elsif ($prefix eq 'In_') {
14256 $make_re_pod_entry = 0;
14257 $status = $status || $NORMAL;
14258 $ok_as_filename = 1;
14261 $make_re_pod_entry = 0;
14262 $status = $status || $DISCOURAGED;
14263 $ok_as_filename = 0;
14266 elsif ($prefix ne "") {
14268 # The 'Is' prefix is handled in the pod by a wild
14269 # card, and we won't use it for an external name
14270 $make_re_pod_entry = 0;
14271 $status = $status || $NORMAL;
14272 $ok_as_filename = 0;
14276 # Here, is an empty prefix, non block. This gets its
14277 # own pod entry and can be used for an external name.
14278 $make_re_pod_entry = 1;
14279 $status = $status || $NORMAL;
14280 $ok_as_filename = 1;
14283 # Here, there isn't a perl pre-existing table with the
14284 # name. Look through the list of equivalents of this
14285 # table to see if one is a perl table.
14286 foreach my $equivalent ($actual->leader->equivalents) {
14287 next if $equivalent->property != $perl;
14289 # Here, have found a table for $perl. Add this alias
14290 # to it, and are done with this prefix.
14291 $equivalent->add_alias($proposed_name,
14292 Re_Pod_Entry => $make_re_pod_entry,
14294 # Currently don't output these in the
14295 # ucd pod, as are strongly discouraged
14300 OK_as_Filename => $ok_as_filename);
14301 trace "adding alias perl=$proposed_name to $equivalent" if main::DEBUG && $to_trace;
14305 # Here, $perl doesn't already have a table that is a
14306 # synonym for this property, add one.
14307 my $added_table = $perl->add_match_table($proposed_name,
14308 Re_Pod_Entry => $make_re_pod_entry,
14310 # See UCD comment just above
14314 OK_as_Filename => $ok_as_filename);
14315 # And it will be related to the actual table, since it is
14317 $added_table->set_equivalent_to($actual, Related => 1);
14318 trace "added ", $perl->table($proposed_name) if main::DEBUG && $to_trace;
14320 } # End of no pre-existing.
14322 # Here, there is a pre-existing table that has the proposed
14323 # name. We could be in trouble, but not if this is just a
14324 # synonym for another table that we have already made a child
14325 # of the pre-existing one.
14326 if ($pre_existing->is_set_equivalent_to($actual)) {
14327 trace "$pre_existing is already equivalent to $actual; adding alias perl=$proposed_name to it" if main::DEBUG && $to_trace;
14328 $pre_existing->add_alias($proposed_name);
14332 # Here, there is a name collision, but it still could be ok if
14333 # the tables match the identical set of code points, in which
14334 # case, we can combine the names. Compare each table's code
14335 # point list to see if they are identical.
14336 trace "Potential name conflict with $pre_existing having ", $pre_existing->count, " code points" if main::DEBUG && $to_trace;
14337 if ($pre_existing->matches_identically_to($actual)) {
14339 # Here, they do match identically. Not a real conflict.
14340 # Make the perl version a child of the Unicode one, except
14341 # in the non-obvious case of where the perl name is
14342 # already a synonym of another Unicode property. (This is
14343 # excluded by the test for it being its own parent.) The
14344 # reason for this exclusion is that then the two Unicode
14345 # properties become related; and we don't really know if
14346 # they are or not. We generate documentation based on
14347 # relatedness, and this would be misleading. Code
14348 # later executed in the process will cause the tables to
14349 # be represented by a single file anyway, without making
14350 # it look in the pod like they are necessarily related.
14351 if ($pre_existing->parent == $pre_existing
14352 && ($pre_existing->property == $perl
14353 || $actual->property == $perl))
14355 trace "Setting $pre_existing equivalent to $actual since one is \$perl, and match identical sets" if main::DEBUG && $to_trace;
14356 $pre_existing->set_equivalent_to($actual, Related => 1);
14358 elsif (main::DEBUG && $to_trace) {
14359 trace "$pre_existing is equivalent to $actual since match identical sets, but not setting them equivalent, to preserve the separateness of the perl aliases";
14360 trace $pre_existing->parent;
14365 # Here they didn't match identically, there is a real conflict
14366 # between our new name and a pre-existing property.
14367 $actual->add_conflicting($proposed_name, 'p', $pre_existing);
14368 $pre_existing->add_conflicting($nominal->full_name,
14372 # Don't output a warning for aliases for the block
14373 # properties (unless they start with 'In_') as it is
14374 # expected that there will be conflicts and the block
14376 if ($verbosity >= $NORMAL_VERBOSITY
14377 && ($actual->property != $block || $prefix eq 'In_'))
14379 print simple_fold(join_lines(<<END
14380 There is already an alias named $proposed_name (from $pre_existing),
14381 so not creating this alias for $actual
14386 # Keep track for documentation purposes.
14387 $has_In_conflicts++ if $prefix eq 'In_';
14388 $has_Is_conflicts++ if $prefix eq 'Is_';
14393 # There are some properties which have No and Yes (and N and Y) as
14394 # property values, but aren't binary, and could possibly be confused with
14395 # binary ones. So create caveats for them. There are tables that are
14396 # named 'No', and tables that are named 'N', but confusion is not likely
14397 # unless they are the same table. For example, N meaning Number or
14398 # Neutral is not likely to cause confusion, so don't add caveats to things
14400 foreach my $property (grep { $_->type != $BINARY
14401 && $_->type != $FORCED_BINARY }
14404 my $yes = $property->table('Yes');
14405 if (defined $yes) {
14406 my $y = $property->table('Y');
14407 if (defined $y && $yes == $y) {
14408 foreach my $alias ($property->aliases) {
14409 $yes->add_conflicting($alias->name);
14413 my $no = $property->table('No');
14415 my $n = $property->table('N');
14416 if (defined $n && $no == $n) {
14417 foreach my $alias ($property->aliases) {
14418 $no->add_conflicting($alias->name, 'P');
14427 sub register_file_for_name($$$) {
14428 # Given info about a table and a datafile that it should be associated
14429 # with, register that association
14432 my $directory_ref = shift; # Array of the directory path for the file
14433 my $file = shift; # The file name in the final directory.
14434 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
14436 trace "table=$table, file=$file, directory=@$directory_ref" if main::DEBUG && $to_trace;
14438 if ($table->isa('Property')) {
14439 $table->set_file_path(@$directory_ref, $file);
14440 push @map_properties, $table;
14442 # No swash means don't do the rest of this.
14443 return if $table->fate != $ORDINARY;
14445 # Get the path to the file
14446 my @path = $table->file_path;
14448 # Use just the file name if no subdirectory.
14449 shift @path if $path[0] eq File::Spec->curdir();
14451 my $file = join '/', @path;
14453 # Create a hash entry for utf8_heavy to get the file that stores this
14454 # property's map table
14455 foreach my $alias ($table->aliases) {
14456 my $name = $alias->name;
14457 $loose_property_to_file_of{standardize($name)} = $file;
14460 # And a way for utf8_heavy to find the proper key in the SwashInfo
14461 # hash for this property.
14462 $file_to_swash_name{$file} = "To" . $table->swash_name;
14466 # Do all of the work for all equivalent tables when called with the leader
14467 # table, so skip if isn't the leader.
14468 return if $table->leader != $table;
14470 # If this is a complement of another file, use that other file instead,
14471 # with a ! prepended to it.
14473 if (($complement = $table->complement) != 0) {
14474 my @directories = $complement->file_path;
14476 # This assumes that the 0th element is something like 'lib',
14477 # the 1th element the property name (in its own directory), like
14478 # 'AHex', and the 2th element the file like 'Y' which will have a .pl
14479 # appended to it later.
14480 $directories[1] =~ s/^/!/;
14481 $file = pop @directories;
14482 $directory_ref =\@directories;
14485 # Join all the file path components together, using slashes.
14486 my $full_filename = join('/', @$directory_ref, $file);
14488 # All go in the same subdirectory of unicore
14489 if ($directory_ref->[0] ne $matches_directory) {
14490 Carp::my_carp("Unexpected directory in "
14491 . join('/', @{$directory_ref}, $file));
14494 # For this table and all its equivalents ...
14495 foreach my $table ($table, $table->equivalents) {
14497 # Associate it with its file internally. Don't include the
14498 # $matches_directory first component
14499 $table->set_file_path(@$directory_ref, $file);
14501 # No swash means don't do the rest of this.
14502 next if $table->isa('Map_Table') && $table->fate != $ORDINARY;
14504 my $sub_filename = join('/', $directory_ref->[1, -1], $file);
14506 my $property = $table->property;
14507 my $property_name = ($property == $perl)
14508 ? "" # 'perl' is never explicitly stated
14509 : standardize($property->name) . '=';
14511 my $is_default = 0; # Is this table the default one for the property?
14513 # To calculate $is_default, we find if this table is the same as the
14514 # default one for the property. But this is complicated by the
14515 # possibility that there is a master table for this one, and the
14516 # information is stored there instead of here.
14517 my $parent = $table->parent;
14518 my $leader_prop = $parent->property;
14519 my $default_map = $leader_prop->default_map;
14520 if (defined $default_map) {
14521 my $default_table = $leader_prop->table($default_map);
14522 $is_default = 1 if defined $default_table && $parent == $default_table;
14525 # Calculate the loose name for this table. Mostly it's just its name,
14526 # standardized. But in the case of Perl tables that are single-form
14527 # equivalents to Unicode properties, it is the latter's name.
14528 my $loose_table_name =
14529 ($property != $perl || $leader_prop == $perl)
14530 ? standardize($table->name)
14531 : standardize($parent->name);
14533 my $deprecated = ($table->status eq $DEPRECATED)
14534 ? $table->status_info
14536 my $caseless_equivalent = $table->caseless_equivalent;
14538 # And for each of the table's aliases... This inner loop eventually
14539 # goes through all aliases in the UCD that we generate regex match
14541 foreach my $alias ($table->aliases) {
14542 my $standard = utf8_heavy_name($table, $alias);
14544 # Generate an entry in either the loose or strict hashes, which
14545 # will translate the property and alias names combination into the
14546 # file where the table for them is stored.
14547 if ($alias->loose_match) {
14548 if (exists $loose_to_file_of{$standard}) {
14549 Carp::my_carp("Can't change file registered to $loose_to_file_of{$standard} to '$sub_filename'.");
14552 $loose_to_file_of{$standard} = $sub_filename;
14556 if (exists $stricter_to_file_of{$standard}) {
14557 Carp::my_carp("Can't change file registered to $stricter_to_file_of{$standard} to '$sub_filename'.");
14560 $stricter_to_file_of{$standard} = $sub_filename;
14562 # Tightly coupled with how utf8_heavy.pl works, for a
14563 # floating point number that is a whole number, get rid of
14564 # the trailing decimal point and 0's, so that utf8_heavy
14565 # will work. Also note that this assumes that such a
14566 # number is matched strictly; so if that were to change,
14567 # this would be wrong.
14568 if ((my $integer_name = $alias->name)
14569 =~ s/^ ( -? \d+ ) \.0+ $ /$1/x)
14571 $stricter_to_file_of{$property_name . $integer_name}
14577 # For Unicode::UCD, create a mapping of the prop=value to the
14578 # canonical =value for that property.
14579 if ($standard =~ /=/) {
14581 # This could happen if a strict name mapped into an existing
14582 # loose name. In that event, the strict names would have to
14583 # be moved to a new hash.
14584 if (exists($loose_to_standard_value{$standard})) {
14585 Carp::my_carp_bug("'$standard' conflicts with a pre-existing use. Bad News. Continuing anyway");
14587 $loose_to_standard_value{$standard} = $loose_table_name;
14590 # Keep a list of the deprecated properties and their filenames
14591 if ($deprecated && $complement == 0) {
14592 $utf8::why_deprecated{$sub_filename} = $deprecated;
14595 # And a substitute table, if any, for case-insensitive matching
14596 if ($caseless_equivalent != 0) {
14597 $caseless_equivalent_to{$standard} = $caseless_equivalent;
14600 # Add to defaults list if the table this alias belongs to is the
14602 $loose_defaults{$standard} = 1 if $is_default;
14610 my %base_names; # Names already used for avoiding DOS 8.3 filesystem
14612 my %full_dir_name_of; # Full length names of directories used.
14614 sub construct_filename($$$) {
14615 # Return a file name for a table, based on the table name, but perhaps
14616 # changed to get rid of non-portable characters in it, and to make
14617 # sure that it is unique on a file system that allows the names before
14618 # any period to be at most 8 characters (DOS). While we're at it
14619 # check and complain if there are any directory conflicts.
14621 my $name = shift; # The name to start with
14622 my $mutable = shift; # Boolean: can it be changed? If no, but
14623 # yet it must be to work properly, a warning
14625 my $directories_ref = shift; # A reference to an array containing the
14626 # path to the file, with each element one path
14627 # component. This is used because the same
14628 # name can be used in different directories.
14629 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
14631 my $warn = ! defined wantarray; # If true, then if the name is
14632 # changed, a warning is issued as well.
14634 if (! defined $name) {
14635 Carp::my_carp("Undefined name in directory "
14636 . File::Spec->join(@$directories_ref)
14641 # Make sure that no directory names conflict with each other. Look at
14642 # each directory in the input file's path. If it is already in use,
14643 # assume it is correct, and is merely being re-used, but if we
14644 # truncate it to 8 characters, and find that there are two directories
14645 # that are the same for the first 8 characters, but differ after that,
14646 # then that is a problem.
14647 foreach my $directory (@$directories_ref) {
14648 my $short_dir = substr($directory, 0, 8);
14649 if (defined $full_dir_name_of{$short_dir}) {
14650 next if $full_dir_name_of{$short_dir} eq $directory;
14651 Carp::my_carp("$directory conflicts with $full_dir_name_of{$short_dir}. Bad News. Continuing anyway");
14654 $full_dir_name_of{$short_dir} = $directory;
14658 my $path = join '/', @$directories_ref;
14659 $path .= '/' if $path;
14661 # Remove interior underscores.
14662 (my $filename = $name) =~ s/ (?<=.) _ (?=.) //xg;
14664 # Change any non-word character into an underscore, and truncate to 8.
14665 $filename =~ s/\W+/_/g; # eg., "L&" -> "L_"
14666 substr($filename, 8) = "" if length($filename) > 8;
14668 # Make sure the basename doesn't conflict with something we
14669 # might have already written. If we have, say,
14676 while (my $num = $base_names{$path}{lc $filename}++) {
14677 $num++; # so basenames with numbers start with '2', which
14678 # just looks more natural.
14680 # Want to append $num, but if it'll make the basename longer
14681 # than 8 characters, pre-truncate $filename so that the result
14683 my $delta = length($filename) + length($num) - 8;
14685 substr($filename, -$delta) = $num;
14690 if ($warn && ! $warned) {
14692 Carp::my_carp("'$path$name' conflicts with another name on a filesystem with 8 significant characters (like DOS). Proceeding anyway.");
14696 return $filename if $mutable;
14698 # If not changeable, must return the input name, but warn if needed to
14699 # change it beyond shortening it.
14700 if ($name ne $filename
14701 && substr($name, 0, length($filename)) ne $filename) {
14702 Carp::my_carp("'$path$name' had to be changed into '$filename'. Bad News. Proceeding anyway.");
14708 # The pod file contains a very large table. Many of the lines in that table
14709 # would exceed a typical output window's size, and so need to be wrapped with
14710 # a hanging indent to make them look good. The pod language is really
14711 # insufficient here. There is no general construct to do that in pod, so it
14712 # is done here by beginning each such line with a space to cause the result to
14713 # be output without formatting, and doing all the formatting here. This leads
14714 # to the result that if the eventual display window is too narrow it won't
14715 # look good, and if the window is too wide, no advantage is taken of that
14716 # extra width. A further complication is that the output may be indented by
14717 # the formatter so that there is less space than expected. What I (khw) have
14718 # done is to assume that that indent is a particular number of spaces based on
14719 # what it is in my Linux system; people can always resize their windows if
14720 # necessary, but this is obviously less than desirable, but the best that can
14722 my $automatic_pod_indent = 8;
14724 # Try to format so that uses fewest lines, but few long left column entries
14725 # slide into the right column. An experiment on 5.1 data yielded the
14726 # following percentages that didn't cut into the other side along with the
14727 # associated first-column widths
14729 # 80% not too bad except for a few blocks
14730 # 90% = 33; # , cuts 353/3053 lines from 37 = 12%
14732 my $indent_info_column = 27; # 75% of lines didn't have overlap
14734 my $FILLER = 3; # Length of initial boiler-plate columns in a pod line
14735 # The 3 is because of:
14736 # 1 for the leading space to tell the pod formatter to
14739 # 1 for the space between the flag and the main data
14741 sub format_pod_line ($$$;$$) {
14742 # Take a pod line and return it, formatted properly
14744 my $first_column_width = shift;
14745 my $entry = shift; # Contents of left column
14746 my $info = shift; # Contents of right column
14748 my $status = shift || ""; # Any flag
14750 my $loose_match = shift; # Boolean.
14751 $loose_match = 1 unless defined $loose_match;
14753 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
14756 $flags .= $STRICTER if ! $loose_match;
14758 $flags .= $status if $status;
14760 # There is a blank in the left column to cause the pod formatter to
14761 # output the line as-is.
14762 return sprintf " %-*s%-*s %s\n",
14763 # The first * in the format is replaced by this, the -1 is
14764 # to account for the leading blank. There isn't a
14765 # hard-coded blank after this to separate the flags from
14766 # the rest of the line, so that in the unlikely event that
14767 # multiple flags are shown on the same line, they both
14768 # will get displayed at the expense of that separation,
14769 # but since they are left justified, a blank will be
14770 # inserted in the normal case.
14774 # The other * in the format is replaced by this number to
14775 # cause the first main column to right fill with blanks.
14776 # The -1 is for the guaranteed blank following it.
14777 $first_column_width - $FILLER - 1,
14782 my @zero_match_tables; # List of tables that have no matches in this release
14784 sub make_re_pod_entries($) {
14785 # This generates the entries for the pod file for a given table.
14786 # Also done at this time are any children tables. The output looks like:
14787 # \p{Common} \p{Script=Common} (Short: \p{Zyyy}) (5178)
14789 my $input_table = shift; # Table the entry is for
14790 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
14792 # Generate parent and all its children at the same time.
14793 return if $input_table->parent != $input_table;
14795 my $property = $input_table->property;
14796 my $type = $property->type;
14797 my $full_name = $property->full_name;
14799 my $count = $input_table->count;
14800 my $string_count = clarify_number($count);
14801 my $status = $input_table->status;
14802 my $status_info = $input_table->status_info;
14803 my $caseless_equivalent = $input_table->caseless_equivalent;
14805 # Don't mention a placeholder equivalent as it isn't to be listed in the
14807 $caseless_equivalent = 0 if $caseless_equivalent != 0
14808 && $caseless_equivalent->fate > $ORDINARY;
14810 my $entry_for_first_table; # The entry for the first table output.
14811 # Almost certainly, it is the parent.
14813 # For each related table (including itself), we will generate a pod entry
14814 # for each name each table goes by
14815 foreach my $table ($input_table, $input_table->children) {
14817 # utf8_heavy.pl cannot deal with null string property values, so skip
14818 # any tables that have no non-null names.
14819 next if ! grep { $_->name ne "" } $table->aliases;
14821 # First, gather all the info that applies to this table as a whole.
14823 push @zero_match_tables, $table if $count == 0
14824 # Don't mention special tables
14825 # as being zero length
14826 && $table->fate == $ORDINARY;
14828 my $table_property = $table->property;
14830 # The short name has all the underscores removed, while the full name
14831 # retains them. Later, we decide whether to output a short synonym
14832 # for the full one, we need to compare apples to apples, so we use the
14833 # short name's length including underscores.
14834 my $table_property_short_name_length;
14835 my $table_property_short_name
14836 = $table_property->short_name(\$table_property_short_name_length);
14837 my $table_property_full_name = $table_property->full_name;
14839 # Get how much savings there is in the short name over the full one
14840 # (delta will always be <= 0)
14841 my $table_property_short_delta = $table_property_short_name_length
14842 - length($table_property_full_name);
14843 my @table_description = $table->description;
14844 my @table_note = $table->note;
14846 # Generate an entry for each alias in this table.
14847 my $entry_for_first_alias; # saves the first one encountered.
14848 foreach my $alias ($table->aliases) {
14850 # Skip if not to go in pod.
14851 next unless $alias->make_re_pod_entry;
14853 # Start gathering all the components for the entry
14854 my $name = $alias->name;
14856 # Skip if name is empty, as can't be accessed by regexes.
14857 next if $name eq "";
14859 my $entry; # Holds the left column, may include extras
14860 my $entry_ref; # To refer to the left column's contents from
14861 # another entry; has no extras
14863 # First the left column of the pod entry. Tables for the $perl
14864 # property always use the single form.
14865 if ($table_property == $perl) {
14866 $entry = "\\p{$name}";
14867 $entry_ref = "\\p{$name}";
14869 else { # Compound form.
14871 # Only generate one entry for all the aliases that mean true
14872 # or false in binary properties. Append a '*' to indicate
14873 # some are missing. (The heading comment notes this.)
14875 if ($type == $BINARY) {
14876 next if $name ne 'N' && $name ne 'Y';
14879 elsif ($type != $FORCED_BINARY) {
14884 # Forced binary properties require special handling. It
14885 # has two sets of tables, one set is true/false; and the
14886 # other set is everything else. Entries are generated for
14887 # each set. Use the Bidi_Mirrored property (which appears
14888 # in all Unicode versions) to get a list of the aliases
14889 # for the true/false tables. Of these, only output the N
14890 # and Y ones, the same as, a regular binary property. And
14891 # output all the rest, same as a non-binary property.
14892 my $bm = property_ref("Bidi_Mirrored");
14893 if ($name eq 'N' || $name eq 'Y') {
14895 } elsif (grep { $name eq $_->name } $bm->table("Y")->aliases,
14896 $bm->table("N")->aliases)
14905 # Colon-space is used to give a little more space to be easier
14908 . $table_property_full_name
14911 # But for the reference to this entry, which will go in the
14912 # right column, where space is at a premium, use equals
14914 $entry_ref = "\\p{" . $table_property_full_name . "=$name}";
14917 # Then the right (info) column. This is stored as components of
14918 # an array for the moment, then joined into a string later. For
14919 # non-internal only properties, begin the info with the entry for
14920 # the first table we encountered (if any), as things are ordered
14921 # so that that one is the most descriptive. This leads to the
14922 # info column of an entry being a more descriptive version of the
14925 if ($name =~ /^_/) {
14927 '(For internal use by Perl, not necessarily stable)';
14929 elsif ($entry_for_first_alias) {
14930 push @info, $entry_for_first_alias;
14933 # If this entry is equivalent to another, add that to the info,
14934 # using the first such table we encountered
14935 if ($entry_for_first_table) {
14937 push @info, "(= $entry_for_first_table)";
14940 push @info, $entry_for_first_table;
14944 # If the name is a large integer, add an equivalent with an
14945 # exponent for better readability
14946 if ($name =~ /^[+-]?[\d]+$/ && $name >= 10_000) {
14947 push @info, sprintf "(= %.1e)", $name
14950 my $parenthesized = "";
14951 if (! $entry_for_first_alias) {
14953 # This is the first alias for the current table. The alias
14954 # array is ordered so that this is the fullest, most
14955 # descriptive alias, so it gets the fullest info. The other
14956 # aliases are mostly merely pointers to this one, using the
14957 # information already added above.
14959 # Display any status message, but only on the parent table
14960 if ($status && ! $entry_for_first_table) {
14961 push @info, $status_info;
14964 # Put out any descriptive info
14965 if (@table_description || @table_note) {
14966 push @info, join "; ", @table_description, @table_note;
14969 # Look to see if there is a shorter name we can point people
14971 my $standard_name = standardize($name);
14973 my $proposed_short = $table->short_name;
14974 if (defined $proposed_short) {
14975 my $standard_short = standardize($proposed_short);
14977 # If the short name is shorter than the standard one, or
14978 # even it it's not, but the combination of it and its
14979 # short property name (as in \p{prop=short} ($perl doesn't
14980 # have this form)) saves at least two characters, then,
14981 # cause it to be listed as a shorter synonym.
14982 if (length $standard_short < length $standard_name
14983 || ($table_property != $perl
14984 && (length($standard_short)
14985 - length($standard_name)
14986 + $table_property_short_delta) # (<= 0)
14989 $short_name = $proposed_short;
14990 if ($table_property != $perl) {
14991 $short_name = $table_property_short_name
14994 $short_name = "\\p{$short_name}";
14998 # And if this is a compound form name, see if there is a
14999 # single form equivalent
15001 if ($table_property != $perl) {
15003 # Special case the binary N tables, so that will print
15004 # \P{single}, but use the Y table values to populate
15005 # 'single', as we haven't likewise populated the N table.
15006 # For forced binary tables, we can't just look at the N
15007 # table, but must see if this table is equivalent to the N
15008 # one, as there are two equivalent beasts in these
15012 if ( ($type == $BINARY
15013 && $input_table == $property->table('No'))
15014 || ($type == $FORCED_BINARY
15015 && $property->table('No')->
15016 is_set_equivalent_to($input_table)))
15018 $test_table = $property->table('Yes');
15022 $test_table = $input_table;
15026 # Look for a single form amongst all the children.
15027 foreach my $table ($test_table->children) {
15028 next if $table->property != $perl;
15029 my $proposed_name = $table->short_name;
15030 next if ! defined $proposed_name;
15032 # Don't mention internal-only properties as a possible
15033 # single form synonym
15034 next if substr($proposed_name, 0, 1) eq '_';
15036 $proposed_name = "\\$p\{$proposed_name}";
15037 if (! defined $single_form
15038 || length($proposed_name) < length $single_form)
15040 $single_form = $proposed_name;
15042 # The goal here is to find a single form; not the
15043 # shortest possible one. We've already found a
15044 # short name. So, stop at the first single form
15045 # found, which is likely to be closer to the
15052 # Ouput both short and single in the same parenthesized
15053 # expression, but with only one of 'Single', 'Short' if there
15055 if ($short_name || $single_form || $table->conflicting) {
15056 $parenthesized .= "Short: $short_name" if $short_name;
15057 if ($short_name && $single_form) {
15058 $parenthesized .= ', ';
15060 elsif ($single_form) {
15061 $parenthesized .= 'Single: ';
15063 $parenthesized .= $single_form if $single_form;
15067 if ($caseless_equivalent != 0) {
15068 $parenthesized .= '; ' if $parenthesized ne "";
15069 $parenthesized .= "/i= " . $caseless_equivalent->complete_name;
15073 # Warn if this property isn't the same as one that a
15074 # semi-casual user might expect. The other components of this
15075 # parenthesized structure are calculated only for the first entry
15076 # for this table, but the conflicting is deemed important enough
15077 # to go on every entry.
15078 my $conflicting = join " NOR ", $table->conflicting;
15079 if ($conflicting) {
15080 $parenthesized .= '; ' if $parenthesized ne "";
15081 $parenthesized .= "NOT $conflicting";
15084 push @info, "($parenthesized)" if $parenthesized;
15086 if ($name =~ /_$/ && $alias->loose_match) {
15087 push @info, "Note the trailing '_' matters in spite of loose matching rules.";
15090 if ($table_property != $perl && $table->perl_extension) {
15091 push @info, '(Perl extension)';
15093 push @info, "($string_count)";
15095 # Now, we have both the entry and info so add them to the
15096 # list of all the properties.
15097 push @match_properties,
15098 format_pod_line($indent_info_column,
15102 $alias->loose_match);
15104 $entry_for_first_alias = $entry_ref unless $entry_for_first_alias;
15105 } # End of looping through the aliases for this table.
15107 if (! $entry_for_first_table) {
15108 $entry_for_first_table = $entry_for_first_alias;
15110 } # End of looping through all the related tables
15114 sub make_ucd_table_pod_entries {
15117 # Generate the entries for the UCD section of the pod for $table. This
15118 # also calculates if names are ambiguous, so has to be called even if the
15119 # pod is not being output
15121 my $short_name = $table->name;
15122 my $standard_short_name = standardize($short_name);
15123 my $full_name = $table->full_name;
15124 my $standard_full_name = standardize($full_name);
15126 my $full_info = ""; # Text of info column for full-name entries
15127 my $other_info = ""; # Text of info column for short-name entries
15128 my $short_info = ""; # Text of info column for other entries
15129 my $meaning = ""; # Synonym of this table
15131 my $property = ($table->isa('Property'))
15133 : $table->parent->property;
15135 my $perl_extension = $table->perl_extension;
15137 # Get the more official name for for perl extensions that aren't
15138 # stand-alone properties
15139 if ($perl_extension && $property != $table) {
15140 if ($property == $perl ||$property->type == $BINARY) {
15141 $meaning = $table->complete_name;
15144 $meaning = $property->full_name . "=$full_name";
15148 # There are three types of info column. One for the short name, one for
15149 # the full name, and one for everything else. They mostly are the same,
15150 # so initialize in the same loop.
15151 foreach my $info_ref (\$full_info, \$short_info, \$other_info) {
15152 if ($perl_extension && $property != $table) {
15154 # Add the synonymous name for the non-full name entries; and to
15155 # the full-name entry if it adds extra information
15156 if ($info_ref == \$other_info
15157 || ($info_ref == \$short_info
15158 && $standard_short_name ne $standard_full_name)
15159 || standardize($meaning) ne $standard_full_name
15161 $$info_ref .= "$meaning.";
15164 elsif ($info_ref != \$full_info) {
15166 # Otherwise, the non-full name columns include the full name
15167 $$info_ref .= $full_name;
15170 # And the full-name entry includes the short name, if different
15171 if ($info_ref == \$full_info
15172 && $standard_short_name ne $standard_full_name)
15174 $full_info =~ s/\.\Z//;
15175 $full_info .= " " if $full_info;
15176 $full_info .= "(Short: $short_name)";
15179 if ($table->perl_extension) {
15180 $$info_ref =~ s/\.\Z//;
15181 $$info_ref .= ". " if $$info_ref;
15182 $$info_ref .= "(Perl extension)";
15186 # Add any extra annotations to the full name entry
15187 foreach my $more_info ($table->description,
15189 $table->status_info)
15191 next unless $more_info;
15192 $full_info =~ s/\.\Z//;
15193 $full_info .= ". " if $full_info;
15194 $full_info .= $more_info;
15197 # These keep track if have created full and short name pod entries for the
15200 my $done_short = 0;
15202 # Every possible name is kept track of, even those that aren't going to be
15203 # output. This way we can be sure to find the ambiguities.
15204 foreach my $alias ($table->aliases) {
15205 my $name = $alias->name;
15206 my $standard = standardize($name);
15208 my $output_this = $alias->ucd;
15210 # If the full and short names are the same, we want to output the full
15211 # one's entry, so it has priority.
15212 if ($standard eq $standard_full_name) {
15213 next if $done_full;
15215 $info = $full_info;
15217 elsif ($standard eq $standard_short_name) {
15218 next if $done_short;
15220 next if $standard_short_name eq $standard_full_name;
15221 $info = $short_info;
15224 $info = $other_info;
15227 # Here, we have set up the two columns for this entry. But if an
15228 # entry already exists for this name, we have to decide which one
15229 # we're going to later output.
15230 if (exists $ucd_pod{$standard}) {
15232 # If the two entries refer to the same property, it's not going to
15233 # be ambiguous. (Likely it's because the names when standardized
15234 # are the same.) But that means if they are different properties,
15235 # there is ambiguity.
15236 if ($ucd_pod{$standard}->{'property'} != $property) {
15238 # Here, we have an ambiguity. This code assumes that one is
15239 # scheduled to be output and one not and that one is a perl
15240 # extension (which is not to be output) and the other isn't.
15241 # If those assumptions are wrong, things have to be rethought.
15242 if ($ucd_pod{$standard}{'output_this'} == $output_this
15243 || $ucd_pod{$standard}{'perl_extension'} == $perl_extension
15244 || $output_this == $perl_extension)
15246 Carp::my_carp("Bad news. $property and $ucd_pod{$standard}->{'property'} have unexpected output status and perl-extension combinations. Proceeding anyway.");
15249 # We modifiy the info column of the one being output to
15250 # indicate the ambiguity. Set $which to point to that one's
15253 if ($ucd_pod{$standard}{'output_this'}) {
15254 $which = \$ucd_pod{$standard}->{'info'};
15258 $meaning = $ucd_pod{$standard}{'meaning'};
15262 $$which =~ s/\.\Z//;
15263 $$which .= "; NOT '$standard' meaning '$meaning'";
15265 $ambiguous_names{$standard} = 1;
15268 # Use the non-perl-extension variant
15269 next unless $ucd_pod{$standard}{'perl_extension'};
15272 # Store enough information about this entry that we can later look for
15273 # ambiguities, and output it properly.
15274 $ucd_pod{$standard} = { 'name' => $name,
15276 'meaning' => $meaning,
15277 'output_this' => $output_this,
15278 'perl_extension' => $perl_extension,
15279 'property' => $property,
15280 'status' => $alias->status,
15282 } # End of looping through all this table's aliases
15287 sub pod_alphanumeric_sort {
15288 # Sort pod entries alphanumerically.
15290 # The first few character columns are filler, plus the '\p{'; and get rid
15291 # of all the trailing stuff, starting with the trailing '}', so as to sort
15292 # on just 'Name=Value'
15293 (my $a = lc $a) =~ s/^ .*? { //x;
15295 (my $b = lc $b) =~ s/^ .*? { //x;
15298 # Determine if the two operands are both internal only or both not.
15299 # Character 0 should be a '\'; 1 should be a p; 2 should be '{', so 3
15300 # should be the underscore that begins internal only
15301 my $a_is_internal = (substr($a, 0, 1) eq '_');
15302 my $b_is_internal = (substr($b, 0, 1) eq '_');
15304 # Sort so the internals come last in the table instead of first (which the
15305 # leading underscore would otherwise indicate).
15306 if ($a_is_internal != $b_is_internal) {
15307 return 1 if $a_is_internal;
15311 # Determine if the two operands are numeric property values or not.
15312 # A numeric property will look like xyz: 3. But the number
15313 # can begin with an optional minus sign, and may have a
15314 # fraction or rational component, like xyz: 3/2. If either
15315 # isn't numeric, use alphabetic sort.
15316 my ($a_initial, $a_number) =
15317 ($a =~ /^ ( [^:=]+ [:=] \s* ) (-? \d+ (?: [.\/] \d+)? )/ix);
15318 return $a cmp $b unless defined $a_number;
15319 my ($b_initial, $b_number) =
15320 ($b =~ /^ ( [^:=]+ [:=] \s* ) (-? \d+ (?: [.\/] \d+)? )/ix);
15321 return $a cmp $b unless defined $b_number;
15323 # Here they are both numeric, but use alphabetic sort if the
15324 # initial parts don't match
15325 return $a cmp $b if $a_initial ne $b_initial;
15327 # Convert rationals to floating for the comparison.
15328 $a_number = eval $a_number if $a_number =~ qr{/};
15329 $b_number = eval $b_number if $b_number =~ qr{/};
15331 return $a_number <=> $b_number;
15335 # Create the .pod file. This generates the various subsections and then
15336 # combines them in one big HERE document.
15338 my $Is_flags_text = "If an entry has flag(s) at its beginning, like \"$DEPRECATED\", the \"Is_\" form has the same flag(s)";
15340 return unless defined $pod_directory;
15341 print "Making pod file\n" if $verbosity >= $PROGRESS;
15343 my $exception_message =
15344 '(Any exceptions are individually noted beginning with the word NOT.)';
15346 if (-e 'Blocks.txt') {
15348 # Add the line: '\p{In_*} \p{Block: *}', with the warning message
15349 # if the global $has_In_conflicts indicates we have them.
15350 push @match_properties, format_pod_line($indent_info_column,
15353 . (($has_In_conflicts)
15354 ? " $exception_message"
15356 @block_warning = << "END";
15358 Matches in the Block property have shortcuts that begin with "In_". For
15359 example, C<\\p{Block=Latin1}> can be written as C<\\p{In_Latin1}>. For
15360 backward compatibility, if there is no conflict with another shortcut, these
15361 may also be written as C<\\p{Latin1}> or C<\\p{Is_Latin1}>. But, N.B., there
15362 are numerous such conflicting shortcuts. Use of these forms for Block is
15363 discouraged, and are flagged as such, not only because of the potential
15364 confusion as to what is meant, but also because a later release of Unicode may
15365 preempt the shortcut, and your program would no longer be correct. Use the
15366 "In_" form instead to avoid this, or even more clearly, use the compound form,
15367 e.g., C<\\p{blk:latin1}>. See L<perlunicode/"Blocks"> for more information
15371 my $text = $Is_flags_text;
15372 $text = "$exception_message $text" if $has_Is_conflicts;
15374 # And the 'Is_ line';
15375 push @match_properties, format_pod_line($indent_info_column,
15379 # Sort the properties array for output. It is sorted alphabetically
15380 # except numerically for numeric properties, and only output unique lines.
15381 @match_properties = sort pod_alphanumeric_sort uniques @match_properties;
15383 my $formatted_properties = simple_fold(\@match_properties,
15385 # indent succeeding lines by two extra
15386 # which looks better
15387 $indent_info_column + 2,
15389 # shorten the line length by how much
15390 # the formatter indents, so the folded
15391 # line will fit in the space
15392 # presumably available
15393 $automatic_pod_indent);
15394 # Add column headings, indented to be a little more centered, but not
15396 $formatted_properties = format_pod_line($indent_info_column,
15400 . $formatted_properties;
15402 # Generate pod documentation lines for the tables that match nothing
15403 my $zero_matches = "";
15404 if (@zero_match_tables) {
15405 @zero_match_tables = uniques(@zero_match_tables);
15406 $zero_matches = join "\n\n",
15407 map { $_ = '=item \p{' . $_->complete_name . "}" }
15408 sort { $a->complete_name cmp $b->complete_name }
15409 @zero_match_tables;
15411 $zero_matches = <<END;
15413 =head2 Legal C<\\p{}> and C<\\P{}> constructs that match no characters
15415 Unicode has some property-value pairs that currently don't match anything.
15416 This happens generally either because they are obsolete, or they exist for
15417 symmetry with other forms, but no language has yet been encoded that uses
15418 them. In this version of Unicode, the following match zero code points:
15429 # Generate list of properties that we don't accept, grouped by the reasons
15430 # why. This is so only put out the 'why' once, and then list all the
15431 # properties that have that reason under it.
15433 my %why_list; # The keys are the reasons; the values are lists of
15434 # properties that have the key as their reason
15436 # For each property, add it to the list that are suppressed for its reason
15437 # The sort will cause the alphabetically first properties to be added to
15438 # each list first, so each list will be sorted.
15439 foreach my $property (sort keys %why_suppressed) {
15440 push @{$why_list{$why_suppressed{$property}}}, $property;
15443 # For each reason (sorted by the first property that has that reason)...
15444 my @bad_re_properties;
15445 foreach my $why (sort { $why_list{$a}->[0] cmp $why_list{$b}->[0] }
15448 # Add to the output, all the properties that have that reason.
15449 my $has_item = 0; # Flag if actually output anything.
15450 foreach my $name (@{$why_list{$why}}) {
15452 # Split compound names into $property and $table components
15453 my $property = $name;
15455 if ($property =~ / (.*) = (.*) /x) {
15460 # This release of Unicode may not have a property that is
15461 # suppressed, so don't reference a non-existent one.
15462 $property = property_ref($property);
15463 next if ! defined $property;
15465 # And since this list is only for match tables, don't list the
15466 # ones that don't have match tables.
15467 next if ! $property->to_create_match_tables;
15469 # Find any abbreviation, and turn it into a compound name if this
15470 # is a property=value pair.
15471 my $short_name = $property->name;
15472 $short_name .= '=' . $property->table($table)->name if $table;
15474 # Start with an empty line.
15475 push @bad_re_properties, "\n\n" unless $has_item;
15477 # And add the property as an item for the reason.
15478 push @bad_re_properties, "\n=item I<$name> ($short_name)\n";
15482 # And add the reason under the list of properties, if such a list
15483 # actually got generated. Note that the header got added
15484 # unconditionally before. But pod ignores extra blank lines, so no
15486 push @bad_re_properties, "\n$why\n" if $has_item;
15488 } # End of looping through each reason.
15490 if (! @bad_re_properties) {
15491 push @bad_re_properties,
15492 "*** This installation accepts ALL non-Unihan properties ***";
15495 # Add =over only if non-empty to avoid an empty =over/=back section,
15496 # which is considered bad form.
15497 unshift @bad_re_properties, "\n=over 4\n";
15498 push @bad_re_properties, "\n=back\n";
15501 # Similiarly, generate a list of files that we don't use, grouped by the
15502 # reasons why. First, create a hash whose keys are the reasons, and whose
15503 # values are anonymous arrays of all the files that share that reason.
15504 my %grouped_by_reason;
15505 foreach my $file (keys %ignored_files) {
15506 push @{$grouped_by_reason{$ignored_files{$file}}}, $file;
15508 foreach my $file (keys %skipped_files) {
15509 push @{$grouped_by_reason{$skipped_files{$file}}}, $file;
15512 # Then, sort each group.
15513 foreach my $group (keys %grouped_by_reason) {
15514 @{$grouped_by_reason{$group}} = sort { lc $a cmp lc $b }
15515 @{$grouped_by_reason{$group}} ;
15518 # Finally, create the output text. For each reason (sorted by the
15519 # alphabetically first file that has that reason)...
15521 foreach my $reason (sort { lc $grouped_by_reason{$a}->[0]
15522 cmp lc $grouped_by_reason{$b}->[0]
15524 keys %grouped_by_reason)
15526 # Add all the files that have that reason to the output. Start
15527 # with an empty line.
15528 push @unused_files, "\n\n";
15529 push @unused_files, map { "\n=item F<$_> \n" }
15530 @{$grouped_by_reason{$reason}};
15531 # And add the reason under the list of files
15532 push @unused_files, "\n$reason\n";
15535 # Similarly, create the output text for the UCD section of the pod
15537 foreach my $key (keys %ucd_pod) {
15538 next unless $ucd_pod{$key}->{'output_this'};
15539 push @ucd_pod, format_pod_line($indent_info_column,
15540 $ucd_pod{$key}->{'name'},
15541 $ucd_pod{$key}->{'info'},
15542 $ucd_pod{$key}->{'status'},
15546 # Sort alphabetically, and fold for output
15547 @ucd_pod = sort { lc substr($a, 2) cmp lc substr($b, 2) } @ucd_pod;
15548 my $ucd_pod = simple_fold(\@ucd_pod,
15550 $indent_info_column,
15551 $automatic_pod_indent);
15552 $ucd_pod = format_pod_line($indent_info_column, 'NAME', ' INFO')
15557 # Everything is ready to assemble.
15558 my @OUT = << "END";
15563 To change this file, edit $0 instead.
15569 $pod_file - Index of Unicode Version $string_version character properties in Perl
15573 This document provides information about the portion of the Unicode database
15574 that deals with character properties, that is the portion that is defined on
15575 single code points. (L</Other information in the Unicode data base>
15576 below briefly mentions other data that Unicode provides.)
15578 Perl can provide access to all non-provisional Unicode character properties,
15579 though not all are enabled by default. The omitted ones are the Unihan
15580 properties (accessible via the CPAN module L<Unicode::Unihan>) and certain
15581 deprecated or Unicode-internal properties. (An installation may choose to
15582 recompile Perl's tables to change this. See L<Unicode character
15583 properties that are NOT accepted by Perl>.)
15585 For most purposes, access to Unicode properties from the Perl core is through
15586 regular expression matches, as described in the next section.
15587 For some special purposes, and to access the properties that are not suitable
15588 for regular expression matching, all the Unicode character properties that
15589 Perl handles are accessible via the standard L<Unicode::UCD> module, as
15590 described in the section L</Properties accessible through Unicode::UCD>.
15592 Perl also provides some additional extensions and short-cut synonyms
15593 for Unicode properties.
15595 This document merely lists all available properties and does not attempt to
15596 explain what each property really means. There is a brief description of each
15597 Perl extension; see L<perlunicode/Other Properties> for more information on
15598 these. There is some detail about Blocks, Scripts, General_Category,
15599 and Bidi_Class in L<perlunicode>, but to find out about the intricacies of the
15600 official Unicode properties, refer to the Unicode standard. A good starting
15601 place is L<$unicode_reference_url>.
15603 Note that you can define your own properties; see
15604 L<perlunicode/"User-Defined Character Properties">.
15606 =head1 Properties accessible through C<\\p{}> and C<\\P{}>
15608 The Perl regular expression C<\\p{}> and C<\\P{}> constructs give access to
15609 most of the Unicode character properties. The table below shows all these
15610 constructs, both single and compound forms.
15612 B<Compound forms> consist of two components, separated by an equals sign or a
15613 colon. The first component is the property name, and the second component is
15614 the particular value of the property to match against, for example,
15615 C<\\p{Script: Greek}> and C<\\p{Script=Greek}> both mean to match characters
15616 whose Script property value is Greek.
15618 B<Single forms>, like C<\\p{Greek}>, are mostly Perl-defined shortcuts for
15619 their equivalent compound forms. The table shows these equivalences. (In our
15620 example, C<\\p{Greek}> is a just a shortcut for C<\\p{Script=Greek}>.)
15621 There are also a few Perl-defined single forms that are not shortcuts for a
15622 compound form. One such is C<\\p{Word}>. These are also listed in the table.
15624 In parsing these constructs, Perl always ignores Upper/lower case differences
15625 everywhere within the {braces}. Thus C<\\p{Greek}> means the same thing as
15626 C<\\p{greek}>. But note that changing the case of the C<"p"> or C<"P"> before
15627 the left brace completely changes the meaning of the construct, from "match"
15628 (for C<\\p{}>) to "doesn't match" (for C<\\P{}>). Casing in this document is
15629 for improved legibility.
15631 Also, white space, hyphens, and underscores are normally ignored
15632 everywhere between the {braces}, and hence can be freely added or removed
15633 even if the C</x> modifier hasn't been specified on the regular expression.
15634 But in the table below $a_bold_stricter at the beginning of an entry
15635 means that tighter (stricter) rules are used for that entry:
15641 =item Single form (C<\\p{name}>) tighter rules:
15643 White space, hyphens, and underscores ARE significant
15648 =item * white space adjacent to a non-word character
15650 =item * underscores separating digits in numbers
15654 That means, for example, that you can freely add or remove white space
15655 adjacent to (but within) the braces without affecting the meaning.
15657 =item Compound form (C<\\p{name=value}> or C<\\p{name:value}>) tighter rules:
15659 The tighter rules given above for the single form apply to everything to the
15660 right of the colon or equals; the looser rules still apply to everything to
15663 That means, for example, that you can freely add or remove white space
15664 adjacent to (but within) the braces and the colon or equal sign.
15670 Some properties are considered obsolete by Unicode, but still available.
15671 There are several varieties of obsolescence:
15679 A property may be stabilized. Such a determination does not indicate
15680 that the property should or should not be used; instead it is a declaration
15681 that the property will not be maintained nor extended for newly encoded
15682 characters. Such properties are marked with $a_bold_stabilized in the
15687 A property may be deprecated, perhaps because its original intent
15688 has been replaced by another property, or because its specification was
15689 somehow defective. This means that its use is strongly
15690 discouraged, so much so that a warning will be issued if used, unless the
15691 regular expression is in the scope of a C<S<no warnings 'deprecated'>>
15692 statement. $A_bold_deprecated flags each such entry in the table, and
15693 the entry there for the longest, most descriptive version of the property will
15694 give the reason it is deprecated, and perhaps advice. Perl may issue such a
15695 warning, even for properties that aren't officially deprecated by Unicode,
15696 when there used to be characters or code points that were matched by them, but
15697 no longer. This is to warn you that your program may not work like it did on
15698 earlier Unicode releases.
15700 A deprecated property may be made unavailable in a future Perl version, so it
15701 is best to move away from them.
15703 A deprecated property may also be stabilized, but this fact is not shown.
15707 Properties marked with $a_bold_obsolete in the table are considered (plain)
15708 obsolete. Generally this designation is given to properties that Unicode once
15709 used for internal purposes (but not any longer).
15713 Some Perl extensions are present for backwards compatibility and are
15714 discouraged from being used, but are not obsolete. $A_bold_discouraged
15715 flags each such entry in the table. Future Unicode versions may force
15716 some of these extensions to be removed without warning, replaced by another
15717 property with the same name that means something different. Use the
15718 equivalent shown instead.
15724 The table below has two columns. The left column contains the C<\\p{}>
15725 constructs to look up, possibly preceded by the flags mentioned above; and
15726 the right column contains information about them, like a description, or
15727 synonyms. The table shows both the single and compound forms for each
15728 property that has them. If the left column is a short name for a property,
15729 the right column will give its longer, more descriptive name; and if the left
15730 column is the longest name, the right column will show any equivalent shortest
15731 name, in both single and compound forms if applicable.
15733 The right column will also caution you if a property means something different
15734 than what might normally be expected.
15736 All single forms are Perl extensions; a few compound forms are as well, and
15739 Numbers in (parentheses) indicate the total number of Unicode code points
15740 matched by the property. For emphasis, those properties that match no code
15741 points at all are listed as well in a separate section following the table.
15743 Most properties match the same code points regardless of whether C<"/i">
15744 case-insensitive matching is specified or not. But a few properties are
15745 affected. These are shown with the notation S<C<(/i= I<other_property>)>>
15746 in the second column. Under case-insensitive matching they match the
15747 same code pode points as the property I<other_property>.
15749 There is no description given for most non-Perl defined properties (See
15750 L<$unicode_reference_url> for that).
15752 For compactness, 'B<*>' is used as a wildcard instead of showing all possible
15753 combinations. For example, entries like:
15755 \\p{Gc: *} \\p{General_Category: *}
15757 mean that 'Gc' is a synonym for 'General_Category', and anything that is valid
15758 for the latter is also valid for the former. Similarly,
15762 means that if and only if, for example, C<\\p{Foo}> exists, then
15763 C<\\p{Is_Foo}> and C<\\p{IsFoo}> are also valid and all mean the same thing.
15764 And similarly, C<\\p{Foo=Bar}> means the same as C<\\p{Is_Foo=Bar}> and
15765 C<\\p{IsFoo=Bar}>. "*" here is restricted to something not beginning with an
15768 Also, in binary properties, 'Yes', 'T', and 'True' are all synonyms for 'Y'.
15769 And 'No', 'F', and 'False' are all synonyms for 'N'. The table shows 'Y*' and
15770 'N*' to indicate this, and doesn't have separate entries for the other
15771 possibilities. Note that not all properties which have values 'Yes' and 'No'
15772 are binary, and they have all their values spelled out without using this wild
15773 card, and a C<NOT> clause in their description that highlights their not being
15774 binary. These also require the compound form to match them, whereas true
15775 binary properties have both single and compound forms available.
15777 Note that all non-essential underscores are removed in the display of the
15786 B<*> is a wild-card
15790 B<(\\d+)> in the info column gives the number of Unicode code points matched
15795 B<$DEPRECATED> means this is deprecated.
15799 B<$OBSOLETE> means this is obsolete.
15803 B<$STABILIZED> means this is stabilized.
15807 B<$STRICTER> means tighter (stricter) name matching applies.
15811 B<$DISCOURAGED> means use of this form is discouraged, and may not be
15816 $formatted_properties
15820 =head1 Properties accessible through Unicode::UCD
15822 All the Unicode character properties mentioned above (except for those marked
15823 as for internal use by Perl) are also accessible by
15824 L<Unicode::UCD/prop_invlist()>.
15826 Due to their nature, not all Unicode character properties are suitable for
15827 regular expression matches, nor C<prop_invlist()>. The remaining
15828 non-provisional, non-internal ones are accessible via
15829 L<Unicode::UCD/prop_invmap()> (except for those that this Perl installation
15830 hasn't included; see L<below for which those are|/Unicode character properties
15831 that are NOT accepted by Perl>).
15833 For compatibility with other parts of Perl, all the single forms given in the
15834 table in the L<section above|/Properties accessible through \\p{} and \\P{}>
15835 are recognized. BUT, there are some ambiguities between some Perl extensions
15836 and the Unicode properties, all of which are silently resolved in favor of the
15837 official Unicode property. To avoid surprises, you should only use
15838 C<prop_invmap()> for forms listed in the table below, which omits the
15839 non-recommended ones. The affected forms are the Perl single form equivalents
15840 of Unicode properties, such as C<\\p{sc}> being a single-form equivalent of
15841 C<\\p{gc=sc}>, which is treated by C<prop_invmap()> as the C<Script> property,
15842 whose short name is C<sc>. The table indicates the current ambiguities in the
15843 INFO column, beginning with the word C<"NOT">.
15845 The standard Unicode properties listed below are documented in
15846 L<$unicode_reference_url>; Perl_Decimal_Digit is documented in
15847 L<Unicode::UCD/prop_invmap()>. The other Perl extensions are in
15848 L<perlunicode/Other Properties>;
15850 The first column in the table is a name for the property; the second column is
15851 an alternative name, if any, plus possibly some annotations. The alternative
15852 name is the property's full name, unless that would simply repeat the first
15853 column, in which case the second column indicates the property's short name
15854 (if different). The annotations are given only in the entry for the full
15855 name. If a property is obsolete, etc, the entry will be flagged with the same
15856 characters used in the table in the L<section above|/Properties accessible
15857 through \\p{} and \\P{}>, like B<$DEPRECATED> or B<$STABILIZED>.
15861 =head1 Properties accessible through other means
15863 Certain properties are accessible also via core function calls. These are:
15865 Lowercase_Mapping lc() and lcfirst()
15866 Titlecase_Mapping ucfirst()
15867 Uppercase_Mapping uc()
15869 Also, Case_Folding is accessible through the C</i> modifier in regular
15870 expressions, the C<\\F> transliteration escape, and the C<L<fc|perlfunc/fc>>
15873 And, the Name and Name_Aliases properties are accessible through the C<\\N{}>
15874 interpolation in double-quoted strings and regular expressions; and functions
15875 C<charnames::viacode()>, C<charnames::vianame()>, and
15876 C<charnames::string_vianame()> (which require a C<use charnames ();> to be
15879 Finally, most properties related to decomposition are accessible via
15880 L<Unicode::Normalize>.
15882 =head1 Unicode character properties that are NOT accepted by Perl
15884 Perl will generate an error for a few character properties in Unicode when
15885 used in a regular expression. The non-Unihan ones are listed below, with the
15886 reasons they are not accepted, perhaps with work-arounds. The short names for
15887 the properties are listed enclosed in (parentheses).
15888 As described after the list, an installation can change the defaults and choose
15889 to accept any of these. The list is machine generated based on the
15890 choices made for the installation that generated this document.
15894 An installation can choose to allow any of these to be matched by downloading
15895 the Unicode database from L<http://www.unicode.org/Public/> to
15896 C<\$Config{privlib}>/F<unicore/> in the Perl source tree, changing the
15897 controlling lists contained in the program
15898 C<\$Config{privlib}>/F<unicore/mktables> and then re-compiling and installing.
15899 (C<\%Config> is available from the Config module).
15901 =head1 Other information in the Unicode data base
15903 The Unicode data base is delivered in two different formats. The XML version
15904 is valid for more modern Unicode releases. The other version is a collection
15905 of files. The two are intended to give equivalent information. Perl uses the
15906 older form; this allows you to recompile Perl to use early Unicode releases.
15908 The only non-character property that Perl currently supports is Named
15909 Sequences, in which a sequence of code points
15910 is given a name and generally treated as a single entity. (Perl supports
15911 these via the C<\\N{...}> double-quotish construct,
15912 L<charnames/charnames::string_vianame(name)>, and L<Unicode::UCD/namedseq()>.
15914 Below is a list of the files in the Unicode data base that Perl doesn't
15915 currently use, along with very brief descriptions of their purposes.
15916 Some of the names of the files have been shortened from those that Unicode
15917 uses, in order to allow them to be distinguishable from similarly named files
15918 on file systems for which only the first 8 characters of a name are
15929 L<$unicode_reference_url>
15937 # And write it. The 0 means no utf8.
15938 main::write([ $pod_directory, "$pod_file.pod" ], 0, \@OUT);
15942 sub make_Heavy () {
15943 # Create and write Heavy.pl, which passes info about the tables to
15946 # Stringify structures for output
15947 my $loose_property_name_of
15948 = simple_dumper(\%loose_property_name_of, ' ' x 4);
15949 chomp $loose_property_name_of;
15951 my $stricter_to_file_of = simple_dumper(\%stricter_to_file_of, ' ' x 4);
15952 chomp $stricter_to_file_of;
15954 my $loose_to_file_of = simple_dumper(\%loose_to_file_of, ' ' x 4);
15955 chomp $loose_to_file_of;
15957 my $nv_floating_to_rational
15958 = simple_dumper(\%nv_floating_to_rational, ' ' x 4);
15959 chomp $nv_floating_to_rational;
15961 my $why_deprecated = simple_dumper(\%utf8::why_deprecated, ' ' x 4);
15962 chomp $why_deprecated;
15964 # We set the key to the file when we associated files with tables, but we
15965 # couldn't do the same for the value then, as we might not have the file
15966 # for the alternate table figured out at that time.
15967 foreach my $cased (keys %caseless_equivalent_to) {
15968 my @path = $caseless_equivalent_to{$cased}->file_path;
15969 my $path = join '/', @path[1, -1];
15970 $caseless_equivalent_to{$cased} = $path;
15972 my $caseless_equivalent_to
15973 = simple_dumper(\%caseless_equivalent_to, ' ' x 4);
15974 chomp $caseless_equivalent_to;
15976 my $loose_property_to_file_of
15977 = simple_dumper(\%loose_property_to_file_of, ' ' x 4);
15978 chomp $loose_property_to_file_of;
15980 my $file_to_swash_name = simple_dumper(\%file_to_swash_name, ' ' x 4);
15981 chomp $file_to_swash_name;
15985 $INTERNAL_ONLY_HEADER
15987 # This file is for the use of utf8_heavy.pl and Unicode::UCD
15989 # Maps Unicode (not Perl single-form extensions) property names in loose
15990 # standard form to their corresponding standard names
15991 \%utf8::loose_property_name_of = (
15992 $loose_property_name_of
15995 # Maps property, table to file for those using stricter matching
15996 \%utf8::stricter_to_file_of = (
15997 $stricter_to_file_of
16000 # Maps property, table to file for those using loose matching
16001 \%utf8::loose_to_file_of = (
16005 # Maps floating point to fractional form
16006 \%utf8::nv_floating_to_rational = (
16007 $nv_floating_to_rational
16010 # If a floating point number doesn't have enough digits in it to get this
16011 # close to a fraction, it isn't considered to be that fraction even if all the
16012 # digits it does have match.
16013 \$utf8::max_floating_slop = $MAX_FLOATING_SLOP;
16015 # Deprecated tables to generate a warning for. The key is the file containing
16016 # the table, so as to avoid duplication, as many property names can map to the
16017 # file, but we only need one entry for all of them.
16018 \%utf8::why_deprecated = (
16022 # A few properties have different behavior under /i matching. This maps
16023 # those to substitute files to use under /i.
16024 \%utf8::caseless_equivalent = (
16025 $caseless_equivalent_to
16028 # Property names to mapping files
16029 \%utf8::loose_property_to_file_of = (
16030 $loose_property_to_file_of
16033 # Files to the swash names within them.
16034 \%utf8::file_to_swash_name = (
16035 $file_to_swash_name
16041 main::write("Heavy.pl", 0, \@heavy); # The 0 means no utf8.
16045 sub make_Name_pm () {
16046 # Create and write Name.pm, which contains subroutines and data to use in
16047 # conjunction with Name.pl
16049 # Maybe there's nothing to do.
16050 return unless $has_hangul_syllables || @code_points_ending_in_code_point;
16054 $INTERNAL_ONLY_HEADER
16057 # Convert these structures to output format.
16058 my $code_points_ending_in_code_point =
16059 main::simple_dumper(\@code_points_ending_in_code_point,
16061 my $names = main::simple_dumper(\%names_ending_in_code_point,
16063 my $loose_names = main::simple_dumper(\%loose_names_ending_in_code_point,
16066 # Do the same with the Hangul names,
16072 if ($has_hangul_syllables) {
16074 # Construct a regular expression of all the possible
16075 # combinations of the Hangul syllables.
16076 my @L_re; # Leading consonants
16077 for my $i ($LBase .. $LBase + $LCount - 1) {
16078 push @L_re, $Jamo{$i}
16080 my @V_re; # Middle vowels
16081 for my $i ($VBase .. $VBase + $VCount - 1) {
16082 push @V_re, $Jamo{$i}
16084 my @T_re; # Trailing consonants
16085 for my $i ($TBase + 1 .. $TBase + $TCount - 1) {
16086 push @T_re, $Jamo{$i}
16089 # The whole re is made up of the L V T combination.
16091 . join ('|', sort @L_re)
16093 . join ('|', sort @V_re)
16095 . join ('|', sort @T_re)
16098 # These hashes needed by the algorithm were generated
16099 # during reading of the Jamo.txt file
16100 $jamo = main::simple_dumper(\%Jamo, ' ' x 8);
16101 $jamo_l = main::simple_dumper(\%Jamo_L, ' ' x 8);
16102 $jamo_v = main::simple_dumper(\%Jamo_V, ' ' x 8);
16103 $jamo_t = main::simple_dumper(\%Jamo_T, ' ' x 8);
16110 # This module contains machine-generated tables and code for the
16111 # algorithmically-determinable Unicode character names. The following
16112 # routines can be used to translate between name and code point and vice versa
16116 # Matches legal code point. 4-6 hex numbers, If there are 6, the first
16117 # two must be 10; if there are 5, the first must not be a 0. Written this
16118 # way to decrease backtracking. The first regex allows the code point to
16119 # be at the end of a word, but to work properly, the word shouldn't end
16120 # with a valid hex character. The second one won't match a code point at
16121 # the end of a word, and doesn't have the run-on issue
16122 my \$run_on_code_point_re = qr/$run_on_code_point_re/;
16123 my \$code_point_re = qr/$code_point_re/;
16125 # In the following hash, the keys are the bases of names which include
16126 # the code point in the name, like CJK UNIFIED IDEOGRAPH-4E01. The value
16127 # of each key is another hash which is used to get the low and high ends
16128 # for each range of code points that apply to the name.
16129 my %names_ending_in_code_point = (
16133 # The following hash is a copy of the previous one, except is for loose
16134 # matching, so each name has blanks and dashes squeezed out
16135 my %loose_names_ending_in_code_point = (
16139 # And the following array gives the inverse mapping from code points to
16140 # names. Lowest code points are first
16141 my \@code_points_ending_in_code_point = (
16142 $code_points_ending_in_code_point
16145 # Earlier releases didn't have Jamos. No sense outputting
16146 # them unless will be used.
16147 if ($has_hangul_syllables) {
16150 # Convert from code point to Jamo short name for use in composing Hangul
16156 # Leading consonant (can be null)
16166 # Optional trailing consonant
16171 # Computed re that splits up a Hangul name into LVT or LV syllables
16172 my \$syllable_re = qr/$jamo_re/;
16174 my \$HANGUL_SYLLABLE = "HANGUL SYLLABLE ";
16175 my \$loose_HANGUL_SYLLABLE = "HANGULSYLLABLE";
16177 # These constants names and values were taken from the Unicode standard,
16178 # version 5.1, section 3.12. They are used in conjunction with Hangul
16180 my \$SBase = $SBase_string;
16181 my \$LBase = $LBase_string;
16182 my \$VBase = $VBase_string;
16183 my \$TBase = $TBase_string;
16184 my \$SCount = $SCount;
16185 my \$LCount = $LCount;
16186 my \$VCount = $VCount;
16187 my \$TCount = $TCount;
16188 my \$NCount = \$VCount * \$TCount;
16190 } # End of has Jamos
16192 push @name, << 'END';
16194 sub name_to_code_point_special {
16195 my ($name, $loose) = @_;
16197 # Returns undef if not one of the specially handled names; otherwise
16198 # returns the code point equivalent to the input name
16199 # $loose is non-zero if to use loose matching, 'name' in that case
16200 # must be input as upper case with all blanks and dashes squeezed out.
16202 if ($has_hangul_syllables) {
16203 push @name, << 'END';
16205 if ((! $loose && $name =~ s/$HANGUL_SYLLABLE//)
16206 || ($loose && $name =~ s/$loose_HANGUL_SYLLABLE//))
16208 return if $name !~ qr/^$syllable_re$/;
16209 my $L = $Jamo_L{$1};
16210 my $V = $Jamo_V{$2};
16211 my $T = (defined $3) ? $Jamo_T{$3} : 0;
16212 return ($L * $VCount + $V) * $TCount + $T + $SBase;
16216 push @name, << 'END';
16218 # Name must end in 'code_point' for this to handle.
16219 return if (($loose && $name !~ /^ (.*?) ($run_on_code_point_re) $/x)
16220 || (! $loose && $name !~ /^ (.*) ($code_point_re) $/x));
16223 my $code_point = CORE::hex $2;
16227 $names_ref = \%loose_names_ending_in_code_point;
16230 return if $base !~ s/-$//;
16231 $names_ref = \%names_ending_in_code_point;
16234 # Name must be one of the ones which has the code point in it.
16235 return if ! $names_ref->{$base};
16237 # Look through the list of ranges that apply to this name to see if
16238 # the code point is in one of them.
16239 for (my $i = 0; $i < scalar @{$names_ref->{$base}{'low'}}; $i++) {
16240 return if $names_ref->{$base}{'low'}->[$i] > $code_point;
16241 next if $names_ref->{$base}{'high'}->[$i] < $code_point;
16243 # Here, the code point is in the range.
16244 return $code_point;
16247 # Here, looked like the name had a code point number in it, but
16248 # did not match one of the valid ones.
16252 sub code_point_to_name_special {
16253 my $code_point = shift;
16255 # Returns the name of a code point if algorithmically determinable;
16258 if ($has_hangul_syllables) {
16259 push @name, << 'END';
16261 # If in the Hangul range, calculate the name based on Unicode's
16263 if ($code_point >= $SBase && $code_point <= $SBase + $SCount -1) {
16265 my $SIndex = $code_point - $SBase;
16266 my $L = $LBase + $SIndex / $NCount;
16267 my $V = $VBase + ($SIndex % $NCount) / $TCount;
16268 my $T = $TBase + $SIndex % $TCount;
16269 $name = "$HANGUL_SYLLABLE$Jamo{$L}$Jamo{$V}";
16270 $name .= $Jamo{$T} if $T != $TBase;
16275 push @name, << 'END';
16277 # Look through list of these code points for one in range.
16278 foreach my $hash (@code_points_ending_in_code_point) {
16279 return if $code_point < $hash->{'low'};
16280 if ($code_point <= $hash->{'high'}) {
16281 return sprintf("%s-%04X", $hash->{'name'}, $code_point);
16284 return; # None found
16291 main::write("Name.pm", 0, \@name); # The 0 means no utf8.
16296 # Create and write UCD.pl, which passes info about the tables to
16299 # Create a mapping from each alias of Perl single-form extensions to all
16300 # its equivalent aliases, for quick look-up.
16301 my %perlprop_to_aliases;
16302 foreach my $table ($perl->tables) {
16304 # First create the list of the aliases of each extension
16305 my @aliases_list; # List of legal aliases for this extension
16307 my $table_name = $table->name;
16308 my $standard_table_name = standardize($table_name);
16309 my $table_full_name = $table->full_name;
16310 my $standard_table_full_name = standardize($table_full_name);
16312 # Make sure that the list has both the short and full names
16313 push @aliases_list, $table_name, $table_full_name;
16315 my $found_ucd = 0; # ? Did we actually get an alias that should be
16316 # output for this table
16318 # Go through all the aliases (including the two just added), and add
16319 # any new unique ones to the list
16320 foreach my $alias ($table->aliases) {
16322 # Skip non-legal names
16323 next unless $alias->ok_as_filename;
16324 next unless $alias->ucd;
16326 $found_ucd = 1; # have at least one legal name
16328 my $name = $alias->name;
16329 my $standard = standardize($name);
16331 # Don't repeat a name that is equivalent to one already on the
16333 next if $standard eq $standard_table_name;
16334 next if $standard eq $standard_table_full_name;
16336 push @aliases_list, $name;
16339 # If there were no legal names, don't output anything.
16340 next unless $found_ucd;
16342 # To conserve memory in the program reading these in, omit full names
16343 # that are identical to the short name, when those are the only two
16344 # aliases for the property.
16345 if (@aliases_list == 2 && $aliases_list[0] eq $aliases_list[1]) {
16349 # Here, @aliases_list is the list of all the aliases that this
16350 # extension legally has. Now can create a map to it from each legal
16351 # standardized alias
16352 foreach my $alias ($table->aliases) {
16353 next unless $alias->ucd;
16354 next unless $alias->ok_as_filename;
16355 push @{$perlprop_to_aliases{standardize($alias->name)}},
16360 # Make a list of all combinations of properties/values that are suppressed.
16362 if (! $debug_skip) { # This tends to fail in this debug mode
16363 foreach my $property_name (keys %why_suppressed) {
16366 my $value_name = $1 if $property_name =~ s/ = ( .* ) //x;
16368 # The hash may contain properties not in this release of Unicode
16369 next unless defined (my $property = property_ref($property_name));
16371 # Find all combinations
16372 foreach my $prop_alias ($property->aliases) {
16373 my $prop_alias_name = standardize($prop_alias->name);
16375 # If no =value, there's just one combination possibe for this
16376 if (! $value_name) {
16378 # The property may be suppressed, but there may be a proxy
16379 # for it, so it shouldn't be listed as suppressed
16380 next if $prop_alias->ucd;
16381 push @suppressed, $prop_alias_name;
16384 foreach my $value_alias
16385 ($property->table($value_name)->aliases)
16387 next if $value_alias->ucd;
16389 push @suppressed, "$prop_alias_name="
16390 . standardize($value_alias->name);
16396 @suppressed = sort @suppressed; # So doesn't change between runs of this
16399 # Convert the structure below (designed for Name.pm) to a form that UCD
16400 # wants, so it doesn't have to modify it at all; i.e. so that it includes
16401 # an element for the Hangul syllables in the appropriate place, and
16402 # otherwise changes the name to include the "-<code point>" suffix.
16403 my @algorithm_names;
16404 my $done_hangul = 0;
16406 # Copy it linearly.
16407 for my $i (0 .. @code_points_ending_in_code_point - 1) {
16409 # Insert the hanguls in the correct place.
16411 && $code_points_ending_in_code_point[$i]->{'low'} > $SBase)
16414 push @algorithm_names, { low => $SBase,
16415 high => $SBase + $SCount - 1,
16416 name => '<hangul syllable>',
16420 # Copy the current entry, modified.
16421 push @algorithm_names, {
16422 low => $code_points_ending_in_code_point[$i]->{'low'},
16423 high => $code_points_ending_in_code_point[$i]->{'high'},
16425 "$code_points_ending_in_code_point[$i]->{'name'}-<code point>",
16429 # Serialize these structures for output.
16430 my $loose_to_standard_value
16431 = simple_dumper(\%loose_to_standard_value, ' ' x 4);
16432 chomp $loose_to_standard_value;
16434 my $string_property_loose_to_name
16435 = simple_dumper(\%string_property_loose_to_name, ' ' x 4);
16436 chomp $string_property_loose_to_name;
16438 my $perlprop_to_aliases = simple_dumper(\%perlprop_to_aliases, ' ' x 4);
16439 chomp $perlprop_to_aliases;
16441 my $prop_aliases = simple_dumper(\%prop_aliases, ' ' x 4);
16442 chomp $prop_aliases;
16444 my $prop_value_aliases = simple_dumper(\%prop_value_aliases, ' ' x 4);
16445 chomp $prop_value_aliases;
16447 my $suppressed = (@suppressed) ? simple_dumper(\@suppressed, ' ' x 4) : "";
16450 my $algorithm_names = simple_dumper(\@algorithm_names, ' ' x 4);
16451 chomp $algorithm_names;
16453 my $ambiguous_names = simple_dumper(\%ambiguous_names, ' ' x 4);
16454 chomp $ambiguous_names;
16456 my $loose_defaults = simple_dumper(\%loose_defaults, ' ' x 4);
16457 chomp $loose_defaults;
16461 $INTERNAL_ONLY_HEADER
16463 # This file is for the use of Unicode::UCD
16465 # Highest legal Unicode code point
16466 \$Unicode::UCD::MAX_UNICODE_CODEPOINT = 0x$MAX_UNICODE_CODEPOINT_STRING;
16469 \$Unicode::UCD::HANGUL_BEGIN = $SBase_string;
16470 \$Unicode::UCD::HANGUL_COUNT = $SCount;
16472 # Keys are all the possible "prop=value" combinations, in loose form; values
16473 # are the standard loose name for the 'value' part of the key
16474 \%Unicode::UCD::loose_to_standard_value = (
16475 $loose_to_standard_value
16478 # String property loose names to standard loose name
16479 \%Unicode::UCD::string_property_loose_to_name = (
16480 $string_property_loose_to_name
16483 # Keys are Perl extensions in loose form; values are each one's list of
16485 \%Unicode::UCD::loose_perlprop_to_name = (
16486 $perlprop_to_aliases
16489 # Keys are standard property name; values are each one's aliases
16490 \%Unicode::UCD::prop_aliases = (
16494 # Keys of top level are standard property name; values are keys to another
16495 # hash, Each one is one of the property's values, in standard form. The
16496 # values are that prop-val's aliases. If only one specified, the short and
16497 # long alias are identical.
16498 \%Unicode::UCD::prop_value_aliases = (
16499 $prop_value_aliases
16502 # Ordered (by code point ordinal) list of the ranges of code points whose
16503 # names are algorithmically determined. Each range entry is an anonymous hash
16504 # of the start and end points and a template for the names within it.
16505 \@Unicode::UCD::algorithmic_named_code_points = (
16509 # The properties that as-is have two meanings, and which must be disambiguated
16510 \%Unicode::UCD::ambiguous_names = (
16514 # Keys are the prop-val combinations which are the default values for the
16515 # given property, expressed in standard loose form
16516 \%Unicode::UCD::loose_defaults = (
16520 # All combinations of names that are suppressed.
16521 # This is actually for UCD.t, so it knows which properties shouldn't have
16522 # entries. If it got any bigger, would probably want to put it in its own
16523 # file to use memory only when it was needed, in testing.
16524 \@Unicode::UCD::suppressed_properties = (
16531 main::write("UCD.pl", 0, \@ucd); # The 0 means no utf8.
16535 sub write_all_tables() {
16536 # Write out all the tables generated by this program to files, as well as
16537 # the supporting data structures, pod file, and .t file.
16539 my @writables; # List of tables that actually get written
16540 my %match_tables_to_write; # Used to collapse identical match tables
16541 # into one file. Each key is a hash function
16542 # result to partition tables into buckets.
16543 # Each value is an array of the tables that
16544 # fit in the bucket.
16546 # For each property ...
16547 # (sort so that if there is an immutable file name, it has precedence, so
16548 # some other property can't come in and take over its file name. (We
16549 # don't care if both defined, as they had better be different anyway.)
16550 # The property named 'Perl' needs to be first (it doesn't have any
16551 # immutable file name) because empty properties are defined in terms of
16552 # it's table named 'Any'.) We also sort by the property's name. This is
16553 # just for repeatability of the outputs between runs of this program, but
16554 # does not affect correctness.
16556 foreach my $property ($perl,
16557 sort { return -1 if defined $a->file;
16558 return 1 if defined $b->file;
16559 return $a->name cmp $b->name;
16560 } grep { $_ != $perl } property_ref('*'))
16562 my $type = $property->type;
16564 # And for each table for that property, starting with the mapping
16567 foreach my $table($property,
16569 # and all the match tables for it (if any), sorted so
16570 # the ones with the shortest associated file name come
16571 # first. The length sorting prevents problems of a
16572 # longer file taking a name that might have to be used
16573 # by a shorter one. The alphabetic sorting prevents
16574 # differences between releases
16575 sort { my $ext_a = $a->external_name;
16576 return 1 if ! defined $ext_a;
16577 my $ext_b = $b->external_name;
16578 return -1 if ! defined $ext_b;
16580 # But return the non-complement table before
16581 # the complement one, as the latter is defined
16582 # in terms of the former, and needs to have
16583 # the information for the former available.
16584 return 1 if $a->complement != 0;
16585 return -1 if $b->complement != 0;
16587 # Similarly, return a subservient table after
16589 return 1 if $a->leader != $a;
16590 return -1 if $b->leader != $b;
16592 my $cmp = length $ext_a <=> length $ext_b;
16594 # Return result if lengths not equal
16595 return $cmp if $cmp;
16597 # Alphabetic if lengths equal
16598 return $ext_a cmp $ext_b
16599 } $property->tables
16603 # Here we have a table associated with a property. It could be
16604 # the map table (done first for each property), or one of the
16605 # other tables. Determine which type.
16606 my $is_property = $table->isa('Property');
16608 my $name = $table->name;
16609 my $complete_name = $table->complete_name;
16611 # See if should suppress the table if is empty, but warn if it
16612 # contains something.
16613 my $suppress_if_empty_warn_if_not
16614 = $why_suppress_if_empty_warn_if_not{$complete_name} || 0;
16616 # Calculate if this table should have any code points associated
16618 my $expected_empty =
16620 # $perl should be empty, as well as properties that we just
16621 # don't do anything with
16623 && ($table == $perl
16624 || grep { $complete_name eq $_ }
16625 @unimplemented_properties
16629 # Match tables in properties we skipped populating should be
16631 || (! $is_property && ! $property->to_create_match_tables)
16633 # Tables and properties that are expected to have no code
16634 # points should be empty
16635 || $suppress_if_empty_warn_if_not
16638 # Set a boolean if this table is the complement of an empty binary
16640 my $is_complement_of_empty_binary =
16641 $type == $BINARY &&
16642 (($table == $property->table('Y')
16643 && $property->table('N')->is_empty)
16644 || ($table == $property->table('N')
16645 && $property->table('Y')->is_empty));
16647 if ($table->is_empty) {
16649 if ($suppress_if_empty_warn_if_not) {
16650 $table->set_fate($SUPPRESSED,
16651 $suppress_if_empty_warn_if_not);
16654 # Suppress (by skipping them) expected empty tables.
16655 next TABLE if $expected_empty;
16657 # And setup to later output a warning for those that aren't
16658 # known to be allowed to be empty. Don't do the warning if
16659 # this table is a child of another one to avoid duplicating
16660 # the warning that should come from the parent one.
16661 if (($table == $property || $table->parent == $table)
16662 && $table->fate != $SUPPRESSED
16663 && $table->fate != $MAP_PROXIED
16664 && ! grep { $complete_name =~ /^$_$/ }
16665 @tables_that_may_be_empty)
16667 push @unhandled_properties, "$table";
16670 # An empty table is just the complement of everything.
16671 $table->set_complement($Any) if $table != $property;
16673 elsif ($expected_empty) {
16675 if ($suppress_if_empty_warn_if_not) {
16676 $because = " because $suppress_if_empty_warn_if_not";
16679 Carp::my_carp("Not expecting property $table$because. Generating file for it anyway.");
16682 # Some tables should match everything
16683 my $expected_full =
16684 ($table->fate == $SUPPRESSED)
16687 ? # All these types of map tables will be full because
16688 # they will have been populated with defaults
16689 ($type == $ENUM || $type == $FORCED_BINARY)
16691 : # A match table should match everything if its method
16693 ($table->matches_all
16695 # The complement of an empty binary table will match
16697 || $is_complement_of_empty_binary
16701 my $count = $table->count;
16702 if ($expected_full) {
16703 if ($count != $MAX_UNICODE_CODEPOINTS) {
16704 Carp::my_carp("$table matches only "
16705 . clarify_number($count)
16706 . " Unicode code points but should match "
16707 . clarify_number($MAX_UNICODE_CODEPOINTS)
16709 . clarify_number(abs($MAX_UNICODE_CODEPOINTS - $count))
16710 . "). Proceeding anyway.");
16713 # Here is expected to be full. If it is because it is the
16714 # complement of an (empty) binary table that is to be
16715 # suppressed, then suppress this one as well.
16716 if ($is_complement_of_empty_binary) {
16717 my $opposing_name = ($name eq 'Y') ? 'N' : 'Y';
16718 my $opposing = $property->table($opposing_name);
16719 my $opposing_status = $opposing->status;
16720 if ($opposing_status) {
16721 $table->set_status($opposing_status,
16722 $opposing->status_info);
16726 elsif ($count == $MAX_UNICODE_CODEPOINTS
16727 && ($table == $property || $table->leader == $table)
16728 && $table->property->status ne $NORMAL)
16730 Carp::my_carp("$table unexpectedly matches all Unicode code points. Proceeding anyway.");
16733 if ($table->fate >= $SUPPRESSED) {
16734 if (! $is_property) {
16735 my @children = $table->children;
16736 foreach my $child (@children) {
16737 if ($child->fate < $SUPPRESSED) {
16738 Carp::my_carp_bug("'$table' is suppressed and has a child '$child' which isn't");
16746 if (! $is_property) {
16748 make_ucd_table_pod_entries($table) if $table->property == $perl;
16750 # Several things need to be done just once for each related
16751 # group of match tables. Do them on the parent.
16752 if ($table->parent == $table) {
16754 # Add an entry in the pod file for the table; it also does
16756 make_re_pod_entries($table) if defined $pod_directory;
16758 # See if the the table matches identical code points with
16759 # something that has already been output. In that case,
16760 # no need to have two files with the same code points in
16761 # them. We use the table's hash() method to store these
16762 # in buckets, so that it is quite likely that if two
16763 # tables are in the same bucket they will be identical, so
16764 # don't have to compare tables frequently. The tables
16765 # have to have the same status to share a file, so add
16766 # this to the bucket hash. (The reason for this latter is
16767 # that Heavy.pl associates a status with a file.)
16768 # We don't check tables that are inverses of others, as it
16769 # would lead to some coding complications, and checking
16770 # all the regular ones should find everything.
16771 if ($table->complement == 0) {
16772 my $hash = $table->hash . ';' . $table->status;
16774 # Look at each table that is in the same bucket as
16775 # this one would be.
16776 foreach my $comparison
16777 (@{$match_tables_to_write{$hash}})
16779 if ($table->matches_identically_to($comparison)) {
16780 $table->set_equivalent_to($comparison,
16786 # Here, not equivalent, add this table to the bucket.
16787 push @{$match_tables_to_write{$hash}}, $table;
16793 # Here is the property itself.
16794 # Don't write out or make references to the $perl property
16795 next if $table == $perl;
16797 make_ucd_table_pod_entries($table);
16799 # There is a mapping stored of the various synonyms to the
16800 # standardized name of the property for utf8_heavy.pl.
16801 # Also, the pod file contains entries of the form:
16802 # \p{alias: *} \p{full: *}
16803 # rather than show every possible combination of things.
16805 my @property_aliases = $property->aliases;
16807 my $full_property_name = $property->full_name;
16808 my $property_name = $property->name;
16809 my $standard_property_name = standardize($property_name);
16810 my $standard_property_full_name
16811 = standardize($full_property_name);
16813 # We also create for Unicode::UCD a list of aliases for
16814 # the property. The list starts with the property name;
16815 # then its full name. Legacy properties are not listed in
16819 if ( $property->fate <= $MAP_PROXIED) {
16820 @property_list = ($property_name, $full_property_name);
16821 @standard_list = ($standard_property_name,
16822 $standard_property_full_name);
16825 # For each synonym ...
16826 for my $i (0 .. @property_aliases - 1) {
16827 my $alias = $property_aliases[$i];
16828 my $alias_name = $alias->name;
16829 my $alias_standard = standardize($alias_name);
16832 # Add other aliases to the list of property aliases
16833 if ($property->fate <= $MAP_PROXIED
16834 && ! grep { $alias_standard eq $_ } @standard_list)
16836 push @property_list, $alias_name;
16837 push @standard_list, $alias_standard;
16840 # For utf8_heavy, set the mapping of the alias to the
16842 if ($type == $STRING) {
16843 if ($property->fate <= $MAP_PROXIED) {
16844 $string_property_loose_to_name{$alias_standard}
16845 = $standard_property_name;
16849 if (exists ($loose_property_name_of{$alias_standard}))
16851 Carp::my_carp("There already is a property with the same standard name as $alias_name: $loose_property_name_of{$alias_standard}. Old name is retained");
16854 $loose_property_name_of{$alias_standard}
16855 = $standard_property_name;
16858 # Now for the re pod entry for this alias. Skip if not
16859 # outputting a pod; skip the first one, which is the
16860 # full name so won't have an entry like: '\p{full: *}
16861 # \p{full: *}', and skip if don't want an entry for
16864 || ! defined $pod_directory
16865 || ! $alias->make_re_pod_entry;
16867 my $rhs = "\\p{$full_property_name: *}";
16868 if ($property != $perl && $table->perl_extension) {
16869 $rhs .= ' (Perl extension)';
16871 push @match_properties,
16872 format_pod_line($indent_info_column,
16873 '\p{' . $alias->name . ': *}',
16879 # The list of all possible names is attached to each alias, so
16881 if (@property_list) {
16882 push @{$prop_aliases{$standard_list[0]}}, @property_list;
16885 if ($property->fate <= $MAP_PROXIED) {
16887 # Similarly, we create for Unicode::UCD a list of
16888 # property-value aliases.
16890 my $property_full_name = $property->full_name;
16892 # Look at each table in the property...
16893 foreach my $table ($property->tables) {
16895 my $table_full_name = $table->full_name;
16896 my $standard_table_full_name
16897 = standardize($table_full_name);
16898 my $table_name = $table->name;
16899 my $standard_table_name = standardize($table_name);
16901 # The list starts with the table name and its full
16903 push @values_list, $table_name, $table_full_name;
16905 # We add to the table each unique alias that isn't
16906 # discouraged from use.
16907 foreach my $alias ($table->aliases) {
16908 next if $alias->status
16909 && $alias->status eq $DISCOURAGED;
16910 my $name = $alias->name;
16911 my $standard = standardize($name);
16912 next if $standard eq $standard_table_name;
16913 next if $standard eq $standard_table_full_name;
16914 push @values_list, $name;
16917 # Here @values_list is a list of all the aliases for
16918 # the table. That is, all the property-values given
16919 # by this table. By agreement with Unicode::UCD,
16920 # if the name and full name are identical, and there
16921 # are no other names, drop the duplcate entry to save
16923 if (@values_list == 2
16924 && $values_list[0] eq $values_list[1])
16929 # To save memory, unlike the similar list for property
16930 # aliases above, only the standard forms hve the list.
16931 # This forces an extra step of converting from input
16932 # name to standard name, but the savings are
16933 # considerable. (There is only marginal savings if we
16934 # did this with the property aliases.)
16935 push @{$prop_value_aliases{$standard_property_name}{$standard_table_name}}, @values_list;
16939 # Don't write out a mapping file if not desired.
16940 next if ! $property->to_output_map;
16943 # Here, we know we want to write out the table, but don't do it
16944 # yet because there may be other tables that come along and will
16945 # want to share the file, and the file's comments will change to
16946 # mention them. So save for later.
16947 push @writables, $table;
16949 } # End of looping through the property and all its tables.
16950 } # End of looping through all properties.
16952 # Now have all the tables that will have files written for them. Do it.
16953 foreach my $table (@writables) {
16956 my $property = $table->property;
16957 my $is_property = ($table == $property);
16958 if (! $is_property) {
16960 # Match tables for the property go in lib/$subdirectory, which is
16961 # the property's name. Don't use the standard file name for this,
16962 # as may get an unfamiliar alias
16963 @directory = ($matches_directory, $property->external_name);
16967 @directory = $table->directory;
16968 $filename = $table->file;
16971 # Use specified filename if available, or default to property's
16972 # shortest name. We need an 8.3 safe filename (which means "an 8
16973 # safe" filename, since after the dot is only 'pl', which is < 3)
16974 # The 2nd parameter is if the filename shouldn't be changed, and
16975 # it shouldn't iff there is a hard-coded name for this table.
16976 $filename = construct_filename(
16977 $filename || $table->external_name,
16978 ! $filename, # mutable if no filename
16981 register_file_for_name($table, \@directory, $filename);
16983 # Only need to write one file when shared by more than one
16985 next if ! $is_property
16986 && ($table->leader != $table || $table->complement != 0);
16988 # Construct a nice comment to add to the file
16989 $table->set_final_comment;
16995 # Write out the pod file
16998 # And Heavy.pl, Name.pm, UCD.pl
17003 make_property_test_script() if $make_test_script;
17004 make_normalization_test_script() if $make_norm_test_script;
17008 my @white_space_separators = ( # This used only for making the test script.
17015 sub generate_separator($) {
17016 # This used only for making the test script. It generates the colon or
17017 # equal separator between the property and property value, with random
17018 # white space surrounding the separator
17022 return "" if $lhs eq ""; # No separator if there's only one (the r) side
17024 # Choose space before and after randomly
17025 my $spaces_before =$white_space_separators[rand(@white_space_separators)];
17026 my $spaces_after = $white_space_separators[rand(@white_space_separators)];
17028 # And return the whole complex, half the time using a colon, half the
17030 return $spaces_before
17031 . (rand() < 0.5) ? '=' : ':'
17035 sub generate_tests($$$$$) {
17036 # This used only for making the test script. It generates test cases that
17037 # are expected to compile successfully in perl. Note that the lhs and
17038 # rhs are assumed to already be as randomized as the caller wants.
17040 my $lhs = shift; # The property: what's to the left of the colon
17041 # or equals separator
17042 my $rhs = shift; # The property value; what's to the right
17043 my $valid_code = shift; # A code point that's known to be in the
17044 # table given by lhs=rhs; undef if table is
17046 my $invalid_code = shift; # A code point known to not be in the table;
17047 # undef if the table is all code points
17048 my $warning = shift;
17050 # Get the colon or equal
17051 my $separator = generate_separator($lhs);
17053 # The whole 'property=value'
17054 my $name = "$lhs$separator$rhs";
17057 # Create a complete set of tests, with complements.
17058 if (defined $valid_code) {
17059 push @output, <<"EOC"
17060 Expect(1, $valid_code, '\\p{$name}', $warning);
17061 Expect(0, $valid_code, '\\p{^$name}', $warning);
17062 Expect(0, $valid_code, '\\P{$name}', $warning);
17063 Expect(1, $valid_code, '\\P{^$name}', $warning);
17066 if (defined $invalid_code) {
17067 push @output, <<"EOC"
17068 Expect(0, $invalid_code, '\\p{$name}', $warning);
17069 Expect(1, $invalid_code, '\\p{^$name}', $warning);
17070 Expect(1, $invalid_code, '\\P{$name}', $warning);
17071 Expect(0, $invalid_code, '\\P{^$name}', $warning);
17077 sub generate_error($$$) {
17078 # This used only for making the test script. It generates test cases that
17079 # are expected to not only not match, but to be syntax or similar errors
17081 my $lhs = shift; # The property: what's to the left of the
17082 # colon or equals separator
17083 my $rhs = shift; # The property value; what's to the right
17084 my $already_in_error = shift; # Boolean; if true it's known that the
17085 # unmodified lhs and rhs will cause an error.
17086 # This routine should not force another one
17087 # Get the colon or equal
17088 my $separator = generate_separator($lhs);
17090 # Since this is an error only, don't bother to randomly decide whether to
17091 # put the error on the left or right side; and assume that the rhs is
17092 # loosely matched, again for convenience rather than rigor.
17093 $rhs = randomize_loose_name($rhs, 'ERROR') unless $already_in_error;
17095 my $property = $lhs . $separator . $rhs;
17098 Error('\\p{$property}');
17099 Error('\\P{$property}');
17103 # These are used only for making the test script
17104 # XXX Maybe should also have a bad strict seps, which includes underscore.
17106 my @good_loose_seps = (
17113 my @bad_loose_seps = (
17118 sub randomize_stricter_name {
17119 # This used only for making the test script. Take the input name and
17120 # return a randomized, but valid version of it under the stricter matching
17124 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
17126 # If the name looks like a number (integer, floating, or rational), do
17128 if ($name =~ qr{ ^ ( -? ) (\d+ ( ( [./] ) \d+ )? ) $ }x) {
17131 my $separator = $3;
17133 # If there isn't a sign, part of the time add a plus
17134 # Note: Not testing having any denominator having a minus sign
17136 $sign = '+' if rand() <= .3;
17139 # And add 0 or more leading zeros.
17140 $name = $sign . ('0' x int rand(10)) . $number;
17142 if (defined $separator) {
17143 my $extra_zeros = '0' x int rand(10);
17145 if ($separator eq '.') {
17147 # Similarly, add 0 or more trailing zeros after a decimal
17149 $name .= $extra_zeros;
17153 # Or, leading zeros before the denominator
17154 $name =~ s,/,/$extra_zeros,;
17159 # For legibility of the test, only change the case of whole sections at a
17160 # time. To do this, first split into sections. The split returns the
17163 for my $section (split / ( [ - + \s _ . ]+ ) /x, $name) {
17164 trace $section if main::DEBUG && $to_trace;
17166 if (length $section > 1 && $section !~ /\D/) {
17168 # If the section is a sequence of digits, about half the time
17169 # randomly add underscores between some of them.
17172 # Figure out how many underscores to add. max is 1 less than
17173 # the number of digits. (But add 1 at the end to make sure
17174 # result isn't 0, and compensate earlier by subtracting 2
17176 my $num_underscores = int rand(length($section) - 2) + 1;
17178 # And add them evenly throughout, for convenience, not rigor
17180 my $spacing = (length($section) - 1)/ $num_underscores;
17181 my $temp = $section;
17183 for my $i (1 .. $num_underscores) {
17184 $section .= substr($temp, 0, $spacing, "") . '_';
17188 push @sections, $section;
17192 # Here not a sequence of digits. Change the case of the section
17194 my $switch = int rand(4);
17195 if ($switch == 0) {
17196 push @sections, uc $section;
17198 elsif ($switch == 1) {
17199 push @sections, lc $section;
17201 elsif ($switch == 2) {
17202 push @sections, ucfirst $section;
17205 push @sections, $section;
17209 trace "returning", join "", @sections if main::DEBUG && $to_trace;
17210 return join "", @sections;
17213 sub randomize_loose_name($;$) {
17214 # This used only for making the test script
17217 my $want_error = shift; # if true, make an error
17218 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
17220 $name = randomize_stricter_name($name);
17223 push @parts, $good_loose_seps[rand(@good_loose_seps)];
17225 # Preserve trailing ones for the sake of not stripping the underscore from
17227 for my $part (split /[-\s_]+ (?= . )/, $name) {
17229 if ($want_error and rand() < 0.3) {
17230 push @parts, $bad_loose_seps[rand(@bad_loose_seps)];
17234 push @parts, $good_loose_seps[rand(@good_loose_seps)];
17237 push @parts, $part;
17239 my $new = join("", @parts);
17240 trace "$name => $new" if main::DEBUG && $to_trace;
17243 if (rand() >= 0.5) {
17244 $new .= $bad_loose_seps[rand(@bad_loose_seps)];
17247 $new = $bad_loose_seps[rand(@bad_loose_seps)] . $new;
17253 # Used to make sure don't generate duplicate test cases.
17254 my %test_generated;
17256 sub make_property_test_script() {
17257 # This used only for making the test script
17258 # this written directly -- it's huge.
17260 print "Making test script\n" if $verbosity >= $PROGRESS;
17262 # This uses randomness to test different possibilities without testing all
17263 # possibilities. To ensure repeatability, set the seed to 0. But if
17264 # tests are added, it will perturb all later ones in the .t file
17267 $t_path = 'TestProp.pl' unless defined $t_path; # the traditional name
17269 # Keep going down an order of magnitude
17270 # until find that adding this quantity to
17271 # 1 remains 1; but put an upper limit on
17272 # this so in case this algorithm doesn't
17273 # work properly on some platform, that we
17274 # won't loop forever.
17276 my $min_floating_slop = 1;
17277 while (1+ $min_floating_slop != 1
17280 my $next = $min_floating_slop / 10;
17281 last if $next == 0; # If underflows,
17283 $min_floating_slop = $next;
17286 # It doesn't matter whether the elements of this array contain single lines
17287 # or multiple lines. main::write doesn't count the lines.
17290 # Sort these so get results in same order on different runs of this
17292 foreach my $property (sort { $a->name cmp $b->name } property_ref('*')) {
17293 foreach my $table (sort { $a->name cmp $b->name } $property->tables) {
17295 # Find code points that match, and don't match this table.
17296 my $valid = $table->get_valid_code_point;
17297 my $invalid = $table->get_invalid_code_point;
17298 my $warning = ($table->status eq $DEPRECATED)
17302 # Test each possible combination of the property's aliases with
17303 # the table's. If this gets to be too many, could do what is done
17304 # in the set_final_comment() for Tables
17305 my @table_aliases = $table->aliases;
17306 my @property_aliases = $table->property->aliases;
17308 # Every property can be optionally be prefixed by 'Is_', so test
17309 # that those work, by creating such a new alias for each
17310 # pre-existing one.
17311 push @property_aliases, map { Alias->new("Is_" . $_->name,
17313 $_->make_re_pod_entry,
17314 $_->ok_as_filename,
17318 } @property_aliases;
17319 my $max = max(scalar @table_aliases, scalar @property_aliases);
17320 for my $j (0 .. $max - 1) {
17322 # The current alias for property is the next one on the list,
17323 # or if beyond the end, start over. Similarly for table
17325 = $property_aliases[$j % @property_aliases]->name;
17327 $property_name = "" if $table->property == $perl;
17328 my $table_alias = $table_aliases[$j % @table_aliases];
17329 my $table_name = $table_alias->name;
17330 my $loose_match = $table_alias->loose_match;
17332 # If the table doesn't have a file, any test for it is
17333 # already guaranteed to be in error
17334 my $already_error = ! $table->file_path;
17336 # Generate error cases for this alias.
17337 push @output, generate_error($property_name,
17341 # If the table is guaranteed to always generate an error,
17342 # quit now without generating success cases.
17343 next if $already_error;
17345 # Now for the success cases.
17347 if ($loose_match) {
17349 # For loose matching, create an extra test case for the
17351 my $standard = standardize($table_name);
17353 # $test_name should be a unique combination for each test
17354 # case; used just to avoid duplicate tests
17355 my $test_name = "$property_name=$standard";
17357 # Don't output duplicate test cases.
17358 if (! exists $test_generated{$test_name}) {
17359 $test_generated{$test_name} = 1;
17360 push @output, generate_tests($property_name,
17367 $random = randomize_loose_name($table_name)
17369 else { # Stricter match
17370 $random = randomize_stricter_name($table_name);
17373 # Now for the main test case for this alias.
17374 my $test_name = "$property_name=$random";
17375 if (! exists $test_generated{$test_name}) {
17376 $test_generated{$test_name} = 1;
17377 push @output, generate_tests($property_name,
17384 # If the name is a rational number, add tests for the
17385 # floating point equivalent.
17386 if ($table_name =~ qr{/}) {
17388 # Calculate the float, and find just the fraction.
17389 my $float = eval $table_name;
17390 my ($whole, $fraction)
17391 = $float =~ / (.*) \. (.*) /x;
17393 # Starting with one digit after the decimal point,
17394 # create a test for each possible precision (number of
17395 # digits past the decimal point) until well beyond the
17396 # native number found on this machine. (If we started
17397 # with 0 digits, it would be an integer, which could
17398 # well match an unrelated table)
17400 for my $i (1 .. $min_floating_slop + 3) {
17401 my $table_name = sprintf("%.*f", $i, $float);
17402 if ($i < $MIN_FRACTION_LENGTH) {
17404 # If the test case has fewer digits than the
17405 # minimum acceptable precision, it shouldn't
17406 # succeed, so we expect an error for it.
17407 # E.g., 2/3 = .7 at one decimal point, and we
17408 # shouldn't say it matches .7. We should make
17409 # it be .667 at least before agreeing that the
17410 # intent was to match 2/3. But at the
17411 # less-than- acceptable level of precision, it
17412 # might actually match an unrelated number.
17413 # So don't generate a test case if this
17414 # conflating is possible. In our example, we
17415 # don't want 2/3 matching 7/10, if there is
17416 # a 7/10 code point.
17418 (keys %nv_floating_to_rational)
17421 if abs($table_name - $existing)
17422 < $MAX_FLOATING_SLOP;
17424 push @output, generate_error($property_name,
17426 1 # 1 => already an error
17431 # Here the number of digits exceeds the
17432 # minimum we think is needed. So generate a
17433 # success test case for it.
17434 push @output, generate_tests($property_name,
17452 (map {"Test_X('$_');\n"} @backslash_X_tests),
17457 sub make_normalization_test_script() {
17458 print "Making normalization test script\n" if $verbosity >= $PROGRESS;
17460 my $n_path = 'TestNorm.pl';
17462 unshift @normalization_tests, <<'END';
17466 sub ord_string { # Convert packed ords to printable string
17468 return "'" . join("", map { '\N{' . charnames::viacode($_) . '}' }
17469 unpack "U*", shift) . "'";
17470 #return "'" . join(" ", map { sprintf "%04X", $_ } unpack "U*", shift) . "'";
17474 my ($source, $nfc, $nfd, $nfkc, $nfkd) = @_;
17475 my $display_source = ord_string($source);
17476 my $display_nfc = ord_string($nfc);
17477 my $display_nfd = ord_string($nfd);
17478 my $display_nfkc = ord_string($nfkc);
17479 my $display_nfkd = ord_string($nfkd);
17481 use Unicode::Normalize;
17483 # nfc == toNFC(source) == toNFC(nfc) == toNFC(nfd)
17484 # nfkc == toNFC(nfkc) == toNFC(nfkd)
17487 # nfd == toNFD(source) == toNFD(nfc) == toNFD(nfd)
17488 # nfkd == toNFD(nfkc) == toNFD(nfkd)
17491 # nfkc == toNFKC(source) == toNFKC(nfc) == toNFKC(nfd) ==
17492 # toNFKC(nfkc) == toNFKC(nfkd)
17495 # nfkd == toNFKD(source) == toNFKD(nfc) == toNFKD(nfd) ==
17496 # toNFKD(nfkc) == toNFKD(nfkd)
17498 is(NFC($source), $nfc, "NFC($display_source) eq $display_nfc");
17499 is(NFC($nfc), $nfc, "NFC($display_nfc) eq $display_nfc");
17500 is(NFC($nfd), $nfc, "NFC($display_nfd) eq $display_nfc");
17501 is(NFC($nfkc), $nfkc, "NFC($display_nfkc) eq $display_nfkc");
17502 is(NFC($nfkd), $nfkc, "NFC($display_nfkd) eq $display_nfkc");
17504 is(NFD($source), $nfd, "NFD($display_source) eq $display_nfd");
17505 is(NFD($nfc), $nfd, "NFD($display_nfc) eq $display_nfd");
17506 is(NFD($nfd), $nfd, "NFD($display_nfd) eq $display_nfd");
17507 is(NFD($nfkc), $nfkd, "NFD($display_nfkc) eq $display_nfkd");
17508 is(NFD($nfkd), $nfkd, "NFD($display_nfkd) eq $display_nfkd");
17510 is(NFKC($source), $nfkc, "NFKC($display_source) eq $display_nfkc");
17511 is(NFKC($nfc), $nfkc, "NFKC($display_nfc) eq $display_nfkc");
17512 is(NFKC($nfd), $nfkc, "NFKC($display_nfd) eq $display_nfkc");
17513 is(NFKC($nfkc), $nfkc, "NFKC($display_nfkc) eq $display_nfkc");
17514 is(NFKC($nfkd), $nfkc, "NFKC($display_nfkd) eq $display_nfkc");
17516 is(NFKD($source), $nfkd, "NFKD($display_source) eq $display_nfkd");
17517 is(NFKD($nfc), $nfkd, "NFKD($display_nfc) eq $display_nfkd");
17518 is(NFKD($nfd), $nfkd, "NFKD($display_nfd) eq $display_nfkd");
17519 is(NFKD($nfkc), $nfkd, "NFKD($display_nfkc) eq $display_nfkd");
17520 is(NFKD($nfkd), $nfkd, "NFKD($display_nfkd) eq $display_nfkd");
17527 @normalization_tests,
17533 # This is a list of the input files and how to handle them. The files are
17534 # processed in their order in this list. Some reordering is possible if
17535 # desired, but the v0 files should be first, and the extracted before the
17536 # others except DAge.txt (as data in an extracted file can be over-ridden by
17537 # the non-extracted. Some other files depend on data derived from an earlier
17538 # file, like UnicodeData requires data from Jamo, and the case changing and
17539 # folding requires data from Unicode. Mostly, it is safest to order by first
17540 # version releases in (except the Jamo). DAge.txt is read before the
17541 # extracted ones because of the rarely used feature $compare_versions. In the
17542 # unlikely event that there were ever an extracted file that contained the Age
17543 # property information, it would have to go in front of DAge.
17545 # The version strings allow the program to know whether to expect a file or
17546 # not, but if a file exists in the directory, it will be processed, even if it
17547 # is in a version earlier than expected, so you can copy files from a later
17548 # release into an earlier release's directory.
17549 my @input_file_objects = (
17550 Input_file->new('PropertyAliases.txt', v0,
17551 Handler => \&process_PropertyAliases,
17553 Input_file->new(undef, v0, # No file associated with this
17554 Progress_Message => 'Finishing property setup',
17555 Handler => \&finish_property_setup,
17557 Input_file->new('PropValueAliases.txt', v0,
17558 Handler => \&process_PropValueAliases,
17559 Has_Missings_Defaults => $NOT_IGNORED,
17561 Input_file->new('DAge.txt', v3.2.0,
17562 Has_Missings_Defaults => $NOT_IGNORED,
17565 Input_file->new("${EXTRACTED}DGeneralCategory.txt", v3.1.0,
17566 Property => 'General_Category',
17568 Input_file->new("${EXTRACTED}DCombiningClass.txt", v3.1.0,
17569 Property => 'Canonical_Combining_Class',
17570 Has_Missings_Defaults => $NOT_IGNORED,
17572 Input_file->new("${EXTRACTED}DNumType.txt", v3.1.0,
17573 Property => 'Numeric_Type',
17574 Has_Missings_Defaults => $NOT_IGNORED,
17576 Input_file->new("${EXTRACTED}DEastAsianWidth.txt", v3.1.0,
17577 Property => 'East_Asian_Width',
17578 Has_Missings_Defaults => $NOT_IGNORED,
17580 Input_file->new("${EXTRACTED}DLineBreak.txt", v3.1.0,
17581 Property => 'Line_Break',
17582 Has_Missings_Defaults => $NOT_IGNORED,
17584 Input_file->new("${EXTRACTED}DBidiClass.txt", v3.1.1,
17585 Property => 'Bidi_Class',
17586 Has_Missings_Defaults => $NOT_IGNORED,
17588 Input_file->new("${EXTRACTED}DDecompositionType.txt", v3.1.0,
17589 Property => 'Decomposition_Type',
17590 Has_Missings_Defaults => $NOT_IGNORED,
17592 Input_file->new("${EXTRACTED}DBinaryProperties.txt", v3.1.0),
17593 Input_file->new("${EXTRACTED}DNumValues.txt", v3.1.0,
17594 Property => 'Numeric_Value',
17595 Each_Line_Handler => \&filter_numeric_value_line,
17596 Has_Missings_Defaults => $NOT_IGNORED,
17598 Input_file->new("${EXTRACTED}DJoinGroup.txt", v3.1.0,
17599 Property => 'Joining_Group',
17600 Has_Missings_Defaults => $NOT_IGNORED,
17603 Input_file->new("${EXTRACTED}DJoinType.txt", v3.1.0,
17604 Property => 'Joining_Type',
17605 Has_Missings_Defaults => $NOT_IGNORED,
17607 Input_file->new('Jamo.txt', v2.0.0,
17608 Property => 'Jamo_Short_Name',
17609 Each_Line_Handler => \&filter_jamo_line,
17611 Input_file->new('UnicodeData.txt', v1.1.5,
17612 Pre_Handler => \&setup_UnicodeData,
17614 # We clean up this file for some early versions.
17615 Each_Line_Handler => [ (($v_version lt v2.0.0 )
17617 : ($v_version eq v2.1.5)
17618 ? \&filter_v2_1_5_ucd
17620 # And for 5.14 Perls with 6.0,
17621 # have to also make changes
17622 : ($v_version ge v6.0.0
17627 # Early versions did not have the
17628 # proper Unicode_1 names for the
17630 (($v_version lt v3.0.0)
17631 ? \&filter_early_U1_names
17634 # Early versions did not correctly
17635 # use the later method for giving
17636 # decimal digit values
17637 (($v_version le v3.2.0)
17638 ? \&filter_bad_Nd_ucd
17641 # And the main filter
17642 \&filter_UnicodeData_line,
17644 EOF_Handler => \&EOF_UnicodeData,
17646 Input_file->new('ArabicShaping.txt', v2.0.0,
17647 Each_Line_Handler =>
17648 ($v_version lt 4.1.0)
17649 ? \&filter_old_style_arabic_shaping
17651 # The first field after the range is a "schematic name"
17653 Properties => [ '<ignored>', 'Joining_Type', 'Joining_Group' ],
17654 Has_Missings_Defaults => $NOT_IGNORED,
17656 Input_file->new('Blocks.txt', v2.0.0,
17657 Property => 'Block',
17658 Has_Missings_Defaults => $NOT_IGNORED,
17659 Each_Line_Handler => \&filter_blocks_lines
17661 Input_file->new('PropList.txt', v2.0.0,
17662 Each_Line_Handler => (($v_version lt v3.1.0)
17663 ? \&filter_old_style_proplist
17666 Input_file->new('Unihan.txt', v2.0.0,
17667 Pre_Handler => \&setup_unihan,
17669 Each_Line_Handler => \&filter_unihan_line,
17671 Input_file->new('SpecialCasing.txt', v2.1.8,
17672 Each_Line_Handler => ($v_version eq 2.1.8)
17673 ? \&filter_2_1_8_special_casing_line
17674 : \&filter_special_casing_line,
17675 Pre_Handler => \&setup_special_casing,
17676 Has_Missings_Defaults => $IGNORED,
17679 'LineBreak.txt', v3.0.0,
17680 Has_Missings_Defaults => $NOT_IGNORED,
17681 Property => 'Line_Break',
17682 # Early versions had problematic syntax
17683 Each_Line_Handler => (($v_version lt v3.1.0)
17684 ? \&filter_early_ea_lb
17687 Input_file->new('EastAsianWidth.txt', v3.0.0,
17688 Property => 'East_Asian_Width',
17689 Has_Missings_Defaults => $NOT_IGNORED,
17690 # Early versions had problematic syntax
17691 Each_Line_Handler => (($v_version lt v3.1.0)
17692 ? \&filter_early_ea_lb
17695 Input_file->new('CompositionExclusions.txt', v3.0.0,
17696 Property => 'Composition_Exclusion',
17698 Input_file->new('BidiMirroring.txt', v3.0.1,
17699 Property => 'Bidi_Mirroring_Glyph',
17700 Has_Missings_Defaults => ($v_version lt v6.2.0)
17702 # Is <none> which doesn't mean
17703 # anything to us, we will use the
17708 Input_file->new("NormTest.txt", v3.0.0,
17709 Handler => \&process_NormalizationsTest,
17710 Skip => ($make_norm_test_script) ? 0 : 'Validation Tests',
17712 Input_file->new('CaseFolding.txt', v3.0.1,
17713 Pre_Handler => \&setup_case_folding,
17714 Each_Line_Handler =>
17715 [ ($v_version lt v3.1.0)
17716 ? \&filter_old_style_case_folding
17718 \&filter_case_folding_line
17720 Has_Missings_Defaults => $IGNORED,
17722 Input_file->new('DCoreProperties.txt', v3.1.0,
17723 # 5.2 changed this file
17724 Has_Missings_Defaults => (($v_version ge v5.2.0)
17728 Input_file->new('Scripts.txt', v3.1.0,
17729 Property => 'Script',
17730 Has_Missings_Defaults => $NOT_IGNORED,
17732 Input_file->new('DNormalizationProps.txt', v3.1.0,
17733 Has_Missings_Defaults => $NOT_IGNORED,
17734 Each_Line_Handler => (($v_version lt v4.0.1)
17735 ? \&filter_old_style_normalization_lines
17738 Input_file->new('HangulSyllableType.txt', v0,
17739 Has_Missings_Defaults => $NOT_IGNORED,
17740 Property => 'Hangul_Syllable_Type',
17741 Pre_Handler => ($v_version lt v4.0.0)
17745 Input_file->new("$AUXILIARY/WordBreakProperty.txt", v4.1.0,
17746 Property => 'Word_Break',
17747 Has_Missings_Defaults => $NOT_IGNORED,
17749 Input_file->new("$AUXILIARY/GraphemeBreakProperty.txt", v0,
17750 Property => 'Grapheme_Cluster_Break',
17751 Has_Missings_Defaults => $NOT_IGNORED,
17752 Pre_Handler => ($v_version lt v4.1.0)
17756 Input_file->new("$AUXILIARY/GCBTest.txt", v4.1.0,
17757 Handler => \&process_GCB_test,
17759 Input_file->new("$AUXILIARY/LBTest.txt", v4.1.0,
17760 Skip => 'Validation Tests',
17762 Input_file->new("$AUXILIARY/SBTest.txt", v4.1.0,
17763 Skip => 'Validation Tests',
17765 Input_file->new("$AUXILIARY/WBTest.txt", v4.1.0,
17766 Skip => 'Validation Tests',
17768 Input_file->new("$AUXILIARY/SentenceBreakProperty.txt", v4.1.0,
17769 Property => 'Sentence_Break',
17770 Has_Missings_Defaults => $NOT_IGNORED,
17772 Input_file->new('NamedSequences.txt', v4.1.0,
17773 Handler => \&process_NamedSequences
17775 Input_file->new('NameAliases.txt', v0,
17776 Property => 'Name_Alias',
17777 Pre_Handler => ($v_version le v6.0.0)
17778 ? \&setup_early_name_alias
17780 Each_Line_Handler => ($v_version le v6.0.0)
17781 ? \&filter_early_version_name_alias_line
17782 : \&filter_later_version_name_alias_line,
17784 Input_file->new("BidiTest.txt", v5.2.0,
17785 Skip => 'Validation Tests',
17787 Input_file->new('UnihanIndicesDictionary.txt', v5.2.0,
17789 Each_Line_Handler => \&filter_unihan_line,
17791 Input_file->new('UnihanDataDictionaryLike.txt', v5.2.0,
17793 Each_Line_Handler => \&filter_unihan_line,
17795 Input_file->new('UnihanIRGSources.txt', v5.2.0,
17797 Pre_Handler => \&setup_unihan,
17798 Each_Line_Handler => \&filter_unihan_line,
17800 Input_file->new('UnihanNumericValues.txt', v5.2.0,
17802 Each_Line_Handler => \&filter_unihan_line,
17804 Input_file->new('UnihanOtherMappings.txt', v5.2.0,
17806 Each_Line_Handler => \&filter_unihan_line,
17808 Input_file->new('UnihanRadicalStrokeCounts.txt', v5.2.0,
17810 Each_Line_Handler => \&filter_unihan_line,
17812 Input_file->new('UnihanReadings.txt', v5.2.0,
17814 Each_Line_Handler => \&filter_unihan_line,
17816 Input_file->new('UnihanVariants.txt', v5.2.0,
17818 Each_Line_Handler => \&filter_unihan_line,
17820 Input_file->new('ScriptExtensions.txt', v6.0.0,
17821 Property => 'Script_Extensions',
17822 Pre_Handler => \&setup_script_extensions,
17823 Each_Line_Handler => \&filter_script_extensions_line,
17824 Has_Missings_Defaults => (($v_version le v6.0.0)
17828 # The two Indic files are actually available starting in v6.0.0, but their
17829 # property values are missing from PropValueAliases.txt in that release,
17830 # so that further work would have to be done to get them to work properly
17831 # for that release.
17832 Input_file->new('IndicMatraCategory.txt', v6.1.0,
17833 Property => 'Indic_Matra_Category',
17834 Has_Missings_Defaults => $NOT_IGNORED,
17835 Skip => "Provisional; for the analysis and processing of Indic scripts",
17837 Input_file->new('IndicSyllabicCategory.txt', v6.1.0,
17838 Property => 'Indic_Syllabic_Category',
17839 Has_Missings_Defaults => $NOT_IGNORED,
17840 Skip => "Provisional; for the analysis and processing of Indic scripts",
17842 Input_file->new('BidiBrackets.txt', v6.3.0,
17843 Properties => [ 'Bidi_Paired_Bracket', 'Bidi_Paired_Bracket_Type' ],
17844 Has_Missings_Defaults => $NO_DEFAULTS,
17846 Input_file->new("BidiCharacterTest.txt", v6.3.0,
17847 Skip => 'Validation Tests',
17851 # End of all the preliminaries.
17854 if ($compare_versions) {
17855 Carp::my_carp(<<END
17856 Warning. \$compare_versions is set. Output is not suitable for production
17861 # Put into %potential_files a list of all the files in the directory structure
17862 # that could be inputs to this program, excluding those that we should ignore.
17863 # Use absolute file names because it makes it easier across machine types.
17864 my @ignored_files_full_names = map { File::Spec->rel2abs(
17865 internal_file_to_platform($_))
17866 } keys %ignored_files;
17869 return unless /\.txt$/i; # Some platforms change the name's case
17870 my $full = lc(File::Spec->rel2abs($_));
17871 $potential_files{$full} = 1
17872 if ! grep { $full eq lc($_) } @ignored_files_full_names;
17875 }, File::Spec->curdir());
17877 my @mktables_list_output_files;
17878 my $old_start_time = 0;
17879 my $old_options = "";
17881 if (! -e $file_list) {
17882 print "'$file_list' doesn't exist, so forcing rebuild.\n" if $verbosity >= $VERBOSE;
17883 $write_unchanged_files = 1;
17884 } elsif ($write_unchanged_files) {
17885 print "Not checking file list '$file_list'.\n" if $verbosity >= $VERBOSE;
17888 print "Reading file list '$file_list'\n" if $verbosity >= $VERBOSE;
17890 if (! open $file_handle, "<", $file_list) {
17891 Carp::my_carp("Failed to open '$file_list'; turning on -globlist option instead: $!");
17897 # Read and parse mktables.lst, placing the results from the first part
17898 # into @input, and the second part into @mktables_list_output_files
17899 for my $list ( \@input, \@mktables_list_output_files ) {
17900 while (<$file_handle>) {
17901 s/^ \s+ | \s+ $//xg;
17902 if (/^ \s* \# \s* Autogenerated\ starting\ on\ (\d+)/x) {
17903 $old_start_time = $1;
17906 if (/^ \s* \# \s* From\ options\ (.+) /x) {
17910 next if /^ \s* (?: \# .* )? $/x;
17912 my ( $file ) = split /\t/;
17913 push @$list, $file;
17915 @$list = uniques(@$list);
17919 # Look through all the input files
17920 foreach my $input (@input) {
17921 next if $input eq 'version'; # Already have checked this.
17923 # Ignore if doesn't exist. The checking about whether we care or
17924 # not is done via the Input_file object.
17925 next if ! file_exists($input);
17927 # The paths are stored with relative names, and with '/' as the
17928 # delimiter; convert to absolute on this machine
17929 my $full = lc(File::Spec->rel2abs(internal_file_to_platform($input)));
17930 $potential_files{lc $full} = 1
17931 if ! grep { lc($full) eq lc($_) } @ignored_files_full_names;
17935 close $file_handle;
17940 # Here wants to process all .txt files in the directory structure.
17941 # Convert them to full path names. They are stored in the platform's
17944 foreach my $object (@input_file_objects) {
17945 my $file = $object->file;
17946 next unless defined $file;
17947 push @known_files, File::Spec->rel2abs($file);
17950 my @unknown_input_files;
17951 foreach my $file (keys %potential_files) { # The keys are stored in lc
17952 next if grep { $file eq lc($_) } @known_files;
17954 # Here, the file is unknown to us. Get relative path name
17955 $file = File::Spec->abs2rel($file);
17956 push @unknown_input_files, $file;
17958 # What will happen is we create a data structure for it, and add it to
17959 # the list of input files to process. First get the subdirectories
17961 my (undef, $directories, undef) = File::Spec->splitpath($file);
17962 $directories =~ s;/$;;; # Can have extraneous trailing '/'
17963 my @directories = File::Spec->splitdir($directories);
17965 # If the file isn't extracted (meaning none of the directories is the
17966 # extracted one), just add it to the end of the list of inputs.
17967 if (! grep { $EXTRACTED_DIR eq $_ } @directories) {
17968 push @input_file_objects, Input_file->new($file, v0);
17972 # Here, the file is extracted. It needs to go ahead of most other
17973 # processing. Search for the first input file that isn't a
17974 # special required property (that is, find one whose first_release
17975 # is non-0), and isn't extracted. Also, the Age property file is
17976 # processed before the extracted ones, just in case
17977 # $compare_versions is set.
17978 for (my $i = 0; $i < @input_file_objects; $i++) {
17979 if ($input_file_objects[$i]->first_released ne v0
17980 && lc($input_file_objects[$i]->file) ne 'dage.txt'
17981 && $input_file_objects[$i]->file !~ /$EXTRACTED_DIR/i)
17983 splice @input_file_objects, $i, 0,
17984 Input_file->new($file, v0);
17991 if (@unknown_input_files) {
17992 print STDERR simple_fold(join_lines(<<END
17994 The following files are unknown as to how to handle. Assuming they are
17995 typical property files. You'll know by later error messages if it worked or
17998 ) . " " . join(", ", @unknown_input_files) . "\n\n");
18000 } # End of looking through directory structure for more .txt files.
18002 # Create the list of input files from the objects we have defined, plus
18004 my @input_files = qw(version Makefile);
18005 foreach my $object (@input_file_objects) {
18006 my $file = $object->file;
18007 next if ! defined $file; # Not all objects have files
18008 next if $object->optional && ! -e $file;
18009 push @input_files, $file;
18012 if ( $verbosity >= $VERBOSE ) {
18013 print "Expecting ".scalar( @input_files )." input files. ",
18014 "Checking ".scalar( @mktables_list_output_files )." output files.\n";
18017 # We set $most_recent to be the most recently changed input file, including
18018 # this program itself (done much earlier in this file)
18019 foreach my $in (@input_files) {
18020 next unless -e $in; # Keep going even if missing a file
18021 my $mod_time = (stat $in)[9];
18022 $most_recent = $mod_time if $mod_time > $most_recent;
18024 # See that the input files have distinct names, to warn someone if they
18025 # are adding a new one
18027 my ($volume, $directories, $file ) = File::Spec->splitpath($in);
18028 $directories =~ s;/$;;; # Can have extraneous trailing '/'
18029 my @directories = File::Spec->splitdir($directories);
18030 my $base = $file =~ s/\.txt$//;
18031 construct_filename($file, 'mutable', \@directories);
18035 # We use 'Makefile' just to see if it has changed since the last time we
18036 # rebuilt. Now discard it.
18037 @input_files = grep { $_ ne 'Makefile' } @input_files;
18039 my $rebuild = $write_unchanged_files # Rebuild: if unconditional rebuild
18040 || ! scalar @mktables_list_output_files # or if no outputs known
18041 || $old_start_time < $most_recent # or out-of-date
18042 || $old_options ne $command_line_arguments; # or with different
18045 # Now we check to see if any output files are older than youngest, if
18046 # they are, we need to continue on, otherwise we can presumably bail.
18048 foreach my $out (@mktables_list_output_files) {
18049 if ( ! file_exists($out)) {
18050 print "'$out' is missing.\n" if $verbosity >= $VERBOSE;
18054 #local $to_trace = 1 if main::DEBUG;
18055 trace $most_recent, (stat $out)[9] if main::DEBUG && $to_trace;
18056 if ( (stat $out)[9] <= $most_recent ) {
18057 #trace "$out: most recent mod time: ", (stat $out)[9], ", youngest: $most_recent\n" if main::DEBUG && $to_trace;
18058 print "'$out' is too old.\n" if $verbosity >= $VERBOSE;
18065 print "Files seem to be ok, not bothering to rebuild. Add '-w' option to force build\n";
18068 print "Must rebuild tables.\n" if $verbosity >= $VERBOSE;
18070 # Ready to do the major processing. First create the perl pseudo-property.
18071 $perl = Property->new('perl', Type => $NON_STRING, Perl_Extension => 1);
18073 # Process each input file
18074 foreach my $file (@input_file_objects) {
18078 # Finish the table generation.
18080 print "Finishing processing Unicode properties\n" if $verbosity >= $PROGRESS;
18083 print "Compiling Perl properties\n" if $verbosity >= $PROGRESS;
18086 print "Creating Perl synonyms\n" if $verbosity >= $PROGRESS;
18087 add_perl_synonyms();
18089 print "Writing tables\n" if $verbosity >= $PROGRESS;
18090 write_all_tables();
18092 # Write mktables.lst
18093 if ( $file_list and $make_list ) {
18095 print "Updating '$file_list'\n" if $verbosity >= $PROGRESS;
18096 foreach my $file (@input_files, @files_actually_output) {
18097 my (undef, $directories, $file) = File::Spec->splitpath($file);
18098 my @directories = File::Spec->splitdir($directories);
18099 $file = join '/', @directories, $file;
18103 if (! open $ofh,">",$file_list) {
18104 Carp::my_carp("Can't write to '$file_list'. Skipping: $!");
18108 my $localtime = localtime $start_time;
18109 print $ofh <<"END";
18111 # $file_list -- File list for $0.
18113 # Autogenerated starting on $start_time ($localtime)
18114 # From options $command_line_arguments
18116 # - First section is input files
18117 # ($0 itself is not listed but is automatically considered an input)
18118 # - Section separator is /^=+\$/
18119 # - Second section is a list of output files.
18120 # - Lines matching /^\\s*#/ are treated as comments
18121 # which along with blank lines are ignored.
18127 print $ofh "$_\n" for sort(@input_files);
18128 print $ofh "\n=================================\n# Output files:\n\n";
18129 print $ofh "$_\n" for sort @files_actually_output;
18130 print $ofh "\n# ",scalar(@input_files)," input files\n",
18131 "# ",scalar(@files_actually_output)+1," output files\n\n",
18134 or Carp::my_carp("Failed to close $ofh: $!");
18136 print "Filelist has ",scalar(@input_files)," input files and ",
18137 scalar(@files_actually_output)+1," output files\n"
18138 if $verbosity >= $VERBOSE;
18142 # Output these warnings unless -q explicitly specified.
18143 if ($verbosity >= $NORMAL_VERBOSITY && ! $debug_skip) {
18144 if (@unhandled_properties) {
18145 print "\nProperties and tables that unexpectedly have no code points\n";
18146 foreach my $property (sort @unhandled_properties) {
18147 print $property, "\n";
18151 if (%potential_files) {
18152 print "\nInput files that are not considered:\n";
18153 foreach my $file (sort keys %potential_files) {
18154 print File::Spec->abs2rel($file), "\n";
18157 print "\nAll done\n" if $verbosity >= $VERBOSE;
18161 # TRAILING CODE IS USED BY make_property_test_script()
18167 # If run outside the normal test suite on an ASCII platform, you can
18168 # just create a latin1_to_native() function that just returns its
18169 # inputs, because that's the only function used from test.pl
18172 # Test qr/\X/ and the \p{} regular expression constructs. This file is
18173 # constructed by mktables from the tables it generates, so if mktables is
18174 # buggy, this won't necessarily catch those bugs. Tests are generated for all
18175 # feasible properties; a few aren't currently feasible; see
18176 # is_code_point_usable() in mktables for details.
18178 # Standard test packages are not used because this manipulates SIG_WARN. It
18179 # exits 0 if every non-skipped test succeeded; -1 if any failed.
18185 my $expected = shift;
18188 my $warning_type = shift; # Type of warning message, like 'deprecated'
18190 my $line = (caller)[2];
18192 # Convert the code point to hex form
18193 my $string = sprintf "\"\\x{%04X}\"", $ord;
18197 # The first time through, use all warnings. If the input should generate
18198 # a warning, add another time through with them turned off
18199 push @tests, "no warnings '$warning_type';" if $warning_type;
18201 foreach my $no_warnings (@tests) {
18203 # Store any warning messages instead of outputting them
18204 local $SIG{__WARN__} = $SIG{__WARN__};
18205 my $warning_message;
18206 $SIG{__WARN__} = sub { $warning_message = $_[0] };
18210 # A string eval is needed because of the 'no warnings'.
18211 # Assumes no parens in the regular expression
18212 my $result = eval "$no_warnings
18213 my \$RegObj = qr($regex);
18214 $string =~ \$RegObj ? 1 : 0";
18215 if (not defined $result) {
18216 print "not ok $Tests - couldn't compile /$regex/; line $line: $@\n";
18219 elsif ($result ^ $expected) {
18220 print "not ok $Tests - expected $expected but got $result for $string =~ qr/$regex/; line $line\n";
18223 elsif ($warning_message) {
18224 if (! $warning_type || ($warning_type && $no_warnings)) {
18225 print "not ok $Tests - for qr/$regex/ did not expect warning message '$warning_message'; line $line\n";
18229 print "ok $Tests - expected and got a warning message for qr/$regex/; line $line\n";
18232 elsif ($warning_type && ! $no_warnings) {
18233 print "not ok $Tests - for qr/$regex/ expected a $warning_type warning message, but got none; line $line\n";
18237 print "ok $Tests - got $result for $string =~ qr/$regex/; line $line\n";
18246 if (eval { 'x' =~ qr/$regex/; 1 }) {
18248 my $line = (caller)[2];
18249 print "not ok $Tests - re compiled ok, but expected error for qr/$regex/; line $line: $@\n";
18252 my $line = (caller)[2];
18253 print "ok $Tests - got and expected error for qr/$regex/; line $line\n";
18258 # GCBTest.txt character that separates grapheme clusters
18259 my $breakable_utf8 = my $breakable = chr(utf8::unicode_to_native(0xF7));
18260 utf8::upgrade($breakable_utf8);
18262 # GCBTest.txt character that indicates that the adjoining code points are part
18263 # of the same grapheme cluster
18264 my $nobreak_utf8 = my $nobreak = chr(utf8::unicode_to_native(0xD7));
18265 utf8::upgrade($nobreak_utf8);
18268 # Test qr/\X/ matches. The input is a line from auxiliary/GCBTest.txt
18269 # Each such line is a sequence of code points given by their hex numbers,
18270 # separated by the two characters defined just before this subroutine that
18271 # indicate that either there can or cannot be a break between the adjacent
18272 # code points. If there isn't a break, that means the sequence forms an
18273 # extended grapheme cluster, which means that \X should match the whole
18274 # thing. If there is a break, \X should stop there. This is all
18275 # converted by this routine into a match:
18276 # $string =~ /(\X)/,
18277 # Each \X should match the next cluster; and that is what is checked.
18279 my $template = shift;
18281 my $line = (caller)[2];
18283 # The line contains characters above the ASCII range, but in Latin1. It
18284 # may or may not be in utf8, and if it is, it may or may not know it. So,
18285 # convert these characters to 8 bits. If knows is in utf8, simply
18287 if (utf8::is_utf8($template)) {
18288 utf8::downgrade($template);
18291 # Otherwise, if it is in utf8, but doesn't know it, the next lines
18292 # convert the two problematic characters to their 8-bit equivalents.
18293 # If it isn't in utf8, they don't harm anything.
18295 $template =~ s/$nobreak_utf8/$nobreak/g;
18296 $template =~ s/$breakable_utf8/$breakable/g;
18299 # Get rid of the leading and trailing breakables
18300 $template =~ s/^ \s* $breakable \s* //x;
18301 $template =~ s/ \s* $breakable \s* $ //x;
18303 # And no-breaks become just a space.
18304 $template =~ s/ \s* $nobreak \s* / /xg;
18306 # Split the input into segments that are breakable between them.
18307 my @segments = split /\s*$breakable\s*/, $template;
18310 my $display_string = "";
18312 my @should_display;
18314 # Convert the code point sequence in each segment into a Perl string of
18316 foreach my $segment (@segments) {
18317 my @code_points = split /\s+/, $segment;
18318 my $this_string = "";
18319 my $this_display = "";
18320 foreach my $code_point (@code_points) {
18321 $this_string .= latin1_to_native(chr(hex $code_point));
18322 $this_display .= "\\x{$code_point}";
18325 # The next cluster should match the string in this segment.
18326 push @should_match, $this_string;
18327 push @should_display, $this_display;
18328 $string .= $this_string;
18329 $display_string .= $this_display;
18332 # If a string can be represented in both non-ut8 and utf8, test both cases
18334 for my $to_upgrade (0 .. 1) {
18338 # If already in utf8, would just be a repeat
18339 next UPGRADE if utf8::is_utf8($string);
18341 utf8::upgrade($string);
18344 # Finally, do the \X match.
18345 my @matches = $string =~ /(\X)/g;
18347 # Look through each matched cluster to verify that it matches what we
18349 my $min = (@matches < @should_match) ? @matches : @should_match;
18350 for my $i (0 .. $min - 1) {
18352 if ($matches[$i] eq $should_match[$i]) {
18353 print "ok $Tests - ";
18355 print "In \"$display_string\" =~ /(\\X)/g, \\X #1";
18357 print "And \\X #", $i + 1,
18359 print " correctly matched $should_display[$i]; line $line\n";
18361 $matches[$i] = join("", map { sprintf "\\x{%04X}", $_ }
18362 unpack("U*", $matches[$i]));
18363 print "not ok $Tests - In \"$display_string\" =~ /(\\X)/g, \\X #",
18365 " should have matched $should_display[$i]",
18366 " but instead matched $matches[$i]",
18367 ". Abandoning rest of line $line\n";
18372 # And the number of matches should equal the number of expected matches.
18374 if (@matches == @should_match) {
18375 print "ok $Tests - Nothing was left over; line $line\n";
18377 print "not ok $Tests - There were ", scalar @should_match, " \\X matches expected, but got ", scalar @matches, " instead; line $line\n";
18385 print "1..$Tests\n";
18386 exit($Fails ? -1 : 0);
18389 Error('\p{Script=InGreek}'); # Bug #69018
18390 Test_X("1100 $nobreak 1161"); # Bug #70940
18391 Expect(0, 0x2028, '\p{Print}', ""); # Bug # 71722
18392 Expect(0, 0x2029, '\p{Print}', ""); # Bug # 71722
18393 Expect(1, 0xFF10, '\p{XDigit}', ""); # Bug # 71726