3 # !!!!!!!!!!!!!! IF YOU MODIFY THIS FILE !!!!!!!!!!!!!!!!!!!!!!!!!
4 # Any files created or read by this program should be listed in 'mktables.lst'
5 # Use -makelist to regenerate it.
7 # Needs 'no overloading' to run faster on miniperl. Code commented out at the
8 # subroutine objaddr can be used instead to work as far back (untested) as
9 # 5.8: needs pack "U". But almost all occurrences of objaddr have been
10 # removed in favor of using 'no overloading'. You also would have to go
11 # through and replace occurrences like:
12 # my $addr = do { no overloading; pack 'J', $self; }
14 # my $addr = main::objaddr $self;
15 # (or reverse commit 9b01bafde4b022706c3d6f947a0963f821b2e50b
16 # that instituted the change to main::objaddr, and subsequent commits that
17 # changed 0+$self to pack 'J', $self.)
20 BEGIN { # Get the time the script started running; do it at compilation to
21 # get it as close as possible
36 sub DEBUG () { 0 } # Set to 0 for production; 1 for development
37 my $debugging_build = $Config{"ccflags"} =~ /-DDEBUGGING/;
39 sub NON_ASCII_PLATFORM { ord("A") != 65 }
41 ##########################################################################
43 # mktables -- create the runtime Perl Unicode files (lib/unicore/.../*.pl),
44 # from the Unicode database files (lib/unicore/.../*.txt), It also generates
45 # a pod file and .t files, depending on option parameters.
47 # The structure of this file is:
48 # First these introductory comments; then
49 # code needed for everywhere, such as debugging stuff; then
50 # code to handle input parameters; then
51 # data structures likely to be of external interest (some of which depend on
52 # the input parameters, so follows them; then
53 # more data structures and subroutine and package (class) definitions; then
54 # the small actual loop to process the input files and finish up; then
55 # a __DATA__ section, for the .t tests
57 # This program works on all releases of Unicode so far. The outputs have been
58 # scrutinized most intently for release 5.1. The others have been checked for
59 # somewhat more than just sanity. It can handle all non-provisional Unicode
60 # character properties in those releases.
62 # This program is mostly about Unicode character (or code point) properties.
63 # A property describes some attribute or quality of a code point, like if it
64 # is lowercase or not, its name, what version of Unicode it was first defined
65 # in, or what its uppercase equivalent is. Unicode deals with these disparate
66 # possibilities by making all properties into mappings from each code point
67 # into some corresponding value. In the case of it being lowercase or not,
68 # the mapping is either to 'Y' or 'N' (or various synonyms thereof). Each
69 # property maps each Unicode code point to a single value, called a "property
70 # value". (Some more recently defined properties, map a code point to a set
73 # When using a property in a regular expression, what is desired isn't the
74 # mapping of the code point to its property's value, but the reverse (or the
75 # mathematical "inverse relation"): starting with the property value, "Does a
76 # code point map to it?" These are written in a "compound" form:
77 # \p{property=value}, e.g., \p{category=punctuation}. This program generates
78 # files containing the lists of code points that map to each such regular
79 # expression property value, one file per list
81 # There is also a single form shortcut that Perl adds for many of the commonly
82 # used properties. This happens for all binary properties, plus script,
83 # general_category, and block properties.
85 # Thus the outputs of this program are files. There are map files, mostly in
86 # the 'To' directory; and there are list files for use in regular expression
87 # matching, all in subdirectories of the 'lib' directory, with each
88 # subdirectory being named for the property that the lists in it are for.
89 # Bookkeeping, test, and documentation files are also generated.
91 my $matches_directory = 'lib'; # Where match (\p{}) files go.
92 my $map_directory = 'To'; # Where map files go.
96 # The major data structures of this program are Property, of course, but also
97 # Table. There are two kinds of tables, very similar to each other.
98 # "Match_Table" is the data structure giving the list of code points that have
99 # a particular property value, mentioned above. There is also a "Map_Table"
100 # data structure which gives the property's mapping from code point to value.
101 # There are two structures because the match tables need to be combined in
102 # various ways, such as constructing unions, intersections, complements, etc.,
103 # and the map ones don't. And there would be problems, perhaps subtle, if
104 # a map table were inadvertently operated on in some of those ways.
105 # The use of separate classes with operations defined on one but not the other
106 # prevents accidentally confusing the two.
108 # At the heart of each table's data structure is a "Range_List", which is just
109 # an ordered list of "Ranges", plus ancillary information, and methods to
110 # operate on them. A Range is a compact way to store property information.
111 # Each range has a starting code point, an ending code point, and a value that
112 # is meant to apply to all the code points between the two end points,
113 # inclusive. For a map table, this value is the property value for those
114 # code points. Two such ranges could be written like this:
115 # 0x41 .. 0x5A, 'Upper',
116 # 0x61 .. 0x7A, 'Lower'
118 # Each range also has a type used as a convenience to classify the values.
119 # Most ranges in this program will be Type 0, or normal, but there are some
120 # ranges that have a non-zero type. These are used only in map tables, and
121 # are for mappings that don't fit into the normal scheme of things. Mappings
122 # that require a hash entry to communicate with utf8.c are one example;
123 # another example is mappings for charnames.pm to use which indicate a name
124 # that is algorithmically determinable from its code point (and the reverse).
125 # These are used to significantly compact these tables, instead of listing
126 # each one of the tens of thousands individually.
128 # In a match table, the value of a range is irrelevant (and hence the type as
129 # well, which will always be 0), and arbitrarily set to the null string.
130 # Using the example above, there would be two match tables for those two
131 # entries, one named Upper would contain the 0x41..0x5A range, and the other
132 # named Lower would contain 0x61..0x7A.
134 # Actually, there are two types of range lists, "Range_Map" is the one
135 # associated with map tables, and "Range_List" with match tables.
136 # Again, this is so that methods can be defined on one and not the others so
137 # as to prevent operating on them in incorrect ways.
139 # Eventually, most tables are written out to files to be read by utf8_heavy.pl
140 # in the perl core. All tables could in theory be written, but some are
141 # suppressed because there is no current practical use for them. It is easy
142 # to change which get written by changing various lists that are near the top
143 # of the actual code in this file. The table data structures contain enough
144 # ancillary information to allow them to be treated as separate entities for
145 # writing, such as the path to each one's file. There is a heading in each
146 # map table that gives the format of its entries, and what the map is for all
147 # the code points missing from it. (This allows tables to be more compact.)
149 # The Property data structure contains one or more tables. All properties
150 # contain a map table (except the $perl property which is a
151 # pseudo-property containing only match tables), and any properties that
152 # are usable in regular expression matches also contain various matching
153 # tables, one for each value the property can have. A binary property can
154 # have two values, True and False (or Y and N, which are preferred by Unicode
155 # terminology). Thus each of these properties will have a map table that
156 # takes every code point and maps it to Y or N (but having ranges cuts the
157 # number of entries in that table way down), and two match tables, one
158 # which has a list of all the code points that map to Y, and one for all the
159 # code points that map to N. (For each binary property, a third table is also
160 # generated for the pseudo Perl property. It contains the identical code
161 # points as the Y table, but can be written in regular expressions, not in the
162 # compound form, but in a "single" form like \p{IsUppercase}.) Many
163 # properties are binary, but some properties have several possible values,
164 # some have many, and properties like Name have a different value for every
165 # named code point. Those will not, unless the controlling lists are changed,
166 # have their match tables written out. But all the ones which can be used in
167 # regular expression \p{} and \P{} constructs will. Prior to 5.14, generally
168 # a property would have either its map table or its match tables written but
169 # not both. Again, what gets written is controlled by lists which can easily
170 # be changed. Starting in 5.14, advantage was taken of this, and all the map
171 # tables needed to reconstruct the Unicode db are now written out, while
172 # suppressing the Unicode .txt files that contain the data. Our tables are
173 # much more compact than the .txt files, so a significant space savings was
174 # achieved. Also, tables are not written out that are trivially derivable
175 # from tables that do get written. So, there typically is no file containing
176 # the code points not matched by a binary property (the table for \P{} versus
177 # lowercase \p{}), since you just need to invert the True table to get the
180 # Properties have a 'Type', like 'binary', or 'string', or 'enum' depending on
181 # how many match tables there are and the content of the maps. This 'Type' is
182 # different than a range 'Type', so don't get confused by the two concepts
183 # having the same name.
185 # For information about the Unicode properties, see Unicode's UAX44 document:
187 my $unicode_reference_url = 'http://www.unicode.org/reports/tr44/';
189 # As stated earlier, this program will work on any release of Unicode so far.
190 # Most obvious problems in earlier data have NOT been corrected except when
191 # necessary to make Perl or this program work reasonably, and to keep out
192 # potential security issues. For example, no folding information was given in
193 # early releases, so this program substitutes lower case instead, just so that
194 # a regular expression with the /i option will do something that actually
195 # gives the right results in many cases. There are also a couple other
196 # corrections for version 1.1.5, commented at the point they are made. As an
197 # example of corrections that weren't made (but could be) is this statement
198 # from DerivedAge.txt: "The supplementary private use code points and the
199 # non-character code points were assigned in version 2.0, but not specifically
200 # listed in the UCD until versions 3.0 and 3.1 respectively." (To be precise
201 # it was 3.0.1 not 3.0.0) More information on Unicode version glitches is
202 # further down in these introductory comments.
204 # This program works on all non-provisional properties as of the current
205 # Unicode release, though the files for some are suppressed for various
206 # reasons. You can change which are output by changing lists in this program.
208 # The old version of mktables emphasized the term "Fuzzy" to mean Unicode's
209 # loose matchings rules (from Unicode TR18):
211 # The recommended names for UCD properties and property values are in
212 # PropertyAliases.txt [Prop] and PropertyValueAliases.txt
213 # [PropValue]. There are both abbreviated names and longer, more
214 # descriptive names. It is strongly recommended that both names be
215 # recognized, and that loose matching of property names be used,
216 # whereby the case distinctions, whitespace, hyphens, and underbar
219 # The program still allows Fuzzy to override its determination of if loose
220 # matching should be used, but it isn't currently used, as it is no longer
221 # needed; the calculations it makes are good enough.
223 # SUMMARY OF HOW IT WORKS:
227 # A list is constructed containing each input file that is to be processed
229 # Each file on the list is processed in a loop, using the associated handler
231 # The PropertyAliases.txt and PropValueAliases.txt files are processed
232 # first. These files name the properties and property values.
233 # Objects are created of all the property and property value names
234 # that the rest of the input should expect, including all synonyms.
235 # The other input files give mappings from properties to property
236 # values. That is, they list code points and say what the mapping
237 # is under the given property. Some files give the mappings for
238 # just one property; and some for many. This program goes through
239 # each file and populates the properties and their map tables from
240 # them. Some properties are listed in more than one file, and
241 # Unicode has set up a precedence as to which has priority if there
242 # is a conflict. Thus the order of processing matters, and this
243 # program handles the conflict possibility by processing the
244 # overriding input files last, so that if necessary they replace
246 # After this is all done, the program creates the property mappings not
247 # furnished by Unicode, but derivable from what it does give.
248 # The tables of code points that match each property value in each
249 # property that is accessible by regular expressions are created.
250 # The Perl-defined properties are created and populated. Many of these
251 # require data determined from the earlier steps
252 # Any Perl-defined synonyms are created, and name clashes between Perl
253 # and Unicode are reconciled and warned about.
254 # All the properties are written to files
255 # Any other files are written, and final warnings issued.
257 # For clarity, a number of operators have been overloaded to work on tables:
258 # ~ means invert (take all characters not in the set). The more
259 # conventional '!' is not used because of the possibility of confusing
260 # it with the actual boolean operation.
262 # - means subtraction
263 # & means intersection
264 # The precedence of these is the order listed. Parentheses should be
265 # copiously used. These are not a general scheme. The operations aren't
266 # defined for a number of things, deliberately, to avoid getting into trouble.
267 # Operations are done on references and affect the underlying structures, so
268 # that the copy constructors for them have been overloaded to not return a new
269 # clone, but the input object itself.
271 # The bool operator is deliberately not overloaded to avoid confusion with
272 # "should it mean if the object merely exists, or also is non-empty?".
274 # WHY CERTAIN DESIGN DECISIONS WERE MADE
276 # This program needs to be able to run under miniperl. Therefore, it uses a
277 # minimum of other modules, and hence implements some things itself that could
278 # be gotten from CPAN
280 # This program uses inputs published by the Unicode Consortium. These can
281 # change incompatibly between releases without the Perl maintainers realizing
282 # it. Therefore this program is now designed to try to flag these. It looks
283 # at the directories where the inputs are, and flags any unrecognized files.
284 # It keeps track of all the properties in the files it handles, and flags any
285 # that it doesn't know how to handle. It also flags any input lines that
286 # don't match the expected syntax, among other checks.
288 # It is also designed so if a new input file matches one of the known
289 # templates, one hopefully just needs to add it to a list to have it
292 # As mentioned earlier, some properties are given in more than one file. In
293 # particular, the files in the extracted directory are supposedly just
294 # reformattings of the others. But they contain information not easily
295 # derivable from the other files, including results for Unihan, which this
296 # program doesn't ordinarily look at, and for unassigned code points. They
297 # also have historically had errors or been incomplete. In an attempt to
298 # create the best possible data, this program thus processes them first to
299 # glean information missing from the other files; then processes those other
300 # files to override any errors in the extracted ones. Much of the design was
301 # driven by this need to store things and then possibly override them.
303 # It tries to keep fatal errors to a minimum, to generate something usable for
304 # testing purposes. It always looks for files that could be inputs, and will
305 # warn about any that it doesn't know how to handle (the -q option suppresses
308 # Why is there more than one type of range?
309 # This simplified things. There are some very specialized code points that
310 # have to be handled specially for output, such as Hangul syllable names.
311 # By creating a range type (done late in the development process), it
312 # allowed this to be stored with the range, and overridden by other input.
313 # Originally these were stored in another data structure, and it became a
314 # mess trying to decide if a second file that was for the same property was
315 # overriding the earlier one or not.
317 # Why are there two kinds of tables, match and map?
318 # (And there is a base class shared by the two as well.) As stated above,
319 # they actually are for different things. Development proceeded much more
320 # smoothly when I (khw) realized the distinction. Map tables are used to
321 # give the property value for every code point (actually every code point
322 # that doesn't map to a default value). Match tables are used for regular
323 # expression matches, and are essentially the inverse mapping. Separating
324 # the two allows more specialized methods, and error checks so that one
325 # can't just take the intersection of two map tables, for example, as that
328 # What about 'fate' and 'status'. The concept of a table's fate was created
329 # late when it became clear that something more was needed. The difference
330 # between this and 'status' is unclean, and could be improved if someone
331 # wanted to spend the effort.
335 # This program is written so it will run under miniperl. Occasionally changes
336 # will cause an error where the backtrace doesn't work well under miniperl.
337 # To diagnose the problem, you can instead run it under regular perl, if you
340 # There is a good trace facility. To enable it, first sub DEBUG must be set
341 # to return true. Then a line like
343 # local $to_trace = 1 if main::DEBUG;
345 # can be added to enable tracing in its lexical scope (plus dynamic) or until
346 # you insert another line:
348 # local $to_trace = 0 if main::DEBUG;
350 # To actually trace, use a line like "trace $a, @b, %c, ...;
352 # Some of the more complex subroutines already have trace statements in them.
353 # Permanent trace statements should be like:
355 # trace ... if main::DEBUG && $to_trace;
357 # If there is just one or a few files that you're debugging, you can easily
358 # cause most everything else to be skipped. Change the line
360 # my $debug_skip = 0;
362 # to 1, and every file whose object is in @input_file_objects and doesn't have
363 # a, 'non_skip => 1,' in its constructor will be skipped. However, skipping
364 # Jamo.txt or UnicodeData.txt will likely cause fatal errors.
366 # To compare the output tables, it may be useful to specify the -annotate
367 # flag. (As of this writing, this can't be done on a clean workspace, due to
368 # requirements in Text::Tabs used in this option; so first run mktables
369 # without this option.) This option adds comment lines to each table, one for
370 # each non-algorithmically named character giving, currently its code point,
371 # name, and graphic representation if printable (and you have a font that
372 # knows about it). This makes it easier to see what the particular code
373 # points are in each output table. Non-named code points are annotated with a
374 # description of their status, and contiguous ones with the same description
375 # will be output as a range rather than individually. Algorithmically named
376 # characters are also output as ranges, except when there are just a few
381 # The program would break if Unicode were to change its names so that
382 # interior white space, underscores, or dashes differences were significant
383 # within property and property value names.
385 # It might be easier to use the xml versions of the UCD if this program ever
386 # would need heavy revision, and the ability to handle old versions was not
389 # There is the potential for name collisions, in that Perl has chosen names
390 # that Unicode could decide it also likes. There have been such collisions in
391 # the past, with mostly Perl deciding to adopt the Unicode definition of the
392 # name. However in the 5.2 Unicode beta testing, there were a number of such
393 # collisions, which were withdrawn before the final release, because of Perl's
394 # and other's protests. These all involved new properties which began with
395 # 'Is'. Based on the protests, Unicode is unlikely to try that again. Also,
396 # many of the Perl-defined synonyms, like Any, Word, etc, are listed in a
397 # Unicode document, so they are unlikely to be used by Unicode for another
398 # purpose. However, they might try something beginning with 'In', or use any
399 # of the other Perl-defined properties. This program will warn you of name
400 # collisions, and refuse to generate tables with them, but manual intervention
401 # will be required in this event. One scheme that could be implemented, if
402 # necessary, would be to have this program generate another file, or add a
403 # field to mktables.lst that gives the date of first definition of a property.
404 # Each new release of Unicode would use that file as a basis for the next
405 # iteration. And the Perl synonym addition code could sort based on the age
406 # of the property, so older properties get priority, and newer ones that clash
407 # would be refused; hence existing code would not be impacted, and some other
408 # synonym would have to be used for the new property. This is ugly, and
409 # manual intervention would certainly be easier to do in the short run; lets
410 # hope it never comes to this.
414 # This program can generate tables from the Unihan database. But it doesn't
415 # by default, letting the CPAN module Unicode::Unihan handle them. Prior to
416 # version 5.2, this database was in a single file, Unihan.txt. In 5.2 the
417 # database was split into 8 different files, all beginning with the letters
418 # 'Unihan'. This program will read those file(s) if present, but it needs to
419 # know which of the many properties in the file(s) should have tables created
420 # for them. It will create tables for any properties listed in
421 # PropertyAliases.txt and PropValueAliases.txt, plus any listed in the
422 # @cjk_properties array and the @cjk_property_values array. Thus, if a
423 # property you want is not in those files of the release you are building
424 # against, you must add it to those two arrays. Starting in 4.0, the
425 # Unicode_Radical_Stroke was listed in those files, so if the Unihan database
426 # is present in the directory, a table will be generated for that property.
427 # In 5.2, several more properties were added. For your convenience, the two
428 # arrays are initialized with all the 6.0 listed properties that are also in
429 # earlier releases. But these are commented out. You can just uncomment the
430 # ones you want, or use them as a template for adding entries for other
433 # You may need to adjust the entries to suit your purposes. setup_unihan(),
434 # and filter_unihan_line() are the functions where this is done. This program
435 # already does some adjusting to make the lines look more like the rest of the
436 # Unicode DB; You can see what that is in filter_unihan_line()
438 # There is a bug in the 3.2 data file in which some values for the
439 # kPrimaryNumeric property have commas and an unexpected comment. A filter
440 # could be added for these; or for a particular installation, the Unihan.txt
441 # file could be edited to fix them.
443 # HOW TO ADD A FILE TO BE PROCESSED
445 # A new file from Unicode needs to have an object constructed for it in
446 # @input_file_objects, probably at the end or at the end of the extracted
447 # ones. The program should warn you if its name will clash with others on
448 # restrictive file systems, like DOS. If so, figure out a better name, and
449 # add lines to the README.perl file giving that. If the file is a character
450 # property, it should be in the format that Unicode has implicitly
451 # standardized for such files for the more recently introduced ones.
452 # If so, the Input_file constructor for @input_file_objects can just be the
453 # file name and release it first appeared in. If not, then it should be
454 # possible to construct an each_line_handler() to massage the line into the
457 # For non-character properties, more code will be needed. You can look at
458 # the existing entries for clues.
460 # UNICODE VERSIONS NOTES
462 # The Unicode UCD has had a number of errors in it over the versions. And
463 # these remain, by policy, in the standard for that version. Therefore it is
464 # risky to correct them, because code may be expecting the error. So this
465 # program doesn't generally make changes, unless the error breaks the Perl
466 # core. As an example, some versions of 2.1.x Jamo.txt have the wrong value
467 # for U+1105, which causes real problems for the algorithms for Jamo
468 # calculations, so it is changed here.
470 # But it isn't so clear cut as to what to do about concepts that are
471 # introduced in a later release; should they extend back to earlier releases
472 # where the concept just didn't exist? It was easier to do this than to not,
473 # so that's what was done. For example, the default value for code points not
474 # in the files for various properties was probably undefined until changed by
475 # some version. No_Block for blocks is such an example. This program will
476 # assign No_Block even in Unicode versions that didn't have it. This has the
477 # benefit that code being written doesn't have to special case earlier
478 # versions; and the detriment that it doesn't match the Standard precisely for
479 # the affected versions.
481 # Here are some observations about some of the issues in early versions:
483 # Prior to version 3.0, there were 3 character decompositions. These are not
484 # handled by Unicode::Normalize, nor will it compile when presented a version
485 # that has them. However, you can trivially get it to compile by simply
486 # ignoring those decompositions, by changing the croak to a carp. At the time
487 # of this writing, the line (in cpan/Unicode-Normalize/mkheader) reads
489 # croak("Weird Canonical Decomposition of U+$h");
491 # Simply change to a carp. It will compile, but will not know about any three
492 # character decomposition.
494 # The number of code points in \p{alpha=True} halved in 2.1.9. It turns out
495 # that the reason is that the CJK block starting at 4E00 was removed from
496 # PropList, and was not put back in until 3.1.0. The Perl extension (the
497 # single property name \p{alpha}) has the correct values. But the compound
498 # form is simply not generated until 3.1, as it can be argued that prior to
499 # this release, this was not an official property. The comments for
500 # filter_old_style_proplist() give more details.
502 # Unicode introduced the synonym Space for White_Space in 4.1. Perl has
503 # always had a \p{Space}. In release 3.2 only, they are not synonymous. The
504 # reason is that 3.2 introduced U+205F=medium math space, which was not
505 # classed as white space, but Perl figured out that it should have been. 4.0
506 # reclassified it correctly.
508 # Another change between 3.2 and 4.0 is the CCC property value ATBL. In 3.2
509 # this was erroneously a synonym for 202 (it should be 200). In 4.0, ATB
510 # became 202, and ATBL was left with no code points, as all the ones that
511 # mapped to 202 stayed mapped to 202. Thus if your program used the numeric
512 # name for the class, it would not have been affected, but if it used the
513 # mnemonic, it would have been.
515 # \p{Script=Hrkt} (Katakana_Or_Hiragana) came in 4.0.1. Before that code
516 # points which eventually came to have this script property value, instead
517 # mapped to "Unknown". But in the next release all these code points were
518 # moved to \p{sc=common} instead.
520 # The default for missing code points for BidiClass is complicated. Starting
521 # in 3.1.1, the derived file DBidiClass.txt handles this, but this program
522 # tries to do the best it can for earlier releases. It is done in
523 # process_PropertyAliases()
525 # In version 2.1.2, the entry in UnicodeData.txt:
526 # 0275;LATIN SMALL LETTER BARRED O;Ll;0;L;;;;;N;;;;019F;
528 # 0275;LATIN SMALL LETTER BARRED O;Ll;0;L;;;;;N;;;019F;;019F
529 # Without this change, there are casing problems for this character.
531 # Search for $string_compare_versions to see how to compare changes to
532 # properties between Unicode versions
534 ##############################################################################
536 my $UNDEF = ':UNDEF:'; # String to print out for undefined values in tracing
538 my $MAX_LINE_WIDTH = 78;
540 # Debugging aid to skip most files so as to not be distracted by them when
541 # concentrating on the ones being debugged. Add
543 # to the constructor for those files you want processed when you set this.
544 # Files with a first version number of 0 are special: they are always
545 # processed regardless of the state of this flag. Generally, Jamo.txt and
546 # UnicodeData.txt must not be skipped if you want this program to not die
547 # before normal completion.
551 # Normally these are suppressed.
552 my $write_Unicode_deprecated_tables = 0;
554 # Set to 1 to enable tracing.
557 { # Closure for trace: debugging aid
558 my $print_caller = 1; # ? Include calling subroutine name
559 my $main_with_colon = 'main::';
560 my $main_colon_length = length($main_with_colon);
563 return unless $to_trace; # Do nothing if global flag not set
567 local $DB::trace = 0;
568 $DB::trace = 0; # Quiet 'used only once' message
572 # Loop looking up the stack to get the first non-trace caller
577 $line_number = $caller_line;
578 (my $pkg, my $file, $caller_line, my $caller) = caller $i++;
579 $caller = $main_with_colon unless defined $caller;
581 $caller_name = $caller;
584 $caller_name =~ s/.*:://;
585 if (substr($caller_name, 0, $main_colon_length)
588 $caller_name = substr($caller_name, $main_colon_length);
591 } until ($caller_name ne 'trace');
593 # If the stack was empty, we were called from the top level
594 $caller_name = 'main' if ($caller_name eq ""
595 || $caller_name eq 'trace');
598 foreach my $string (@input) {
599 #print STDERR __LINE__, ": ", join ", ", @input, "\n";
600 if (ref $string eq 'ARRAY' || ref $string eq 'HASH') {
601 $output .= simple_dumper($string);
604 $string = "$string" if ref $string;
605 $string = $UNDEF unless defined $string;
607 $string = '""' if $string eq "";
608 $output .= " " if $output ne ""
610 && substr($output, -1, 1) ne " "
611 && substr($string, 0, 1) ne " ";
616 print STDERR sprintf "%4d: ", $line_number if defined $line_number;
617 print STDERR "$caller_name: " if $print_caller;
618 print STDERR $output, "\n";
623 # This is for a rarely used development feature that allows you to compare two
624 # versions of the Unicode standard without having to deal with changes caused
625 # by the code points introduced in the later version. Change the 0 to a
626 # string containing a SINGLE dotted Unicode release number (e.g. "2.1"). Only
627 # code points introduced in that release and earlier will be used; later ones
628 # are thrown away. You use the version number of the earliest one you want to
629 # compare; then run this program on directory structures containing each
630 # release, and compare the outputs. These outputs will therefore include only
631 # the code points common to both releases, and you can see the changes caused
632 # just by the underlying release semantic changes. For versions earlier than
633 # 3.2, you must copy a version of DAge.txt into the directory.
634 my $string_compare_versions = DEBUG && 0; # e.g., "2.1";
635 my $compare_versions = DEBUG
636 && $string_compare_versions
637 && pack "C*", split /\./, $string_compare_versions;
640 # Returns non-duplicated input values. From "Perl Best Practices:
641 # Encapsulated Cleverness". p. 455 in first edition.
644 # Arguably this breaks encapsulation, if the goal is to permit multiple
645 # distinct objects to stringify to the same value, and be interchangeable.
646 # However, for this program, no two objects stringify identically, and all
647 # lists passed to this function are either objects or strings. So this
648 # doesn't affect correctness, but it does give a couple of percent speedup.
650 return grep { ! $seen{$_}++ } @_;
653 $0 = File::Spec->canonpath($0);
655 my $make_test_script = 0; # ? Should we output a test script
656 my $make_norm_test_script = 0; # ? Should we output a normalization test script
657 my $write_unchanged_files = 0; # ? Should we update the output files even if
658 # we don't think they have changed
659 my $use_directory = ""; # ? Should we chdir somewhere.
660 my $pod_directory; # input directory to store the pod file.
661 my $pod_file = 'perluniprops';
662 my $t_path; # Path to the .t test file
663 my $file_list = 'mktables.lst'; # File to store input and output file names.
664 # This is used to speed up the build, by not
665 # executing the main body of the program if
666 # nothing on the list has changed since the
668 my $make_list = 1; # ? Should we write $file_list. Set to always
669 # make a list so that when the pumpking is
670 # preparing a release, s/he won't have to do
672 my $glob_list = 0; # ? Should we try to include unknown .txt files
674 my $output_range_counts = $debugging_build; # ? Should we include the number
675 # of code points in ranges in
677 my $annotate = 0; # ? Should character names be in the output
679 # Verbosity levels; 0 is quiet
680 my $NORMAL_VERBOSITY = 1;
684 my $verbosity = $NORMAL_VERBOSITY;
686 # Stored in mktables.lst so that if this program is called with different
687 # options, will regenerate even if the files otherwise look like they're
689 my $command_line_arguments = join " ", @ARGV;
693 my $arg = shift @ARGV;
695 $verbosity = $VERBOSE;
697 elsif ($arg eq '-p') {
698 $verbosity = $PROGRESS;
699 $| = 1; # Flush buffers as we go.
701 elsif ($arg eq '-q') {
704 elsif ($arg eq '-w') {
705 $write_unchanged_files = 1; # update the files even if havent changed
707 elsif ($arg eq '-check') {
708 my $this = shift @ARGV;
709 my $ok = shift @ARGV;
711 print "Skipping as check params are not the same.\n";
715 elsif ($arg eq '-P' && defined ($pod_directory = shift)) {
716 -d $pod_directory or croak "Directory '$pod_directory' doesn't exist";
718 elsif ($arg eq '-maketest' || ($arg eq '-T' && defined ($t_path = shift)))
720 $make_test_script = 1;
722 elsif ($arg eq '-makenormtest')
724 $make_norm_test_script = 1;
726 elsif ($arg eq '-makelist') {
729 elsif ($arg eq '-C' && defined ($use_directory = shift)) {
730 -d $use_directory or croak "Unknown directory '$use_directory'";
732 elsif ($arg eq '-L') {
734 # Existence not tested until have chdir'd
737 elsif ($arg eq '-globlist') {
740 elsif ($arg eq '-c') {
741 $output_range_counts = ! $output_range_counts
743 elsif ($arg eq '-annotate') {
745 $debugging_build = 1;
746 $output_range_counts = 1;
750 $with_c .= 'out' if $output_range_counts; # Complements the state
752 usage: $0 [-c|-p|-q|-v|-w] [-C dir] [-L filelist] [ -P pod_dir ]
753 [ -T test_file_path ] [-globlist] [-makelist] [-maketest]
755 -c : Output comments $with_c number of code points in ranges
756 -q : Quiet Mode: Only output serious warnings.
757 -p : Set verbosity level to normal plus show progress.
758 -v : Set Verbosity level high: Show progress and non-serious
760 -w : Write files regardless
761 -C dir : Change to this directory before proceeding. All relative paths
762 except those specified by the -P and -T options will be done
763 with respect to this directory.
764 -P dir : Output $pod_file file to directory 'dir'.
765 -T path : Create a test script as 'path'; overrides -maketest
766 -L filelist : Use alternate 'filelist' instead of standard one
767 -globlist : Take as input all non-Test *.txt files in current and sub
769 -maketest : Make test script 'TestProp.pl' in current (or -C directory),
771 -makelist : Rewrite the file list $file_list based on current setup
772 -annotate : Output an annotation for each character in the table files;
773 useful for debugging mktables, looking at diffs; but is slow
775 -check A B : Executes $0 only if A and B are the same
780 # Stores the most-recently changed file. If none have changed, can skip the
782 my $most_recent = (stat $0)[9]; # Do this before the chdir!
784 # Change directories now, because need to read 'version' early.
785 if ($use_directory) {
786 if ($pod_directory && ! File::Spec->file_name_is_absolute($pod_directory)) {
787 $pod_directory = File::Spec->rel2abs($pod_directory);
789 if ($t_path && ! File::Spec->file_name_is_absolute($t_path)) {
790 $t_path = File::Spec->rel2abs($t_path);
792 chdir $use_directory or croak "Failed to chdir to '$use_directory':$!";
793 if ($pod_directory && File::Spec->file_name_is_absolute($pod_directory)) {
794 $pod_directory = File::Spec->abs2rel($pod_directory);
796 if ($t_path && File::Spec->file_name_is_absolute($t_path)) {
797 $t_path = File::Spec->abs2rel($t_path);
801 # Get Unicode version into regular and v-string. This is done now because
802 # various tables below get populated based on it. These tables are populated
803 # here to be near the top of the file, and so easily seeable by those needing
805 open my $VERSION, "<", "version"
806 or croak "$0: can't open required file 'version': $!\n";
807 my $string_version = <$VERSION>;
809 chomp $string_version;
810 my $v_version = pack "C*", split /\./, $string_version; # v string
812 # The following are the complete names of properties with property values that
813 # are known to not match any code points in some versions of Unicode, but that
814 # may change in the future so they should be matchable, hence an empty file is
815 # generated for them.
816 my @tables_that_may_be_empty;
817 push @tables_that_may_be_empty, 'Joining_Type=Left_Joining'
818 if $v_version lt v6.3.0;
819 push @tables_that_may_be_empty, 'Script=Common' if $v_version le v4.0.1;
820 push @tables_that_may_be_empty, 'Title' if $v_version lt v2.0.0;
821 push @tables_that_may_be_empty, 'Script=Katakana_Or_Hiragana'
822 if $v_version ge v4.1.0;
823 push @tables_that_may_be_empty, 'Script_Extensions=Katakana_Or_Hiragana'
824 if $v_version ge v6.0.0;
825 push @tables_that_may_be_empty, 'Grapheme_Cluster_Break=Prepend'
826 if $v_version ge v6.1.0;
827 push @tables_that_may_be_empty, 'Canonical_Combining_Class=CCC133'
828 if $v_version ge v6.2.0;
830 # The lists below are hashes, so the key is the item in the list, and the
831 # value is the reason why it is in the list. This makes generation of
832 # documentation easier.
834 my %why_suppressed; # No file generated for these.
836 # Files aren't generated for empty extraneous properties. This is arguable.
837 # Extraneous properties generally come about because a property is no longer
838 # used in a newer version of Unicode. If we generated a file without code
839 # points, programs that used to work on that property will still execute
840 # without errors. It just won't ever match (or will always match, with \P{}).
841 # This means that the logic is now likely wrong. I (khw) think its better to
842 # find this out by getting an error message. Just move them to the table
843 # above to change this behavior
844 my %why_suppress_if_empty_warn_if_not = (
846 # It is the only property that has ever officially been removed from the
847 # Standard. The database never contained any code points for it.
848 'Special_Case_Condition' => 'Obsolete',
850 # Apparently never official, but there were code points in some versions of
851 # old-style PropList.txt
852 'Non_Break' => 'Obsolete',
855 # These would normally go in the warn table just above, but they were changed
856 # a long time before this program was written, so warnings about them are
858 if ($v_version gt v3.2.0) {
859 push @tables_that_may_be_empty,
860 'Canonical_Combining_Class=Attached_Below_Left'
863 # These are listed in the Property aliases file in 6.0, but Unihan is ignored
864 # unless explicitly added.
865 if ($v_version ge v5.2.0) {
866 my $unihan = 'Unihan; remove from list if using Unihan';
867 foreach my $table (qw (
871 kCompatibilityVariant
885 $why_suppress_if_empty_warn_if_not{$table} = $unihan;
889 # Enum values for to_output_map() method in the Map_Table package.
890 my $EXTERNAL_MAP = 1;
891 my $INTERNAL_MAP = 2;
892 my $OUTPUT_ADJUSTED = 3;
894 # To override computed values for writing the map tables for these properties.
895 # The default for enum map tables is to write them out, so that the Unicode
896 # .txt files can be removed, but all the data to compute any property value
897 # for any code point is available in a more compact form.
898 my %global_to_output_map = (
899 # Needed by UCD.pm, but don't want to publicize that it exists, so won't
900 # get stuck supporting it if things change. Since it is a STRING
901 # property, it normally would be listed in the pod, but INTERNAL_MAP
903 Unicode_1_Name => $INTERNAL_MAP,
905 Present_In => 0, # Suppress, as easily computed from Age
906 Block => (NON_ASCII_PLATFORM) ? 1 : 0, # Suppress, as Blocks.txt is
907 # retained, but needed for
910 # Suppress, as mapping can be found instead from the
911 # Perl_Decomposition_Mapping file
912 Decomposition_Type => 0,
915 # Properties that this program ignores.
916 my @unimplemented_properties;
918 # With this release, it is automatically handled if the Unihan db is
920 push @unimplemented_properties, 'Unicode_Radical_Stroke' if $v_version le v5.2.0;
922 # There are several types of obsolete properties defined by Unicode. These
923 # must be hand-edited for every new Unicode release.
924 my %why_deprecated; # Generates a deprecated warning message if used.
925 my %why_stabilized; # Documentation only
926 my %why_obsolete; # Documentation only
929 my $simple = 'Perl uses the more complete version';
930 my $unihan = 'Unihan properties are by default not enabled in the Perl core. Instead use CPAN: Unicode::Unihan';
932 my $other_properties = 'other properties';
933 my $contributory = "Used by Unicode internally for generating $other_properties and not intended to be used stand-alone";
934 my $why_no_expand = "Deprecated by Unicode. These are characters that expand to more than one character in the specified normalization form, but whether they actually take up more bytes or not depends on the encoding being used. For example, a UTF-8 encoded character may expand to a different number of bytes than a UTF-32 encoded character.";
937 'Grapheme_Link' => 'Deprecated by Unicode: Duplicates ccc=vr (Canonical_Combining_Class=Virama)',
938 'Jamo_Short_Name' => $contributory,
939 'Line_Break=Surrogate' => 'Deprecated by Unicode because surrogates should never appear in well-formed text, and therefore shouldn\'t be the basis for line breaking',
940 'Other_Alphabetic' => $contributory,
941 'Other_Default_Ignorable_Code_Point' => $contributory,
942 'Other_Grapheme_Extend' => $contributory,
943 'Other_ID_Continue' => $contributory,
944 'Other_ID_Start' => $contributory,
945 'Other_Lowercase' => $contributory,
946 'Other_Math' => $contributory,
947 'Other_Uppercase' => $contributory,
948 'Expands_On_NFC' => $why_no_expand,
949 'Expands_On_NFD' => $why_no_expand,
950 'Expands_On_NFKC' => $why_no_expand,
951 'Expands_On_NFKD' => $why_no_expand,
955 # There is a lib/unicore/Decomposition.pl (used by Normalize.pm) which
956 # contains the same information, but without the algorithmically
957 # determinable Hangul syllables'. This file is not published, so it's
958 # existence is not noted in the comment.
959 'Decomposition_Mapping' => 'Accessible via Unicode::Normalize or Unicode::UCD::prop_invmap()',
961 'Indic_Matra_Category' => "Provisional",
962 'Indic_Syllabic_Category' => "Provisional",
964 # Don't suppress ISO_Comment, as otherwise special handling is needed
965 # to differentiate between it and gc=c, which can be written as 'isc',
966 # which is the same characters as ISO_Comment's short name.
968 'Name' => "Accessible via \\N{...} or 'use charnames;' or Unicode::UCD::prop_invmap()",
970 'Simple_Case_Folding' => "$simple. Can access this through Unicode::UCD::casefold or Unicode::UCD::prop_invmap()",
971 'Simple_Lowercase_Mapping' => "$simple. Can access this through Unicode::UCD::charinfo or Unicode::UCD::prop_invmap()",
972 'Simple_Titlecase_Mapping' => "$simple. Can access this through Unicode::UCD::charinfo or Unicode::UCD::prop_invmap()",
973 'Simple_Uppercase_Mapping' => "$simple. Can access this through Unicode::UCD::charinfo or Unicode::UCD::prop_invmap()",
975 FC_NFKC_Closure => 'Supplanted in usage by NFKC_Casefold; otherwise not useful',
978 foreach my $property (
980 # The following are suppressed because they were made contributory
981 # or deprecated by Unicode before Perl ever thought about
990 # The following are suppressed because they have been marked
991 # as deprecated for a sufficient amount of time
993 'Other_Default_Ignorable_Code_Point',
994 'Other_Grapheme_Extend',
1001 $why_suppressed{$property} = $why_deprecated{$property};
1004 # Customize the message for all the 'Other_' properties
1005 foreach my $property (keys %why_deprecated) {
1006 next if (my $main_property = $property) !~ s/^Other_//;
1007 $why_deprecated{$property} =~ s/$other_properties/the $main_property property (which should be used instead)/;
1011 if ($write_Unicode_deprecated_tables) {
1012 foreach my $property (keys %why_suppressed) {
1013 delete $why_suppressed{$property} if $property =~
1014 / ^ Other | Grapheme /x;
1018 if ($v_version ge 4.0.0) {
1019 $why_stabilized{'Hyphen'} = 'Use the Line_Break property instead; see www.unicode.org/reports/tr14';
1020 if ($v_version ge 6.0.0) {
1021 $why_deprecated{'Hyphen'} = 'Supplanted by Line_Break property values; see www.unicode.org/reports/tr14';
1024 if ($v_version ge 5.2.0 && $v_version lt 6.0.0) {
1025 $why_obsolete{'ISO_Comment'} = 'Code points for it have been removed';
1026 if ($v_version ge 6.0.0) {
1027 $why_deprecated{'ISO_Comment'} = 'No longer needed for Unicode\'s internal chart generation; otherwise not useful, and code points for it have been removed';
1031 # Probably obsolete forever
1032 if ($v_version ge v4.1.0) {
1033 $why_suppressed{'Script=Katakana_Or_Hiragana'} = 'Obsolete. All code points previously matched by this have been moved to "Script=Common".';
1035 if ($v_version ge v6.0.0) {
1036 $why_suppressed{'Script=Katakana_Or_Hiragana'} .= ' Consider instead using "Script_Extensions=Katakana" or "Script_Extensions=Hiragana" (or both)';
1037 $why_suppressed{'Script_Extensions=Katakana_Or_Hiragana'} = 'All code points that would be matched by this are matched by either "Script_Extensions=Katakana" or "Script_Extensions=Hiragana"';
1040 # This program can create files for enumerated-like properties, such as
1041 # 'Numeric_Type'. This file would be the same format as for a string
1042 # property, with a mapping from code point to its value, so you could look up,
1043 # for example, the script a code point is in. But no one so far wants this
1044 # mapping, or they have found another way to get it since this is a new
1045 # feature. So no file is generated except if it is in this list.
1046 my @output_mapped_properties = split "\n", <<END;
1049 # If you are using the Unihan database in a Unicode version before 5.2, you
1050 # need to add the properties that you want to extract from it to this table.
1051 # For your convenience, the properties in the 6.0 PropertyAliases.txt file are
1052 # listed, commented out
1053 my @cjk_properties = split "\n", <<'END';
1054 #cjkAccountingNumeric; kAccountingNumeric
1055 #cjkOtherNumeric; kOtherNumeric
1056 #cjkPrimaryNumeric; kPrimaryNumeric
1057 #cjkCompatibilityVariant; kCompatibilityVariant
1058 #cjkIICore ; kIICore
1059 #cjkIRG_GSource; kIRG_GSource
1060 #cjkIRG_HSource; kIRG_HSource
1061 #cjkIRG_JSource; kIRG_JSource
1062 #cjkIRG_KPSource; kIRG_KPSource
1063 #cjkIRG_KSource; kIRG_KSource
1064 #cjkIRG_TSource; kIRG_TSource
1065 #cjkIRG_USource; kIRG_USource
1066 #cjkIRG_VSource; kIRG_VSource
1067 #cjkRSUnicode; kRSUnicode ; Unicode_Radical_Stroke; URS
1070 # Similarly for the property values. For your convenience, the lines in the
1071 # 6.0 PropertyAliases.txt file are listed. Just remove the first BUT NOT both
1072 # '#' marks (for Unicode versions before 5.2)
1073 my @cjk_property_values = split "\n", <<'END';
1074 ## @missing: 0000..10FFFF; cjkAccountingNumeric; NaN
1075 ## @missing: 0000..10FFFF; cjkCompatibilityVariant; <code point>
1076 ## @missing: 0000..10FFFF; cjkIICore; <none>
1077 ## @missing: 0000..10FFFF; cjkIRG_GSource; <none>
1078 ## @missing: 0000..10FFFF; cjkIRG_HSource; <none>
1079 ## @missing: 0000..10FFFF; cjkIRG_JSource; <none>
1080 ## @missing: 0000..10FFFF; cjkIRG_KPSource; <none>
1081 ## @missing: 0000..10FFFF; cjkIRG_KSource; <none>
1082 ## @missing: 0000..10FFFF; cjkIRG_TSource; <none>
1083 ## @missing: 0000..10FFFF; cjkIRG_USource; <none>
1084 ## @missing: 0000..10FFFF; cjkIRG_VSource; <none>
1085 ## @missing: 0000..10FFFF; cjkOtherNumeric; NaN
1086 ## @missing: 0000..10FFFF; cjkPrimaryNumeric; NaN
1087 ## @missing: 0000..10FFFF; cjkRSUnicode; <none>
1090 # The input files don't list every code point. Those not listed are to be
1091 # defaulted to some value. Below are hard-coded what those values are for
1092 # non-binary properties as of 5.1. Starting in 5.0, there are
1093 # machine-parsable comment lines in the files that give the defaults; so this
1094 # list shouldn't have to be extended. The claim is that all missing entries
1095 # for binary properties will default to 'N'. Unicode tried to change that in
1096 # 5.2, but the beta period produced enough protest that they backed off.
1098 # The defaults for the fields that appear in UnicodeData.txt in this hash must
1099 # be in the form that it expects. The others may be synonyms.
1100 my $CODE_POINT = '<code point>';
1101 my %default_mapping = (
1102 Age => "Unassigned",
1103 # Bidi_Class => Complicated; set in code
1104 Bidi_Mirroring_Glyph => "",
1105 Block => 'No_Block',
1106 Canonical_Combining_Class => 0,
1107 Case_Folding => $CODE_POINT,
1108 Decomposition_Mapping => $CODE_POINT,
1109 Decomposition_Type => 'None',
1110 East_Asian_Width => "Neutral",
1111 FC_NFKC_Closure => $CODE_POINT,
1112 General_Category => 'Cn',
1113 Grapheme_Cluster_Break => 'Other',
1114 Hangul_Syllable_Type => 'NA',
1116 Jamo_Short_Name => "",
1117 Joining_Group => "No_Joining_Group",
1118 # Joining_Type => Complicated; set in code
1119 kIICore => 'N', # Is converted to binary
1120 #Line_Break => Complicated; set in code
1121 Lowercase_Mapping => $CODE_POINT,
1128 Numeric_Type => 'None',
1129 Numeric_Value => 'NaN',
1130 Script => ($v_version le 4.1.0) ? 'Common' : 'Unknown',
1131 Sentence_Break => 'Other',
1132 Simple_Case_Folding => $CODE_POINT,
1133 Simple_Lowercase_Mapping => $CODE_POINT,
1134 Simple_Titlecase_Mapping => $CODE_POINT,
1135 Simple_Uppercase_Mapping => $CODE_POINT,
1136 Titlecase_Mapping => $CODE_POINT,
1137 Unicode_1_Name => "",
1138 Unicode_Radical_Stroke => "",
1139 Uppercase_Mapping => $CODE_POINT,
1140 Word_Break => 'Other',
1143 # Below are files that Unicode furnishes, but this program ignores, and why.
1144 # NormalizationCorrections.txt requires some more explanation. It documents
1145 # the cumulative fixes to erroneous normalizations in earlier Unicode
1146 # versions. Its main purpose is so that someone running on an earlier version
1147 # can use this file to override what got published in that earlier release.
1148 # It would be easy for mktables to read and handle this file. But all the
1149 # corrections in it should already be in the other files for the release it
1150 # is. To get it to actually mean something useful, someone would have to be
1151 # using an earlier Unicode release, and copy it to the files for that release
1152 # and recomplile. So far there has been no demand to do that, so this hasn't
1154 my %ignored_files = (
1155 'CJKRadicals.txt' => 'Maps the kRSUnicode property values to corresponding code points',
1156 'Index.txt' => 'Alphabetical index of Unicode characters',
1157 'NamedSqProv.txt' => 'Named sequences proposed for inclusion in a later version of the Unicode Standard; if you need them now, you can append this file to F<NamedSequences.txt> and recompile perl',
1158 'NamesList.txt' => 'Annotated list of characters',
1159 'NamesList.html' => 'Describes the format and contents of F<NamesList.txt>',
1160 'NormalizationCorrections.txt' => 'Documentation of corrections already incorporated into the Unicode data base',
1161 'Props.txt' => 'Only in very early releases; is a subset of F<PropList.txt> (which is used instead)',
1162 'ReadMe.txt' => 'Documentation',
1163 'StandardizedVariants.txt' => 'Certain glyph variations for character display are standardized. This lists the non-Unihan ones; the Unihan ones are also not used by Perl, and are in a separate Unicode data base L<http://www.unicode.org/ivd>',
1164 'StandardizedVariants.html' => 'Provides a visual display of the standard variant sequences derived from F<StandardizedVariants.txt>.',
1165 'EmojiSources.txt' => 'Maps certain Unicode code points to their legacy Japanese cell-phone values',
1166 'USourceData.txt' => 'Documentation of status and cross reference of proposals for encoding by Unicode of Unihan characters',
1167 'USourceGlyphs.pdf' => 'Pictures of the characters in F<USourceData.txt>',
1168 'auxiliary/WordBreakTest.html' => 'Documentation of validation tests',
1169 'auxiliary/SentenceBreakTest.html' => 'Documentation of validation tests',
1170 'auxiliary/GraphemeBreakTest.html' => 'Documentation of validation tests',
1171 'auxiliary/LineBreakTest.html' => 'Documentation of validation tests',
1174 my %skipped_files; # List of files that we skip
1176 ### End of externally interesting definitions, except for @input_file_objects
1179 # !!!!!!! DO NOT EDIT THIS FILE !!!!!!!
1180 # This file is machine-generated by $0 from the Unicode
1181 # database, Version $string_version. Any changes made here will be lost!
1184 my $INTERNAL_ONLY_HEADER = <<"EOF";
1186 # !!!!!!! INTERNAL PERL USE ONLY !!!!!!!
1187 # This file is for internal use by core Perl only. The format and even the
1188 # name or existence of this file are subject to change without notice. Don't
1189 # use it directly. Use Unicode::UCD to access the Unicode character data
1193 my $DEVELOPMENT_ONLY=<<"EOF";
1194 # !!!!!!! DEVELOPMENT USE ONLY !!!!!!!
1195 # This file contains information artificially constrained to code points
1196 # present in Unicode release $string_compare_versions.
1197 # IT CANNOT BE RELIED ON. It is for use during development only and should
1198 # not be used for production.
1202 my $MAX_UNICODE_CODEPOINT_STRING = "10FFFF";
1203 my $MAX_UNICODE_CODEPOINT = hex $MAX_UNICODE_CODEPOINT_STRING;
1204 my $MAX_UNICODE_CODEPOINTS = $MAX_UNICODE_CODEPOINT + 1;
1206 # We work with above-Unicode code points, up to UV_MAX. But when you get
1207 # that high, above IV_MAX, some operations don't work, and you can easily get
1208 # overflow. Therefore for internal use, we use a much smaller number,
1209 # translating it to UV_MAX only for output. The exact number is immaterial
1210 # (all Unicode code points are treated exactly the same), but the algorithm
1211 # requires it to be at least 2 * $MAX_UNICODE_CODEPOINTS + 1;
1212 my $MAX_WORKING_CODEPOINTS= $MAX_UNICODE_CODEPOINT * 8;
1213 my $MAX_WORKING_CODEPOINT = $MAX_WORKING_CODEPOINTS - 1;
1214 my $MAX_WORKING_CODEPOINT_STRING = sprintf("%X", $MAX_WORKING_CODEPOINT);
1216 my $MAX_PLATFORM_CODEPOINT = ~0;
1218 # Matches legal code point. 4-6 hex numbers, If there are 6, the first
1219 # two must be 10; if there are 5, the first must not be a 0. Written this way
1220 # to decrease backtracking. The first regex allows the code point to be at
1221 # the end of a word, but to work properly, the word shouldn't end with a valid
1222 # hex character. The second one won't match a code point at the end of a
1223 # word, and doesn't have the run-on issue
1224 my $run_on_code_point_re =
1225 qr/ (?: 10[0-9A-F]{4} | [1-9A-F][0-9A-F]{4} | [0-9A-F]{4} ) \b/x;
1226 my $code_point_re = qr/\b$run_on_code_point_re/;
1228 # This matches the beginning of the line in the Unicode db files that give the
1229 # defaults for code points not listed (i.e., missing) in the file. The code
1230 # depends on this ending with a semi-colon, so it can assume it is a valid
1231 # field when the line is split() by semi-colons
1232 my $missing_defaults_prefix =
1233 qr/^#\s+\@missing:\s+0000\.\.$MAX_UNICODE_CODEPOINT_STRING\s*;/;
1235 # Property types. Unicode has more types, but these are sufficient for our
1237 my $UNKNOWN = -1; # initialized to illegal value
1238 my $NON_STRING = 1; # Either binary or enum
1240 my $FORCED_BINARY = 3; # Not a binary property, but, besides its normal
1241 # tables, additional true and false tables are
1242 # generated so that false is anything matching the
1243 # default value, and true is everything else.
1244 my $ENUM = 4; # Include catalog
1245 my $STRING = 5; # Anything else: string or misc
1247 # Some input files have lines that give default values for code points not
1248 # contained in the file. Sometimes these should be ignored.
1249 my $NO_DEFAULTS = 0; # Must evaluate to false
1250 my $NOT_IGNORED = 1;
1253 # Range types. Each range has a type. Most ranges are type 0, for normal,
1254 # and will appear in the main body of the tables in the output files, but
1255 # there are other types of ranges as well, listed below, that are specially
1256 # handled. There are pseudo-types as well that will never be stored as a
1257 # type, but will affect the calculation of the type.
1259 # 0 is for normal, non-specials
1260 my $MULTI_CP = 1; # Sequence of more than code point
1261 my $HANGUL_SYLLABLE = 2;
1262 my $CP_IN_NAME = 3; # The NAME contains the code point appended to it.
1263 my $NULL = 4; # The map is to the null string; utf8.c can't
1264 # handle these, nor is there an accepted syntax
1265 # for them in \p{} constructs
1266 my $COMPUTE_NO_MULTI_CP = 5; # Pseudo-type; means that ranges that would
1267 # otherwise be $MULTI_CP type are instead type 0
1269 # process_generic_property_file() can accept certain overrides in its input.
1270 # Each of these must begin AND end with $CMD_DELIM.
1271 my $CMD_DELIM = "\a";
1272 my $REPLACE_CMD = 'replace'; # Override the Replace
1273 my $MAP_TYPE_CMD = 'map_type'; # Override the Type
1278 # Values for the Replace argument to add_range.
1279 # $NO # Don't replace; add only the code points not
1281 my $IF_NOT_EQUIVALENT = 1; # Replace only under certain conditions; details in
1282 # the comments at the subroutine definition.
1283 my $UNCONDITIONALLY = 2; # Replace without conditions.
1284 my $MULTIPLE_BEFORE = 4; # Don't replace, but add a duplicate record if
1286 my $MULTIPLE_AFTER = 5; # Don't replace, but add a duplicate record if
1288 my $CROAK = 6; # Die with an error if is already there
1290 # Flags to give property statuses. The phrases are to remind maintainers that
1291 # if the flag is changed, the indefinite article referring to it in the
1292 # documentation may need to be as well.
1294 my $DEPRECATED = 'D';
1295 my $a_bold_deprecated = "a 'B<$DEPRECATED>'";
1296 my $A_bold_deprecated = "A 'B<$DEPRECATED>'";
1297 my $DISCOURAGED = 'X';
1298 my $a_bold_discouraged = "an 'B<$DISCOURAGED>'";
1299 my $A_bold_discouraged = "An 'B<$DISCOURAGED>'";
1301 my $a_bold_stricter = "a 'B<$STRICTER>'";
1302 my $A_bold_stricter = "A 'B<$STRICTER>'";
1303 my $STABILIZED = 'S';
1304 my $a_bold_stabilized = "an 'B<$STABILIZED>'";
1305 my $A_bold_stabilized = "An 'B<$STABILIZED>'";
1307 my $a_bold_obsolete = "an 'B<$OBSOLETE>'";
1308 my $A_bold_obsolete = "An 'B<$OBSOLETE>'";
1310 my %status_past_participles = (
1311 $DISCOURAGED => 'discouraged',
1312 $STABILIZED => 'stabilized',
1313 $OBSOLETE => 'obsolete',
1314 $DEPRECATED => 'deprecated',
1317 # Table fates. These are somewhat ordered, so that fates < $MAP_PROXIED should be
1318 # externally documented.
1319 my $ORDINARY = 0; # The normal fate.
1320 my $MAP_PROXIED = 1; # The map table for the property isn't written out,
1321 # but there is a file written that can be used to
1322 # reconstruct this table
1323 my $INTERNAL_ONLY = 2; # The file for this table is written out, but it is
1324 # for Perl's internal use only
1325 my $LEGACY_ONLY = 3; # Like $INTERNAL_ONLY, but not actually used by Perl.
1326 # Is for backwards compatibility for applications that
1327 # read the file directly, so it's format is
1329 my $SUPPRESSED = 4; # The file for this table is not written out, and as a
1330 # result, we don't bother to do many computations on
1332 my $PLACEHOLDER = 5; # Like $SUPPRESSED, but we go through all the
1333 # computations anyway, as the values are needed for
1334 # things to work. This happens when we have Perl
1335 # extensions that depend on Unicode tables that
1336 # wouldn't normally be in a given Unicode version.
1338 # The format of the values of the tables:
1339 my $EMPTY_FORMAT = "";
1340 my $BINARY_FORMAT = 'b';
1341 my $DECIMAL_FORMAT = 'd';
1342 my $FLOAT_FORMAT = 'f';
1343 my $INTEGER_FORMAT = 'i';
1344 my $HEX_FORMAT = 'x';
1345 my $RATIONAL_FORMAT = 'r';
1346 my $STRING_FORMAT = 's';
1347 my $ADJUST_FORMAT = 'a';
1348 my $HEX_ADJUST_FORMAT = 'ax';
1349 my $DECOMP_STRING_FORMAT = 'c';
1350 my $STRING_WHITE_SPACE_LIST = 'sw';
1352 my %map_table_formats = (
1353 $BINARY_FORMAT => 'binary',
1354 $DECIMAL_FORMAT => 'single decimal digit',
1355 $FLOAT_FORMAT => 'floating point number',
1356 $INTEGER_FORMAT => 'integer',
1357 $HEX_FORMAT => 'non-negative hex whole number; a code point',
1358 $RATIONAL_FORMAT => 'rational: an integer or a fraction',
1359 $STRING_FORMAT => 'string',
1360 $ADJUST_FORMAT => 'some entries need adjustment',
1361 $HEX_ADJUST_FORMAT => 'mapped value in hex; some entries need adjustment',
1362 $DECOMP_STRING_FORMAT => 'Perl\'s internal (Normalize.pm) decomposition mapping',
1363 $STRING_WHITE_SPACE_LIST => 'string, but some elements are interpreted as a list; white space occurs only as list item separators'
1366 # Unicode didn't put such derived files in a separate directory at first.
1367 my $EXTRACTED_DIR = (-d 'extracted') ? 'extracted' : "";
1368 my $EXTRACTED = ($EXTRACTED_DIR) ? "$EXTRACTED_DIR/" : "";
1369 my $AUXILIARY = 'auxiliary';
1371 # Hashes and arrays that will eventually go into Heavy.pl for the use of
1372 # utf8_heavy.pl and into UCD.pl for the use of UCD.pm
1373 my %loose_to_file_of; # loosely maps table names to their respective
1375 my %stricter_to_file_of; # same; but for stricter mapping.
1376 my %loose_property_to_file_of; # Maps a loose property name to its map file
1377 my @inline_definitions = "V0"; # Each element gives a definition of a unique
1378 # inversion list. When a definition is inlined,
1379 # its value in the hash it's in (one of the two
1380 # defined just above) will include an index into
1381 # this array. The 0th element is initialized to
1382 # the definition for a zero length invwersion list
1383 my %file_to_swash_name; # Maps the file name to its corresponding key name
1384 # in the hash %utf8::SwashInfo
1385 my %nv_floating_to_rational; # maps numeric values floating point numbers to
1386 # their rational equivalent
1387 my %loose_property_name_of; # Loosely maps (non_string) property names to
1389 my %string_property_loose_to_name; # Same, for string properties.
1390 my %loose_defaults; # keys are of form "prop=value", where 'prop' is
1391 # the property name in standard loose form, and
1392 # 'value' is the default value for that property,
1393 # also in standard loose form.
1394 my %loose_to_standard_value; # loosely maps table names to the canonical
1396 my %ambiguous_names; # keys are alias names (in standard form) that
1397 # have more than one possible meaning.
1398 my %prop_aliases; # Keys are standard property name; values are each
1400 my %prop_value_aliases; # Keys of top level are standard property name;
1401 # values are keys to another hash, Each one is
1402 # one of the property's values, in standard form.
1403 # The values are that prop-val's aliases.
1404 my %ucd_pod; # Holds entries that will go into the UCD section of the pod
1406 # Most properties are immune to caseless matching, otherwise you would get
1407 # nonsensical results, as properties are a function of a code point, not
1408 # everything that is caselessly equivalent to that code point. For example,
1409 # Changes_When_Case_Folded('s') should be false, whereas caselessly it would
1410 # be true because 's' and 'S' are equivalent caselessly. However,
1411 # traditionally, [:upper:] and [:lower:] are equivalent caselessly, so we
1412 # extend that concept to those very few properties that are like this. Each
1413 # such property will match the full range caselessly. They are hard-coded in
1414 # the program; it's not worth trying to make it general as it's extremely
1415 # unlikely that they will ever change.
1416 my %caseless_equivalent_to;
1418 # These constants names and values were taken from the Unicode standard,
1419 # version 5.1, section 3.12. They are used in conjunction with Hangul
1420 # syllables. The '_string' versions are so generated tables can retain the
1421 # hex format, which is the more familiar value
1422 my $SBase_string = "0xAC00";
1423 my $SBase = CORE::hex $SBase_string;
1424 my $LBase_string = "0x1100";
1425 my $LBase = CORE::hex $LBase_string;
1426 my $VBase_string = "0x1161";
1427 my $VBase = CORE::hex $VBase_string;
1428 my $TBase_string = "0x11A7";
1429 my $TBase = CORE::hex $TBase_string;
1434 my $NCount = $VCount * $TCount;
1436 # For Hangul syllables; These store the numbers from Jamo.txt in conjunction
1437 # with the above published constants.
1439 my %Jamo_L; # Leading consonants
1440 my %Jamo_V; # Vowels
1441 my %Jamo_T; # Trailing consonants
1443 # For code points whose name contains its ordinal as a '-ABCD' suffix.
1444 # The key is the base name of the code point, and the value is an
1445 # array giving all the ranges that use this base name. Each range
1446 # is actually a hash giving the 'low' and 'high' values of it.
1447 my %names_ending_in_code_point;
1448 my %loose_names_ending_in_code_point; # Same as above, but has blanks, dashes
1449 # removed from the names
1450 # Inverse mapping. The list of ranges that have these kinds of
1451 # names. Each element contains the low, high, and base names in an
1453 my @code_points_ending_in_code_point;
1455 # To hold Unicode's normalization test suite
1456 my @normalization_tests;
1458 # Boolean: does this Unicode version have the hangul syllables, and are we
1459 # writing out a table for them?
1460 my $has_hangul_syllables = 0;
1462 # Does this Unicode version have code points whose names end in their
1463 # respective code points, and are we writing out a table for them? 0 for no;
1464 # otherwise points to first property that a table is needed for them, so that
1465 # if multiple tables are needed, we don't create duplicates
1466 my $needing_code_points_ending_in_code_point = 0;
1468 my @backslash_X_tests; # List of tests read in for testing \X
1469 my @unhandled_properties; # Will contain a list of properties found in
1470 # the input that we didn't process.
1471 my @match_properties; # Properties that have match tables, to be
1473 my @map_properties; # Properties that get map files written
1474 my @named_sequences; # NamedSequences.txt contents.
1475 my %potential_files; # Generated list of all .txt files in the directory
1476 # structure so we can warn if something is being
1478 my @files_actually_output; # List of files we generated.
1479 my @more_Names; # Some code point names are compound; this is used
1480 # to store the extra components of them.
1481 my $MIN_FRACTION_LENGTH = 3; # How many digits of a floating point number at
1482 # the minimum before we consider it equivalent to a
1483 # candidate rational
1484 my $MAX_FLOATING_SLOP = 10 ** - $MIN_FRACTION_LENGTH; # And in floating terms
1486 # These store references to certain commonly used property objects
1496 # Are there conflicting names because of beginning with 'In_', or 'Is_'
1497 my $has_In_conflicts = 0;
1498 my $has_Is_conflicts = 0;
1500 sub internal_file_to_platform ($) {
1501 # Convert our file paths which have '/' separators to those of the
1505 return undef unless defined $file;
1507 return File::Spec->join(split '/', $file);
1510 sub file_exists ($) { # platform independent '-e'. This program internally
1511 # uses slash as a path separator.
1513 return 0 if ! defined $file;
1514 return -e internal_file_to_platform($file);
1518 # Returns the address of the blessed input object.
1519 # It doesn't check for blessedness because that would do a string eval
1520 # every call, and the program is structured so that this is never called
1521 # for a non-blessed object.
1523 no overloading; # If overloaded, numifying below won't work.
1525 # Numifying a ref gives its address.
1526 return pack 'J', $_[0];
1529 # These are used only if $annotate is true.
1530 # The entire range of Unicode characters is examined to populate these
1531 # after all the input has been processed. But most can be skipped, as they
1532 # have the same descriptive phrases, such as being unassigned
1533 my @viacode; # Contains the 1 million character names
1534 my @printable; # boolean: And are those characters printable?
1535 my @annotate_char_type; # Contains a type of those characters, specifically
1536 # for the purposes of annotation.
1537 my $annotate_ranges; # A map of ranges of code points that have the same
1538 # name for the purposes of annotation. They map to the
1539 # upper edge of the range, so that the end point can
1540 # be immediately found. This is used to skip ahead to
1541 # the end of a range, and avoid processing each
1542 # individual code point in it.
1543 my $unassigned_sans_noncharacters; # A Range_List of the unassigned
1544 # characters, but excluding those which are
1545 # also noncharacter code points
1547 # The annotation types are an extension of the regular range types, though
1548 # some of the latter are folded into one. Make the new types negative to
1549 # avoid conflicting with the regular types
1550 my $SURROGATE_TYPE = -1;
1551 my $UNASSIGNED_TYPE = -2;
1552 my $PRIVATE_USE_TYPE = -3;
1553 my $NONCHARACTER_TYPE = -4;
1554 my $CONTROL_TYPE = -5;
1555 my $ABOVE_UNICODE_TYPE = -6;
1556 my $UNKNOWN_TYPE = -7; # Used only if there is a bug in this program
1558 sub populate_char_info ($) {
1559 # Used only with the $annotate option. Populates the arrays with the
1560 # input code point's info that are needed for outputting more detailed
1561 # comments. If calling context wants a return, it is the end point of
1562 # any contiguous range of characters that share essentially the same info
1565 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
1567 $viacode[$i] = $perl_charname->value_of($i) || "";
1569 # A character is generally printable if Unicode says it is,
1570 # but below we make sure that most Unicode general category 'C' types
1572 $printable[$i] = $print->contains($i);
1574 $annotate_char_type[$i] = $perl_charname->type_of($i) || 0;
1576 # Only these two regular types are treated specially for annotations
1578 $annotate_char_type[$i] = 0 if $annotate_char_type[$i] != $CP_IN_NAME
1579 && $annotate_char_type[$i] != $HANGUL_SYLLABLE;
1581 # Give a generic name to all code points that don't have a real name.
1582 # We output ranges, if applicable, for these. Also calculate the end
1583 # point of the range.
1585 if (! $viacode[$i]) {
1587 if ($i > $MAX_UNICODE_CODEPOINT) {
1588 $viacode[$i] = 'Above-Unicode';
1589 $annotate_char_type[$i] = $ABOVE_UNICODE_TYPE;
1591 $end = $MAX_WORKING_CODEPOINT;
1593 elsif ($gc-> table('Private_use')->contains($i)) {
1594 $viacode[$i] = 'Private Use';
1595 $annotate_char_type[$i] = $PRIVATE_USE_TYPE;
1597 $end = $gc->table('Private_Use')->containing_range($i)->end;
1599 elsif ((defined ($nonchar =
1600 Property::property_ref('Noncharacter_Code_Point'))
1601 && $nonchar->table('Y')->contains($i)))
1603 $viacode[$i] = 'Noncharacter';
1604 $annotate_char_type[$i] = $NONCHARACTER_TYPE;
1606 $end = property_ref('Noncharacter_Code_Point')->table('Y')->
1607 containing_range($i)->end;
1609 elsif ($gc-> table('Control')->contains($i)) {
1610 $viacode[$i] = property_ref('Name_Alias')->value_of($i) || 'Control';
1611 $annotate_char_type[$i] = $CONTROL_TYPE;
1614 elsif ($gc-> table('Unassigned')->contains($i)) {
1615 $annotate_char_type[$i] = $UNASSIGNED_TYPE;
1617 if ($v_version lt v2.0.0) { # No blocks in earliest releases
1618 $viacode[$i] = 'Unassigned';
1619 $end = $gc-> table('Unassigned')->containing_range($i)->end;
1622 $viacode[$i] = 'Unassigned, block=' . $block-> value_of($i);
1624 # Because we name the unassigned by the blocks they are in, it
1625 # can't go past the end of that block, and it also can't go
1626 # past the unassigned range it is in. The special table makes
1627 # sure that the non-characters, which are unassigned, are
1629 $end = min($block->containing_range($i)->end,
1630 $unassigned_sans_noncharacters->
1631 containing_range($i)->end);
1634 elsif ($v_version lt v2.0.0) { # No surrogates in earliest releases
1635 $viacode[$i] = $gc->value_of($i);
1636 $annotate_char_type[$i] = $UNKNOWN_TYPE;
1639 elsif ($gc-> table('Surrogate')->contains($i)) {
1640 $viacode[$i] = 'Surrogate';
1641 $annotate_char_type[$i] = $SURROGATE_TYPE;
1643 $end = $gc->table('Surrogate')->containing_range($i)->end;
1646 Carp::my_carp_bug("Can't figure out how to annotate "
1647 . sprintf("U+%04X", $i)
1648 . ". Proceeding anyway.");
1649 $viacode[$i] = 'UNKNOWN';
1650 $annotate_char_type[$i] = $UNKNOWN_TYPE;
1655 # Here, has a name, but if it's one in which the code point number is
1656 # appended to the name, do that.
1657 elsif ($annotate_char_type[$i] == $CP_IN_NAME) {
1658 $viacode[$i] .= sprintf("-%04X", $i);
1659 $end = $perl_charname->containing_range($i)->end;
1662 # And here, has a name, but if it's a hangul syllable one, replace it with
1663 # the correct name from the Unicode algorithm
1664 elsif ($annotate_char_type[$i] == $HANGUL_SYLLABLE) {
1666 my $SIndex = $i - $SBase;
1667 my $L = $LBase + $SIndex / $NCount;
1668 my $V = $VBase + ($SIndex % $NCount) / $TCount;
1669 my $T = $TBase + $SIndex % $TCount;
1670 $viacode[$i] = "HANGUL SYLLABLE $Jamo{$L}$Jamo{$V}";
1671 $viacode[$i] .= $Jamo{$T} if $T != $TBase;
1672 $end = $perl_charname->containing_range($i)->end;
1675 return if ! defined wantarray;
1676 return $i if ! defined $end; # If not a range, return the input
1678 # Save this whole range so can find the end point quickly
1679 $annotate_ranges->add_map($i, $end, $end);
1684 # Commented code below should work on Perl 5.8.
1685 ## This 'require' doesn't necessarily work in miniperl, and even if it does,
1686 ## the native perl version of it (which is what would operate under miniperl)
1687 ## is extremely slow, as it does a string eval every call.
1688 #my $has_fast_scalar_util = $^X !~ /miniperl/
1689 # && defined eval "require Scalar::Util";
1692 # # Returns the address of the blessed input object. Uses the XS version if
1693 # # available. It doesn't check for blessedness because that would do a
1694 # # string eval every call, and the program is structured so that this is
1695 # # never called for a non-blessed object.
1697 # return Scalar::Util::refaddr($_[0]) if $has_fast_scalar_util;
1699 # # Check at least that is a ref.
1700 # my $pkg = ref($_[0]) or return undef;
1702 # # Change to a fake package to defeat any overloaded stringify
1703 # bless $_[0], 'main::Fake';
1705 # # Numifying a ref gives its address.
1706 # my $addr = pack 'J', $_[0];
1708 # # Return to original class
1709 # bless $_[0], $pkg;
1716 return $a if $a >= $b;
1723 return $a if $a <= $b;
1727 sub clarify_number ($) {
1728 # This returns the input number with underscores inserted every 3 digits
1729 # in large (5 digits or more) numbers. Input must be entirely digits, not
1733 my $pos = length($number) - 3;
1734 return $number if $pos <= 1;
1736 substr($number, $pos, 0) = '_';
1742 sub clarify_code_point_count ($) {
1743 # This is like clarify_number(), but the input is assumed to be a count of
1744 # code points, rather than a generic number.
1749 if ($number > $MAX_UNICODE_CODEPOINTS) {
1750 $number -= ($MAX_WORKING_CODEPOINTS - $MAX_UNICODE_CODEPOINTS);
1751 return "All above-Unicode code points" if $number == 0;
1752 $append = " + all above-Unicode code points";
1754 return clarify_number($number) . $append;
1759 # These routines give a uniform treatment of messages in this program. They
1760 # are placed in the Carp package to cause the stack trace to not include them,
1761 # although an alternative would be to use another package and set @CARP_NOT
1764 our $Verbose = 1 if main::DEBUG; # Useful info when debugging
1766 # This is a work-around suggested by Nicholas Clark to fix a problem with Carp
1767 # and overload trying to load Scalar:Util under miniperl. See
1768 # http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/2009-11/msg01057.html
1769 undef $overload::VERSION;
1772 my $message = shift || "";
1773 my $nofold = shift || 0;
1776 $message = main::join_lines($message);
1777 $message =~ s/^$0: *//; # Remove initial program name
1778 $message =~ s/[.;,]+$//; # Remove certain ending punctuation
1779 $message = "\n$0: $message;";
1781 # Fold the message with program name, semi-colon end punctuation
1782 # (which looks good with the message that carp appends to it), and a
1783 # hanging indent for continuation lines.
1784 $message = main::simple_fold($message, "", 4) unless $nofold;
1785 $message =~ s/\n$//; # Remove the trailing nl so what carp
1786 # appends is to the same line
1789 return $message if defined wantarray; # If a caller just wants the msg
1796 # This is called when it is clear that the problem is caused by a bug in
1799 my $message = shift;
1800 $message =~ s/^$0: *//;
1801 $message = my_carp("Bug in $0. Please report it by running perlbug or if that is unavailable, by sending email to perbug\@perl.org:\n$message");
1806 sub carp_too_few_args {
1808 my_carp_bug("Wrong number of arguments: to 'carp_too_few_arguments'. No action taken.");
1812 my $args_ref = shift;
1815 my_carp_bug("Need at least $count arguments to "
1817 . ". Instead got: '"
1818 . join ', ', @$args_ref
1819 . "'. No action taken.");
1823 sub carp_extra_args {
1824 my $args_ref = shift;
1825 my_carp_bug("Too many arguments to 'carp_extra_args': (" . join(', ', @_) . "); Extras ignored.") if @_;
1827 unless (ref $args_ref) {
1828 my_carp_bug("Argument to 'carp_extra_args' ($args_ref) must be a ref. Not checking arguments.");
1831 my ($package, $file, $line) = caller;
1832 my $subroutine = (caller 1)[3];
1835 if (ref $args_ref eq 'HASH') {
1836 foreach my $key (keys %$args_ref) {
1837 $args_ref->{$key} = $UNDEF unless defined $args_ref->{$key};
1839 $list = join ', ', each %{$args_ref};
1841 elsif (ref $args_ref eq 'ARRAY') {
1842 foreach my $arg (@$args_ref) {
1843 $arg = $UNDEF unless defined $arg;
1845 $list = join ', ', @$args_ref;
1848 my_carp_bug("Can't cope with ref "
1850 . " . argument to 'carp_extra_args'. Not checking arguments.");
1854 my_carp_bug("Unrecognized parameters in options: '$list' to $subroutine. Skipped.");
1862 # This program uses the inside-out method for objects, as recommended in
1863 # "Perl Best Practices". (This is the best solution still, since this has
1864 # to run under miniperl.) This closure aids in generating those. There
1865 # are two routines. setup_package() is called once per package to set
1866 # things up, and then set_access() is called for each hash representing a
1867 # field in the object. These routines arrange for the object to be
1868 # properly destroyed when no longer used, and for standard accessor
1869 # functions to be generated. If you need more complex accessors, just
1870 # write your own and leave those accesses out of the call to set_access().
1871 # More details below.
1873 my %constructor_fields; # fields that are to be used in constructors; see
1876 # The values of this hash will be the package names as keys to other
1877 # hashes containing the name of each field in the package as keys, and
1878 # references to their respective hashes as values.
1882 # Sets up the package, creating standard DESTROY and dump methods
1883 # (unless already defined). The dump method is used in debugging by
1885 # The optional parameters are:
1886 # a) a reference to a hash, that gets populated by later
1887 # set_access() calls with one of the accesses being
1888 # 'constructor'. The caller can then refer to this, but it is
1889 # not otherwise used by these two routines.
1890 # b) a reference to a callback routine to call during destruction
1891 # of the object, before any fields are actually destroyed
1894 my $constructor_ref = delete $args{'Constructor_Fields'};
1895 my $destroy_callback = delete $args{'Destroy_Callback'};
1896 Carp::carp_extra_args(\@_) if main::DEBUG && %args;
1899 my $package = (caller)[0];
1901 $package_fields{$package} = \%fields;
1902 $constructor_fields{$package} = $constructor_ref;
1904 unless ($package->can('DESTROY')) {
1905 my $destroy_name = "${package}::DESTROY";
1908 # Use typeglob to give the anonymous subroutine the name we want
1909 *$destroy_name = sub {
1911 my $addr = do { no overloading; pack 'J', $self; };
1913 $self->$destroy_callback if $destroy_callback;
1914 foreach my $field (keys %{$package_fields{$package}}) {
1915 #print STDERR __LINE__, ": Destroying ", ref $self, " ", sprintf("%04X", $addr), ": ", $field, "\n";
1916 delete $package_fields{$package}{$field}{$addr};
1922 unless ($package->can('dump')) {
1923 my $dump_name = "${package}::dump";
1927 return dump_inside_out($self, $package_fields{$package}, @_);
1934 # Arrange for the input field to be garbage collected when no longer
1935 # needed. Also, creates standard accessor functions for the field
1936 # based on the optional parameters-- none if none of these parameters:
1937 # 'addable' creates an 'add_NAME()' accessor function.
1938 # 'readable' or 'readable_array' creates a 'NAME()' accessor
1940 # 'settable' creates a 'set_NAME()' accessor function.
1941 # 'constructor' doesn't create an accessor function, but adds the
1942 # field to the hash that was previously passed to
1944 # Any of the accesses can be abbreviated down, so that 'a', 'ad',
1945 # 'add' etc. all mean 'addable'.
1946 # The read accessor function will work on both array and scalar
1947 # values. If another accessor in the parameter list is 'a', the read
1948 # access assumes an array. You can also force it to be array access
1949 # by specifying 'readable_array' instead of 'readable'
1951 # A sort-of 'protected' access can be set-up by preceding the addable,
1952 # readable or settable with some initial portion of 'protected_' (but,
1953 # the underscore is required), like 'p_a', 'pro_set', etc. The
1954 # "protection" is only by convention. All that happens is that the
1955 # accessor functions' names begin with an underscore. So instead of
1956 # calling set_foo, the call is _set_foo. (Real protection could be
1957 # accomplished by having a new subroutine, end_package, called at the
1958 # end of each package, and then storing the __LINE__ ranges and
1959 # checking them on every accessor. But that is way overkill.)
1961 # We create anonymous subroutines as the accessors and then use
1962 # typeglobs to assign them to the proper package and name
1964 my $name = shift; # Name of the field
1965 my $field = shift; # Reference to the inside-out hash containing the
1968 my $package = (caller)[0];
1970 if (! exists $package_fields{$package}) {
1971 croak "$0: Must call 'setup_package' before 'set_access'";
1974 # Stash the field so DESTROY can get it.
1975 $package_fields{$package}{$name} = $field;
1977 # Remaining arguments are the accessors. For each...
1978 foreach my $access (@_) {
1979 my $access = lc $access;
1983 # Match the input as far as it goes.
1984 if ($access =~ /^(p[^_]*)_/) {
1986 if (substr('protected_', 0, length $protected)
1990 # Add 1 for the underscore not included in $protected
1991 $access = substr($access, length($protected) + 1);
1999 if (substr('addable', 0, length $access) eq $access) {
2000 my $subname = "${package}::${protected}add_$name";
2003 # add_ accessor. Don't add if already there, which we
2004 # determine using 'eq' for scalars and '==' otherwise.
2007 return Carp::carp_too_few_args(\@_, 2) if main::DEBUG && @_ < 2;
2010 my $addr = do { no overloading; pack 'J', $self; };
2011 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2013 return if grep { $value == $_ } @{$field->{$addr}};
2016 return if grep { $value eq $_ } @{$field->{$addr}};
2018 push @{$field->{$addr}}, $value;
2022 elsif (substr('constructor', 0, length $access) eq $access) {
2024 Carp::my_carp_bug("Can't set-up 'protected' constructors")
2027 $constructor_fields{$package}{$name} = $field;
2030 elsif (substr('readable_array', 0, length $access) eq $access) {
2032 # Here has read access. If one of the other parameters for
2033 # access is array, or this one specifies array (by being more
2034 # than just 'readable_'), then create a subroutine that
2035 # assumes the data is an array. Otherwise just a scalar
2036 my $subname = "${package}::${protected}$name";
2037 if (grep { /^a/i } @_
2038 or length($access) > length('readable_'))
2043 Carp::carp_extra_args(\@_) if main::DEBUG && @_ > 1;
2044 my $addr = do { no overloading; pack 'J', $_[0]; };
2045 if (ref $field->{$addr} ne 'ARRAY') {
2046 my $type = ref $field->{$addr};
2047 $type = 'scalar' unless $type;
2048 Carp::my_carp_bug("Trying to read $name as an array when it is a $type. Big problems.");
2051 return scalar @{$field->{$addr}} unless wantarray;
2053 # Make a copy; had problems with caller modifying the
2054 # original otherwise
2055 my @return = @{$field->{$addr}};
2061 # Here not an array value, a simpler function.
2065 Carp::carp_extra_args(\@_) if main::DEBUG && @_ > 1;
2067 return $field->{pack 'J', $_[0]};
2071 elsif (substr('settable', 0, length $access) eq $access) {
2072 my $subname = "${package}::${protected}set_$name";
2077 return Carp::carp_too_few_args(\@_, 2) if @_ < 2;
2078 Carp::carp_extra_args(\@_) if @_ > 2;
2080 # $self is $_[0]; $value is $_[1]
2082 $field->{pack 'J', $_[0]} = $_[1];
2087 Carp::my_carp_bug("Unknown accessor type $access. No accessor set.");
2096 # All input files use this object, which stores various attributes about them,
2097 # and provides for convenient, uniform handling. The run method wraps the
2098 # processing. It handles all the bookkeeping of opening, reading, and closing
2099 # the file, returning only significant input lines.
2101 # Each object gets a handler which processes the body of the file, and is
2102 # called by run(). All character property files must use the generic,
2103 # default handler, which has code scrubbed to handle things you might not
2104 # expect, including automatic EBCDIC handling. For files that don't deal with
2105 # mapping code points to a property value, such as test files,
2106 # PropertyAliases, PropValueAliases, and named sequences, you can override the
2107 # handler to be a custom one. Such a handler should basically be a
2108 # while(next_line()) {...} loop.
2110 # You can also set up handlers to
2111 # 1) call before the first line is read, for pre processing
2112 # 2) call to adjust each line of the input before the main handler gets
2113 # them. This can be automatically generated, if appropriately simple
2114 # enough, by specifiying a Properties parameter in the constructor.
2115 # 3) call upon EOF before the main handler exits its loop
2116 # 4) call at the end, for post processing
2118 # $_ is used to store the input line, and is to be filtered by the
2119 # each_line_handler()s. So, if the format of the line is not in the desired
2120 # format for the main handler, these are used to do that adjusting. They can
2121 # be stacked (by enclosing them in an [ anonymous array ] in the constructor,
2122 # so the $_ output of one is used as the input to the next. None of the other
2123 # handlers are stackable, but could easily be changed to be so.
2125 # Most of the handlers can call insert_lines() or insert_adjusted_lines()
2126 # which insert the parameters as lines to be processed before the next input
2127 # file line is read. This allows the EOF handler to flush buffers, for
2128 # example. The difference between the two routines is that the lines inserted
2129 # by insert_lines() are subjected to the each_line_handler()s. (So if you
2130 # called it from such a handler, you would get infinite recursion.) Lines
2131 # inserted by insert_adjusted_lines() go directly to the main handler without
2132 # any adjustments. If the post-processing handler calls any of these, there
2133 # will be no effect. Some error checking for these conditions could be added,
2134 # but it hasn't been done.
2136 # carp_bad_line() should be called to warn of bad input lines, which clears $_
2137 # to prevent further processing of the line. This routine will output the
2138 # message as a warning once, and then keep a count of the lines that have the
2139 # same message, and output that count at the end of the file's processing.
2140 # This keeps the number of messages down to a manageable amount.
2142 # get_missings() should be called to retrieve any @missing input lines.
2143 # Messages will be raised if this isn't done if the options aren't to ignore
2146 sub trace { return main::trace(@_); }
2149 # Keep track of fields that are to be put into the constructor.
2150 my %constructor_fields;
2152 main::setup_package(Constructor_Fields => \%constructor_fields);
2154 my %file; # Input file name, required
2155 main::set_access('file', \%file, qw{ c r });
2157 my %first_released; # Unicode version file was first released in, required
2158 main::set_access('first_released', \%first_released, qw{ c r });
2160 my %handler; # Subroutine to process the input file, defaults to
2161 # 'process_generic_property_file'
2162 main::set_access('handler', \%handler, qw{ c });
2165 # name of property this file is for. defaults to none, meaning not
2166 # applicable, or is otherwise determinable, for example, from each line.
2167 main::set_access('property', \%property, qw{ c r });
2170 # If this is true, the file is optional. If not present, no warning is
2171 # output. If it is present, the string given by this parameter is
2172 # evaluated, and if false the file is not processed.
2173 main::set_access('optional', \%optional, 'c', 'r');
2176 # This is used for debugging, to skip processing of all but a few input
2177 # files. Add 'non_skip => 1' to the constructor for those files you want
2178 # processed when you set the $debug_skip global.
2179 main::set_access('non_skip', \%non_skip, 'c');
2182 # This is used to skip processing of this input file semi-permanently,
2183 # when it evaluates to true. The value should be the reason the file is
2184 # being skipped. It is used for files that we aren't planning to process
2185 # anytime soon, but want to allow to be in the directory and not raise a
2186 # message that we are not handling. Mostly for test files. This is in
2187 # contrast to the non_skip element, which is supposed to be used very
2188 # temporarily for debugging. Sets 'optional' to 1. Also, files that we
2189 # pretty much will never look at can be placed in the global
2190 # %ignored_files instead. Ones used here will be added to %skipped files
2191 main::set_access('skip', \%skip, 'c');
2193 my %each_line_handler;
2194 # list of subroutines to look at and filter each non-comment line in the
2195 # file. defaults to none. The subroutines are called in order, each is
2196 # to adjust $_ for the next one, and the final one adjusts it for
2198 main::set_access('each_line_handler', \%each_line_handler, 'c');
2200 my %properties; # Optional ordered list of the properties that occur in each
2201 # meaningful line of the input file. If present, an appropriate
2202 # each_line_handler() is automatically generated and pushed onto the stack
2203 # of such handlers. This is useful when a file contains multiple
2204 # proerties per line, but no other special considerations are necessary.
2205 # The special value "<ignored>" means to discard the corresponding input
2207 # Any @missing lines in the file should also match this syntax; no such
2208 # files exist as of 6.3. But if it happens in a future release, the code
2209 # could be expanded to properly parse them.
2210 main::set_access('properties', \%properties, qw{ c r });
2212 my %has_missings_defaults;
2213 # ? Are there lines in the file giving default values for code points
2214 # missing from it?. Defaults to NO_DEFAULTS. Otherwise NOT_IGNORED is
2215 # the norm, but IGNORED means it has such lines, but the handler doesn't
2216 # use them. Having these three states allows us to catch changes to the
2217 # UCD that this program should track. XXX This could be expanded to
2218 # specify the syntax for such lines, like %properties above.
2219 main::set_access('has_missings_defaults',
2220 \%has_missings_defaults, qw{ c r });
2223 # Subroutine to call before doing anything else in the file. If undef, no
2224 # such handler is called.
2225 main::set_access('pre_handler', \%pre_handler, qw{ c });
2228 # Subroutine to call upon getting an EOF on the input file, but before
2229 # that is returned to the main handler. This is to allow buffers to be
2230 # flushed. The handler is expected to call insert_lines() or
2231 # insert_adjusted() with the buffered material
2232 main::set_access('eof_handler', \%eof_handler, qw{ c r });
2235 # Subroutine to call after all the lines of the file are read in and
2236 # processed. If undef, no such handler is called.
2237 main::set_access('post_handler', \%post_handler, qw{ c });
2239 my %progress_message;
2240 # Message to print to display progress in lieu of the standard one
2241 main::set_access('progress_message', \%progress_message, qw{ c });
2244 # cache open file handle, internal. Is undef if file hasn't been
2245 # processed at all, empty if has;
2246 main::set_access('handle', \%handle);
2249 # cache of lines added virtually to the file, internal
2250 main::set_access('added_lines', \%added_lines);
2253 # cache of lines added virtually to the file, internal
2254 main::set_access('remapped_lines', \%remapped_lines);
2257 # cache of errors found, internal
2258 main::set_access('errors', \%errors);
2261 # storage of '@missing' defaults lines
2262 main::set_access('missings', \%missings);
2265 sub _next_line_with_remapped_range;
2270 my $self = bless \do{ my $anonymous_scalar }, $class;
2271 my $addr = do { no overloading; pack 'J', $self; };
2274 $handler{$addr} = \&main::process_generic_property_file;
2275 $non_skip{$addr} = 0;
2277 $has_missings_defaults{$addr} = $NO_DEFAULTS;
2278 $handle{$addr} = undef;
2279 $added_lines{$addr} = [ ];
2280 $remapped_lines{$addr} = [ ];
2281 $each_line_handler{$addr} = [ ];
2282 $errors{$addr} = { };
2283 $missings{$addr} = [ ];
2285 # Two positional parameters.
2286 return Carp::carp_too_few_args(\@_, 2) if main::DEBUG && @_ < 2;
2287 $file{$addr} = main::internal_file_to_platform(shift);
2288 $first_released{$addr} = shift;
2290 # The rest of the arguments are key => value pairs
2291 # %constructor_fields has been set up earlier to list all possible
2292 # ones. Either set or push, depending on how the default has been set
2295 foreach my $key (keys %args) {
2296 my $argument = $args{$key};
2298 # Note that the fields are the lower case of the constructor keys
2299 my $hash = $constructor_fields{lc $key};
2300 if (! defined $hash) {
2301 Carp::my_carp_bug("Unrecognized parameters '$key => $argument' to new() for $self. Skipped");
2304 if (ref $hash->{$addr} eq 'ARRAY') {
2305 if (ref $argument eq 'ARRAY') {
2306 foreach my $argument (@{$argument}) {
2307 next if ! defined $argument;
2308 push @{$hash->{$addr}}, $argument;
2312 push @{$hash->{$addr}}, $argument if defined $argument;
2316 $hash->{$addr} = $argument;
2321 # If the file has a property for it, it means that the property is not
2322 # listed in the file's entries. So add a handler to the list of line
2323 # handlers to insert the property name into the lines, to provide a
2324 # uniform interface to the final processing subroutine.
2325 # the final code doesn't have to worry about that.
2326 if ($property{$addr}) {
2327 push @{$each_line_handler{$addr}}, \&_insert_property_into_line;
2330 if ($non_skip{$addr} && ! $debug_skip && $verbosity) {
2331 print "Warning: " . __PACKAGE__ . " constructor for $file{$addr} has useless 'non_skip' in it\n";
2334 # If skipping, set to optional, and add to list of ignored files,
2335 # including its reason
2337 $optional{$addr} = 1;
2338 $skipped_files{$file{$addr}} = $skip{$addr}
2340 elsif ($properties{$addr}) {
2342 # Add a handler for each line in the input so that it creates a
2343 # separate input line for each property in those input lines, thus
2344 # making them suitable for process_generic_property_file().
2346 push @{$each_line_handler{$addr}},
2349 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2351 my @fields = split /\s*;\s*/, $_, -1;
2353 if (@fields - 1 > @{$properties{$addr}}) {
2354 $file->carp_bad_line('Extra fields');
2358 my $range = shift @fields; # 0th element is always the
2361 # The next fields in the input line correspond
2362 # respectively to the stored properties.
2363 for my $i (0 .. @{$properties{$addr}} - 1) {
2364 my $property_name = $properties{$addr}[$i];
2365 next if $property_name eq '<ignored>';
2366 $file->insert_adjusted_lines(
2367 "$range; $property_name; $fields[$i]");
2375 { # On non-ascii platforms, we use a special handler
2378 *next_line = (main::NON_ASCII_PLATFORM)
2379 ? *_next_line_with_remapped_range
2389 qw("") => "_operator_stringify",
2390 "." => \&main::_operator_dot,
2391 ".=" => \&main::_operator_dot_equal,
2394 sub _operator_stringify {
2397 return __PACKAGE__ . " object for " . $self->file;
2400 # flag to make sure extracted files are processed early
2401 my $seen_non_extracted_non_age = 0;
2404 # Process the input object $self. This opens and closes the file and
2405 # calls all the handlers for it. Currently, this can only be called
2406 # once per file, as it destroy's the EOF handler
2409 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2411 my $addr = do { no overloading; pack 'J', $self; };
2413 my $file = $file{$addr};
2415 # Don't process if not expecting this file (because released later
2416 # than this Unicode version), and isn't there. This means if someone
2417 # copies it into an earlier version's directory, we will go ahead and
2419 return if $first_released{$addr} gt $v_version && ! -e $file;
2421 # If in debugging mode and this file doesn't have the non-skip
2422 # flag set, and isn't one of the critical files, skip it.
2424 && $first_released{$addr} ne v0
2425 && ! $non_skip{$addr})
2427 print "Skipping $file in debugging\n" if $verbosity;
2431 # File could be optional
2432 if ($optional{$addr}) {
2433 return unless -e $file;
2434 my $result = eval $optional{$addr};
2435 if (! defined $result) {
2436 Carp::my_carp_bug("Got '$@' when tried to eval $optional{$addr}. $file Skipped.");
2441 print STDERR "Skipping processing input file '$file' because '$optional{$addr}' is not true\n";
2447 if (! defined $file || ! -e $file) {
2449 # If the file doesn't exist, see if have internal data for it
2450 # (based on first_released being 0).
2451 if ($first_released{$addr} eq v0) {
2452 $handle{$addr} = 'pretend_is_open';
2455 if (! $optional{$addr} # File could be optional
2456 && $v_version ge $first_released{$addr})
2458 print STDERR "Skipping processing input file '$file' because not found\n" if $v_version ge $first_released{$addr};
2465 # Here, the file exists. Some platforms may change the case of
2467 if ($seen_non_extracted_non_age) {
2468 if ($file =~ /$EXTRACTED/i) {
2469 Carp::my_carp_bug(main::join_lines(<<END
2470 $file should be processed just after the 'Prop...Alias' files, and before
2471 anything not in the $EXTRACTED_DIR directory. Proceeding, but the results may
2472 have subtle problems
2477 elsif ($EXTRACTED_DIR
2478 && $first_released{$addr} ne v0
2479 && $file !~ /$EXTRACTED/i
2480 && lc($file) ne 'dage.txt')
2482 # We don't set this (by the 'if' above) if we have no
2483 # extracted directory, so if running on an early version,
2484 # this test won't work. Not worth worrying about.
2485 $seen_non_extracted_non_age = 1;
2488 # And mark the file as having being processed, and warn if it
2489 # isn't a file we are expecting. As we process the files,
2490 # they are deleted from the hash, so any that remain at the
2491 # end of the program are files that we didn't process.
2492 my $fkey = File::Spec->rel2abs($file);
2493 my $expecting = delete $potential_files{lc($fkey)};
2495 Carp::my_carp("Was not expecting '$file'.") if
2497 && ! defined $handle{$addr};
2499 # Having deleted from expected files, we can quit if not to do
2500 # anything. Don't print progress unless really want verbosity
2502 print "Skipping $file.\n" if $verbosity >= $VERBOSE;
2506 # Open the file, converting the slashes used in this program
2507 # into the proper form for the OS
2509 if (not open $file_handle, "<", $file) {
2510 Carp::my_carp("Can't open $file. Skipping: $!");
2513 $handle{$addr} = $file_handle; # Cache the open file handle
2515 if ($v_version ge v3.2.0
2516 && lc($file) ne 'unicodedata.txt'
2518 # Unihan files used another format until v7
2519 && ($v_version ge v7.0.0 || $file !~ /^Unihan/i))
2521 $_ = <$file_handle>;
2522 if ($_ !~ / - $string_version \. /x) {
2525 die Carp::my_carp("File '$file' is version '$_'. It should be version $string_version");
2530 if ($verbosity >= $PROGRESS) {
2531 if ($progress_message{$addr}) {
2532 print "$progress_message{$addr}\n";
2535 # If using a virtual file, say so.
2536 print "Processing ", (-e $file)
2538 : "substitute $file",
2544 # Call any special handler for before the file.
2545 &{$pre_handler{$addr}}($self) if $pre_handler{$addr};
2547 # Then the main handler
2548 &{$handler{$addr}}($self);
2550 # Then any special post-file handler.
2551 &{$post_handler{$addr}}($self) if $post_handler{$addr};
2553 # If any errors have been accumulated, output the counts (as the first
2554 # error message in each class was output when it was encountered).
2555 if ($errors{$addr}) {
2558 foreach my $error (keys %{$errors{$addr}}) {
2559 $total += $errors{$addr}->{$error};
2560 delete $errors{$addr}->{$error};
2565 = "A total of $total lines had errors in $file. ";
2567 $message .= ($types == 1)
2568 ? '(Only the first one was displayed.)'
2569 : '(Only the first of each type was displayed.)';
2570 Carp::my_carp($message);
2574 if (@{$missings{$addr}}) {
2575 Carp::my_carp_bug("Handler for $file didn't look at all the \@missing lines. Generated tables likely are wrong");
2578 # If a real file handle, close it.
2579 close $handle{$addr} or Carp::my_carp("Can't close $file: $!") if
2581 $handle{$addr} = ""; # Uses empty to indicate that has already seen
2582 # the file, as opposed to undef
2587 # Sets $_ to be the next logical input line, if any. Returns non-zero
2588 # if such a line exists. 'logical' means that any lines that have
2589 # been added via insert_lines() will be returned in $_ before the file
2593 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2595 my $addr = do { no overloading; pack 'J', $self; };
2597 # Here the file is open (or if the handle is not a ref, is an open
2598 # 'virtual' file). Get the next line; any inserted lines get priority
2599 # over the file itself.
2603 while (1) { # Loop until find non-comment, non-empty line
2604 #local $to_trace = 1 if main::DEBUG;
2605 my $inserted_ref = shift @{$added_lines{$addr}};
2606 if (defined $inserted_ref) {
2607 ($adjusted, $_) = @{$inserted_ref};
2608 trace $adjusted, $_ if main::DEBUG && $to_trace;
2609 return 1 if $adjusted;
2612 last if ! ref $handle{$addr}; # Don't read unless is real file
2613 last if ! defined ($_ = readline $handle{$addr});
2616 trace $_ if main::DEBUG && $to_trace;
2618 # See if this line is the comment line that defines what property
2619 # value that code points that are not listed in the file should
2620 # have. The format or existence of these lines is not guaranteed
2621 # by Unicode since they are comments, but the documentation says
2622 # that this was added for machine-readability, so probably won't
2623 # change. This works starting in Unicode Version 5.0. They look
2626 # @missing: 0000..10FFFF; Not_Reordered
2627 # @missing: 0000..10FFFF; Decomposition_Mapping; <code point>
2628 # @missing: 0000..10FFFF; ; NaN
2630 # Save the line for a later get_missings() call.
2631 if (/$missing_defaults_prefix/) {
2632 if ($has_missings_defaults{$addr} == $NO_DEFAULTS) {
2633 $self->carp_bad_line("Unexpected \@missing line. Assuming no missing entries");
2635 elsif ($has_missings_defaults{$addr} == $NOT_IGNORED) {
2636 my @defaults = split /\s* ; \s*/x, $_;
2638 # The first field is the @missing, which ends in a
2639 # semi-colon, so can safely shift.
2642 # Some of these lines may have empty field placeholders
2643 # which get in the way. An example is:
2644 # @missing: 0000..10FFFF; ; NaN
2645 # Remove them. Process starting from the top so the
2646 # splice doesn't affect things still to be looked at.
2647 for (my $i = @defaults - 1; $i >= 0; $i--) {
2648 next if $defaults[$i] ne "";
2649 splice @defaults, $i, 1;
2652 # What's left should be just the property (maybe) and the
2653 # default. Having only one element means it doesn't have
2657 if (@defaults >= 1) {
2658 if (@defaults == 1) {
2659 $default = $defaults[0];
2662 $property = $defaults[0];
2663 $default = $defaults[1];
2669 || ($default =~ /^</
2670 && $default !~ /^<code *point>$/i
2671 && $default !~ /^<none>$/i
2672 && $default !~ /^<script>$/i))
2674 $self->carp_bad_line("Unrecognized \@missing line: $_. Assuming no missing entries");
2678 # If the property is missing from the line, it should
2679 # be the one for the whole file
2680 $property = $property{$addr} if ! defined $property;
2682 # Change <none> to the null string, which is what it
2683 # really means. If the default is the code point
2684 # itself, set it to <code point>, which is what
2685 # Unicode uses (but sometimes they've forgotten the
2687 if ($default =~ /^<none>$/i) {
2690 elsif ($default =~ /^<code *point>$/i) {
2691 $default = $CODE_POINT;
2693 elsif ($default =~ /^<script>$/i) {
2695 # Special case this one. Currently is from
2696 # ScriptExtensions.txt, and means for all unlisted
2697 # code points, use their Script property values.
2698 # For the code points not listed in that file, the
2699 # default value is 'Unknown'.
2700 $default = "Unknown";
2703 # Store them as a sub-arrays with both components.
2704 push @{$missings{$addr}}, [ $default, $property ];
2708 # There is nothing for the caller to process on this comment
2713 # Remove comments and trailing space, and skip this line if the
2719 # Call any handlers for this line, and skip further processing of
2720 # the line if the handler sets the line to null.
2721 foreach my $sub_ref (@{$each_line_handler{$addr}}) {
2726 # Here the line is ok. return success.
2728 } # End of looping through lines.
2730 # If there is an EOF handler, call it (only once) and if it generates
2731 # more lines to process go back in the loop to handle them.
2732 if ($eof_handler{$addr}) {
2733 &{$eof_handler{$addr}}($self);
2734 $eof_handler{$addr} = ""; # Currently only get one shot at it.
2735 goto LINE if $added_lines{$addr};
2738 # Return failure -- no more lines.
2743 sub _next_line_with_remapped_range {
2745 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2747 # like _next_line(), but for use on non-ASCII platforms. It sets $_
2748 # to be the next logical input line, if any. Returns non-zero if such
2749 # a line exists. 'logical' means that any lines that have been added
2750 # via insert_lines() will be returned in $_ before the file is read
2753 # The difference from _next_line() is that this remaps the Unicode
2754 # code points in the input to those of the native platform. Each
2755 # input line contains a single code point, or a single contiguous
2756 # range of them This routine splits each range into its individual
2757 # code points and caches them. It returns the cached values,
2758 # translated into their native equivalents, one at a time, for each
2759 # call, before reading the next line. Since native values can only be
2760 # a single byte wide, no translation is needed for code points above
2761 # 0xFF, and ranges that are entirely above that number are not split.
2762 # If an input line contains the range 254-1000, it would be split into
2763 # three elements: 254, 255, and 256-1000. (The downstream table
2764 # insertion code will sort and coalesce the individual code points
2765 # into appropriate ranges.)
2767 my $addr = do { no overloading; pack 'J', $self; };
2771 # Look in cache before reading the next line. Return any cached
2773 my $inserted = shift @{$remapped_lines{$addr}};
2774 if (defined $inserted) {
2775 trace $inserted if main::DEBUG && $to_trace;
2776 $_ = $inserted =~ s/^ ( \d+ ) /sprintf("%04X", utf8::unicode_to_native($1))/xer;
2777 trace $_ if main::DEBUG && $to_trace;
2781 # Get the next line.
2782 return 0 unless _next_line($self);
2784 # If there is a special handler for it, return the line,
2785 # untranslated. This should happen only for files that are
2786 # special, not being code-point related, such as property names.
2787 return 1 if $handler{$addr}
2788 != \&main::process_generic_property_file;
2790 my ($range, $property_name, $map, @remainder)
2791 = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
2794 || ! defined $property_name
2795 || $range !~ /^ ($code_point_re) (?:\.\. ($code_point_re) )? $/x)
2797 Carp::my_carp_bug("Unrecognized input line '$_'. Ignored");
2801 my $high = (defined $2) ? hex $2 : $low;
2803 # If the input maps the range to another code point, remap the
2804 # target if it is between 0 and 255.
2807 $map =~ s/\b 00 ( [0-9A-F]{2} ) \b/sprintf("%04X", utf8::unicode_to_native(hex $1))/gxe;
2808 $tail = "$property_name; $map";
2809 $_ = "$range; $tail";
2812 $tail = $property_name;
2815 # If entire range is above 255, just return it, unchanged (except
2816 # any mapped-to code point, already changed above)
2817 return 1 if $low > 255;
2819 # Cache an entry for every code point < 255. For those in the
2820 # range above 255, return a dummy entry for just that portion of
2821 # the range. Note that this will be out-of-order, but that is not
2823 foreach my $code_point ($low .. $high) {
2824 if ($code_point > 255) {
2825 $_ = sprintf "%04X..%04X; $tail", $code_point, $high;
2828 push @{$remapped_lines{$addr}}, "$code_point; $tail";
2830 } # End of looping through lines.
2835 # Not currently used, not fully tested.
2837 # # Non-destructive look-ahead one non-adjusted, non-comment, non-blank
2838 # # record. Not callable from an each_line_handler(), nor does it call
2839 # # an each_line_handler() on the line.
2842 # my $addr = do { no overloading; pack 'J', $self; };
2844 # foreach my $inserted_ref (@{$added_lines{$addr}}) {
2845 # my ($adjusted, $line) = @{$inserted_ref};
2846 # next if $adjusted;
2848 # # Remove comments and trailing space, and return a non-empty
2851 # $line =~ s/\s+$//;
2852 # return $line if $line ne "";
2855 # return if ! ref $handle{$addr}; # Don't read unless is real file
2856 # while (1) { # Loop until find non-comment, non-empty line
2857 # local $to_trace = 1 if main::DEBUG;
2858 # trace $_ if main::DEBUG && $to_trace;
2859 # return if ! defined (my $line = readline $handle{$addr});
2861 # push @{$added_lines{$addr}}, [ 0, $line ];
2864 # $line =~ s/\s+$//;
2865 # return $line if $line ne "";
2873 # Lines can be inserted so that it looks like they were in the input
2874 # file at the place it was when this routine is called. See also
2875 # insert_adjusted_lines(). Lines inserted via this routine go through
2876 # any each_line_handler()
2880 # Each inserted line is an array, with the first element being 0 to
2881 # indicate that this line hasn't been adjusted, and needs to be
2884 push @{$added_lines{pack 'J', $self}}, map { [ 0, $_ ] } @_;
2888 sub insert_adjusted_lines {
2889 # Lines can be inserted so that it looks like they were in the input
2890 # file at the place it was when this routine is called. See also
2891 # insert_lines(). Lines inserted via this routine are already fully
2892 # adjusted, ready to be processed; each_line_handler()s handlers will
2893 # not be called. This means this is not a completely general
2894 # facility, as only the last each_line_handler on the stack should
2895 # call this. It could be made more general, by passing to each of the
2896 # line_handlers their position on the stack, which they would pass on
2897 # to this routine, and that would replace the boolean first element in
2898 # the anonymous array pushed here, so that the next_line routine could
2899 # use that to call only those handlers whose index is after it on the
2900 # stack. But this is overkill for what is needed now.
2903 trace $_[0] if main::DEBUG && $to_trace;
2905 # Each inserted line is an array, with the first element being 1 to
2906 # indicate that this line has been adjusted
2908 push @{$added_lines{pack 'J', $self}}, map { [ 1, $_ ] } @_;
2913 # Returns the stored up @missings lines' values, and clears the list.
2914 # The values are in an array, consisting of the default in the first
2915 # element, and the property in the 2nd. However, since these lines
2916 # can be stacked up, the return is an array of all these arrays.
2919 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2921 my $addr = do { no overloading; pack 'J', $self; };
2923 # If not accepting a list return, just return the first one.
2924 return shift @{$missings{$addr}} unless wantarray;
2926 my @return = @{$missings{$addr}};
2927 undef @{$missings{$addr}};
2931 sub _insert_property_into_line {
2932 # Add a property field to $_, if this file requires it.
2935 my $addr = do { no overloading; pack 'J', $self; };
2936 my $property = $property{$addr};
2937 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2939 $_ =~ s/(;|$)/; $property$1/;
2944 # Output consistent error messages, using either a generic one, or the
2945 # one given by the optional parameter. To avoid gazillions of the
2946 # same message in case the syntax of a file is way off, this routine
2947 # only outputs the first instance of each message, incrementing a
2948 # count so the totals can be output at the end of the file.
2951 my $message = shift;
2952 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2954 my $addr = do { no overloading; pack 'J', $self; };
2956 $message = 'Unexpected line' unless $message;
2958 # No trailing punctuation so as to fit with our addenda.
2959 $message =~ s/[.:;,]$//;
2961 # If haven't seen this exact message before, output it now. Otherwise
2962 # increment the count of how many times it has occurred
2963 unless ($errors{$addr}->{$message}) {
2964 Carp::my_carp("$message in '$_' in "
2966 . " at line $.. Skipping this line;");
2967 $errors{$addr}->{$message} = 1;
2970 $errors{$addr}->{$message}++;
2973 # Clear the line to prevent any further (meaningful) processing of it.
2980 package Multi_Default;
2982 # Certain properties in early versions of Unicode had more than one possible
2983 # default for code points missing from the files. In these cases, one
2984 # default applies to everything left over after all the others are applied,
2985 # and for each of the others, there is a description of which class of code
2986 # points applies to it. This object helps implement this by storing the
2987 # defaults, and for all but that final default, an eval string that generates
2988 # the class that it applies to.
2993 main::setup_package();
2996 # The defaults structure for the classes
2997 main::set_access('class_defaults', \%class_defaults);
3000 # The default that applies to everything left over.
3001 main::set_access('other_default', \%other_default, 'r');
3005 # The constructor is called with default => eval pairs, terminated by
3006 # the left-over default. e.g.
3007 # Multi_Default->new(
3008 # 'T' => '$gc->table("Mn") + $gc->table("Cf") - 0x200C
3010 # 'R' => 'some other expression that evaluates to code points',
3018 my $self = bless \do{my $anonymous_scalar}, $class;
3019 my $addr = do { no overloading; pack 'J', $self; };
3022 my $default = shift;
3024 $class_defaults{$addr}->{$default} = $eval;
3027 $other_default{$addr} = shift;
3032 sub get_next_defaults {
3033 # Iterates and returns the next class of defaults.
3035 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3037 my $addr = do { no overloading; pack 'J', $self; };
3039 return each %{$class_defaults{$addr}};
3045 # An alias is one of the names that a table goes by. This class defines them
3046 # including some attributes. Everything is currently setup in the
3052 main::setup_package();
3055 main::set_access('name', \%name, 'r');
3058 # Should this name match loosely or not.
3059 main::set_access('loose_match', \%loose_match, 'r');
3061 my %make_re_pod_entry;
3062 # Some aliases should not get their own entries in the re section of the
3063 # pod, because they are covered by a wild-card, and some we want to
3064 # discourage use of. Binary
3065 main::set_access('make_re_pod_entry', \%make_re_pod_entry, 'r', 's');
3068 # Is this documented to be accessible via Unicode::UCD
3069 main::set_access('ucd', \%ucd, 'r', 's');
3072 # Aliases have a status, like deprecated, or even suppressed (which means
3073 # they don't appear in documentation). Enum
3074 main::set_access('status', \%status, 'r');
3077 # Similarly, some aliases should not be considered as usable ones for
3078 # external use, such as file names, or we don't want documentation to
3079 # recommend them. Boolean
3080 main::set_access('ok_as_filename', \%ok_as_filename, 'r');
3085 my $self = bless \do { my $anonymous_scalar }, $class;
3086 my $addr = do { no overloading; pack 'J', $self; };
3088 $name{$addr} = shift;
3089 $loose_match{$addr} = shift;
3090 $make_re_pod_entry{$addr} = shift;
3091 $ok_as_filename{$addr} = shift;
3092 $status{$addr} = shift;
3093 $ucd{$addr} = shift;
3095 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3097 # Null names are never ok externally
3098 $ok_as_filename{$addr} = 0 if $name{$addr} eq "";
3106 # A range is the basic unit for storing code points, and is described in the
3107 # comments at the beginning of the program. Each range has a starting code
3108 # point; an ending code point (not less than the starting one); a value
3109 # that applies to every code point in between the two end-points, inclusive;
3110 # and an enum type that applies to the value. The type is for the user's
3111 # convenience, and has no meaning here, except that a non-zero type is
3112 # considered to not obey the normal Unicode rules for having standard forms.
3114 # The same structure is used for both map and match tables, even though in the
3115 # latter, the value (and hence type) is irrelevant and could be used as a
3116 # comment. In map tables, the value is what all the code points in the range
3117 # map to. Type 0 values have the standardized version of the value stored as
3118 # well, so as to not have to recalculate it a lot.
3120 sub trace { return main::trace(@_); }
3124 main::setup_package();
3127 main::set_access('start', \%start, 'r', 's');
3130 main::set_access('end', \%end, 'r', 's');
3133 main::set_access('value', \%value, 'r');
3136 main::set_access('type', \%type, 'r');
3139 # The value in internal standard form. Defined only if the type is 0.
3140 main::set_access('standard_form', \%standard_form);
3142 # Note that if these fields change, the dump() method should as well
3145 return Carp::carp_too_few_args(\@_, 3) if main::DEBUG && @_ < 3;
3148 my $self = bless \do { my $anonymous_scalar }, $class;
3149 my $addr = do { no overloading; pack 'J', $self; };
3151 $start{$addr} = shift;
3152 $end{$addr} = shift;
3156 my $value = delete $args{'Value'}; # Can be 0
3157 $value = "" unless defined $value;
3158 $value{$addr} = $value;
3160 $type{$addr} = delete $args{'Type'} || 0;
3162 Carp::carp_extra_args(\%args) if main::DEBUG && %args;
3169 qw("") => "_operator_stringify",
3170 "." => \&main::_operator_dot,
3171 ".=" => \&main::_operator_dot_equal,
3174 sub _operator_stringify {
3176 my $addr = do { no overloading; pack 'J', $self; };
3178 # Output it like '0041..0065 (value)'
3179 my $return = sprintf("%04X", $start{$addr})
3181 . sprintf("%04X", $end{$addr});
3182 my $value = $value{$addr};
3183 my $type = $type{$addr};
3185 $return .= "$value";
3186 $return .= ", Type=$type" if $type != 0;
3193 # Calculate the standard form only if needed, and cache the result.
3194 # The standard form is the value itself if the type is special.
3195 # This represents a considerable CPU and memory saving - at the time
3196 # of writing there are 368676 non-special objects, but the standard
3197 # form is only requested for 22047 of them - ie about 6%.
3200 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3202 my $addr = do { no overloading; pack 'J', $self; };
3204 return $standard_form{$addr} if defined $standard_form{$addr};
3206 my $value = $value{$addr};
3207 return $value if $type{$addr};
3208 return $standard_form{$addr} = main::standardize($value);
3212 # Human, not machine readable. For machine readable, comment out this
3213 # entire routine and let the standard one take effect.
3216 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3218 my $addr = do { no overloading; pack 'J', $self; };
3220 my $return = $indent
3221 . sprintf("%04X", $start{$addr})
3223 . sprintf("%04X", $end{$addr})
3224 . " '$value{$addr}';";
3225 if (! defined $standard_form{$addr}) {
3226 $return .= "(type=$type{$addr})";
3228 elsif ($standard_form{$addr} ne $value{$addr}) {
3229 $return .= "(standard '$standard_form{$addr}')";
3235 package _Range_List_Base;
3237 # Base class for range lists. A range list is simply an ordered list of
3238 # ranges, so that the ranges with the lowest starting numbers are first in it.
3240 # When a new range is added that is adjacent to an existing range that has the
3241 # same value and type, it merges with it to form a larger range.
3243 # Ranges generally do not overlap, except that there can be multiple entries
3244 # of single code point ranges. This is because of NameAliases.txt.
3246 # In this program, there is a standard value such that if two different
3247 # values, have the same standard value, they are considered equivalent. This
3248 # value was chosen so that it gives correct results on Unicode data
3250 # There are a number of methods to manipulate range lists, and some operators
3251 # are overloaded to handle them.
3253 sub trace { return main::trace(@_); }
3259 # Max is initialized to a negative value that isn't adjacent to 0, for
3263 main::setup_package();
3266 # The list of ranges
3267 main::set_access('ranges', \%ranges, 'readable_array');
3270 # The highest code point in the list. This was originally a method, but
3271 # actual measurements said it was used a lot.
3272 main::set_access('max', \%max, 'r');
3274 my %each_range_iterator;
3275 # Iterator position for each_range()
3276 main::set_access('each_range_iterator', \%each_range_iterator);
3279 # Name of parent this is attached to, if any. Solely for better error
3281 main::set_access('owner_name_of', \%owner_name_of, 'p_r');
3283 my %_search_ranges_cache;
3284 # A cache of the previous result from _search_ranges(), for better
3286 main::set_access('_search_ranges_cache', \%_search_ranges_cache);
3292 # Optional initialization data for the range list.
3293 my $initialize = delete $args{'Initialize'};
3297 # Use _union() to initialize. _union() returns an object of this
3298 # class, which means that it will call this constructor recursively.
3299 # But it won't have this $initialize parameter so that it won't
3300 # infinitely loop on this.
3301 return _union($class, $initialize, %args) if defined $initialize;
3303 $self = bless \do { my $anonymous_scalar }, $class;
3304 my $addr = do { no overloading; pack 'J', $self; };
3306 # Optional parent object, only for debug info.
3307 $owner_name_of{$addr} = delete $args{'Owner'};
3308 $owner_name_of{$addr} = "" if ! defined $owner_name_of{$addr};
3310 # Stringify, in case it is an object.
3311 $owner_name_of{$addr} = "$owner_name_of{$addr}";
3313 # This is used only for error messages, and so a colon is added
3314 $owner_name_of{$addr} .= ": " if $owner_name_of{$addr} ne "";
3316 Carp::carp_extra_args(\%args) if main::DEBUG && %args;
3318 $max{$addr} = $max_init;
3320 $_search_ranges_cache{$addr} = 0;
3321 $ranges{$addr} = [];
3328 qw("") => "_operator_stringify",
3329 "." => \&main::_operator_dot,
3330 ".=" => \&main::_operator_dot_equal,
3333 sub _operator_stringify {
3335 my $addr = do { no overloading; pack 'J', $self; };
3337 return "Range_List attached to '$owner_name_of{$addr}'"
3338 if $owner_name_of{$addr};
3339 return "anonymous Range_List " . \$self;
3343 # Returns the union of the input code points. It can be called as
3344 # either a constructor or a method. If called as a method, the result
3345 # will be a new() instance of the calling object, containing the union
3346 # of that object with the other parameter's code points; if called as
3347 # a constructor, the first parameter gives the class that the new object
3348 # should be, and the second parameter gives the code points to go into
3350 # In either case, there are two parameters looked at by this routine;
3351 # any additional parameters are passed to the new() constructor.
3353 # The code points can come in the form of some object that contains
3354 # ranges, and has a conventionally named method to access them; or
3355 # they can be an array of individual code points (as integers); or
3356 # just a single code point.
3358 # If they are ranges, this routine doesn't make any effort to preserve
3359 # the range values and types of one input over the other. Therefore
3360 # this base class should not allow _union to be called from other than
3361 # initialization code, so as to prevent two tables from being added
3362 # together where the range values matter. The general form of this
3363 # routine therefore belongs in a derived class, but it was moved here
3364 # to avoid duplication of code. The failure to overload this in this
3365 # class keeps it safe.
3367 # It does make the effort during initialization to accept tables with
3368 # multiple values for the same code point, and to preserve the order
3369 # of these. If there is only one input range or range set, it doesn't
3370 # sort (as it should already be sorted to the desired order), and will
3371 # accept multiple values per code point. Otherwise it will merge
3372 # multiple values into a single one.
3375 my @args; # Arguments to pass to the constructor
3379 # If a method call, will start the union with the object itself, and
3380 # the class of the new object will be the same as self.
3387 # Add the other required parameter.
3389 # Rest of parameters are passed on to the constructor
3391 # Accumulate all records from both lists.
3393 my $input_count = 0;
3394 for my $arg (@args) {
3395 #local $to_trace = 0 if main::DEBUG;
3396 trace "argument = $arg" if main::DEBUG && $to_trace;
3397 if (! defined $arg) {
3399 if (defined $self) {
3401 $message .= $owner_name_of{pack 'J', $self};
3403 Carp::my_carp_bug($message . "Undefined argument to _union. No union done.");
3407 $arg = [ $arg ] if ! ref $arg;
3408 my $type = ref $arg;
3409 if ($type eq 'ARRAY') {
3410 foreach my $element (@$arg) {
3411 push @records, Range->new($element, $element);
3415 elsif ($arg->isa('Range')) {
3416 push @records, $arg;
3419 elsif ($arg->can('ranges')) {
3420 push @records, $arg->ranges;
3425 if (defined $self) {
3427 $message .= $owner_name_of{pack 'J', $self};
3429 Carp::my_carp_bug($message . "Cannot take the union of a $type. No union done.");
3434 # Sort with the range containing the lowest ordinal first, but if
3435 # two ranges start at the same code point, sort with the bigger range
3436 # of the two first, because it takes fewer cycles.
3437 if ($input_count > 1) {
3438 @records = sort { ($a->start <=> $b->start)
3440 # if b is shorter than a, b->end will be
3441 # less than a->end, and we want to select
3442 # a, so want to return -1
3443 ($b->end <=> $a->end)
3447 my $new = $class->new(@_);
3449 # Fold in records so long as they add new information.
3450 for my $set (@records) {
3451 my $start = $set->start;
3452 my $end = $set->end;
3453 my $value = $set->value;
3454 my $type = $set->type;
3455 if ($start > $new->max) {
3456 $new->_add_delete('+', $start, $end, $value, Type => $type);
3458 elsif ($end > $new->max) {
3459 $new->_add_delete('+', $new->max +1, $end, $value,
3462 elsif ($input_count == 1) {
3463 # Here, overlaps existing range, but is from a single input,
3464 # so preserve the multiple values from that input.
3465 $new->_add_delete('+', $start, $end, $value, Type => $type,
3466 Replace => $MULTIPLE_AFTER);
3473 sub range_count { # Return the number of ranges in the range list
3475 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3478 return scalar @{$ranges{pack 'J', $self}};
3482 # Returns the minimum code point currently in the range list, or if
3483 # the range list is empty, 2 beyond the max possible. This is a
3484 # method because used so rarely, that not worth saving between calls,
3485 # and having to worry about changing it as ranges are added and
3489 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3491 my $addr = do { no overloading; pack 'J', $self; };
3493 # If the range list is empty, return a large value that isn't adjacent
3494 # to any that could be in the range list, for simpler tests
3495 return $MAX_WORKING_CODEPOINT + 2 unless scalar @{$ranges{$addr}};
3496 return $ranges{$addr}->[0]->start;
3500 # Boolean: Is argument in the range list? If so returns $i such that:
3501 # range[$i]->end < $codepoint <= range[$i+1]->end
3502 # which is one beyond what you want; this is so that the 0th range
3503 # doesn't return false
3505 my $codepoint = shift;
3506 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3508 my $i = $self->_search_ranges($codepoint);
3509 return 0 unless defined $i;
3511 # The search returns $i, such that
3512 # range[$i-1]->end < $codepoint <= range[$i]->end
3513 # So is in the table if and only iff it is at least the start position
3516 return 0 if $ranges{pack 'J', $self}->[$i]->start > $codepoint;
3520 sub containing_range {
3521 # Returns the range object that contains the code point, undef if none
3524 my $codepoint = shift;
3525 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3527 my $i = $self->contains($codepoint);
3530 # contains() returns 1 beyond where we should look
3532 return $ranges{pack 'J', $self}->[$i-1];
3536 # Returns the value associated with the code point, undef if none
3539 my $codepoint = shift;
3540 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3542 my $range = $self->containing_range($codepoint);
3543 return unless defined $range;
3545 return $range->value;
3549 # Returns the type of the range containing the code point, undef if
3550 # the code point is not in the table
3553 my $codepoint = shift;
3554 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3556 my $range = $self->containing_range($codepoint);
3557 return unless defined $range;
3559 return $range->type;
3562 sub _search_ranges {
3563 # Find the range in the list which contains a code point, or where it
3564 # should go if were to add it. That is, it returns $i, such that:
3565 # range[$i-1]->end < $codepoint <= range[$i]->end
3566 # Returns undef if no such $i is possible (e.g. at end of table), or
3567 # if there is an error.
3570 my $code_point = shift;
3571 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3573 my $addr = do { no overloading; pack 'J', $self; };
3575 return if $code_point > $max{$addr};
3576 my $r = $ranges{$addr}; # The current list of ranges
3577 my $range_list_size = scalar @$r;
3580 use integer; # want integer division
3582 # Use the cached result as the starting guess for this one, because,
3583 # an experiment on 5.1 showed that 90% of the time the cache was the
3584 # same as the result on the next call (and 7% it was one less).
3585 $i = $_search_ranges_cache{$addr};
3586 $i = 0 if $i >= $range_list_size; # Reset if no longer valid (prob.
3587 # from an intervening deletion
3588 #local $to_trace = 1 if main::DEBUG;
3589 trace "previous \$i is still valid: $i" if main::DEBUG && $to_trace && $code_point <= $r->[$i]->end && ($i == 0 || $r->[$i-1]->end < $code_point);
3590 return $i if $code_point <= $r->[$i]->end
3591 && ($i == 0 || $r->[$i-1]->end < $code_point);
3593 # Here the cache doesn't yield the correct $i. Try adding 1.
3594 if ($i < $range_list_size - 1
3595 && $r->[$i]->end < $code_point &&
3596 $code_point <= $r->[$i+1]->end)
3599 trace "next \$i is correct: $i" if main::DEBUG && $to_trace;
3600 $_search_ranges_cache{$addr} = $i;
3604 # Here, adding 1 also didn't work. We do a binary search to
3605 # find the correct position, starting with current $i
3607 my $upper = $range_list_size - 1;
3609 trace "top of loop i=$i:", sprintf("%04X", $r->[$lower]->start), "[$lower] .. ", sprintf("%04X", $r->[$i]->start), "[$i] .. ", sprintf("%04X", $r->[$upper]->start), "[$upper]" if main::DEBUG && $to_trace;
3611 if ($code_point <= $r->[$i]->end) {
3613 # Here we have met the upper constraint. We can quit if we
3614 # also meet the lower one.
3615 last if $i == 0 || $r->[$i-1]->end < $code_point;
3617 $upper = $i; # Still too high.
3622 # Here, $r[$i]->end < $code_point, so look higher up.
3626 # Split search domain in half to try again.
3627 my $temp = ($upper + $lower) / 2;
3629 # No point in continuing unless $i changes for next time
3633 # We can't reach the highest element because of the averaging.
3634 # So if one below the upper edge, force it there and try one
3636 if ($i == $range_list_size - 2) {
3638 trace "Forcing to upper edge" if main::DEBUG && $to_trace;
3639 $i = $range_list_size - 1;
3641 # Change $lower as well so if fails next time through,
3642 # taking the average will yield the same $i, and we will
3643 # quit with the error message just below.
3647 Carp::my_carp_bug("$owner_name_of{$addr}Can't find where the range ought to go. No action taken.");
3651 } # End of while loop
3653 if (main::DEBUG && $to_trace) {
3654 trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i;
3655 trace "i= [ $i ]", $r->[$i];
3656 trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < $range_list_size - 1;
3659 # Here we have found the offset. Cache it as a starting point for the
3661 $_search_ranges_cache{$addr} = $i;
3666 # Add, replace or delete ranges to or from a list. The $type
3667 # parameter gives which:
3668 # '+' => insert or replace a range, returning a list of any changed
3670 # '-' => delete a range, returning a list of any deleted ranges.
3672 # The next three parameters give respectively the start, end, and
3673 # value associated with the range. 'value' should be null unless the
3676 # The range list is kept sorted so that the range with the lowest
3677 # starting position is first in the list, and generally, adjacent
3678 # ranges with the same values are merged into a single larger one (see
3679 # exceptions below).
3681 # There are more parameters; all are key => value pairs:
3682 # Type gives the type of the value. It is only valid for '+'.
3683 # All ranges have types; if this parameter is omitted, 0 is
3684 # assumed. Ranges with type 0 are assumed to obey the
3685 # Unicode rules for casing, etc; ranges with other types are
3686 # not. Otherwise, the type is arbitrary, for the caller's
3687 # convenience, and looked at only by this routine to keep
3688 # adjacent ranges of different types from being merged into
3689 # a single larger range, and when Replace =>
3690 # $IF_NOT_EQUIVALENT is specified (see just below).
3691 # Replace determines what to do if the range list already contains
3692 # ranges which coincide with all or portions of the input
3693 # range. It is only valid for '+':
3694 # => $NO means that the new value is not to replace
3695 # any existing ones, but any empty gaps of the
3696 # range list coinciding with the input range
3697 # will be filled in with the new value.
3698 # => $UNCONDITIONALLY means to replace the existing values with
3699 # this one unconditionally. However, if the
3700 # new and old values are identical, the
3701 # replacement is skipped to save cycles
3702 # => $IF_NOT_EQUIVALENT means to replace the existing values
3703 # (the default) with this one if they are not equivalent.
3704 # Ranges are equivalent if their types are the
3705 # same, and they are the same string; or if
3706 # both are type 0 ranges, if their Unicode
3707 # standard forms are identical. In this last
3708 # case, the routine chooses the more "modern"
3709 # one to use. This is because some of the
3710 # older files are formatted with values that
3711 # are, for example, ALL CAPs, whereas the
3712 # derived files have a more modern style,
3713 # which looks better. By looking for this
3714 # style when the pre-existing and replacement
3715 # standard forms are the same, we can move to
3717 # => $MULTIPLE_BEFORE means that if this range duplicates an
3718 # existing one, but has a different value,
3719 # don't replace the existing one, but insert
3720 # this, one so that the same range can occur
3721 # multiple times. They are stored LIFO, so
3722 # that the final one inserted is the first one
3723 # returned in an ordered search of the table.
3724 # If this is an exact duplicate, including the
3725 # value, the original will be moved to be
3726 # first, before any other duplicate ranges
3727 # with different values.
3728 # => $MULTIPLE_AFTER is like $MULTIPLE_BEFORE, but is stored
3729 # FIFO, so that this one is inserted after all
3730 # others that currently exist. If this is an
3731 # exact duplicate, including value, of an
3732 # existing range, this one is discarded
3733 # (leaving the existing one in its original,
3734 # higher priority position
3735 # => anything else is the same as => $IF_NOT_EQUIVALENT
3737 # "same value" means identical for non-type-0 ranges, and it means
3738 # having the same standard forms for type-0 ranges.
3740 return Carp::carp_too_few_args(\@_, 5) if main::DEBUG && @_ < 5;
3743 my $operation = shift; # '+' for add/replace; '-' for delete;
3750 $value = "" if not defined $value; # warning: $value can be "0"
3752 my $replace = delete $args{'Replace'};
3753 $replace = $IF_NOT_EQUIVALENT unless defined $replace;
3755 my $type = delete $args{'Type'};
3756 $type = 0 unless defined $type;
3758 Carp::carp_extra_args(\%args) if main::DEBUG && %args;
3760 my $addr = do { no overloading; pack 'J', $self; };
3762 if ($operation ne '+' && $operation ne '-') {
3763 Carp::my_carp_bug("$owner_name_of{$addr}First parameter to _add_delete must be '+' or '-'. No action taken.");
3766 unless (defined $start && defined $end) {
3767 Carp::my_carp_bug("$owner_name_of{$addr}Undefined start and/or end to _add_delete. No action taken.");
3770 unless ($end >= $start) {
3771 Carp::my_carp_bug("$owner_name_of{$addr}End of range (" . sprintf("%04X", $end) . ") must not be before start (" . sprintf("%04X", $start) . "). No action taken.");
3774 #local $to_trace = 1 if main::DEBUG;
3776 if ($operation eq '-') {
3777 if ($replace != $IF_NOT_EQUIVALENT) {
3778 Carp::my_carp_bug("$owner_name_of{$addr}Replace => \$IF_NOT_EQUIVALENT is required when deleting a range from a range list. Assuming Replace => \$IF_NOT_EQUIVALENT.");
3779 $replace = $IF_NOT_EQUIVALENT;
3782 Carp::my_carp_bug("$owner_name_of{$addr}Type => 0 is required when deleting a range from a range list. Assuming Type => 0.");
3786 Carp::my_carp_bug("$owner_name_of{$addr}Value => \"\" is required when deleting a range from a range list. Assuming Value => \"\".");
3791 my $r = $ranges{$addr}; # The current list of ranges
3792 my $range_list_size = scalar @$r; # And its size
3793 my $max = $max{$addr}; # The current high code point in
3794 # the list of ranges
3796 # Do a special case requiring fewer machine cycles when the new range
3797 # starts after the current highest point. The Unicode input data is
3798 # structured so this is common.
3799 if ($start > $max) {
3801 trace "$owner_name_of{$addr} $operation", sprintf("%04X..%04X (%s) type=%d; prev max=%04X", $start, $end, $value, $type, $max) if main::DEBUG && $to_trace;
3802 return if $operation eq '-'; # Deleting a non-existing range is a
3805 # If the new range doesn't logically extend the current final one
3806 # in the range list, create a new range at the end of the range
3807 # list. (max cleverly is initialized to a negative number not
3808 # adjacent to 0 if the range list is empty, so even adding a range
3809 # to an empty range list starting at 0 will have this 'if'
3811 if ($start > $max + 1 # non-adjacent means can't extend.
3812 || @{$r}[-1]->value ne $value # values differ, can't extend.
3813 || @{$r}[-1]->type != $type # types differ, can't extend.
3815 push @$r, Range->new($start, $end,
3821 # Here, the new range starts just after the current highest in
3822 # the range list, and they have the same type and value.
3823 # Extend the current range to incorporate the new one.
3824 @{$r}[-1]->set_end($end);
3827 # This becomes the new maximum.
3832 #local $to_trace = 0 if main::DEBUG;
3834 trace "$owner_name_of{$addr} $operation", sprintf("%04X", $start) . '..' . sprintf("%04X", $end) . " ($value) replace=$replace" if main::DEBUG && $to_trace;
3836 # Here, the input range isn't after the whole rest of the range list.
3837 # Most likely 'splice' will be needed. The rest of the routine finds
3838 # the needed splice parameters, and if necessary, does the splice.
3839 # First, find the offset parameter needed by the splice function for
3840 # the input range. Note that the input range may span multiple
3841 # existing ones, but we'll worry about that later. For now, just find
3842 # the beginning. If the input range is to be inserted starting in a
3843 # position not currently in the range list, it must (obviously) come
3844 # just after the range below it, and just before the range above it.
3845 # Slightly less obviously, it will occupy the position currently
3846 # occupied by the range that is to come after it. More formally, we
3847 # are looking for the position, $i, in the array of ranges, such that:
3849 # r[$i-1]->start <= r[$i-1]->end < $start < r[$i]->start <= r[$i]->end
3851 # (The ordered relationships within existing ranges are also shown in
3852 # the equation above). However, if the start of the input range is
3853 # within an existing range, the splice offset should point to that
3854 # existing range's position in the list; that is $i satisfies a
3855 # somewhat different equation, namely:
3857 #r[$i-1]->start <= r[$i-1]->end < r[$i]->start <= $start <= r[$i]->end
3859 # More briefly, $start can come before or after r[$i]->start, and at
3860 # this point, we don't know which it will be. However, these
3861 # two equations share these constraints:
3863 # r[$i-1]->end < $start <= r[$i]->end
3865 # And that is good enough to find $i.
3867 my $i = $self->_search_ranges($start);
3869 Carp::my_carp_bug("Searching $self for range beginning with $start unexpectedly returned undefined. Operation '$operation' not performed");
3873 # The search function returns $i such that:
3875 # r[$i-1]->end < $start <= r[$i]->end
3877 # That means that $i points to the first range in the range list
3878 # that could possibly be affected by this operation. We still don't
3879 # know if the start of the input range is within r[$i], or if it
3880 # points to empty space between r[$i-1] and r[$i].
3881 trace "[$i] is the beginning splice point. Existing range there is ", $r->[$i] if main::DEBUG && $to_trace;
3883 # Special case the insertion of data that is not to replace any
3885 if ($replace == $NO) { # If $NO, has to be operation '+'
3886 #local $to_trace = 1 if main::DEBUG;
3887 trace "Doesn't replace" if main::DEBUG && $to_trace;
3889 # Here, the new range is to take effect only on those code points
3890 # that aren't already in an existing range. This can be done by
3891 # looking through the existing range list and finding the gaps in
3892 # the ranges that this new range affects, and then calling this
3893 # function recursively on each of those gaps, leaving untouched
3894 # anything already in the list. Gather up a list of the changed
3895 # gaps first so that changes to the internal state as new ranges
3896 # are added won't be a problem.
3899 # First, if the starting point of the input range is outside an
3900 # existing one, there is a gap from there to the beginning of the
3901 # existing range -- add a span to fill the part that this new
3903 if ($start < $r->[$i]->start) {
3904 push @gap_list, Range->new($start,
3906 $r->[$i]->start - 1),
3908 trace "gap before $r->[$i] [$i], will add", $gap_list[-1] if main::DEBUG && $to_trace;
3911 # Then look through the range list for other gaps until we reach
3912 # the highest range affected by the input one.
3914 for ($j = $i+1; $j < $range_list_size; $j++) {
3915 trace "j=[$j]", $r->[$j] if main::DEBUG && $to_trace;
3916 last if $end < $r->[$j]->start;
3918 # If there is a gap between when this range starts and the
3919 # previous one ends, add a span to fill it. Note that just
3920 # because there are two ranges doesn't mean there is a
3921 # non-zero gap between them. It could be that they have
3922 # different values or types
3923 if ($r->[$j-1]->end + 1 != $r->[$j]->start) {
3925 Range->new($r->[$j-1]->end + 1,
3926 $r->[$j]->start - 1,
3928 trace "gap between $r->[$j-1] and $r->[$j] [$j], will add: $gap_list[-1]" if main::DEBUG && $to_trace;
3932 # Here, we have either found an existing range in the range list,
3933 # beyond the area affected by the input one, or we fell off the
3934 # end of the loop because the input range affects the whole rest
3935 # of the range list. In either case, $j is 1 higher than the
3936 # highest affected range. If $j == $i, it means that there are no
3937 # affected ranges, that the entire insertion is in the gap between
3938 # r[$i-1], and r[$i], which we already have taken care of before
3940 # On the other hand, if there are affected ranges, it might be
3941 # that there is a gap that needs filling after the final such
3942 # range to the end of the input range
3943 if ($r->[$j-1]->end < $end) {
3944 push @gap_list, Range->new(main::max($start,
3945 $r->[$j-1]->end + 1),
3948 trace "gap after $r->[$j-1], will add $gap_list[-1]" if main::DEBUG && $to_trace;
3951 # Call recursively to fill in all the gaps.
3952 foreach my $gap (@gap_list) {
3953 $self->_add_delete($operation,
3963 # Here, we have taken care of the case where $replace is $NO.
3964 # Remember that here, r[$i-1]->end < $start <= r[$i]->end
3965 # If inserting a multiple record, this is where it goes, before the
3966 # first (if any) existing one if inserting LIFO. (If this is to go
3967 # afterwards, FIFO, we below move the pointer to there.) These imply
3968 # an insertion, and no change to any existing ranges. Note that $i
3969 # can be -1 if this new range doesn't actually duplicate any existing,
3970 # and comes at the beginning of the list.
3971 if ($replace == $MULTIPLE_BEFORE || $replace == $MULTIPLE_AFTER) {
3973 if ($start != $end) {
3974 Carp::my_carp_bug("$owner_name_of{$addr}Can't cope with adding a multiple record when the range ($start..$end) contains more than one code point. No action taken.");
3978 # If the new code point is within a current range ...
3979 if ($end >= $r->[$i]->start) {
3981 # Don't add an exact duplicate, as it isn't really a multiple
3982 my $existing_value = $r->[$i]->value;
3983 my $existing_type = $r->[$i]->type;
3984 return if $value eq $existing_value && $type eq $existing_type;
3986 # If the multiple value is part of an existing range, we want
3987 # to split up that range, so that only the single code point
3988 # is affected. To do this, we first call ourselves
3989 # recursively to delete that code point from the table, having
3990 # preserved its current data above. Then we call ourselves
3991 # recursively again to add the new multiple, which we know by
3992 # the test just above is different than the current code
3993 # point's value, so it will become a range containing a single
3994 # code point: just itself. Finally, we add back in the
3995 # pre-existing code point, which will again be a single code
3996 # point range. Because 'i' likely will have changed as a
3997 # result of these operations, we can't just continue on, but
3998 # do this operation recursively as well. If we are inserting
3999 # LIFO, the pre-existing code point needs to go after the new
4000 # one, so use MULTIPLE_AFTER; and vice versa.
4001 if ($r->[$i]->start != $r->[$i]->end) {
4002 $self->_add_delete('-', $start, $end, "");
4003 $self->_add_delete('+', $start, $end, $value, Type => $type);
4004 return $self->_add_delete('+',
4007 Type => $existing_type,
4008 Replace => ($replace == $MULTIPLE_BEFORE)
4010 : $MULTIPLE_BEFORE);
4014 # If to place this new record after, move to beyond all existing
4015 # ones; but don't add this one if identical to any of them, as it
4016 # isn't really a multiple. This leaves the original order, so
4017 # that the current request is ignored. The reasoning is that the
4018 # previous request that wanted this record to have high priority
4019 # should have precedence.
4020 if ($replace == $MULTIPLE_AFTER) {
4021 while ($i < @$r && $r->[$i]->start == $start) {
4022 return if $value eq $r->[$i]->value
4023 && $type eq $r->[$i]->type;
4028 # If instead we are to place this new record before any
4029 # existing ones, remove any identical ones that come after it.
4030 # This changes the existing order so that the new one is
4031 # first, as is being requested.
4032 for (my $j = $i + 1;
4033 $j < @$r && $r->[$j]->start == $start;
4036 if ($value eq $r->[$j]->value && $type eq $r->[$j]->type) {
4038 last; # There should only be one instance, so no
4039 # need to keep looking
4044 trace "Adding multiple record at $i with $start..$end, $value" if main::DEBUG && $to_trace;
4045 my @return = splice @$r,
4052 if (main::DEBUG && $to_trace) {
4053 trace "After splice:";
4054 trace 'i-2=[', $i-2, ']', $r->[$i-2] if $i >= 2;
4055 trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i >= 1;
4056 trace "i =[", $i, "]", $r->[$i] if $i >= 0;
4057 trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < @$r - 1;
4058 trace 'i+2=[', $i+2, ']', $r->[$i+2] if $i < @$r - 2;
4059 trace 'i+3=[', $i+3, ']', $r->[$i+3] if $i < @$r - 3;
4064 # Here, we have taken care of $NO and $MULTIPLE_foo replaces. This
4065 # leaves delete, insert, and replace either unconditionally or if not
4066 # equivalent. $i still points to the first potential affected range.
4067 # Now find the highest range affected, which will determine the length
4068 # parameter to splice. (The input range can span multiple existing
4069 # ones.) If this isn't a deletion, while we are looking through the
4070 # range list, see also if this is a replacement rather than a clean
4071 # insertion; that is if it will change the values of at least one
4072 # existing range. Start off assuming it is an insert, until find it
4074 my $clean_insert = $operation eq '+';
4075 my $j; # This will point to the highest affected range
4077 # For non-zero types, the standard form is the value itself;
4078 my $standard_form = ($type) ? $value : main::standardize($value);
4080 for ($j = $i; $j < $range_list_size; $j++) {
4081 trace "Looking for highest affected range; the one at $j is ", $r->[$j] if main::DEBUG && $to_trace;
4083 # If find a range that it doesn't overlap into, we can stop
4085 last if $end < $r->[$j]->start;
4087 # Here, overlaps the range at $j. If the values don't match,
4088 # and so far we think this is a clean insertion, it becomes a
4089 # non-clean insertion, i.e., a 'change' or 'replace' instead.
4090 if ($clean_insert) {
4091 if ($r->[$j]->standard_form ne $standard_form) {
4093 if ($replace == $CROAK) {
4094 main::croak("The range to add "
4095 . sprintf("%04X", $start)
4097 . sprintf("%04X", $end)
4098 . " with value '$value' overlaps an existing range $r->[$j]");
4103 # Here, the two values are essentially the same. If the
4104 # two are actually identical, replacing wouldn't change
4105 # anything so skip it.
4106 my $pre_existing = $r->[$j]->value;
4107 if ($pre_existing ne $value) {
4109 # Here the new and old standardized values are the
4110 # same, but the non-standardized values aren't. If
4111 # replacing unconditionally, then replace
4112 if( $replace == $UNCONDITIONALLY) {
4117 # Here, are replacing conditionally. Decide to
4118 # replace or not based on which appears to look
4119 # the "nicest". If one is mixed case and the
4120 # other isn't, choose the mixed case one.
4121 my $new_mixed = $value =~ /[A-Z]/
4122 && $value =~ /[a-z]/;
4123 my $old_mixed = $pre_existing =~ /[A-Z]/
4124 && $pre_existing =~ /[a-z]/;
4126 if ($old_mixed != $new_mixed) {
4127 $clean_insert = 0 if $new_mixed;
4128 if (main::DEBUG && $to_trace) {
4129 if ($clean_insert) {
4130 trace "Retaining $pre_existing over $value";
4133 trace "Replacing $pre_existing with $value";
4139 # Here casing wasn't different between the two.
4140 # If one has hyphens or underscores and the
4141 # other doesn't, choose the one with the
4143 my $new_punct = $value =~ /[-_]/;
4144 my $old_punct = $pre_existing =~ /[-_]/;
4146 if ($old_punct != $new_punct) {
4147 $clean_insert = 0 if $new_punct;
4148 if (main::DEBUG && $to_trace) {
4149 if ($clean_insert) {
4150 trace "Retaining $pre_existing over $value";
4153 trace "Replacing $pre_existing with $value";
4156 } # else existing one is just as "good";
4157 # retain it to save cycles.
4163 } # End of loop looking for highest affected range.
4165 # Here, $j points to one beyond the highest range that this insertion
4166 # affects (hence to beyond the range list if that range is the final
4167 # one in the range list).
4169 # The splice length is all the affected ranges. Get it before
4170 # subtracting, for efficiency, so we don't have to later add 1.
4171 my $length = $j - $i;
4173 $j--; # $j now points to the highest affected range.
4174 trace "Final affected range is $j: $r->[$j]" if main::DEBUG && $to_trace;
4176 # Here, have taken care of $NO and $MULTIPLE_foo replaces.
4177 # $j points to the highest affected range. But it can be < $i or even
4178 # -1. These happen only if the insertion is entirely in the gap
4179 # between r[$i-1] and r[$i]. Here's why: j < i means that the j loop
4180 # above exited first time through with $end < $r->[$i]->start. (And
4181 # then we subtracted one from j) This implies also that $start <
4182 # $r->[$i]->start, but we know from above that $r->[$i-1]->end <
4183 # $start, so the entire input range is in the gap.
4186 # Here the entire input range is in the gap before $i.
4188 if (main::DEBUG && $to_trace) {
4190 trace "Entire range is between $r->[$i-1] and $r->[$i]";
4193 trace "Entire range is before $r->[$i]";
4196 return if $operation ne '+'; # Deletion of a non-existent range is
4201 # Here part of the input range is not in the gap before $i. Thus,
4202 # there is at least one affected one, and $j points to the highest
4205 # At this point, here is the situation:
4206 # This is not an insertion of a multiple, nor of tentative ($NO)
4208 # $i points to the first element in the current range list that
4209 # may be affected by this operation. In fact, we know
4210 # that the range at $i is affected because we are in
4211 # the else branch of this 'if'
4212 # $j points to the highest affected range.
4214 # r[$i-1]->end < $start <= r[$i]->end
4216 # r[$i-1]->end < $start <= $end <= r[$j]->end
4219 # $clean_insert is a boolean which is set true if and only if
4220 # this is a "clean insertion", i.e., not a change nor a
4221 # deletion (multiple was handled above).
4223 # We now have enough information to decide if this call is a no-op
4224 # or not. It is a no-op if this is an insertion of already
4227 if (main::DEBUG && $to_trace && $clean_insert
4229 && $start >= $r->[$i]->start)
4233 return if $clean_insert
4234 && $i == $j # more than one affected range => not no-op
4236 # Here, r[$i-1]->end < $start <= $end <= r[$i]->end
4237 # Further, $start and/or $end is >= r[$i]->start
4238 # The test below hence guarantees that
4239 # r[$i]->start < $start <= $end <= r[$i]->end
4240 # This means the input range is contained entirely in
4241 # the one at $i, so is a no-op
4242 && $start >= $r->[$i]->start;
4245 # Here, we know that some action will have to be taken. We have
4246 # calculated the offset and length (though adjustments may be needed)
4247 # for the splice. Now start constructing the replacement list.
4249 my $splice_start = $i;
4254 # See if should extend any adjacent ranges.
4255 if ($operation eq '-') { # Don't extend deletions
4256 $extends_below = $extends_above = 0;
4258 else { # Here, should extend any adjacent ranges. See if there are
4260 $extends_below = ($i > 0
4261 # can't extend unless adjacent
4262 && $r->[$i-1]->end == $start -1
4263 # can't extend unless are same standard value
4264 && $r->[$i-1]->standard_form eq $standard_form
4265 # can't extend unless share type
4266 && $r->[$i-1]->type == $type);
4267 $extends_above = ($j+1 < $range_list_size
4268 && $r->[$j+1]->start == $end +1
4269 && $r->[$j+1]->standard_form eq $standard_form
4270 && $r->[$j+1]->type == $type);
4272 if ($extends_below && $extends_above) { # Adds to both
4273 $splice_start--; # start replace at element below
4274 $length += 2; # will replace on both sides
4275 trace "Extends both below and above ranges" if main::DEBUG && $to_trace;
4277 # The result will fill in any gap, replacing both sides, and
4278 # create one large range.
4279 @replacement = Range->new($r->[$i-1]->start,
4286 # Here we know that the result won't just be the conglomeration of
4287 # a new range with both its adjacent neighbors. But it could
4288 # extend one of them.
4290 if ($extends_below) {
4292 # Here the new element adds to the one below, but not to the
4293 # one above. If inserting, and only to that one range, can
4294 # just change its ending to include the new one.
4295 if ($length == 0 && $clean_insert) {
4296 $r->[$i-1]->set_end($end);
4297 trace "inserted range extends range to below so it is now $r->[$i-1]" if main::DEBUG && $to_trace;
4301 trace "Changing inserted range to start at ", sprintf("%04X", $r->[$i-1]->start), " instead of ", sprintf("%04X", $start) if main::DEBUG && $to_trace;
4302 $splice_start--; # start replace at element below
4303 $length++; # will replace the element below
4304 $start = $r->[$i-1]->start;
4307 elsif ($extends_above) {
4309 # Here the new element adds to the one above, but not below.
4310 # Mirror the code above
4311 if ($length == 0 && $clean_insert) {
4312 $r->[$j+1]->set_start($start);
4313 trace "inserted range extends range to above so it is now $r->[$j+1]" if main::DEBUG && $to_trace;
4317 trace "Changing inserted range to end at ", sprintf("%04X", $r->[$j+1]->end), " instead of ", sprintf("%04X", $end) if main::DEBUG && $to_trace;
4318 $length++; # will replace the element above
4319 $end = $r->[$j+1]->end;
4323 trace "Range at $i is $r->[$i]" if main::DEBUG && $to_trace;
4325 # Finally, here we know there will have to be a splice.
4326 # If the change or delete affects only the highest portion of the
4327 # first affected range, the range will have to be split. The
4328 # splice will remove the whole range, but will replace it by a new
4329 # range containing just the unaffected part. So, in this case,
4330 # add to the replacement list just this unaffected portion.
4331 if (! $extends_below
4332 && $start > $r->[$i]->start && $start <= $r->[$i]->end)
4335 Range->new($r->[$i]->start,
4337 Value => $r->[$i]->value,
4338 Type => $r->[$i]->type);
4341 # In the case of an insert or change, but not a delete, we have to
4342 # put in the new stuff; this comes next.
4343 if ($operation eq '+') {
4344 push @replacement, Range->new($start,
4350 trace "Range at $j is $r->[$j]" if main::DEBUG && $to_trace && $j != $i;
4351 #trace "$end >=", $r->[$j]->start, " && $end <", $r->[$j]->end if main::DEBUG && $to_trace;
4353 # And finally, if we're changing or deleting only a portion of the
4354 # highest affected range, it must be split, as the lowest one was.
4355 if (! $extends_above
4356 && $j >= 0 # Remember that j can be -1 if before first
4358 && $end >= $r->[$j]->start
4359 && $end < $r->[$j]->end)
4362 Range->new($end + 1,
4364 Value => $r->[$j]->value,
4365 Type => $r->[$j]->type);
4369 # And do the splice, as calculated above
4370 if (main::DEBUG && $to_trace) {
4371 trace "replacing $length element(s) at $i with ";
4372 foreach my $replacement (@replacement) {
4373 trace " $replacement";
4375 trace "Before splice:";
4376 trace 'i-2=[', $i-2, ']', $r->[$i-2] if $i >= 2;
4377 trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i >= 1;
4378 trace "i =[", $i, "]", $r->[$i];
4379 trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < @$r - 1;
4380 trace 'i+2=[', $i+2, ']', $r->[$i+2] if $i < @$r - 2;
4383 my @return = splice @$r, $splice_start, $length, @replacement;
4385 if (main::DEBUG && $to_trace) {
4386 trace "After splice:";
4387 trace 'i-2=[', $i-2, ']', $r->[$i-2] if $i >= 2;
4388 trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i >= 1;
4389 trace "i =[", $i, "]", $r->[$i];
4390 trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < @$r - 1;
4391 trace 'i+2=[', $i+2, ']', $r->[$i+2] if $i < @$r - 2;
4392 trace "removed ", @return if @return;
4395 # An actual deletion could have changed the maximum in the list.
4396 # There was no deletion if the splice didn't return something, but
4397 # otherwise recalculate it. This is done too rarely to worry about
4399 if ($operation eq '-' && @return) {
4401 $max{$addr} = $r->[-1]->end;
4404 $max{$addr} = $max_init;
4410 sub reset_each_range { # reset the iterator for each_range();
4412 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4415 undef $each_range_iterator{pack 'J', $self};
4420 # Iterate over each range in a range list. Results are undefined if
4421 # the range list is changed during the iteration.
4424 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4426 my $addr = do { no overloading; pack 'J', $self; };
4428 return if $self->is_empty;
4430 $each_range_iterator{$addr} = -1
4431 if ! defined $each_range_iterator{$addr};
4432 $each_range_iterator{$addr}++;
4433 return $ranges{$addr}->[$each_range_iterator{$addr}]
4434 if $each_range_iterator{$addr} < @{$ranges{$addr}};
4435 undef $each_range_iterator{$addr};
4439 sub count { # Returns count of code points in range list
4441 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4443 my $addr = do { no overloading; pack 'J', $self; };
4446 foreach my $range (@{$ranges{$addr}}) {
4447 $count += $range->end - $range->start + 1;
4452 sub delete_range { # Delete a range
4457 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4459 return $self->_add_delete('-', $start, $end, "");
4462 sub is_empty { # Returns boolean as to if a range list is empty
4464 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4467 return scalar @{$ranges{pack 'J', $self}} == 0;
4471 # Quickly returns a scalar suitable for separating tables into
4472 # buckets, i.e. it is a hash function of the contents of a table, so
4473 # there are relatively few conflicts.
4476 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4478 my $addr = do { no overloading; pack 'J', $self; };
4480 # These are quickly computable. Return looks like 'min..max;count'
4481 return $self->min . "..$max{$addr};" . scalar @{$ranges{$addr}};
4483 } # End closure for _Range_List_Base
4486 use parent '-norequire', '_Range_List_Base';
4488 # A Range_List is a range list for match tables; i.e. the range values are
4489 # not significant. Thus a number of operations can be safely added to it,
4490 # such as inversion, intersection. Note that union is also an unsafe
4491 # operation when range values are cared about, and that method is in the base
4492 # class, not here. But things are set up so that that method is callable only
4493 # during initialization. Only in this derived class, is there an operation
4494 # that combines two tables. A Range_Map can thus be used to initialize a
4495 # Range_List, and its mappings will be in the list, but are not significant to
4498 sub trace { return main::trace(@_); }
4504 '+' => sub { my $self = shift;
4507 return $self->_union($other)
4509 '+=' => sub { my $self = shift;
4511 my $reversed = shift;
4514 Carp::my_carp_bug("Bad news. Can't cope with '"
4518 . "'. undef returned.");
4522 return $self->_union($other)
4524 '&' => sub { my $self = shift;
4527 return $self->_intersect($other, 0);
4529 '&=' => sub { my $self = shift;
4531 my $reversed = shift;
4534 Carp::my_carp_bug("Bad news. Can't cope with '"
4538 . "'. undef returned.");
4542 return $self->_intersect($other, 0);
4549 # Returns a new Range_List that gives all code points not in $self.
4553 my $new = Range_List->new;
4555 # Go through each range in the table, finding the gaps between them
4556 my $max = -1; # Set so no gap before range beginning at 0
4557 for my $range ($self->ranges) {
4558 my $start = $range->start;
4559 my $end = $range->end;
4561 # If there is a gap before this range, the inverse will contain
4563 if ($start > $max + 1) {
4564 $new->add_range($max + 1, $start - 1);
4569 # And finally, add the gap from the end of the table to the max
4570 # possible code point
4571 if ($max < $MAX_WORKING_CODEPOINT) {
4572 $new->add_range($max + 1, $MAX_WORKING_CODEPOINT);
4578 # Returns a new Range_List with the argument deleted from it. The
4579 # argument can be a single code point, a range, or something that has
4580 # a range, with the _range_list() method on it returning them
4584 my $reversed = shift;
4585 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4588 Carp::my_carp_bug("Bad news. Can't cope with '"
4592 . "'. undef returned.");
4596 my $new = Range_List->new(Initialize => $self);
4598 if (! ref $other) { # Single code point
4599 $new->delete_range($other, $other);
4601 elsif ($other->isa('Range')) {
4602 $new->delete_range($other->start, $other->end);
4604 elsif ($other->can('_range_list')) {
4605 foreach my $range ($other->_range_list->ranges) {
4606 $new->delete_range($range->start, $range->end);
4610 Carp::my_carp_bug("Can't cope with a "
4612 . " argument to '-'. Subtraction ignored."
4621 # Returns either a boolean giving whether the two inputs' range lists
4622 # intersect (overlap), or a new Range_List containing the intersection
4623 # of the two lists. The optional final parameter being true indicates
4624 # to do the check instead of the intersection.
4626 my $a_object = shift;
4627 my $b_object = shift;
4628 my $check_if_overlapping = shift;
4629 $check_if_overlapping = 0 unless defined $check_if_overlapping;
4630 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4632 if (! defined $b_object) {
4634 $message .= $a_object->_owner_name_of if defined $a_object;
4635 Carp::my_carp_bug($message .= "Called with undefined value. Intersection not done.");
4639 # a & b = !(!a | !b), or in our terminology = ~ ( ~a + -b )
4640 # Thus the intersection could be much more simply be written:
4641 # return ~(~$a_object + ~$b_object);
4642 # But, this is slower, and when taking the inverse of a large
4643 # range_size_1 table, back when such tables were always stored that
4644 # way, it became prohibitively slow, hence the code was changed to the
4647 if ($b_object->isa('Range')) {
4648 $b_object = Range_List->new(Initialize => $b_object,
4649 Owner => $a_object->_owner_name_of);
4651 $b_object = $b_object->_range_list if $b_object->can('_range_list');
4653 my @a_ranges = $a_object->ranges;
4654 my @b_ranges = $b_object->ranges;
4656 #local $to_trace = 1 if main::DEBUG;
4657 trace "intersecting $a_object with ", scalar @a_ranges, "ranges and $b_object with", scalar @b_ranges, " ranges" if main::DEBUG && $to_trace;
4659 # Start with the first range in each list
4661 my $range_a = $a_ranges[$a_i];
4663 my $range_b = $b_ranges[$b_i];
4665 my $new = __PACKAGE__->new(Owner => $a_object->_owner_name_of)
4666 if ! $check_if_overlapping;
4668 # If either list is empty, there is no intersection and no overlap
4669 if (! defined $range_a || ! defined $range_b) {
4670 return $check_if_overlapping ? 0 : $new;
4672 trace "range_a[$a_i]=$range_a; range_b[$b_i]=$range_b" if main::DEBUG && $to_trace;
4674 # Otherwise, must calculate the intersection/overlap. Start with the
4675 # very first code point in each list
4676 my $a = $range_a->start;
4677 my $b = $range_b->start;
4679 # Loop through all the ranges of each list; in each iteration, $a and
4680 # $b are the current code points in their respective lists
4683 # If $a and $b are the same code point, ...
4686 # it means the lists overlap. If just checking for overlap
4687 # know the answer now,
4688 return 1 if $check_if_overlapping;
4690 # The intersection includes this code point plus anything else
4691 # common to both current ranges.
4693 my $end = main::min($range_a->end, $range_b->end);
4694 if (! $check_if_overlapping) {
4695 trace "adding intersection range ", sprintf("%04X", $start) . ".." . sprintf("%04X", $end) if main::DEBUG && $to_trace;
4696 $new->add_range($start, $end);
4699 # Skip ahead to the end of the current intersect
4702 # If the current intersect ends at the end of either range (as
4703 # it must for at least one of them), the next possible one
4704 # will be the beginning code point in it's list's next range.
4705 if ($a == $range_a->end) {
4706 $range_a = $a_ranges[++$a_i];
4707 last unless defined $range_a;
4708 $a = $range_a->start;
4710 if ($b == $range_b->end) {
4711 $range_b = $b_ranges[++$b_i];
4712 last unless defined $range_b;
4713 $b = $range_b->start;
4716 trace "range_a[$a_i]=$range_a; range_b[$b_i]=$range_b" if main::DEBUG && $to_trace;
4720 # Not equal, but if the range containing $a encompasses $b,
4721 # change $a to be the middle of the range where it does equal
4722 # $b, so the next iteration will get the intersection
4723 if ($range_a->end >= $b) {
4728 # Here, the current range containing $a is entirely below
4729 # $b. Go try to find a range that could contain $b.
4730 $a_i = $a_object->_search_ranges($b);
4732 # If no range found, quit.
4733 last unless defined $a_i;
4735 # The search returns $a_i, such that
4736 # range_a[$a_i-1]->end < $b <= range_a[$a_i]->end
4737 # Set $a to the beginning of this new range, and repeat.
4738 $range_a = $a_ranges[$a_i];
4739 $a = $range_a->start;
4742 else { # Here, $b < $a.
4744 # Mirror image code to the leg just above
4745 if ($range_b->end >= $a) {
4749 $b_i = $b_object->_search_ranges($a);
4750 last unless defined $b_i;
4751 $range_b = $b_ranges[$b_i];
4752 $b = $range_b->start;
4755 } # End of looping through ranges.
4757 # Intersection fully computed, or now know that there is no overlap
4758 return $check_if_overlapping ? 0 : $new;
4762 # Returns boolean giving whether the two arguments overlap somewhere
4766 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4768 return $self->_intersect($other, 1);
4772 # Add a range to the list.
4777 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4779 return $self->_add_delete('+', $start, $end, "");
4782 sub matches_identically_to {
4783 # Return a boolean as to whether or not two Range_Lists match identical
4784 # sets of code points.
4788 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4790 # These are ordered in increasing real time to figure out (at least
4791 # until a patch changes that and doesn't change this)
4792 return 0 if $self->max != $other->max;
4793 return 0 if $self->min != $other->min;
4794 return 0 if $self->range_count != $other->range_count;
4795 return 0 if $self->count != $other->count;
4797 # Here they could be identical because all the tests above passed.
4798 # The loop below is somewhat simpler since we know they have the same
4799 # number of elements. Compare range by range, until reach the end or
4800 # find something that differs.
4801 my @a_ranges = $self->ranges;
4802 my @b_ranges = $other->ranges;
4803 for my $i (0 .. @a_ranges - 1) {
4804 my $a = $a_ranges[$i];
4805 my $b = $b_ranges[$i];
4806 trace "self $a; other $b" if main::DEBUG && $to_trace;
4807 return 0 if ! defined $b
4808 || $a->start != $b->start
4809 || $a->end != $b->end;
4814 sub is_code_point_usable {
4815 # This used only for making the test script. See if the input
4816 # proposed trial code point is one that Perl will handle. If second
4817 # parameter is 0, it won't select some code points for various
4818 # reasons, noted below.
4821 my $try_hard = shift;
4822 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4824 return 0 if $code < 0; # Never use a negative
4826 # shun null. I'm (khw) not sure why this was done, but NULL would be
4827 # the character very frequently used.
4828 return $try_hard if $code == 0x0000;
4830 # shun non-character code points.
4831 return $try_hard if $code >= 0xFDD0 && $code <= 0xFDEF;
4832 return $try_hard if ($code & 0xFFFE) == 0xFFFE; # includes FFFF
4834 return $try_hard if $code > $MAX_UNICODE_CODEPOINT; # keep in range
4835 return $try_hard if $code >= 0xD800 && $code <= 0xDFFF; # no surrogate
4840 sub get_valid_code_point {
4841 # Return a code point that's part of the range list. Returns nothing
4842 # if the table is empty or we can't find a suitable code point. This
4843 # used only for making the test script.
4846 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4848 my $addr = do { no overloading; pack 'J', $self; };
4850 # On first pass, don't choose less desirable code points; if no good
4851 # one is found, repeat, allowing a less desirable one to be selected.
4852 for my $try_hard (0, 1) {
4854 # Look through all the ranges for a usable code point.
4855 for my $set (reverse $self->ranges) {
4857 # Try the edge cases first, starting with the end point of the
4859 my $end = $set->end;
4860 return $end if is_code_point_usable($end, $try_hard);
4861 $end = $MAX_UNICODE_CODEPOINT + 1 if $end > $MAX_UNICODE_CODEPOINT;
4863 # End point didn't, work. Start at the beginning and try
4864 # every one until find one that does work.
4865 for my $trial ($set->start .. $end - 1) {
4866 return $trial if is_code_point_usable($trial, $try_hard);
4870 return (); # If none found, give up.
4873 sub get_invalid_code_point {
4874 # Return a code point that's not part of the table. Returns nothing
4875 # if the table covers all code points or a suitable code point can't
4876 # be found. This used only for making the test script.
4879 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4881 # Just find a valid code point of the inverse, if any.
4882 return Range_List->new(Initialize => ~ $self)->get_valid_code_point;
4884 } # end closure for Range_List
4887 use parent '-norequire', '_Range_List_Base';
4889 # A Range_Map is a range list in which the range values (called maps) are
4890 # significant, and hence shouldn't be manipulated by our other code, which
4891 # could be ambiguous or lose things. For example, in taking the union of two
4892 # lists, which share code points, but which have differing values, which one
4893 # has precedence in the union?
4894 # It turns out that these operations aren't really necessary for map tables,
4895 # and so this class was created to make sure they aren't accidentally
4901 # Add a range containing a mapping value to the list
4904 # Rest of parameters passed on
4906 return $self->_add_delete('+', @_);
4910 # Adds entry to a range list which can duplicate an existing entry
4913 my $code_point = shift;
4916 my $replace = delete $args{'Replace'} // $MULTIPLE_BEFORE;
4917 Carp::carp_extra_args(\%args) if main::DEBUG && %args;
4919 return $self->add_map($code_point, $code_point,
4920 $value, Replace => $replace);
4922 } # End of closure for package Range_Map
4924 package _Base_Table;
4926 # A table is the basic data structure that gets written out into a file for
4927 # use by the Perl core. This is the abstract base class implementing the
4928 # common elements from the derived ones. A list of the methods to be
4929 # furnished by an implementing class is just after the constructor.
4931 sub standardize { return main::standardize($_[0]); }
4932 sub trace { return main::trace(@_); }
4936 main::setup_package();
4939 # Object containing the ranges of the table.
4940 main::set_access('range_list', \%range_list, 'p_r', 'p_s');
4943 # The full table name.
4944 main::set_access('full_name', \%full_name, 'r');
4947 # The table name, almost always shorter
4948 main::set_access('name', \%name, 'r');
4951 # The shortest of all the aliases for this table, with underscores removed
4952 main::set_access('short_name', \%short_name);
4954 my %nominal_short_name_length;
4955 # The length of short_name before removing underscores
4956 main::set_access('nominal_short_name_length',
4957 \%nominal_short_name_length);
4960 # The complete name, including property.
4961 main::set_access('complete_name', \%complete_name, 'r');
4964 # Parent property this table is attached to.
4965 main::set_access('property', \%property, 'r');
4968 # Ordered list of alias objects of the table's name. The first ones in
4969 # the list are output first in comments
4970 main::set_access('aliases', \%aliases, 'readable_array');
4973 # A comment associated with the table for human readers of the files
4974 main::set_access('comment', \%comment, 's');
4977 # A comment giving a short description of the table's meaning for human
4978 # readers of the files.
4979 main::set_access('description', \%description, 'readable_array');
4982 # A comment giving a short note about the table for human readers of the
4984 main::set_access('note', \%note, 'readable_array');
4987 # Enum; there are a number of possibilities for what happens to this
4988 # table: it could be normal, or suppressed, or not for external use. See
4989 # values at definition for $SUPPRESSED.
4990 main::set_access('fate', \%fate, 'r');
4992 my %find_table_from_alias;
4993 # The parent property passes this pointer to a hash which this class adds
4994 # all its aliases to, so that the parent can quickly take an alias and
4996 main::set_access('find_table_from_alias', \%find_table_from_alias, 'p_r');
4999 # After this table is made equivalent to another one; we shouldn't go
5000 # changing the contents because that could mean it's no longer equivalent
5001 main::set_access('locked', \%locked, 'r');
5004 # This gives the final path to the file containing the table. Each
5005 # directory in the path is an element in the array
5006 main::set_access('file_path', \%file_path, 'readable_array');
5009 # What is the table's status, normal, $OBSOLETE, etc. Enum
5010 main::set_access('status', \%status, 'r');
5013 # A comment about its being obsolete, or whatever non normal status it has
5014 main::set_access('status_info', \%status_info, 'r');
5016 my %caseless_equivalent;
5017 # The table this is equivalent to under /i matching, if any.
5018 main::set_access('caseless_equivalent', \%caseless_equivalent, 'r', 's');
5021 # Is the table to be output with each range only a single code point?
5022 # This is done to avoid breaking existing code that may have come to rely
5023 # on this behavior in previous versions of this program.)
5024 main::set_access('range_size_1', \%range_size_1, 'r', 's');
5027 # A boolean set iff this table is a Perl extension to the Unicode
5029 main::set_access('perl_extension', \%perl_extension, 'r');
5031 my %output_range_counts;
5032 # A boolean set iff this table is to have comments written in the
5033 # output file that contain the number of code points in the range.
5034 # The constructor can override the global flag of the same name.
5035 main::set_access('output_range_counts', \%output_range_counts, 'r');
5037 my %write_as_invlist;
5038 # A boolean set iff the output file for this table is to be in the form of
5039 # an inversion list/map.
5040 main::set_access('write_as_invlist', \%write_as_invlist, 'r');
5043 # The format of the entries of the table. This is calculated from the
5044 # data in the table (or passed in the constructor). This is an enum e.g.,
5045 # $STRING_FORMAT. It is marked protected as it should not be generally
5046 # used to override calculations.
5047 main::set_access('format', \%format, 'r', 'p_s');
5050 # All arguments are key => value pairs, which you can see below, most
5051 # of which match fields documented above. Otherwise: Re_Pod_Entry,
5052 # OK_as_Filename, and Fuzzy apply to the names of the table, and are
5053 # documented in the Alias package
5055 return Carp::carp_too_few_args(\@_, 2) if main::DEBUG && @_ < 2;
5059 my $self = bless \do { my $anonymous_scalar }, $class;
5060 my $addr = do { no overloading; pack 'J', $self; };
5064 $name{$addr} = delete $args{'Name'};
5065 $find_table_from_alias{$addr} = delete $args{'_Alias_Hash'};
5066 $full_name{$addr} = delete $args{'Full_Name'};
5067 my $complete_name = $complete_name{$addr}
5068 = delete $args{'Complete_Name'};
5069 $format{$addr} = delete $args{'Format'};
5070 $output_range_counts{$addr} = delete $args{'Output_Range_Counts'};
5071 $property{$addr} = delete $args{'_Property'};
5072 $range_list{$addr} = delete $args{'_Range_List'};
5073 $status{$addr} = delete $args{'Status'} || $NORMAL;
5074 $status_info{$addr} = delete $args{'_Status_Info'} || "";
5075 $range_size_1{$addr} = delete $args{'Range_Size_1'} || 0;
5076 $caseless_equivalent{$addr} = delete $args{'Caseless_Equivalent'} || 0;
5077 $fate{$addr} = delete $args{'Fate'} || $ORDINARY;
5078 $write_as_invlist{$addr} = delete $args{'Write_As_Invlist'};# No default
5079 my $ucd = delete $args{'UCD'};
5081 my $description = delete $args{'Description'};
5082 my $ok_as_filename = delete $args{'OK_as_Filename'};
5083 my $loose_match = delete $args{'Fuzzy'};
5084 my $note = delete $args{'Note'};
5085 my $make_re_pod_entry = delete $args{'Re_Pod_Entry'};
5086 my $perl_extension = delete $args{'Perl_Extension'};
5088 # Shouldn't have any left over
5089 Carp::carp_extra_args(\%args) if main::DEBUG && %args;
5091 # Can't use || above because conceivably the name could be 0, and
5092 # can't use // operator in case this program gets used in Perl 5.8
5093 $full_name{$addr} = $name{$addr} if ! defined $full_name{$addr};
5094 $output_range_counts{$addr} = $output_range_counts if
5095 ! defined $output_range_counts{$addr};
5097 $aliases{$addr} = [ ];
5098 $comment{$addr} = [ ];
5099 $description{$addr} = [ ];
5101 $file_path{$addr} = [ ];
5102 $locked{$addr} = "";
5104 push @{$description{$addr}}, $description if $description;
5105 push @{$note{$addr}}, $note if $note;
5107 if ($fate{$addr} == $PLACEHOLDER) {
5109 # A placeholder table doesn't get documented, is a perl extension,
5110 # and quite likely will be empty
5111 $make_re_pod_entry = 0 if ! defined $make_re_pod_entry;
5112 $perl_extension = 1 if ! defined $perl_extension;
5113 $ucd = 0 if ! defined $ucd;
5114 push @tables_that_may_be_empty, $complete_name{$addr};
5115 $self->add_comment(<<END);
5116 This is a placeholder because it is not in Version $string_version of Unicode,
5117 but is needed by the Perl core to work gracefully. Because it is not in this
5118 version of Unicode, it will not be listed in $pod_file.pod
5121 elsif (exists $why_suppressed{$complete_name}
5122 # Don't suppress if overridden
5123 && ! grep { $_ eq $complete_name{$addr} }
5124 @output_mapped_properties)
5126 $fate{$addr} = $SUPPRESSED;
5128 elsif ($fate{$addr} == $SUPPRESSED
5129 && ! exists $why_suppressed{$property{$addr}->complete_name})
5131 Carp::my_carp_bug("There is no current capability to set the reason for suppressing.");
5132 # perhaps Fate => [ $SUPPRESSED, "reason" ]
5135 # If hasn't set its status already, see if it is on one of the
5136 # lists of properties or tables that have particular statuses; if
5137 # not, is normal. The lists are prioritized so the most serious
5138 # ones are checked first
5139 if (! $status{$addr}) {
5140 if (exists $why_deprecated{$complete_name}) {
5141 $status{$addr} = $DEPRECATED;
5143 elsif (exists $why_stabilized{$complete_name}) {
5144 $status{$addr} = $STABILIZED;
5146 elsif (exists $why_obsolete{$complete_name}) {
5147 $status{$addr} = $OBSOLETE;
5150 # Existence above doesn't necessarily mean there is a message
5151 # associated with it. Use the most serious message.
5152 if ($status{$addr}) {
5153 if ($why_deprecated{$complete_name}) {
5155 = $why_deprecated{$complete_name};
5157 elsif ($why_stabilized{$complete_name}) {
5159 = $why_stabilized{$complete_name};
5161 elsif ($why_obsolete{$complete_name}) {
5163 = $why_obsolete{$complete_name};
5168 $perl_extension{$addr} = $perl_extension || 0;
5170 # Don't list a property by default that is internal only
5171 if ($fate{$addr} > $MAP_PROXIED) {
5172 $make_re_pod_entry = 0 if ! defined $make_re_pod_entry;
5173 $ucd = 0 if ! defined $ucd;
5176 $ucd = 1 if ! defined $ucd;
5179 # By convention what typically gets printed only or first is what's
5180 # first in the list, so put the full name there for good output
5181 # clarity. Other routines rely on the full name being first on the
5183 $self->add_alias($full_name{$addr},
5184 OK_as_Filename => $ok_as_filename,
5185 Fuzzy => $loose_match,
5186 Re_Pod_Entry => $make_re_pod_entry,
5187 Status => $status{$addr},
5191 # Then comes the other name, if meaningfully different.
5192 if (standardize($full_name{$addr}) ne standardize($name{$addr})) {
5193 $self->add_alias($name{$addr},
5194 OK_as_Filename => $ok_as_filename,
5195 Fuzzy => $loose_match,
5196 Re_Pod_Entry => $make_re_pod_entry,
5197 Status => $status{$addr},
5205 # Here are the methods that are required to be defined by any derived
5208 handle_special_range
5212 # write() knows how to write out normal ranges, but it calls
5213 # handle_special_range() when it encounters a non-normal one.
5214 # append_to_body() is called by it after it has handled all
5215 # ranges to add anything after the main portion of the table.
5216 # And finally, pre_body() is called after all this to build up
5217 # anything that should appear before the main portion of the
5218 # table. Doing it this way allows things in the middle to
5219 # affect what should appear before the main portion of the
5224 Carp::my_carp_bug( __LINE__
5225 . ": Must create method '$sub()' for "
5233 "." => \&main::_operator_dot,
5234 ".=" => \&main::_operator_dot_equal,
5235 '!=' => \&main::_operator_not_equal,
5236 '==' => \&main::_operator_equal,
5240 # Returns the array of ranges associated with this table.
5243 return $range_list{pack 'J', shift}->ranges;
5247 # Add a synonym for this table.
5249 return Carp::carp_too_few_args(\@_, 3) if main::DEBUG && @_ < 3;
5252 my $name = shift; # The name to add.
5253 my $pointer = shift; # What the alias hash should point to. For
5254 # map tables, this is the parent property;
5255 # for match tables, it is the table itself.
5258 my $loose_match = delete $args{'Fuzzy'};
5260 my $make_re_pod_entry = delete $args{'Re_Pod_Entry'};
5261 $make_re_pod_entry = $YES unless defined $make_re_pod_entry;
5263 my $ok_as_filename = delete $args{'OK_as_Filename'};
5264 $ok_as_filename = 1 unless defined $ok_as_filename;
5266 my $status = delete $args{'Status'};
5267 $status = $NORMAL unless defined $status;
5269 # An internal name does not get documented, unless overridden by the
5271 my $ucd = delete $args{'UCD'} // (($name =~ /^_/) ? 0 : 1);
5273 Carp::carp_extra_args(\%args) if main::DEBUG && %args;
5275 # Capitalize the first letter of the alias unless it is one of the CJK
5276 # ones which specifically begins with a lower 'k'. Do this because
5277 # Unicode has varied whether they capitalize first letters or not, and
5278 # have later changed their minds and capitalized them, but not the
5279 # other way around. So do it always and avoid changes from release to
5281 $name = ucfirst($name) unless $name =~ /^k[A-Z]/;
5283 my $addr = do { no overloading; pack 'J', $self; };
5285 # Figure out if should be loosely matched if not already specified.
5286 if (! defined $loose_match) {
5288 # Is a loose_match if isn't null, and doesn't begin with an
5289 # underscore and isn't just a number
5291 && substr($name, 0, 1) ne '_'
5292 && $name !~ qr{^[0-9_.+-/]+$})
5301 # If this alias has already been defined, do nothing.
5302 return if defined $find_table_from_alias{$addr}->{$name};
5304 # That includes if it is standardly equivalent to an existing alias,
5305 # in which case, add this name to the list, so won't have to search
5307 my $standard_name = main::standardize($name);
5308 if (defined $find_table_from_alias{$addr}->{$standard_name}) {
5309 $find_table_from_alias{$addr}->{$name}
5310 = $find_table_from_alias{$addr}->{$standard_name};
5314 # Set the index hash for this alias for future quick reference.
5315 $find_table_from_alias{$addr}->{$name} = $pointer;
5316 $find_table_from_alias{$addr}->{$standard_name} = $pointer;
5317 local $to_trace = 0 if main::DEBUG;
5318 trace "adding alias $name to $pointer" if main::DEBUG && $to_trace;
5319 trace "adding alias $standard_name to $pointer" if main::DEBUG && $to_trace;
5322 # Put the new alias at the end of the list of aliases unless the final
5323 # element begins with an underscore (meaning it is for internal perl
5324 # use) or is all numeric, in which case, put the new one before that
5325 # one. This floats any all-numeric or underscore-beginning aliases to
5326 # the end. This is done so that they are listed last in output lists,
5327 # to encourage the user to use a better name (either more descriptive
5328 # or not an internal-only one) instead. This ordering is relied on
5329 # implicitly elsewhere in this program, like in short_name()
5330 my $list = $aliases{$addr};
5331 my $insert_position = (@$list == 0
5332 || (substr($list->[-1]->name, 0, 1) ne '_'
5333 && $list->[-1]->name =~ /\D/))
5339 Alias->new($name, $loose_match, $make_re_pod_entry,
5340 $ok_as_filename, $status, $ucd);
5342 # This name may be shorter than any existing ones, so clear the cache
5343 # of the shortest, so will have to be recalculated.
5345 undef $short_name{pack 'J', $self};
5350 # Returns a name suitable for use as the base part of a file name.
5351 # That is, shorter wins. It can return undef if there is no suitable
5352 # name. The name has all non-essential underscores removed.
5354 # The optional second parameter is a reference to a scalar in which
5355 # this routine will store the length the returned name had before the
5356 # underscores were removed, or undef if the return is undef.
5358 # The shortest name can change if new aliases are added. So using
5359 # this should be deferred until after all these are added. The code
5360 # that does that should clear this one's cache.
5361 # Any name with alphabetics is preferred over an all numeric one, even
5365 my $nominal_length_ptr = shift;
5366 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5368 my $addr = do { no overloading; pack 'J', $self; };
5370 # For efficiency, don't recalculate, but this means that adding new
5371 # aliases could change what the shortest is, so the code that does
5372 # that needs to undef this.
5373 if (defined $short_name{$addr}) {
5374 if ($nominal_length_ptr) {
5375 $$nominal_length_ptr = $nominal_short_name_length{$addr};
5377 return $short_name{$addr};
5380 # Look at each alias
5381 foreach my $alias ($self->aliases()) {
5383 # Don't use an alias that isn't ok to use for an external name.
5384 next if ! $alias->ok_as_filename;
5386 my $name = main::Standardize($alias->name);
5387 trace $self, $name if main::DEBUG && $to_trace;
5389 # Take the first one, or a shorter one that isn't numeric. This
5390 # relies on numeric aliases always being last in the array
5391 # returned by aliases(). Any alpha one will have precedence.
5392 if (! defined $short_name{$addr}
5394 && length($name) < length($short_name{$addr})))
5396 # Remove interior underscores.
5397 ($short_name{$addr} = $name) =~ s/ (?<= . ) _ (?= . ) //xg;
5399 $nominal_short_name_length{$addr} = length $name;
5403 # If the short name isn't a nice one, perhaps an equivalent table has
5405 if (! defined $short_name{$addr}
5406 || $short_name{$addr} eq ""
5407 || $short_name{$addr} eq "_")
5410 foreach my $follower ($self->children) { # All equivalents
5411 my $follower_name = $follower->short_name;
5412 next unless defined $follower_name;
5414 # Anything (except undefined) is better than underscore or
5416 if (! defined $return || $return eq "_") {
5417 $return = $follower_name;
5421 # If the new follower name isn't "_" and is shorter than the
5422 # current best one, prefer the new one.
5423 next if $follower_name eq "_";
5424 next if length $follower_name > length $return;
5425 $return = $follower_name;
5427 $short_name{$addr} = $return if defined $return;
5430 # If no suitable external name return undef
5431 if (! defined $short_name{$addr}) {
5432 $$nominal_length_ptr = undef if $nominal_length_ptr;
5436 # Don't allow a null short name.
5437 if ($short_name{$addr} eq "") {
5438 $short_name{$addr} = '_';
5439 $nominal_short_name_length{$addr} = 1;
5442 trace $self, $short_name{$addr} if main::DEBUG && $to_trace;
5444 if ($nominal_length_ptr) {
5445 $$nominal_length_ptr = $nominal_short_name_length{$addr};
5447 return $short_name{$addr};
5451 # Returns the external name that this table should be known by. This
5452 # is usually the short_name, but not if the short_name is undefined,
5453 # in which case the external_name is arbitrarily set to the
5457 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5459 my $short = $self->short_name;
5460 return $short if defined $short;
5465 sub add_description { # Adds the parameter as a short description.
5468 my $description = shift;
5470 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5473 push @{$description{pack 'J', $self}}, $description;
5478 sub add_note { # Adds the parameter as a short note.
5483 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5486 push @{$note{pack 'J', $self}}, $note;
5491 sub add_comment { # Adds the parameter as a comment.
5493 return unless $debugging_build;
5496 my $comment = shift;
5497 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5502 push @{$comment{pack 'J', $self}}, $comment;
5508 # Return the current comment for this table. If called in list
5509 # context, returns the array of comments. In scalar, returns a string
5510 # of each element joined together with a period ending each.
5513 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5515 my $addr = do { no overloading; pack 'J', $self; };
5516 my @list = @{$comment{$addr}};
5517 return @list if wantarray;
5519 foreach my $sentence (@list) {
5520 $return .= '. ' if $return;
5521 $return .= $sentence;
5524 $return .= '.' if $return;
5529 # Initialize the table with the argument which is any valid
5530 # initialization for range lists.
5533 my $addr = do { no overloading; pack 'J', $self; };
5534 my $initialization = shift;
5535 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5537 # Replace the current range list with a new one of the same exact
5539 my $class = ref $range_list{$addr};
5540 $range_list{$addr} = $class->new(Owner => $self,
5541 Initialize => $initialization);
5547 # The header that is output for the table in the file it is written
5551 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5554 $return .= $DEVELOPMENT_ONLY if $compare_versions;
5559 sub merge_single_annotation_line ($$$) {
5560 my ($output, $annotation, $annotation_column) = @_;
5562 # This appends an annotation comment, $annotation, to $output,
5563 # starting in or after column $annotation_column, removing any
5564 # pre-existing comment from $output.
5566 $annotation =~ s/^ \s* \# \ //x;
5567 $output =~ s/ \s* ( \# \N* )? \n //x;
5568 $output = Text::Tabs::expand($output);
5570 my $spaces = $annotation_column - length $output;
5571 $spaces = 2 if $spaces < 0; # Have 2 blanks before the comment
5573 $output = sprintf "%s%*s# %s",
5578 return Text::Tabs::unexpand $output;
5582 # Write a representation of the table to its file. It calls several
5583 # functions furnished by sub-classes of this abstract base class to
5584 # handle non-normal ranges, to add stuff before the table, and at its
5585 # end. If the table is to be written so that adjustments are
5586 # required, this does that conversion.
5589 my $use_adjustments = shift; # ? output in adjusted format or not
5590 my $suppress_value = shift; # Optional, if the value associated with
5591 # a range equals this one, don't write
5593 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5595 my $addr = do { no overloading; pack 'J', $self; };
5596 my $write_as_invlist = $write_as_invlist{$addr};
5598 # Start with the header
5599 my @HEADER = $self->header;
5602 push @HEADER, "\n", main::simple_fold($comment{$addr}, '# '), "\n"
5605 # Things discovered processing the main body of the document may
5606 # affect what gets output before it, therefore pre_body() isn't called
5607 # until after all other processing of the table is done.
5609 # The main body looks like a 'here' document. If there are comments,
5610 # get rid of them when processing it.
5612 if ($annotate || $output_range_counts) {
5613 # Use the line below in Perls that don't have /r
5614 #push @OUT, 'return join "\n", map { s/\s*#.*//mg; $_ } split "\n", <<\'END\';' . "\n";
5615 push @OUT, "return <<'END' =~ s/\\s*#.*//mgr;\n";
5617 push @OUT, "return <<'END';\n";
5620 if ($range_list{$addr}->is_empty) {
5622 # This is a kludge for empty tables to silence a warning in
5623 # utf8.c, which can't really deal with empty tables, but it can
5624 # deal with a table that matches nothing, as the inverse of 'All'
5626 push @OUT, "!utf8::All\n";
5628 elsif ($self->name eq 'N'
5630 # To save disk space and table cache space, avoid putting out
5631 # binary N tables, but instead create a file which just inverts
5632 # the Y table. Since the file will still exist and occupy a
5633 # certain number of blocks, might as well output the whole
5634 # thing if it all will fit in one block. The number of
5635 # ranges below is an approximate number for that.
5636 && ($self->property->type == $BINARY
5637 || $self->property->type == $FORCED_BINARY)
5638 # && $self->property->tables == 2 Can't do this because the
5639 # non-binary properties, like NFDQC aren't specifiable
5641 && $range_list{$addr}->ranges > 15
5642 && ! $annotate) # Under --annotate, want to see everything
5644 push @OUT, "!utf8::" . $self->property->name . "\n";
5647 my $range_size_1 = $range_size_1{$addr};
5649 # To make it more readable, use a minimum indentation
5652 # These are used only in $annotate option
5653 my $format; # e.g. $HEX_ADJUST_FORMAT
5654 my $include_name; # ? Include the character's name in the
5656 my $include_cp; # ? Include its code point
5659 $comment_indent = ($self->isa('Map_Table'))
5661 : ($write_as_invlist)
5666 $format = $self->format;
5668 # The name of the character is output only for tables that
5669 # don't already include the name in the output.
5670 my $property = $self->property;
5672 ! ($property == $perl_charname
5673 || $property == main::property_ref('Unicode_1_Name')
5674 || $property == main::property_ref('Name')
5675 || $property == main::property_ref('Name_Alias')
5678 # Don't include the code point in the annotation where all
5679 # lines are a single code point, so it can be easily found in
5681 $include_cp = ! $range_size_1;
5683 if (! $self->isa('Map_Table')) {
5684 $comment_indent = ($write_as_invlist) ? 8 : 16;
5687 $comment_indent = 16;
5689 # There are just a few short ranges in this table, so no
5690 # need to include the code point in the annotation.
5691 $include_cp = 0 if $format eq $DECOMP_STRING_FORMAT;
5693 # We're trying to get this to look good, as the whole
5694 # point is to make human-readable tables. It is easier to
5695 # read if almost all the annotation comments begin in the
5696 # same column. Map tables have varying width maps, so can
5697 # create a jagged comment appearance. This code does a
5698 # preliminary pass through these tables looking for the
5699 # maximum width map in each, and causing the comments to
5700 # begin just to the right of that. However, if the
5701 # comments begin too far to the right of most lines, it's
5702 # hard to line them up horizontally with their real data.
5703 # Therefore we ignore the longest outliers
5704 my $ignore_longest_X_percent = 2; # Discard longest X%
5706 # Each key in this hash is a width of at least one of the
5707 # maps in the table. Its value is how many lines have
5711 # We won't space things further left than one tab stop
5712 # after the rest of the line; initializing it to that
5713 # number saves some work.
5714 my $max_map_width = 8;
5716 # Fill in the %widths hash
5718 for my $set ($range_list{$addr}->ranges) {
5719 my $value = $set->value;
5721 # These range types don't appear in the main table
5722 next if $set->type == 0
5723 && defined $suppress_value
5724 && $value eq $suppress_value;
5725 next if $set->type == $MULTI_CP
5726 || $set->type == $NULL;
5728 # Include 2 spaces before the beginning of the
5730 my $this_width = length($value) + 2;
5732 # Ranges of the remaining non-zero types usually
5733 # occupy just one line (maybe occasionally two, but
5734 # this doesn't have to be dead accurate). This is
5735 # because these ranges are like "unassigned code
5737 my $count = ($set->type != 0)
5739 : $set->end - $set->start + 1;
5740 $widths{$this_width} += $count;
5742 $max_map_width = $this_width
5743 if $max_map_width < $this_width;
5746 # If the widest map gives us less than two tab stops
5747 # worth, just take it as-is.
5748 if ($max_map_width > 16) {
5750 # Otherwise go through %widths until we have included
5751 # the desired percentage of lines in the whole table.
5752 my $running_total = 0;
5753 foreach my $width (sort { $a <=> $b } keys %widths)
5755 $running_total += $widths{$width};
5757 if ($running_total * 100 / $total
5758 >= 100 - $ignore_longest_X_percent)
5760 $max_map_width = $width;
5765 $comment_indent += $max_map_width;
5769 # Values for previous time through the loop. Initialize to
5770 # something that won't be adjacent to the first iteration;
5771 # only $previous_end matters for that.
5773 my $previous_end = -2;
5776 # Values for next time through the portion of the loop that splits
5777 # the range. 0 in $next_start means there is no remaining portion
5783 my $invlist_count = 0;
5785 my $output_value_in_hex = $self->isa('Map_Table')
5786 && ($self->format eq $HEX_ADJUST_FORMAT
5787 || $self->to_output_map == $EXTERNAL_MAP);
5788 # Use leading zeroes just for files whose format should not be
5789 # changed from what it has been. Otherwise, they just take up
5790 # space and time to process.
5791 my $hex_format = ($self->isa('Map_Table')
5792 && $self->to_output_map == $EXTERNAL_MAP)
5796 # The values for some of these tables are stored in mktables as
5797 # hex strings. Normally, these are just output as strings without
5798 # change, but when we are doing adjustments, we have to operate on
5799 # these numerically, so we convert those to decimal to do that,
5800 # and back to hex for output
5801 my $convert_map_to_from_hex = 0;
5802 my $output_map_in_hex = 0;
5803 if ($self->isa('Map_Table')) {
5804 $convert_map_to_from_hex
5805 = ($use_adjustments && $self->format eq $HEX_ADJUST_FORMAT)
5806 || ($annotate && $self->format eq $HEX_FORMAT);
5807 $output_map_in_hex = $convert_map_to_from_hex
5808 || $self->format eq $HEX_FORMAT;
5811 # To store any annotations about the characters.
5814 # Output each range as part of the here document.
5816 for my $set ($range_list{$addr}->ranges) {
5817 if ($set->type != 0) {
5818 $self->handle_special_range($set);
5821 my $start = $set->start;
5822 my $end = $set->end;
5823 my $value = $set->value;
5825 # Don't output ranges whose value is the one to suppress
5826 next RANGE if defined $suppress_value
5827 && $value eq $suppress_value;
5829 $value = CORE::hex $value if $convert_map_to_from_hex;
5832 { # This bare block encloses the scope where we may need to
5833 # 'redo' to. Consider a table that is to be written out
5834 # using single item ranges. This is given in the
5835 # $range_size_1 boolean. To accomplish this, we split the
5836 # range each time through the loop into two portions, the
5837 # first item, and the rest. We handle that first item
5838 # this time in the loop, and 'redo' to repeat the process
5839 # for the rest of the range.
5841 # We may also have to do it, with other special handling,
5842 # if the table has adjustments. Consider the table that
5843 # contains the lowercasing maps. mktables stores the
5844 # ASCII range ones as 26 ranges:
5845 # ord('A') => ord('a'), .. ord('Z') => ord('z')
5846 # For compactness, the table that gets written has this as
5848 # ( ord('A') .. ord('Z') ) => ord('a')
5849 # and the software that reads the tables is smart enough
5850 # to "connect the dots". This change is accomplished in
5851 # this loop by looking to see if the current iteration
5852 # fits the paradigm of the previous iteration, and if so,
5853 # we merge them by replacing the final output item with
5854 # the merged data. Repeated 25 times, this gets A-Z. But
5855 # we also have to make sure we don't screw up cases where
5856 # we have internally stored
5857 # ( 0x1C4 .. 0x1C6 ) => 0x1C5
5858 # This single internal range has to be output as 3 ranges,
5859 # which is done by splitting, like we do for $range_size_1
5860 # tables. (There are very few of such ranges that need to
5861 # be split, so the gain of doing the combining of other
5862 # ranges far outweighs the splitting of these.) The
5863 # values to use for the redo at the end of this block are
5864 # set up just below in the scalars whose names begin with
5867 if (($use_adjustments || $range_size_1) && $end != $start)
5869 $next_start = $start + 1;
5871 $next_value = $value;
5875 if ($use_adjustments && ! $range_size_1) {
5877 # If this range is adjacent to the previous one, and
5878 # the values in each are integers that are also
5879 # adjacent (differ by 1), then this range really
5880 # extends the previous one that is already in element
5881 # $OUT[-1]. So we pop that element, and pretend that
5882 # the range starts with whatever it started with.
5883 # $offset is incremented by 1 each time so that it
5884 # gives the current offset from the first element in
5885 # the accumulating range, and we keep in $value the
5886 # value of that first element.
5887 if ($start == $previous_end + 1
5888 && $value =~ /^ -? \d+ $/xa
5889 && $previous_value =~ /^ -? \d+ $/xa
5890 && ($value == ($previous_value + ++$offset)))
5893 $start = $previous_start;
5894 $value = $previous_value;
5898 if (@annotation == 1) {
5899 $OUT[-1] = merge_single_annotation_line(
5900 $OUT[-1], $annotation[0], $comment_indent);
5903 push @OUT, @annotation;
5908 # Save the current values for the next time through
5910 $previous_start = $start;
5911 $previous_end = $end;
5912 $previous_value = $value;
5915 if ($write_as_invlist) {
5917 # Inversion list format has a single number per line,
5918 # the starting code point of a range that matches the
5920 push @OUT, $start, "\n";
5923 # Add a comment with the size of the range, if
5925 if ($output_range_counts{$addr}) {
5926 $OUT[-1] = merge_single_annotation_line(
5929 . main::clarify_code_point_count($end - $start + 1)
5934 elsif ($start != $end) { # If there is a range
5935 if ($end == $MAX_WORKING_CODEPOINT) {
5936 push @OUT, sprintf "$hex_format\t$hex_format",
5938 $MAX_PLATFORM_CODEPOINT;
5941 push @OUT, sprintf "$hex_format\t$hex_format",
5944 if (length $value) {
5945 if ($convert_map_to_from_hex) {
5946 $OUT[-1] .= sprintf "\t$hex_format\n", $value;
5949 $OUT[-1] .= "\t$value\n";
5953 # Add a comment with the size of the range, if
5955 if ($output_range_counts{$addr}) {
5956 $OUT[-1] = merge_single_annotation_line(
5959 . main::clarify_code_point_count($end - $start + 1)
5964 else { # Here to output a single code point per line.
5966 # Use any passed in subroutine to output.
5967 if (ref $range_size_1 eq 'CODE') {
5968 for my $i ($start .. $end) {
5969 push @OUT, &{$range_size_1}($i, $value);
5974 # Here, caller is ok with default output.
5975 for (my $i = $start; $i <= $end; $i++) {
5976 if ($convert_map_to_from_hex) {
5978 sprintf "$hex_format\t\t$hex_format\n",
5982 push @OUT, sprintf $hex_format, $i;
5983 $OUT[-1] .= "\t\t$value" if $value ne "";
5991 for (my $i = $start; $i <= $end; $i++) {
5992 my $annotation = "";
5994 # Get character information if don't have it already
5995 main::populate_char_info($i)
5996 if ! defined $viacode[$i];
5997 my $type = $annotate_char_type[$i];
5999 # Figure out if should output the next code points
6000 # as part of a range or not. If this is not in an
6001 # annotation range, then won't output as a range,
6002 # so returns $i. Otherwise use the end of the
6003 # annotation range, but no further than the
6004 # maximum possible end point of the loop.
6009 $annotate_ranges->value_of($i) || $i,
6012 # Use a range if it is a range, and either is one
6013 # of the special annotation ranges, or the range
6014 # is at most 3 long. This last case causes the
6015 # algorithmically named code points to be output
6016 # individually in spans of at most 3, as they are
6017 # the ones whose $type is > 0.
6018 if ($range_end != $i
6019 && ( $type < 0 || $range_end - $i > 2))
6021 # Here is to output a range. We don't allow a
6022 # caller-specified output format--just use the
6024 my $range_name = $viacode[$i];
6026 # For the code points which end in their hex
6027 # value, we eliminate that from the output
6028 # annotation, and capitalize only the first
6029 # letter of each word.
6030 if ($type == $CP_IN_NAME) {
6031 my $hex = sprintf $hex_format, $i;
6032 $range_name =~ s/-$hex$//;
6033 my @words = split " ", $range_name;
6034 for my $word (@words) {
6036 ucfirst(lc($word)) if $word ne 'CJK';
6038 $range_name = join " ", @words;
6040 elsif ($type == $HANGUL_SYLLABLE) {
6041 $range_name = "Hangul Syllable";
6044 if ($i != $start || $range_end < $end) {
6045 if ($range_end < $MAX_WORKING_CODEPOINT)
6047 $annotation = sprintf "%04X..%04X",
6051 $annotation = sprintf "%04X..INFINITY",
6055 else { # Indent if not displaying code points
6056 $annotation = " " x 4;
6058 $annotation .= " $range_name" if $range_name;
6060 # Include the number of code points in the
6063 main::clarify_code_point_count($range_end - $i + 1);
6064 $annotation .= " [$count]\n";
6066 # Skip to the end of the range
6069 else { # Not in a range.
6072 # When outputting the names of each character,
6073 # use the character itself if printable
6074 $comment .= "'" . main::display_chr($i) . "' "
6077 my $output_value = $value;
6079 # Determine the annotation
6080 if ($format eq $DECOMP_STRING_FORMAT) {
6082 # This is very specialized, with the type
6083 # of decomposition beginning the line
6084 # enclosed in <...>, and the code points
6085 # that the code point decomposes to
6086 # separated by blanks. Create two
6087 # strings, one of the printable
6088 # characters, and one of their official
6090 (my $map = $output_value)
6091 =~ s/ \ * < .*? > \ +//x;
6095 foreach my $to (split " ", $map) {
6096 $to = CORE::hex $to;
6097 $to_name .= " + " if $to_name;
6098 $to_chr .= main::display_chr($to);
6099 main::populate_char_info($to)
6100 if ! defined $viacode[$to];
6101 $to_name .= $viacode[$to];
6105 "=> '$to_chr'; $viacode[$i] => $to_name";
6108 $output_value += $i - $start
6110 # Don't try to adjust a
6112 && $output_value !~ /[-\D]/;
6114 if ($output_map_in_hex) {
6115 main::populate_char_info($output_value)
6116 if ! defined $viacode[$output_value];
6118 . main::display_chr($output_value)
6119 . "'; " if $printable[$output_value];
6121 if ($include_name && $viacode[$i]) {
6122 $comment .= " " if $comment;
6123 $comment .= $viacode[$i];
6125 if ($output_map_in_hex) {
6127 " => $viacode[$output_value]"
6128 if $viacode[$output_value];
6129 $output_value = sprintf($hex_format,
6135 $annotation = sprintf "%04X", $i;
6136 if ($use_adjustments) {
6137 $annotation .= " => $output_value";
6141 if ($comment ne "") {
6142 $annotation .= " " if $annotation ne "";
6143 $annotation .= $comment;
6145 $annotation .= "\n" if $annotation ne "";
6148 if ($annotation ne "") {
6149 push @annotation, (" " x $comment_indent)
6154 # If not adjusting, we don't have to go through the
6155 # loop again to know that the annotation comes next
6157 if (! $use_adjustments) {
6158 if (@annotation == 1) {
6159 $OUT[-1] = merge_single_annotation_line(
6160 $OUT[-1], $annotation[0], $comment_indent);
6163 push @OUT, map { Text::Tabs::unexpand $_ }
6170 # Add the beginning of the range that doesn't match the
6171 # property, except if the just added match range extends
6172 # to infinity. We do this after any annotations for the
6174 if ($write_as_invlist && $end < $MAX_WORKING_CODEPOINT) {
6175 push @OUT, $end + 1, "\n";
6179 # If we split the range, set up so the next time through
6180 # we get the remainder, and redo.
6182 $start = $next_start;
6184 $value = $next_value;
6189 } # End of loop through all the table's ranges
6191 push @OUT, @annotation; # Add orphaned annotation, if any
6193 splice @OUT, 1, 0, "V$invlist_count\n" if $invlist_count;
6196 # Add anything that goes after the main body, but within the here
6198 my $append_to_body = $self->append_to_body;
6199 push @OUT, $append_to_body if $append_to_body;
6201 # And finish the here document.
6204 # Done with the main portion of the body. Can now figure out what
6205 # should appear before it in the file.
6206 my $pre_body = $self->pre_body;
6207 push @HEADER, $pre_body, "\n" if $pre_body;
6209 # All these files should have a .pl suffix added to them.
6210 my @file_with_pl = @{$file_path{$addr}};
6211 $file_with_pl[-1] .= '.pl';
6213 main::write(\@file_with_pl,
6214 $annotate, # utf8 iff annotating
6220 sub set_status { # Set the table's status
6222 my $status = shift; # The status enum value
6223 my $info = shift; # Any message associated with it.
6224 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6226 my $addr = do { no overloading; pack 'J', $self; };
6228 $status{$addr} = $status;
6229 $status_info{$addr} = $info;
6233 sub set_fate { # Set the fate of a table
6237 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6239 my $addr = do { no overloading; pack 'J', $self; };
6241 return if $fate{$addr} == $fate; # If no-op
6243 # Can only change the ordinary fate, except if going to $MAP_PROXIED
6244 return if $fate{$addr} != $ORDINARY && $fate != $MAP_PROXIED;
6246 $fate{$addr} = $fate;
6248 # Don't document anything to do with a non-normal fated table
6249 if ($fate != $ORDINARY) {
6250 my $put_in_pod = ($fate == $MAP_PROXIED) ? 1 : 0;
6251 foreach my $alias ($self->aliases) {
6252 $alias->set_ucd($put_in_pod);
6254 # MAP_PROXIED doesn't affect the match tables
6255 next if $fate == $MAP_PROXIED;
6256 $alias->set_make_re_pod_entry($put_in_pod);
6260 # Save the reason for suppression for output
6261 if ($fate == $SUPPRESSED && defined $reason) {
6262 $why_suppressed{$complete_name{$addr}} = $reason;
6269 # Don't allow changes to the table from now on. This stores a stack
6270 # trace of where it was called, so that later attempts to modify it
6271 # can immediately show where it got locked.
6274 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6276 my $addr = do { no overloading; pack 'J', $self; };
6278 $locked{$addr} = "";
6280 my $line = (caller(0))[2];
6283 # Accumulate the stack trace
6285 my ($pkg, $file, $caller_line, $caller) = caller $i++;
6287 last unless defined $caller;
6289 $locked{$addr} .= " called from $caller() at line $line\n";
6290 $line = $caller_line;
6292 $locked{$addr} .= " called from main at line $line\n";
6297 sub carp_if_locked {
6298 # Return whether a table is locked or not, and, by the way, complain
6302 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6304 my $addr = do { no overloading; pack 'J', $self; };
6306 return 0 if ! $locked{$addr};
6307 Carp::my_carp_bug("Can't modify a locked table. Stack trace of locking:\n$locked{$addr}\n\n");
6311 sub set_file_path { # Set the final directory path for this table
6313 # Rest of parameters passed on
6316 @{$file_path{pack 'J', $self}} = @_;
6320 # Accessors for the range list stored in this table. First for
6329 matches_identically_to
6342 return $self->_range_list->$sub(@_);
6346 # Then for ones that should fail if locked
6356 return if $self->carp_if_locked;
6358 return $self->_range_list->$sub(@_);
6365 use parent '-norequire', '_Base_Table';
6367 # A Map Table is a table that contains the mappings from code points to
6368 # values. There are two weird cases:
6369 # 1) Anomalous entries are ones that aren't maps of ranges of code points, but
6370 # are written in the table's file at the end of the table nonetheless. It
6371 # requires specially constructed code to handle these; utf8.c can not read
6372 # these in, so they should not go in $map_directory. As of this writing,
6373 # the only case that these happen is for named sequences used in
6374 # charnames.pm. But this code doesn't enforce any syntax on these, so
6375 # something else could come along that uses it.
6376 # 2) Specials are anything that doesn't fit syntactically into the body of the
6377 # table. The ranges for these have a map type of non-zero. The code below
6378 # knows about and handles each possible type. In most cases, these are
6379 # written as part of the header.
6381 # A map table deliberately can't be manipulated at will unlike match tables.
6382 # This is because of the ambiguities having to do with what to do with
6383 # overlapping code points. And there just isn't a need for those things;
6384 # what one wants to do is just query, add, replace, or delete mappings, plus
6385 # write the final result.
6386 # However, there is a method to get the list of possible ranges that aren't in
6387 # this table to use for defaulting missing code point mappings. And,
6388 # map_add_or_replace_non_nulls() does allow one to add another table to this
6389 # one, but it is clearly very specialized, and defined that the other's
6390 # non-null values replace this one's if there is any overlap.
6392 sub trace { return main::trace(@_); }
6396 main::setup_package();
6399 # Many input files omit some entries; this gives what the mapping for the
6400 # missing entries should be
6401 main::set_access('default_map', \%default_map, 'r');
6403 my %anomalous_entries;
6404 # Things that go in the body of the table which don't fit the normal
6405 # scheme of things, like having a range. Not much can be done with these
6406 # once there except to output them. This was created to handle named
6408 main::set_access('anomalous_entry', \%anomalous_entries, 'a');
6409 main::set_access('anomalous_entries', # Append singular, read plural
6410 \%anomalous_entries,
6413 my %replacement_property;
6414 # Certain files are unused by Perl itself, and are kept only for backwards
6415 # compatibility for programs that used them before Unicode::UCD existed.
6416 # These are termed legacy properties. At some point they may be removed,
6417 # but for now mark them as legacy. If non empty, this is the name of the
6418 # property to use instead (i.e., the modern equivalent).
6419 main::set_access('replacement_property', \%replacement_property, 'r');
6422 # Enum as to whether or not to write out this map table, and how:
6424 # $EXTERNAL_MAP means its existence is noted in the documentation, and
6425 # it should not be removed nor its format changed. This
6426 # is done for those files that have traditionally been
6427 # output. Maps of legacy-only properties default to
6429 # $INTERNAL_MAP means Perl reserves the right to do anything it wants
6431 # $OUTPUT_ADJUSTED means that it is an $INTERNAL_MAP, and instead of
6432 # outputting the actual mappings as-is, we adjust things
6433 # to create a much more compact table. Only those few
6434 # tables where the mapping is convertible at least to an
6435 # integer and compacting makes a big difference should
6436 # have this. Hence, the default is to not do this
6437 # unless the table's default mapping is to $CODE_POINT,
6438 # and the range size is not 1.
6439 main::set_access('to_output_map', \%to_output_map, 's');
6447 # Optional initialization data for the table.
6448 my $initialize = delete $args{'Initialize'};
6450 my $default_map = delete $args{'Default_Map'};
6451 my $property = delete $args{'_Property'};
6452 my $full_name = delete $args{'Full_Name'};
6453 my $replacement_property = delete $args{'Replacement_Property'} // "";
6454 my $to_output_map = delete $args{'To_Output_Map'};
6456 # Rest of parameters passed on; legacy properties have several common
6458 if ($replacement_property) {
6459 $args{"Fate"} = $LEGACY_ONLY;
6460 $args{"Range_Size_1"} = 1;
6461 $args{"Perl_Extension"} = 1;
6465 my $range_list = Range_Map->new(Owner => $property);
6467 my $self = $class->SUPER::new(
6469 Complete_Name => $full_name,
6470 Full_Name => $full_name,
6471 _Property => $property,
6472 _Range_List => $range_list,
6473 Write_As_Invlist => 0,
6476 my $addr = do { no overloading; pack 'J', $self; };
6478 $anomalous_entries{$addr} = [];
6479 $default_map{$addr} = $default_map;
6480 $replacement_property{$addr} = $replacement_property;
6481 $to_output_map = $EXTERNAL_MAP if ! defined $to_output_map
6482 && $replacement_property;
6483 $to_output_map{$addr} = $to_output_map;
6485 $self->initialize($initialize) if defined $initialize;
6492 qw("") => "_operator_stringify",
6495 sub _operator_stringify {
6498 my $name = $self->property->full_name;
6499 $name = '""' if $name eq "";
6500 return "Map table for Property '$name'";
6504 # Add a synonym for this table (which means the property itself)
6507 # Rest of parameters passed on.
6509 $self->SUPER::add_alias($name, $self->property, @_);
6514 # Add a range of code points to the list of specially-handled code
6515 # points. $MULTI_CP is assumed if the type of special is not passed
6524 my $type = delete $args{'Type'} || 0;
6525 # Rest of parameters passed on
6527 # Can't change the table if locked.
6528 return if $self->carp_if_locked;
6530 my $addr = do { no overloading; pack 'J', $self; };
6532 $self->_range_list->add_map($lower, $upper,
6539 sub append_to_body {
6540 # Adds to the written HERE document of the table's body any anomalous
6541 # entries in the table..
6544 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6546 my $addr = do { no overloading; pack 'J', $self; };
6548 return "" unless @{$anomalous_entries{$addr}};
6549 return join("\n", @{$anomalous_entries{$addr}}) . "\n";
6552 sub map_add_or_replace_non_nulls {
6553 # This adds the mappings in the table $other to $self. Non-null
6554 # mappings from $other override those in $self. It essentially merges
6555 # the two tables, with the second having priority except for null
6560 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6562 return if $self->carp_if_locked;
6564 if (! $other->isa(__PACKAGE__)) {
6565 Carp::my_carp_bug("$other should be a "
6573 my $addr = do { no overloading; pack 'J', $self; };
6574 my $other_addr = do { no overloading; pack 'J', $other; };
6576 local $to_trace = 0 if main::DEBUG;
6578 my $self_range_list = $self->_range_list;
6579 my $other_range_list = $other->_range_list;
6580 foreach my $range ($other_range_list->ranges) {
6581 my $value = $range->value;
6582 next if $value eq "";
6583 $self_range_list->_add_delete('+',
6587 Type => $range->type,
6588 Replace => $UNCONDITIONALLY);
6594 sub set_default_map {
6595 # Define what code points that are missing from the input files should
6600 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6602 my $addr = do { no overloading; pack 'J', $self; };
6604 # Convert the input to the standard equivalent, if any (won't have any
6605 # for $STRING properties)
6606 my $standard = $self->_find_table_from_alias->{$map};
6607 $map = $standard->name if defined $standard;
6609 # Warn if there already is a non-equivalent default map for this
6610 # property. Note that a default map can be a ref, which means that
6611 # what it actually means is delayed until later in the program, and it
6612 # IS permissible to override it here without a message.
6613 my $default_map = $default_map{$addr};
6614 if (defined $default_map
6615 && ! ref($default_map)
6616 && $default_map ne $map
6617 && main::Standardize($map) ne $default_map)
6619 my $property = $self->property;
6620 my $map_table = $property->table($map);
6621 my $default_table = $property->table($default_map);
6622 if (defined $map_table
6623 && defined $default_table
6624 && $map_table != $default_table)
6626 Carp::my_carp("Changing the default mapping for "
6628 . " from $default_map to $map'");
6632 $default_map{$addr} = $map;
6634 # Don't also create any missing table for this map at this point,
6635 # because if we did, it could get done before the main table add is
6636 # done for PropValueAliases.txt; instead the caller will have to make
6637 # sure it exists, if desired.
6642 # Returns boolean: should we write this map table?
6645 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6647 my $addr = do { no overloading; pack 'J', $self; };
6649 # If overridden, use that
6650 return $to_output_map{$addr} if defined $to_output_map{$addr};
6652 my $full_name = $self->full_name;
6653 return $global_to_output_map{$full_name}
6654 if defined $global_to_output_map{$full_name};
6656 # If table says to output, do so; if says to suppress it, do so.
6657 my $fate = $self->fate;
6658 return $INTERNAL_MAP if $fate == $INTERNAL_ONLY;
6659 return $EXTERNAL_MAP if grep { $_ eq $full_name } @output_mapped_properties;
6660 return 0 if $fate == $SUPPRESSED || $fate == $MAP_PROXIED;
6662 my $type = $self->property->type;
6664 # Don't want to output binary map tables even for debugging.
6665 return 0 if $type == $BINARY;
6667 # But do want to output string ones. All the ones that remain to
6668 # be dealt with (i.e. which haven't explicitly been set to external)
6669 # are for internal Perl use only. The default for those that map to
6670 # $CODE_POINT and haven't been restricted to a single element range
6671 # is to use the adjusted form.
6672 if ($type == $STRING) {
6673 return $INTERNAL_MAP if $self->range_size_1
6674 || $default_map{$addr} ne $CODE_POINT;
6675 return $OUTPUT_ADJUSTED;
6678 # Otherwise is an $ENUM, do output it, for Perl's purposes
6679 return $INTERNAL_MAP;
6683 # Returns a Range_List that is gaps of the current table. That is,
6687 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6689 my $current = Range_List->new(Initialize => $self->_range_list,
6690 Owner => $self->property);
6696 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6698 my $return = $self->SUPER::header();
6700 if ($self->to_output_map >= $INTERNAL_MAP) {
6701 $return .= $INTERNAL_ONLY_HEADER;
6704 my $property_name = $self->property->replacement_property;
6706 # The legacy-only properties were gotten above; but there are some
6707 # other properties whose files are in current use that have fixed
6709 $property_name = $self->property->full_name unless $property_name;
6713 # !!!!!!! IT IS DEPRECATED TO USE THIS FILE !!!!!!!
6715 # This file is for internal use by core Perl only. It is retained for
6716 # backwards compatibility with applications that may have come to rely on it,
6717 # but its format and even its name or existence are subject to change without
6718 # notice in a future Perl version. Don't use it directly. Instead, its
6719 # contents are now retrievable through a stable API in the Unicode::UCD
6720 # module: Unicode::UCD::prop_invmap('$property_name').
6726 sub set_final_comment {
6727 # Just before output, create the comment that heads the file
6728 # containing this table.
6730 return unless $debugging_build;
6733 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6735 # No sense generating a comment if aren't going to write it out.
6736 return if ! $self->to_output_map;
6738 my $addr = do { no overloading; pack 'J', $self; };
6740 my $property = $self->property;
6742 # Get all the possible names for this property. Don't use any that
6743 # aren't ok for use in a file name, etc. This is perhaps causing that
6744 # flag to do double duty, and may have to be changed in the future to
6745 # have our own flag for just this purpose; but it works now to exclude
6746 # Perl generated synonyms from the lists for properties, where the
6747 # name is always the proper Unicode one.
6748 my @property_aliases = grep { $_->ok_as_filename } $self->aliases;
6750 my $count = $self->count;
6751 my $default_map = $default_map{$addr};
6753 # The ranges that map to the default aren't output, so subtract that
6754 # to get those actually output. A property with matching tables
6755 # already has the information calculated.
6756 if ($property->type != $STRING) {
6757 $count -= $property->table($default_map)->count;
6759 elsif (defined $default_map) {
6761 # But for $STRING properties, must calculate now. Subtract the
6762 # count from each range that maps to the default.
6763 foreach my $range ($self->_range_list->ranges) {
6764 if ($range->value eq $default_map) {
6765 $count -= $range->end +1 - $range->start;
6771 # Get a string version of $count with underscores in large numbers,
6773 my $string_count = main::clarify_code_point_count($count);
6775 my $code_points = ($count == 1)
6776 ? 'single code point'
6777 : "$string_count code points";
6782 if (@property_aliases <= 1) {
6783 $mapping = 'mapping';
6784 $these_mappings = 'this mapping';
6788 $mapping = 'synonymous mappings';
6789 $these_mappings = 'these mappings';
6793 if ($count >= $MAX_UNICODE_CODEPOINTS) {
6794 $cp = "any code point in Unicode Version $string_version";
6798 if ($default_map eq "") {
6799 $map_to = 'the null string';
6801 elsif ($default_map eq $CODE_POINT) {
6805 $map_to = "'$default_map'";
6808 $cp = "the single code point";
6811 $cp = "one of the $code_points";
6813 $cp .= " in Unicode Version $string_version for which the mapping is not to $map_to";
6818 my $status = $self->status;
6819 if ($status ne $NORMAL) {
6820 my $warn = uc $status_past_participles{$status};
6823 !!!!!!! $warn !!!!!!!!!!!!!!!!!!!
6824 All property or property=value combinations contained in this file are $warn.
6825 See $unicode_reference_url for what this means.
6829 $comment .= "This file returns the $mapping:\n";
6831 my $ucd_accessible_name = "";
6832 my $full_name = $self->property->full_name;
6833 for my $i (0 .. @property_aliases - 1) {
6834 my $name = $property_aliases[$i]->name;
6835 $comment .= sprintf("%-8s%s\n", " ", $name . '(cp)');
6836 if ($property_aliases[$i]->ucd) {
6837 if ($name eq $full_name) {
6838 $ucd_accessible_name = $full_name;
6840 elsif (! $ucd_accessible_name) {
6841 $ucd_accessible_name = $name;
6845 $comment .= "\nwhere 'cp' is $cp.";
6846 if ($ucd_accessible_name) {
6847 $comment .= " Note that $these_mappings $are accessible via the function prop_invmap('$full_name') in Unicode::UCD";
6850 # And append any commentary already set from the actual property.
6851 $comment .= "\n\n" . $self->comment if $self->comment;
6852 if ($self->description) {
6853 $comment .= "\n\n" . join " ", $self->description;
6856 $comment .= "\n\n" . join " ", $self->note;
6860 if (! $self->perl_extension) {
6863 For information about what this property really means, see:
6864 $unicode_reference_url
6868 if ($count) { # Format differs for empty table
6869 $comment.= "\nThe format of the ";
6870 if ($self->range_size_1) {
6872 main body of lines of this file is: CODE_POINT\\t\\tMAPPING where CODE_POINT
6873 is in hex; MAPPING is what CODE_POINT maps to.
6878 # There are tables which end up only having one element per
6879 # range, but it is not worth keeping track of for making just
6880 # this comment a little better.
6882 non-comment portions of the main body of lines of this file is:
6883 START\\tSTOP\\tMAPPING where START is the starting code point of the
6884 range, in hex; STOP is the ending point, or if omitted, the range has just one
6885 code point; MAPPING is what each code point between START and STOP maps to.
6887 if ($self->output_range_counts) {
6889 Numbers in comments in [brackets] indicate how many code points are in the
6890 range (omitted when the range is a single code point or if the mapping is to
6896 $self->set_comment(main::join_lines($comment));
6900 my %swash_keys; # Makes sure don't duplicate swash names.
6902 # The remaining variables are temporaries used while writing each table,
6903 # to output special ranges.
6904 my @multi_code_point_maps; # Map is to more than one code point.
6906 sub handle_special_range {
6907 # Called in the middle of write when it finds a range it doesn't know
6912 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6914 my $addr = do { no overloading; pack 'J', $self; };
6916 my $type = $range->type;
6918 my $low = $range->start;
6919 my $high = $range->end;
6920 my $map = $range->value;
6922 # No need to output the range if it maps to the default.
6923 return if $map eq $default_map{$addr};
6925 my $property = $self->property;
6927 # Switch based on the map type...
6928 if ($type == $HANGUL_SYLLABLE) {
6930 # These are entirely algorithmically determinable based on
6931 # some constants furnished by Unicode; for now, just set a
6932 # flag to indicate that have them. After everything is figured
6933 # out, we will output the code that does the algorithm. (Don't
6934 # output them if not needed because we are suppressing this
6936 $has_hangul_syllables = 1 if $property->to_output_map;
6938 elsif ($type == $CP_IN_NAME) {
6940 # Code points whose name ends in their code point are also
6941 # algorithmically determinable, but need information about the map
6942 # to do so. Both the map and its inverse are stored in data
6943 # structures output in the file. They are stored in the mean time
6944 # in global lists The lists will be written out later into Name.pm,
6945 # which is created only if needed. In order to prevent duplicates
6946 # in the list, only add to them for one property, should multiple
6948 if ($needing_code_points_ending_in_code_point == 0) {
6949 $needing_code_points_ending_in_code_point = $property;
6951 if ($property == $needing_code_points_ending_in_code_point) {
6952 push @{$names_ending_in_code_point{$map}->{'low'}}, $low;
6953 push @{$names_ending_in_code_point{$map}->{'high'}}, $high;
6955 my $squeezed = $map =~ s/[-\s]+//gr;
6956 push @{$loose_names_ending_in_code_point{$squeezed}->{'low'}},
6958 push @{$loose_names_ending_in_code_point{$squeezed}->{'high'}},
6961 push @code_points_ending_in_code_point, { low => $low,
6967 elsif ($range->type == $MULTI_CP || $range->type == $NULL) {
6969 # Multi-code point maps and null string maps have an entry
6970 # for each code point in the range. They use the same
6972 for my $code_point ($low .. $high) {
6974 # The pack() below can't cope with surrogates. XXX This may
6976 if ($code_point >= 0xD800 && $code_point <= 0xDFFF) {
6977 Carp::my_carp("Surrogate code point '$code_point' in mapping to '$map' in $self. No map created");
6981 # Generate the hash entries for these in the form that
6982 # utf8.c understands.
6986 foreach my $to (split " ", $map) {
6987 if ($to !~ /^$code_point_re$/) {
6988 Carp::my_carp("Illegal code point '$to' in mapping '$map' from $code_point in $self. No map created");
6991 $tostr .= sprintf "\\x{%s}", $to;
6992 $to = CORE::hex $to;
6994 $to_name .= " + " if $to_name;
6995 $to_chr .= main::display_chr($to);
6996 main::populate_char_info($to)
6997 if ! defined $viacode[$to];
6998 $to_name .= $viacode[$to];
7002 # The unpack yields a list of the bytes that comprise the
7003 # UTF-8 of $code_point, which are each placed in \xZZ format
7004 # and output in the %s to map to $tostr, so the result looks
7006 # "\xC4\xB0" => "\x{0069}\x{0307}",
7007 my $utf8 = sprintf(qq["%s" => "$tostr",],
7008 join("", map { sprintf "\\x%02X", $_ }
7009 unpack("U0C*", chr $code_point)));
7011 # Add a comment so that a human reader can more easily
7012 # see what's going on.
7013 push @multi_code_point_maps,
7014 sprintf("%-45s # U+%04X", $utf8, $code_point);
7016 $multi_code_point_maps[-1] .= " => $map";
7019 main::populate_char_info($code_point)
7020 if ! defined $viacode[$code_point];
7021 $multi_code_point_maps[-1] .= " '"
7022 . main::display_chr($code_point)
7023 . "' => '$to_chr'; $viacode[$code_point] => $to_name";
7028 Carp::my_carp("Unrecognized map type '$range->type' in '$range' in $self. Not written");
7035 # Returns the string that should be output in the file before the main
7036 # body of this table. It isn't called until the main body is
7037 # calculated, saving a pass. The string includes some hash entries
7038 # identifying the format of the body, and what the single value should
7039 # be for all ranges missing from it. It also includes any code points
7040 # which have map_types that don't go in the main table.
7043 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7045 my $addr = do { no overloading; pack 'J', $self; };
7047 my $name = $self->property->swash_name;
7049 # Currently there is nothing in the pre_body unless a swash is being
7051 return unless defined $name;
7053 if (defined $swash_keys{$name}) {
7054 Carp::my_carp(main::join_lines(<<END
7055 Already created a swash name '$name' for $swash_keys{$name}. This means that
7056 the same name desired for $self shouldn't be used. Bad News. This must be
7057 fixed before production use, but proceeding anyway
7061 $swash_keys{$name} = "$self";
7065 # Here we assume we were called after have gone through the whole
7066 # file. If we actually generated anything for each map type, add its
7067 # respective header and trailer
7068 my $specials_name = "";
7069 if (@multi_code_point_maps) {
7070 $specials_name = "utf8::ToSpec$name";
7073 # Some code points require special handling because their mappings are each to
7074 # multiple code points. These do not appear in the main body, but are defined
7075 # in the hash below.
7077 # Each key is the string of N bytes that together make up the UTF-8 encoding
7078 # for the code point. (i.e. the same as looking at the code point's UTF-8
7079 # under "use bytes"). Each value is the UTF-8 of the translation, for speed.
7080 \%$specials_name = (
7082 $pre_body .= join("\n", @multi_code_point_maps) . "\n);\n";
7085 my $format = $self->format;
7089 my $output_adjusted = ($self->to_output_map == $OUTPUT_ADJUSTED);
7090 if ($output_adjusted) {
7091 if ($specials_name) {
7093 # The mappings in the non-hash portion of this file must be modified to get the
7094 # correct values by adding the code point ordinal number to each one that is
7100 # The mappings must be modified to get the correct values by adding the code
7101 # point ordinal number to each one that is numeric.
7108 # The name this swash is to be known by, with the format of the mappings in
7109 # the main body of the table, and what all code points missing from this file
7111 \$utf8::SwashInfo{'To$name'}{'format'} = '$format'; # $map_table_formats{$format}
7113 if ($specials_name) {
7115 \$utf8::SwashInfo{'To$name'}{'specials_name'} = '$specials_name'; # Name of hash of special mappings
7118 my $default_map = $default_map{$addr};
7120 # For $CODE_POINT default maps and using adjustments, instead the default
7122 $return .= "\$utf8::SwashInfo{'To$name'}{'missing'} = '"
7123 . (($output_adjusted && $default_map eq $CODE_POINT)
7128 if ($default_map eq $CODE_POINT) {
7129 $return .= ' # code point maps to itself';
7131 elsif ($default_map eq "") {
7132 $return .= ' # code point maps to the null string';
7136 $return .= $pre_body;
7142 # Write the table to the file.
7145 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7147 my $addr = do { no overloading; pack 'J', $self; };
7149 # Clear the temporaries
7150 undef @multi_code_point_maps;
7152 # Calculate the format of the table if not already done.
7153 my $format = $self->format;
7154 my $type = $self->property->type;
7155 my $default_map = $self->default_map;
7156 if (! defined $format) {
7157 if ($type == $BINARY) {
7159 # Don't bother checking the values, because we elsewhere
7160 # verify that a binary table has only 2 values.
7161 $format = $BINARY_FORMAT;
7164 my @ranges = $self->_range_list->ranges;
7166 # default an empty table based on its type and default map
7169 # But it turns out that the only one we can say is a
7170 # non-string (besides binary, handled above) is when the
7171 # table is a string and the default map is to a code point
7172 if ($type == $STRING && $default_map eq $CODE_POINT) {
7173 $format = $HEX_FORMAT;
7176 $format = $STRING_FORMAT;
7181 # Start with the most restrictive format, and as we find
7182 # something that doesn't fit with that, change to the next
7183 # most restrictive, and so on.
7184 $format = $DECIMAL_FORMAT;
7185 foreach my $range (@ranges) {
7186 next if $range->type != 0; # Non-normal ranges don't
7187 # affect the main body
7188 my $map = $range->value;
7189 if ($map ne $default_map) {
7190 last if $format eq $STRING_FORMAT; # already at
7193 $format = $INTEGER_FORMAT
7194 if $format eq $DECIMAL_FORMAT
7195 && $map !~ / ^ [0-9] $ /x;
7196 $format = $FLOAT_FORMAT
7197 if $format eq $INTEGER_FORMAT
7198 && $map !~ / ^ -? [0-9]+ $ /x;
7199 $format = $RATIONAL_FORMAT
7200 if $format eq $FLOAT_FORMAT
7201 && $map !~ / ^ -? [0-9]+ \. [0-9]* $ /x;
7202 $format = $HEX_FORMAT
7203 if ($format eq $RATIONAL_FORMAT
7205 m/ ^ -? [0-9]+ ( \/ [0-9]+ )? $ /x)
7206 # Assume a leading zero means hex,
7207 # even if all digits are 0-9
7208 || ($format eq $INTEGER_FORMAT
7209 && $map =~ /^0[0-9A-F]/);
7210 $format = $STRING_FORMAT if $format eq $HEX_FORMAT
7211 && $map =~ /[^0-9A-F]/;
7216 } # end of calculating format
7218 if ($default_map eq $CODE_POINT
7219 && $format ne $HEX_FORMAT
7220 && ! defined $self->format) # manual settings are always
7223 Carp::my_carp_bug("Expecting hex format for mapping table for $self, instead got '$format'")
7226 # If the output is to be adjusted, the format of the table that gets
7227 # output is actually 'a' or 'ax' instead of whatever it is stored
7229 my $output_adjusted = ($self->to_output_map == $OUTPUT_ADJUSTED);
7230 if ($output_adjusted) {
7231 if ($default_map eq $CODE_POINT) {
7232 $format = $HEX_ADJUST_FORMAT;
7235 $format = $ADJUST_FORMAT;
7239 $self->_set_format($format);
7241 return $self->SUPER::write(
7243 $default_map); # don't write defaulteds
7246 # Accessors for the underlying list that should fail if locked.
7256 return if $self->carp_if_locked;
7257 return $self->_range_list->$sub(@_);
7260 } # End closure for Map_Table
7262 package Match_Table;
7263 use parent '-norequire', '_Base_Table';
7265 # A Match table is one which is a list of all the code points that have
7266 # the same property and property value, for use in \p{property=value}
7267 # constructs in regular expressions. It adds very little data to the base
7268 # structure, but many methods, as these lists can be combined in many ways to
7270 # There are only a few concepts added:
7271 # 1) Equivalents and Relatedness.
7272 # Two tables can match the identical code points, but have different names.
7273 # This always happens when there is a perl single form extension
7274 # \p{IsProperty} for the Unicode compound form \P{Property=True}. The two
7275 # tables are set to be related, with the Perl extension being a child, and
7276 # the Unicode property being the parent.
7278 # It may be that two tables match the identical code points and we don't
7279 # know if they are related or not. This happens most frequently when the
7280 # Block and Script properties have the exact range. But note that a
7281 # revision to Unicode could add new code points to the script, which would
7282 # now have to be in a different block (as the block was filled, or there
7283 # would have been 'Unknown' script code points in it and they wouldn't have
7284 # been identical). So we can't rely on any two properties from Unicode
7285 # always matching the same code points from release to release, and thus
7286 # these tables are considered coincidentally equivalent--not related. When
7287 # two tables are unrelated but equivalent, one is arbitrarily chosen as the
7288 # 'leader', and the others are 'equivalents'. This concept is useful
7289 # to minimize the number of tables written out. Only one file is used for
7290 # any identical set of code points, with entries in Heavy.pl mapping all
7291 # the involved tables to it.
7293 # Related tables will always be identical; we set them up to be so. Thus
7294 # if the Unicode one is deprecated, the Perl one will be too. Not so for
7295 # unrelated tables. Relatedness makes generating the documentation easier.
7298 # Like equivalents, two tables may be the inverses of each other, the
7299 # intersection between them is null, and the union is every Unicode code
7300 # point. The two tables that occupy a binary property are necessarily like
7301 # this. By specifying one table as the complement of another, we can avoid
7302 # storing it on disk (using the other table and performing a fast
7303 # transform), and some memory and calculations.
7305 # 3) Conflicting. It may be that there will eventually be name clashes, with
7306 # the same name meaning different things. For a while, there actually were
7307 # conflicts, but they have so far been resolved by changing Perl's or
7308 # Unicode's definitions to match the other, but when this code was written,
7309 # it wasn't clear that that was what was going to happen. (Unicode changed
7310 # because of protests during their beta period.) Name clashes are warned
7311 # about during compilation, and the documentation. The generated tables
7312 # are sane, free of name clashes, because the code suppresses the Perl
7313 # version. But manual intervention to decide what the actual behavior
7314 # should be may be required should this happen. The introductory comments
7315 # have more to say about this.
7317 sub standardize { return main::standardize($_[0]); }
7318 sub trace { return main::trace(@_); }
7323 main::setup_package();
7326 # The leader table of this one; initially $self.
7327 main::set_access('leader', \%leader, 'r');
7330 # An array of any tables that have this one as their leader
7331 main::set_access('equivalents', \%equivalents, 'readable_array');
7334 # The parent table to this one, initially $self. This allows us to
7335 # distinguish between equivalent tables that are related (for which this
7336 # is set to), and those which may not be, but share the same output file
7337 # because they match the exact same set of code points in the current
7339 main::set_access('parent', \%parent, 'r');
7342 # An array of any tables that have this one as their parent
7343 main::set_access('children', \%children, 'readable_array');
7346 # Array of any tables that would have the same name as this one with
7347 # a different meaning. This is used for the generated documentation.
7348 main::set_access('conflicting', \%conflicting, 'readable_array');
7351 # Set in the constructor for tables that are expected to match all code
7353 main::set_access('matches_all', \%matches_all, 'r');
7356 # Points to the complement that this table is expressed in terms of; 0 if
7358 main::set_access('complement', \%complement, 'r');
7365 # The property for which this table is a listing of property values.
7366 my $property = delete $args{'_Property'};
7368 my $name = delete $args{'Name'};
7369 my $full_name = delete $args{'Full_Name'};
7370 $full_name = $name if ! defined $full_name;
7373 my $initialize = delete $args{'Initialize'};
7374 my $matches_all = delete $args{'Matches_All'} || 0;
7375 my $format = delete $args{'Format'};
7376 # Rest of parameters passed on.
7378 my $range_list = Range_List->new(Initialize => $initialize,
7379 Owner => $property);
7381 my $complete = $full_name;
7382 $complete = '""' if $complete eq ""; # A null name shouldn't happen,
7383 # but this helps debug if it
7385 # The complete name for a match table includes it's property in a
7386 # compound form 'property=table', except if the property is the
7387 # pseudo-property, perl, in which case it is just the single form,
7388 # 'table' (If you change the '=' must also change the ':' in lots of
7389 # places in this program that assume an equal sign)
7390 $complete = $property->full_name . "=$complete" if $property != $perl;
7392 my $self = $class->SUPER::new(%args,
7394 Complete_Name => $complete,
7395 Full_Name => $full_name,
7396 _Property => $property,
7397 _Range_List => $range_list,
7398 Format => $EMPTY_FORMAT,
7399 Write_As_Invlist => 1,
7401 my $addr = do { no overloading; pack 'J', $self; };
7403 $conflicting{$addr} = [ ];
7404 $equivalents{$addr} = [ ];
7405 $children{$addr} = [ ];
7406 $matches_all{$addr} = $matches_all;
7407 $leader{$addr} = $self;
7408 $parent{$addr} = $self;
7409 $complement{$addr} = 0;
7411 if (defined $format && $format ne $EMPTY_FORMAT) {
7412 Carp::my_carp_bug("'Format' must be '$EMPTY_FORMAT' in a match table instead of '$format'. Using '$EMPTY_FORMAT'");
7418 # See this program's beginning comment block about overloading these.
7421 qw("") => "_operator_stringify",
7425 return if $self->carp_if_locked;
7433 return $self->_range_list + $other;
7439 return $self->_range_list & $other;
7444 my $reversed = shift;
7447 Carp::my_carp_bug("Bad news. Can't cope with '"
7451 . "'. undef returned.");
7455 return if $self->carp_if_locked;
7457 my $addr = do { no overloading; pack 'J', $self; };
7461 # Change the range list of this table to be the
7463 $self->_set_range_list($self->_range_list
7466 else { # $other is just a simple value
7467 $self->add_range($other, $other);
7474 my $reversed = shift;
7477 Carp::my_carp_bug("Bad news. Can't cope with '"
7481 . "'. undef returned.");
7485 return if $self->carp_if_locked;
7486 $self->_set_range_list($self->_range_list & $other);
7489 '-' => sub { my $self = shift;
7491 my $reversed = shift;
7493 Carp::my_carp_bug("Bad news. Can't cope with '"
7497 . "'. undef returned.");
7501 return $self->_range_list - $other;
7503 '~' => sub { my $self = shift;
7504 return ~ $self->_range_list;
7508 sub _operator_stringify {
7511 my $name = $self->complete_name;
7512 return "Table '$name'";
7516 # Returns the range list associated with this table, which will be the
7517 # complement's if it has one.
7521 if (($complement = $self->complement) != 0) {
7522 return ~ $complement->_range_list;
7525 return $self->SUPER::_range_list;
7530 # Add a synonym for this table. See the comments in the base class
7534 # Rest of parameters passed on.
7536 $self->SUPER::add_alias($name, $self, @_);
7540 sub add_conflicting {
7541 # Add the name of some other object to the list of ones that name
7542 # clash with this match table.
7545 my $conflicting_name = shift; # The name of the conflicting object
7546 my $p = shift || 'p'; # Optional, is this a \p{} or \P{} ?
7547 my $conflicting_object = shift; # Optional, the conflicting object
7548 # itself. This is used to
7549 # disambiguate the text if the input
7550 # name is identical to any of the
7551 # aliases $self is known by.
7552 # Sometimes the conflicting object is
7553 # merely hypothetical, so this has to
7554 # be an optional parameter.
7555 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7557 my $addr = do { no overloading; pack 'J', $self; };
7559 # Check if the conflicting name is exactly the same as any existing
7560 # alias in this table (as long as there is a real object there to
7561 # disambiguate with).
7562 if (defined $conflicting_object) {
7563 foreach my $alias ($self->aliases) {
7564 if ($alias->name eq $conflicting_name) {
7566 # Here, there is an exact match. This results in
7567 # ambiguous comments, so disambiguate by changing the
7568 # conflicting name to its object's complete equivalent.
7569 $conflicting_name = $conflicting_object->complete_name;
7575 # Convert to the \p{...} final name
7576 $conflicting_name = "\\$p" . "{$conflicting_name}";
7579 return if grep { $conflicting_name eq $_ } @{$conflicting{$addr}};
7581 push @{$conflicting{$addr}}, $conflicting_name;
7586 sub is_set_equivalent_to {
7587 # Return boolean of whether or not the other object is a table of this
7588 # type and has been marked equivalent to this one.
7592 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7594 return 0 if ! defined $other; # Can happen for incomplete early
7596 unless ($other->isa(__PACKAGE__)) {
7597 my $ref_other = ref $other;
7598 my $ref_self = ref $self;
7599 Carp::my_carp_bug("Argument to 'is_set_equivalent_to' must be another $ref_self, not a '$ref_other'. $other not set equivalent to $self.");
7603 # Two tables are equivalent if they have the same leader.
7605 return $leader{pack 'J', $self} == $leader{pack 'J', $other};
7609 sub set_equivalent_to {
7610 # Set $self equivalent to the parameter table.
7611 # The required Related => 'x' parameter is a boolean indicating
7612 # whether these tables are related or not. If related, $other becomes
7613 # the 'parent' of $self; if unrelated it becomes the 'leader'
7615 # Related tables share all characteristics except names; equivalents
7616 # not quite so many.
7617 # If they are related, one must be a perl extension. This is because
7618 # we can't guarantee that Unicode won't change one or the other in a
7619 # later release even if they are identical now.
7625 my $related = delete $args{'Related'};
7627 Carp::carp_extra_args(\%args) if main::DEBUG && %args;
7629 return if ! defined $other; # Keep on going; happens in some early
7632 if (! defined $related) {
7633 Carp::my_carp_bug("set_equivalent_to must have 'Related => [01] parameter. Assuming $self is not related to $other");
7637 # If already are equivalent, no need to re-do it; if subroutine
7638 # returns null, it found an error, also do nothing
7639 my $are_equivalent = $self->is_set_equivalent_to($other);
7640 return if ! defined $are_equivalent || $are_equivalent;
7642 my $addr = do { no overloading; pack 'J', $self; };
7643 my $current_leader = ($related) ? $parent{$addr} : $leader{$addr};
7646 if ($current_leader->perl_extension) {
7647 if ($other->perl_extension) {
7648 Carp::my_carp_bug("Use add_alias() to set two Perl tables '$self' and '$other', equivalent.");
7651 } elsif ($self->property != $other->property # Depending on
7657 && ! $other->perl_extension)
7659 Carp::my_carp_bug("set_equivalent_to should have 'Related => 0 for equivalencing two Unicode properties. Assuming $self is not related to $other");
7664 if (! $self->is_empty && ! $self->matches_identically_to($other)) {
7665 Carp::my_carp_bug("$self should be empty or match identically to $other. Not setting equivalent");
7669 my $leader = do { no overloading; pack 'J', $current_leader; };
7670 my $other_addr = do { no overloading; pack 'J', $other; };
7672 # Any tables that are equivalent to or children of this table must now
7673 # instead be equivalent to or (children) to the new leader (parent),
7674 # still equivalent. The equivalency includes their matches_all info,
7675 # and for related tables, their fate and status.
7676 # All related tables are of necessity equivalent, but the converse
7677 # isn't necessarily true
7678 my $status = $other->status;
7679 my $status_info = $other->status_info;
7680 my $fate = $other->fate;
7681 my $matches_all = $matches_all{other_addr};
7682 my $caseless_equivalent = $other->caseless_equivalent;
7683 foreach my $table ($current_leader, @{$equivalents{$leader}}) {
7684 next if $table == $other;
7685 trace "setting $other to be the leader of $table, status=$status" if main::DEBUG && $to_trace;
7687 my $table_addr = do { no overloading; pack 'J', $table; };
7688 $leader{$table_addr} = $other;
7689 $matches_all{$table_addr} = $matches_all;
7690 $self->_set_range_list($other->_range_list);
7691 push @{$equivalents{$other_addr}}, $table;
7693 $parent{$table_addr} = $other;
7694 push @{$children{$other_addr}}, $table;
7695 $table->set_status($status, $status_info);
7697 # This reason currently doesn't get exposed outside; otherwise
7698 # would have to look up the parent's reason and use it instead.
7699 $table->set_fate($fate, "Parent's fate");
7701 $self->set_caseless_equivalent($caseless_equivalent);
7705 # Now that we've declared these to be equivalent, any changes to one
7706 # of the tables would invalidate that equivalency.
7712 sub set_complement {
7713 # Set $self to be the complement of the parameter table. $self is
7714 # locked, as what it contains should all come from the other table.
7720 Carp::carp_extra_args(\%args) if main::DEBUG && %args;
7722 if ($other->complement != 0) {
7723 Carp::my_carp_bug("Can't set $self to be the complement of $other, which itself is the complement of " . $other->complement);
7726 my $addr = do { no overloading; pack 'J', $self; };
7727 $complement{$addr} = $other;
7732 sub add_range { # Add a range to the list for this table.
7734 # Rest of parameters passed on
7736 return if $self->carp_if_locked;
7737 return $self->_range_list->add_range(@_);
7742 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7744 # All match tables are to be used only by the Perl core.
7745 return $self->SUPER::header() . $INTERNAL_ONLY_HEADER;
7748 sub pre_body { # Does nothing for match tables.
7752 sub append_to_body { # Does nothing for match tables.
7760 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7762 $self->SUPER::set_fate($fate, $reason);
7764 # All children share this fate
7765 foreach my $child ($self->children) {
7766 $child->set_fate($fate, $reason);
7773 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7775 return $self->SUPER::write(0); # No adjustments
7778 sub set_final_comment {
7779 # This creates a comment for the file that is to hold the match table
7780 # $self. It is somewhat convoluted to make the English read nicely,
7781 # but, heh, it's just a comment.
7782 # This should be called only with the leader match table of all the
7783 # ones that share the same file. It lists all such tables, ordered so
7784 # that related ones are together.
7786 return unless $debugging_build;
7788 my $leader = shift; # Should only be called on the leader table of
7789 # an equivalent group
7790 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7792 my $addr = do { no overloading; pack 'J', $leader; };
7794 if ($leader{$addr} != $leader) {
7795 Carp::my_carp_bug(<<END
7796 set_final_comment() must be called on a leader table, which $leader is not.
7797 It is equivalent to $leader{$addr}. No comment created
7803 # Get the number of code points matched by each of the tables in this
7804 # file, and add underscores for clarity.
7805 my $count = $leader->count;
7807 my $non_unicode_string;
7808 if ($count > $MAX_UNICODE_CODEPOINTS) {
7809 $unicode_count = $count - ($MAX_WORKING_CODEPOINT
7810 - $MAX_UNICODE_CODEPOINT);
7811 $non_unicode_string = "All above-Unicode code points match as well, and are also returned";
7814 $unicode_count = $count;
7815 $non_unicode_string = "";
7817 my $string_count = main::clarify_code_point_count($unicode_count);
7819 my $loose_count = 0; # how many aliases loosely matched
7820 my $compound_name = ""; # ? Are any names compound?, and if so, an
7822 my $properties_with_compound_names = 0; # count of these
7825 my %flags; # The status flags used in the file
7826 my $total_entries = 0; # number of entries written in the comment
7827 my $matches_comment = ""; # The portion of the comment about the
7829 my @global_comments; # List of all the tables' comments that are
7830 # there before this routine was called.
7831 my $has_ucd_alias = 0; # If there is an alias that is accessible via
7832 # Unicode::UCD. If not, then don't say it is
7835 # Get list of all the parent tables that are equivalent to this one
7836 # (including itself).
7837 my @parents = grep { $parent{main::objaddr $_} == $_ }
7838 main::uniques($leader, @{$equivalents{$addr}});
7839 my $has_unrelated = (@parents >= 2); # boolean, ? are there unrelated
7842 for my $parent (@parents) {
7844 my $property = $parent->property;
7846 # Special case 'N' tables in properties with two match tables when
7847 # the other is a 'Y' one. These are likely to be binary tables,
7848 # but not necessarily. In either case, \P{} will match the
7849 # complement of \p{}, and so if something is a synonym of \p, the
7850 # complement of that something will be the synonym of \P. This
7851 # would be true of any property with just two match tables, not
7852 # just those whose values are Y and N; but that would require a
7853 # little extra work, and there are none such so far in Unicode.
7854 my $perl_p = 'p'; # which is it? \p{} or \P{}
7855 my @yes_perl_synonyms; # list of any synonyms for the 'Y' table
7857 if (scalar $property->tables == 2
7858 && $parent == $property->table('N')
7859 && defined (my $yes = $property->table('Y')))
7861 my $yes_addr = do { no overloading; pack 'J', $yes; };
7863 = grep { $_->property == $perl }
7866 $parent{$yes_addr}->children);
7868 # But these synonyms are \P{} ,not \p{}
7872 my @description; # Will hold the table description
7873 my @note; # Will hold the table notes.
7874 my @conflicting; # Will hold the table conflicts.
7876 # Look at the parent, any yes synonyms, and all the children
7877 my $parent_addr = do { no overloading; pack 'J', $parent; };
7878 for my $table ($parent,
7880 @{$children{$parent_addr}})
7882 my $table_addr = do { no overloading; pack 'J', $table; };
7883 my $table_property = $table->property;
7885 # Tables are separated by a blank line to create a grouping.
7886 $matches_comment .= "\n" if $matches_comment;
7888 # The table is named based on the property and value
7889 # combination it is for, like script=greek. But there may be
7890 # a number of synonyms for each side, like 'sc' for 'script',
7891 # and 'grek' for 'greek'. Any combination of these is a valid
7892 # name for this table. In this case, there are three more,
7893 # 'sc=grek', 'sc=greek', and 'script='grek'. Rather than
7894 # listing all possible combinations in the comment, we make
7895 # sure that each synonym occurs at least once, and add
7896 # commentary that the other combinations are possible.
7897 # Because regular expressions don't recognize things like
7898 # \p{jsn=}, only look at non-null right-hand-sides
7899 my @property_aliases = $table_property->aliases;
7900 my @table_aliases = grep { $_->name ne "" } $table->aliases;
7902 # The alias lists above are already ordered in the order we
7903 # want to output them. To ensure that each synonym is listed,
7904 # we must use the max of the two numbers. But if there are no
7905 # legal synonyms (nothing in @table_aliases), then we don't
7907 my $listed_combos = (@table_aliases)
7908 ? main::max(scalar @table_aliases,
7909 scalar @property_aliases)
7911 trace "$listed_combos, tables=", scalar @table_aliases, "; names=", scalar @property_aliases if main::DEBUG;
7914 my $property_had_compound_name = 0;
7916 for my $i (0 .. $listed_combos - 1) {
7919 # The current alias for the property is the next one on
7920 # the list, or if beyond the end, start over. Similarly
7921 # for the table (\p{prop=table})
7922 my $property_alias = $property_aliases
7923 [$i % @property_aliases]->name;
7924 my $table_alias_object = $table_aliases
7925 [$i % @table_aliases];
7926 my $table_alias = $table_alias_object->name;
7927 my $loose_match = $table_alias_object->loose_match;
7928 $has_ucd_alias |= $table_alias_object->ucd;
7930 if ($table_alias !~ /\D/) { # Clarify large numbers.
7931 $table_alias = main::clarify_number($table_alias)
7934 # Add a comment for this alias combination
7935 my $current_match_comment;
7936 if ($table_property == $perl) {
7937 $current_match_comment = "\\$perl_p"
7941 $current_match_comment
7942 = "\\p{$property_alias=$table_alias}";
7943 $property_had_compound_name = 1;
7946 # Flag any abnormal status for this table.
7947 my $flag = $property->status
7949 || $table_alias_object->status;
7950 if ($flag && $flag ne $PLACEHOLDER) {
7951 $flags{$flag} = $status_past_participles{$flag};
7956 # Pretty up the comment. Note the \b; it says don't make
7957 # this line a continuation.
7958 $matches_comment .= sprintf("\b%-1s%-s%s\n",
7961 $current_match_comment);
7962 } # End of generating the entries for this table.
7964 # Save these for output after this group of related tables.
7965 push @description, $table->description;
7966 push @note, $table->note;
7967 push @conflicting, $table->conflicting;
7969 # And this for output after all the tables.
7970 push @global_comments, $table->comment;
7972 # Compute an alternate compound name using the final property
7973 # synonym and the first table synonym with a colon instead of
7974 # the equal sign used elsewhere.
7975 if ($property_had_compound_name) {
7976 $properties_with_compound_names ++;
7977 if (! $compound_name || @property_aliases > 1) {
7978 $compound_name = $property_aliases[-1]->name
7980 . $table_aliases[0]->name;
7983 } # End of looping through all children of this table
7985 # Here have assembled in $matches_comment all the related tables
7986 # to the current parent (preceded by the same info for all the
7987 # previous parents). Put out information that applies to all of
7988 # the current family.
7991 # But output the conflicting information now, as it applies to
7993 my $conflicting = join ", ", @conflicting;
7995 $matches_comment .= <<END;
7997 Note that contrary to what you might expect, the above is NOT the same as
7999 $matches_comment .= "any of: " if @conflicting > 1;
8000 $matches_comment .= "$conflicting\n";
8004 $matches_comment .= "\n Meaning: "
8005 . join('; ', @description)
8009 $matches_comment .= "\n Note: "
8010 . join("\n ", @note)
8013 } # End of looping through all tables
8015 $matches_comment .= "\n$non_unicode_string\n" if $non_unicode_string;
8021 if ($unicode_count == 1) {
8023 $code_points = 'single code point';
8027 $code_points = "$string_count code points";
8032 if ($total_entries == 1) {
8035 $any_of_these = 'this'
8038 $synonyms = " any of the following regular expression constructs";
8039 $entries = 'entries';
8040 $any_of_these = 'any of these'
8044 if ($has_ucd_alias) {
8045 $comment .= "Use Unicode::UCD::prop_invlist() to access the contents of this file.\n\n";
8047 if ($has_unrelated) {
8049 This file is for tables that are not necessarily related: To conserve
8050 resources, every table that matches the identical set of code points in this
8051 version of Unicode uses this file. Each one is listed in a separate group
8052 below. It could be that the tables will match the same set of code points in
8053 other Unicode releases, or it could be purely coincidence that they happen to
8054 be the same in Unicode $string_version, and hence may not in other versions.
8060 foreach my $flag (sort keys %flags) {
8062 '$flag' below means that this form is $flags{$flag}.
8063 Consult $pod_file.pod
8069 if ($total_entries == 0) {
8070 Carp::my_carp("No regular expression construct can match $leader, as all names for it are the null string. Creating file anyway.");
8072 This file returns the $code_points in Unicode Version
8074 $leader, but it is inaccessible through Perl regular expressions, as
8075 "\\p{prop=}" is not recognized.
8080 This file returns the $code_points in Unicode Version
8081 $string_version that
8085 $pod_file.pod should be consulted for the syntax rules for $any_of_these,
8086 including if adding or subtracting white space, underscore, and hyphen
8087 characters matters or doesn't matter, and other permissible syntactic
8088 variants. Upper/lower case distinctions never matter.
8092 if ($compound_name) {
8095 A colon can be substituted for the equals sign, and
8097 if ($properties_with_compound_names > 1) {
8099 within each group above,
8102 $compound_name = sprintf("%-8s\\p{%s}", " ", $compound_name);
8104 # Note the \b below, it says don't make that line a continuation.
8106 anything to the left of the equals (or colon) can be combined with anything to
8107 the right. Thus, for example,
8113 # And append any comment(s) from the actual tables. They are all
8114 # gathered here, so may not read all that well.
8115 if (@global_comments) {
8116 $comment .= "\n" . join("\n\n", @global_comments) . "\n";
8119 if ($count) { # The format differs if no code points, and needs no
8120 # explanation in that case
8121 if ($leader->write_as_invlist) {
8124 The first data line of this file begins with the letter V to indicate it is in
8125 inversion list format. The number following the V gives the number of lines
8126 remaining. Each of those remaining lines is a single number representing the
8127 starting code point of a range which goes up to but not including the number
8128 on the next line; The 0th, 2nd, 4th... ranges are for code points that match
8129 the property; the 1st, 3rd, 5th... are ranges of code points that don't match
8130 the property. The final line's range extends to the platform's infinity.
8135 The format of the lines of this file is:
8136 START\\tSTOP\\twhere START is the starting code point of the range, in hex;
8137 STOP is the ending point, or if omitted, the range has just one code point.
8140 if ($leader->output_range_counts) {
8142 Numbers in comments in [brackets] indicate how many code points are in the
8148 $leader->set_comment(main::join_lines($comment));
8152 # Accessors for the underlying list
8154 get_valid_code_point
8155 get_invalid_code_point
8163 return $self->_range_list->$sub(@_);
8166 } # End closure for Match_Table
8170 # The Property class represents a Unicode property, or the $perl
8171 # pseudo-property. It contains a map table initialized empty at construction
8172 # time, and for properties accessible through regular expressions, various
8173 # match tables, created through the add_match_table() method, and referenced
8174 # by the table('NAME') or tables() methods, the latter returning a list of all
8175 # of the match tables. Otherwise table operations implicitly are for the map
8178 # Most of the data in the property is actually about its map table, so it
8179 # mostly just uses that table's accessors for most methods. The two could
8180 # have been combined into one object, but for clarity because of their
8181 # differing semantics, they have been kept separate. It could be argued that
8182 # the 'file' and 'directory' fields should be kept with the map table.
8184 # Each property has a type. This can be set in the constructor, or in the
8185 # set_type accessor, but mostly it is figured out by the data. Every property
8186 # starts with unknown type, overridden by a parameter to the constructor, or
8187 # as match tables are added, or ranges added to the map table, the data is
8188 # inspected, and the type changed. After the table is mostly or entirely
8189 # filled, compute_type() should be called to finalize they analysis.
8191 # There are very few operations defined. One can safely remove a range from
8192 # the map table, and property_add_or_replace_non_nulls() adds the maps from another
8193 # table to this one, replacing any in the intersection of the two.
8195 sub standardize { return main::standardize($_[0]); }
8196 sub trace { return main::trace(@_) if main::DEBUG && $to_trace }
8200 # This hash will contain as keys, all the aliases of all properties, and
8201 # as values, pointers to their respective property objects. This allows
8202 # quick look-up of a property from any of its names.
8203 my %alias_to_property_of;
8205 sub dump_alias_to_property_of {
8208 print "\n", main::simple_dumper (\%alias_to_property_of), "\n";
8213 # This is a package subroutine, not called as a method.
8214 # If the single parameter is a literal '*' it returns a list of all
8215 # defined properties.
8216 # Otherwise, the single parameter is a name, and it returns a pointer
8217 # to the corresponding property object, or undef if none.
8219 # Properties can have several different names. The 'standard' form of
8220 # each of them is stored in %alias_to_property_of as they are defined.
8221 # But it's possible that this subroutine will be called with some
8222 # variant, so if the initial lookup fails, it is repeated with the
8223 # standardized form of the input name. If found, besides returning the
8224 # result, the input name is added to the list so future calls won't
8225 # have to do the conversion again.
8229 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8231 if (! defined $name) {
8232 Carp::my_carp_bug("Undefined input property. No action taken.");
8236 return main::uniques(values %alias_to_property_of) if $name eq '*';
8238 # Return cached result if have it.
8239 my $result = $alias_to_property_of{$name};
8240 return $result if defined $result;
8242 # Convert the input to standard form.
8243 my $standard_name = standardize($name);
8245 $result = $alias_to_property_of{$standard_name};
8246 return unless defined $result; # Don't cache undefs
8248 # Cache the result before returning it.
8249 $alias_to_property_of{$name} = $result;
8254 main::setup_package();
8257 # A pointer to the map table object for this property
8258 main::set_access('map', \%map);
8261 # The property's full name. This is a duplicate of the copy kept in the
8262 # map table, but is needed because stringify needs it during
8263 # construction of the map table, and then would have a chicken before egg
8265 main::set_access('full_name', \%full_name, 'r');
8268 # This hash will contain as keys, all the aliases of any match tables
8269 # attached to this property, and as values, the pointers to their
8270 # respective tables. This allows quick look-up of a table from any of its
8272 main::set_access('table_ref', \%table_ref);
8275 # The type of the property, $ENUM, $BINARY, etc
8276 main::set_access('type', \%type, 'r');
8279 # The filename where the map table will go (if actually written).
8280 # Normally defaulted, but can be overridden.
8281 main::set_access('file', \%file, 'r', 's');
8284 # The directory where the map table will go (if actually written).
8285 # Normally defaulted, but can be overridden.
8286 main::set_access('directory', \%directory, 's');
8288 my %pseudo_map_type;
8289 # This is used to affect the calculation of the map types for all the
8290 # ranges in the table. It should be set to one of the values that signify
8291 # to alter the calculation.
8292 main::set_access('pseudo_map_type', \%pseudo_map_type, 'r');
8294 my %has_only_code_point_maps;
8295 # A boolean used to help in computing the type of data in the map table.
8296 main::set_access('has_only_code_point_maps', \%has_only_code_point_maps);
8299 # A list of the first few distinct mappings this property has. This is
8300 # used to disambiguate between binary and enum property types, so don't
8301 # have to keep more than three.
8302 main::set_access('unique_maps', \%unique_maps);
8304 my %pre_declared_maps;
8305 # A boolean that gives whether the input data should declare all the
8306 # tables used, or not. If the former, unknown ones raise a warning.
8307 main::set_access('pre_declared_maps',
8308 \%pre_declared_maps, 'r', 's');
8311 # The only required parameter is the positionally first, name. All
8312 # other parameters are key => value pairs. See the documentation just
8313 # above for the meanings of the ones not passed directly on to the map
8314 # table constructor.
8317 my $name = shift || "";
8319 my $self = property_ref($name);
8320 if (defined $self) {
8321 my $options_string = join ", ", @_;
8322 $options_string = ". Ignoring options $options_string" if $options_string;
8323 Carp::my_carp("$self is already in use. Using existing one$options_string;");
8329 $self = bless \do { my $anonymous_scalar }, $class;
8330 my $addr = do { no overloading; pack 'J', $self; };
8332 $directory{$addr} = delete $args{'Directory'};
8333 $file{$addr} = delete $args{'File'};
8334 $full_name{$addr} = delete $args{'Full_Name'} || $name;
8335 $type{$addr} = delete $args{'Type'} || $UNKNOWN;
8336 $pseudo_map_type{$addr} = delete $args{'Map_Type'};
8337 $pre_declared_maps{$addr} = delete $args{'Pre_Declared_Maps'}
8338 # Starting in this release, property
8339 # values should be defined for all
8340 # properties, except those overriding this
8341 // $v_version ge v5.1.0;
8343 # Rest of parameters passed on.
8345 $has_only_code_point_maps{$addr} = 1;
8346 $table_ref{$addr} = { };
8347 $unique_maps{$addr} = { };
8349 $map{$addr} = Map_Table->new($name,
8350 Full_Name => $full_name{$addr},
8351 _Alias_Hash => \%alias_to_property_of,
8357 # See this program's beginning comment block about overloading the copy
8358 # constructor. Few operations are defined on properties, but a couple are
8359 # useful. It is safe to take the inverse of a property, and to remove a
8360 # single code point from it.
8363 qw("") => "_operator_stringify",
8364 "." => \&main::_operator_dot,
8365 ".=" => \&main::_operator_dot_equal,
8366 '==' => \&main::_operator_equal,
8367 '!=' => \&main::_operator_not_equal,
8368 '=' => sub { return shift },
8369 '-=' => "_minus_and_equal",
8372 sub _operator_stringify {
8373 return "Property '" . shift->full_name . "'";
8376 sub _minus_and_equal {
8377 # Remove a single code point from the map table of a property.
8381 my $reversed = shift;
8382 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8385 Carp::my_carp_bug("Bad news. Can't cope with a "
8387 . " argument to '-='. Subtraction ignored.");
8390 elsif ($reversed) { # Shouldn't happen in a -=, but just in case
8391 Carp::my_carp_bug("Bad news. Can't cope with subtracting a "
8393 . " from a non-object. undef returned.");
8398 $map{pack 'J', $self}->delete_range($other, $other);
8403 sub add_match_table {
8404 # Add a new match table for this property, with name given by the
8405 # parameter. It returns a pointer to the table.
8411 my $addr = do { no overloading; pack 'J', $self; };
8413 my $table = $table_ref{$addr}{$name};
8414 my $standard_name = main::standardize($name);
8416 || (defined ($table = $table_ref{$addr}{$standard_name})))
8418 Carp::my_carp("Table '$name' in $self is already in use. Using existing one");
8419 $table_ref{$addr}{$name} = $table;
8424 # See if this is a perl extension, if not passed in.
8425 my $perl_extension = delete $args{'Perl_Extension'};
8427 = $self->perl_extension if ! defined $perl_extension;
8429 $table = Match_Table->new(
8431 Perl_Extension => $perl_extension,
8432 _Alias_Hash => $table_ref{$addr},
8435 # gets property's fate and status by default,
8436 # except if the name begind with an
8437 # underscore, default it to internal
8438 Fate => ($name =~ /^_/)
8441 Status => $self->status,
8442 _Status_Info => $self->status_info,
8444 return unless defined $table;
8447 # Save the names for quick look up
8448 $table_ref{$addr}{$standard_name} = $table;
8449 $table_ref{$addr}{$name} = $table;
8451 # Perhaps we can figure out the type of this property based on the
8452 # fact of adding this match table. First, string properties don't
8453 # have match tables; second, a binary property can't have 3 match
8455 if ($type{$addr} == $UNKNOWN) {
8456 $type{$addr} = $NON_STRING;
8458 elsif ($type{$addr} == $STRING) {
8459 Carp::my_carp("$self Added a match table '$name' to a string property '$self'. Changed it to a non-string property. Bad News.");
8460 $type{$addr} = $NON_STRING;
8462 elsif ($type{$addr} != $ENUM && $type{$addr} != $FORCED_BINARY) {
8463 if (scalar main::uniques(values %{$table_ref{$addr}}) > 2) {
8464 if ($type{$addr} == $BINARY) {
8465 Carp::my_carp("$self now has more than 2 tables (with the addition of '$name'), and so is no longer binary. Changing its type to 'enum'. Bad News.");
8467 $type{$addr} = $ENUM;
8474 sub delete_match_table {
8475 # Delete the table referred to by $2 from the property $1.
8478 my $table_to_remove = shift;
8479 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8481 my $addr = do { no overloading; pack 'J', $self; };
8483 # Remove all names that refer to it.
8484 foreach my $key (keys %{$table_ref{$addr}}) {
8485 delete $table_ref{$addr}{$key}
8486 if $table_ref{$addr}{$key} == $table_to_remove;
8489 $table_to_remove->DESTROY;
8494 # Return a pointer to the match table (with name given by the
8495 # parameter) associated with this property; undef if none.
8499 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8501 my $addr = do { no overloading; pack 'J', $self; };
8503 return $table_ref{$addr}{$name} if defined $table_ref{$addr}{$name};
8505 # If quick look-up failed, try again using the standard form of the
8506 # input name. If that succeeds, cache the result before returning so
8507 # won't have to standardize this input name again.
8508 my $standard_name = main::standardize($name);
8509 return unless defined $table_ref{$addr}{$standard_name};
8511 $table_ref{$addr}{$name} = $table_ref{$addr}{$standard_name};
8512 return $table_ref{$addr}{$name};
8516 # Return a list of pointers to all the match tables attached to this
8520 return main::uniques(values %{$table_ref{pack 'J', shift}});
8524 # Returns the directory the map table for this property should be
8525 # output in. If a specific directory has been specified, that has
8526 # priority; 'undef' is returned if the type isn't defined;
8527 # or $map_directory for everything else.
8529 my $addr = do { no overloading; pack 'J', shift; };
8531 return $directory{$addr} if defined $directory{$addr};
8532 return undef if $type{$addr} == $UNKNOWN;
8533 return $map_directory;
8537 # Return the name that is used to both:
8538 # 1) Name the file that the map table is written to.
8539 # 2) The name of swash related stuff inside that file.
8540 # The reason for this is that the Perl core historically has used
8541 # certain names that aren't the same as the Unicode property names.
8542 # To continue using these, $file is hard-coded in this file for those,
8543 # but otherwise the standard name is used. This is different from the
8544 # external_name, so that the rest of the files, like in lib can use
8545 # the standard name always, without regard to historical precedent.
8548 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8550 my $addr = do { no overloading; pack 'J', $self; };
8552 # Swash names are used only on either
8553 # 1) legacy-only properties, because the formats for these are
8554 # unchangeable, and they have had these lines in them; or
8555 # 2) regular map tables; otherwise there should be no access to the
8556 # property map table from other parts of Perl.
8557 return if $map{$addr}->fate != $ORDINARY
8558 && $map{$addr}->fate != $LEGACY_ONLY;
8560 return $file{$addr} if defined $file{$addr};
8561 return $map{$addr}->external_name;
8564 sub to_create_match_tables {
8565 # Returns a boolean as to whether or not match tables should be
8566 # created for this property.
8569 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8571 # The whole point of this pseudo property is match tables.
8572 return 1 if $self == $perl;
8574 my $addr = do { no overloading; pack 'J', $self; };
8576 # Don't generate tables of code points that match the property values
8577 # of a string property. Such a list would most likely have many
8578 # property values, each with just one or very few code points mapping
8580 return 0 if $type{$addr} == $STRING;
8582 # Don't generate anything for unimplemented properties.
8583 return 0 if grep { $self->complete_name eq $_ }
8584 @unimplemented_properties;
8589 sub property_add_or_replace_non_nulls {
8590 # This adds the mappings in the property $other to $self. Non-null
8591 # mappings from $other override those in $self. It essentially merges
8592 # the two properties, with the second having priority except for null
8597 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8599 if (! $other->isa(__PACKAGE__)) {
8600 Carp::my_carp_bug("$other should be a "
8609 return $map{pack 'J', $self}->map_add_or_replace_non_nulls($map{pack 'J', $other});
8613 # Certain tables are not generally written out to files, but
8614 # Unicode::UCD has the intelligence to know that the file for $self
8615 # can be used to reconstruct those tables. This routine just changes
8616 # things so that UCD pod entries for those suppressed tables are
8617 # generated, so the fact that a proxy is used is invisible to the
8622 foreach my $property_name (@_) {
8623 my $ref = property_ref($property_name);
8624 next if $ref->to_output_map;
8625 $ref->set_fate($MAP_PROXIED);
8630 # Set the type of the property. Mostly this is figured out by the
8631 # data in the table. But this is used to set it explicitly. The
8632 # reason it is not a standard accessor is that when setting a binary
8633 # property, we need to make sure that all the true/false aliases are
8634 # present, as they were omitted in early Unicode releases.
8638 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8642 && $type != $FORCED_BINARY
8643 && $type != $STRING)
8645 Carp::my_carp("Unrecognized type '$type'. Type not set");
8649 { no overloading; $type{pack 'J', $self} = $type; }
8650 return if $type != $BINARY && $type != $FORCED_BINARY;
8652 my $yes = $self->table('Y');
8653 $yes = $self->table('Yes') if ! defined $yes;
8654 $yes = $self->add_match_table('Y', Full_Name => 'Yes')
8657 # Add aliases in order wanted, duplicates will be ignored. We use a
8658 # binary property present in all releases for its ordered lists of
8659 # true/false aliases. Note, that could run into problems in
8660 # outputting things in that we don't distinguish between the name and
8661 # full name of these. Hopefully, if the table was already created
8662 # before this code is executed, it was done with these set properly.
8663 my $bm = property_ref("Bidi_Mirrored");
8664 foreach my $alias ($bm->table("Y")->aliases) {
8665 $yes->add_alias($alias->name);
8667 my $no = $self->table('N');
8668 $no = $self->table('No') if ! defined $no;
8669 $no = $self->add_match_table('N', Full_Name => 'No') if ! defined $no;
8670 foreach my $alias ($bm->table("N")->aliases) {
8671 $no->add_alias($alias->name);
8678 # Add a map to the property's map table. This also keeps
8679 # track of the maps so that the property type can be determined from
8683 my $start = shift; # First code point in range
8684 my $end = shift; # Final code point in range
8685 my $map = shift; # What the range maps to.
8686 # Rest of parameters passed on.
8688 my $addr = do { no overloading; pack 'J', $self; };
8690 # If haven't the type of the property, gather information to figure it
8692 if ($type{$addr} == $UNKNOWN) {
8694 # If the map contains an interior blank or dash, or most other
8695 # nonword characters, it will be a string property. This
8696 # heuristic may actually miss some string properties. If so, they
8697 # may need to have explicit set_types called for them. This
8698 # happens in the Unihan properties.
8699 if ($map =~ / (?<= . ) [ -] (?= . ) /x
8700 || $map =~ / [^\w.\/\ -] /x)
8702 $self->set_type($STRING);
8704 # $unique_maps is used for disambiguating between ENUM and
8705 # BINARY later; since we know the property is not going to be
8706 # one of those, no point in keeping the data around
8707 undef $unique_maps{$addr};
8711 # Not necessarily a string. The final decision has to be
8712 # deferred until all the data are in. We keep track of if all
8713 # the values are code points for that eventual decision.
8714 $has_only_code_point_maps{$addr} &=
8715 $map =~ / ^ $code_point_re $/x;
8717 # For the purposes of disambiguating between binary and other
8718 # enumerations at the end, we keep track of the first three
8719 # distinct property values. Once we get to three, we know
8720 # it's not going to be binary, so no need to track more.
8721 if (scalar keys %{$unique_maps{$addr}} < 3) {
8722 $unique_maps{$addr}{main::standardize($map)} = 1;
8727 # Add the mapping by calling our map table's method
8728 return $map{$addr}->add_map($start, $end, $map, @_);
8732 # Compute the type of the property: $ENUM, $STRING, or $BINARY. This
8733 # should be called after the property is mostly filled with its maps.
8734 # We have been keeping track of what the property values have been,
8735 # and now have the necessary information to figure out the type.
8738 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8740 my $addr = do { no overloading; pack 'J', $self; };
8742 my $type = $type{$addr};
8744 # If already have figured these out, no need to do so again, but we do
8745 # a double check on ENUMS to make sure that a string property hasn't
8746 # improperly been classified as an ENUM, so continue on with those.
8747 return if $type == $STRING
8749 || $type == $FORCED_BINARY;
8751 # If every map is to a code point, is a string property.
8752 if ($type == $UNKNOWN
8753 && ($has_only_code_point_maps{$addr}
8754 || (defined $map{$addr}->default_map
8755 && $map{$addr}->default_map eq "")))
8757 $self->set_type($STRING);
8761 # Otherwise, it is to some sort of enumeration. (The case where
8762 # it is a Unicode miscellaneous property, and treated like a
8763 # string in this program is handled in add_map()). Distinguish
8764 # between binary and some other enumeration type. Of course, if
8765 # there are more than two values, it's not binary. But more
8766 # subtle is the test that the default mapping is defined means it
8767 # isn't binary. This in fact may change in the future if Unicode
8768 # changes the way its data is structured. But so far, no binary
8769 # properties ever have @missing lines for them, so the default map
8770 # isn't defined for them. The few properties that are two-valued
8771 # and aren't considered binary have the default map defined
8772 # starting in Unicode 5.0, when the @missing lines appeared; and
8773 # this program has special code to put in a default map for them
8774 # for earlier than 5.0 releases.
8776 || scalar keys %{$unique_maps{$addr}} > 2
8777 || defined $self->default_map)
8779 my $tables = $self->tables;
8780 my $count = $self->count;
8781 if ($verbosity && $tables > 500 && $tables/$count > .1) {
8782 Carp::my_carp_bug("It appears that $self should be a \$STRING property, not an \$ENUM because it has too many match tables: $tables\n");
8784 $self->set_type($ENUM);
8787 $self->set_type($BINARY);
8790 undef $unique_maps{$addr}; # Garbage collect
8797 my $reason = shift; # Ignored unless suppressing
8798 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8800 my $addr = do { no overloading; pack 'J', $self; };
8801 if ($fate == $SUPPRESSED) {
8802 $why_suppressed{$self->complete_name} = $reason;
8805 # Each table shares the property's fate, except that MAP_PROXIED
8806 # doesn't affect match tables
8807 $map{$addr}->set_fate($fate, $reason);
8808 if ($fate != $MAP_PROXIED) {
8809 foreach my $table ($map{$addr}, $self->tables) {
8810 $table->set_fate($fate, $reason);
8817 # Most of the accessors for a property actually apply to its map table.
8818 # Setup up accessor functions for those, referring to %map
8843 replacement_property
8868 # 'property' above is for symmetry, so that one can take
8869 # the property of a property and get itself, and so don't
8870 # have to distinguish between properties and tables in
8878 return $map{pack 'J', $self}->$sub(@_);
8888 # Converts an ordinal character value to a displayable string, using a
8889 # NBSP to hold combining characters.
8893 return $chr if $ccc->table(0)->contains($ord);
8894 return chr(utf8::unicode_to_native(0xA0)) . $chr;
8898 # Returns lines of the input joined together, so that they can be folded
8900 # This causes continuation lines to be joined together into one long line
8901 # for folding. A continuation line is any line that doesn't begin with a
8902 # space or "\b" (the latter is stripped from the output). This is so
8903 # lines can be be in a HERE document so as to fit nicely in the terminal
8904 # width, but be joined together in one long line, and then folded with
8905 # indents, '#' prefixes, etc, properly handled.
8906 # A blank separates the joined lines except if there is a break; an extra
8907 # blank is inserted after a period ending a line.
8909 # Initialize the return with the first line.
8910 my ($return, @lines) = split "\n", shift;
8912 # If the first line is null, it was an empty line, add the \n back in
8913 $return = "\n" if $return eq "";
8915 # Now join the remainder of the physical lines.
8916 for my $line (@lines) {
8918 # An empty line means wanted a blank line, so add two \n's to get that
8919 # effect, and go to the next line.
8920 if (length $line == 0) {
8925 # Look at the last character of what we have so far.
8926 my $previous_char = substr($return, -1, 1);
8928 # And at the next char to be output.
8929 my $next_char = substr($line, 0, 1);
8931 if ($previous_char ne "\n") {
8933 # Here didn't end wth a nl. If the next char a blank or \b, it
8934 # means that here there is a break anyway. So add a nl to the
8936 if ($next_char eq " " || $next_char eq "\b") {
8937 $previous_char = "\n";
8938 $return .= $previous_char;
8941 # Add an extra space after periods.
8942 $return .= " " if $previous_char eq '.';
8945 # Here $previous_char is still the latest character to be output. If
8946 # it isn't a nl, it means that the next line is to be a continuation
8947 # line, with a blank inserted between them.
8948 $return .= " " if $previous_char ne "\n";
8951 substr($line, 0, 1) = "" if $next_char eq "\b";
8953 # And append this next line.
8960 sub simple_fold($;$$$) {
8961 # Returns a string of the input (string or an array of strings) folded
8962 # into multiple-lines each of no more than $MAX_LINE_WIDTH characters plus
8964 # This is tailored for the kind of text written by this program,
8965 # especially the pod file, which can have very long names with
8966 # underscores in the middle, or words like AbcDefgHij.... We allow
8967 # breaking in the middle of such constructs if the line won't fit
8968 # otherwise. The break in such cases will come either just after an
8969 # underscore, or just before one of the Capital letters.
8971 local $to_trace = 0 if main::DEBUG;
8974 my $prefix = shift; # Optional string to prepend to each output
8976 $prefix = "" unless defined $prefix;
8978 my $hanging_indent = shift; # Optional number of spaces to indent
8979 # continuation lines
8980 $hanging_indent = 0 unless $hanging_indent;
8982 my $right_margin = shift; # Optional number of spaces to narrow the
8984 $right_margin = 0 unless defined $right_margin;
8986 # Call carp with the 'nofold' option to avoid it from trying to call us
8988 Carp::carp_extra_args(\@_, 'nofold') if main::DEBUG && @_;
8990 # The space available doesn't include what's automatically prepended
8991 # to each line, or what's reserved on the right.
8992 my $max = $MAX_LINE_WIDTH - length($prefix) - $right_margin;
8993 # XXX Instead of using the 'nofold' perhaps better to look up the stack
8995 if (DEBUG && $hanging_indent >= $max) {
8996 Carp::my_carp("Too large a hanging indent ($hanging_indent); must be < $max. Using 0", 'nofold');
8997 $hanging_indent = 0;
9000 # First, split into the current physical lines.
9002 if (ref $line) { # Better be an array, because not bothering to
9004 foreach my $line (@{$line}) {
9005 push @line, split /\n/, $line;
9009 @line = split /\n/, $line;
9012 #local $to_trace = 1 if main::DEBUG;
9013 trace "", join(" ", @line), "\n" if main::DEBUG && $to_trace;
9015 # Look at each current physical line.
9016 for (my $i = 0; $i < @line; $i++) {
9017 Carp::my_carp("Tabs don't work well.", 'nofold') if $line[$i] =~ /\t/;
9018 #local $to_trace = 1 if main::DEBUG;
9019 trace "i=$i: $line[$i]\n" if main::DEBUG && $to_trace;
9021 # Remove prefix, because will be added back anyway, don't want
9023 $line[$i] =~ s/^$prefix//;
9025 # Remove trailing space
9026 $line[$i] =~ s/\s+\Z//;
9028 # If the line is too long, fold it.
9029 if (length $line[$i] > $max) {
9032 # Here needs to fold. Save the leading space in the line for
9034 $line[$i] =~ /^ ( \s* )/x;
9035 my $leading_space = $1;
9036 trace "line length", length $line[$i], "; lead length", length($leading_space) if main::DEBUG && $to_trace;
9038 # If character at final permissible position is white space,
9039 # fold there, which will delete that white space
9040 if (substr($line[$i], $max - 1, 1) =~ /\s/) {
9041 $remainder = substr($line[$i], $max);
9042 $line[$i] = substr($line[$i], 0, $max - 1);
9046 # Otherwise fold at an acceptable break char closest to
9047 # the max length. Look at just the maximal initial
9048 # segment of the line
9049 my $segment = substr($line[$i], 0, $max - 1);
9051 /^ ( .{$hanging_indent} # Don't look before the
9053 \ * # Don't look in leading
9054 # blanks past the indent
9055 [^ ] .* # Find the right-most
9056 (?: # acceptable break:
9057 [ \s = ] # space or equal
9058 | - (?! [.0-9] ) # or non-unary minus.
9059 ) # $1 includes the character
9062 # Split into the initial part that fits, and remaining
9064 $remainder = substr($line[$i], length $1);
9066 trace $line[$i] if DEBUG && $to_trace;
9067 trace $remainder if DEBUG && $to_trace;
9070 # If didn't find a good breaking spot, see if there is a
9071 # not-so-good breaking spot. These are just after
9072 # underscores or where the case changes from lower to
9073 # upper. Use \a as a soft hyphen, but give up
9074 # and don't break the line if there is actually a \a
9075 # already in the input. We use an ascii character for the
9076 # soft-hyphen to avoid any attempt by miniperl to try to
9077 # access the files that this program is creating.
9078 elsif ($segment !~ /\a/
9079 && ($segment =~ s/_/_\a/g
9080 || $segment =~ s/ ( [a-z] ) (?= [A-Z] )/$1\a/xg))
9082 # Here were able to find at least one place to insert
9083 # our substitute soft hyphen. Find the right-most one
9084 # and replace it by a real hyphen.
9085 trace $segment if DEBUG && $to_trace;
9087 rindex($segment, "\a"),
9090 # Then remove the soft hyphen substitutes.
9091 $segment =~ s/\a//g;
9092 trace $segment if DEBUG && $to_trace;
9094 # And split into the initial part that fits, and
9095 # remainder of the line
9096 my $pos = rindex($segment, '-');
9097 $remainder = substr($line[$i], $pos);
9098 trace $remainder if DEBUG && $to_trace;
9099 $line[$i] = substr($segment, 0, $pos + 1);
9103 # Here we know if we can fold or not. If we can, $remainder
9104 # is what remains to be processed in the next iteration.
9105 if (defined $remainder) {
9106 trace "folded='$line[$i]'" if main::DEBUG && $to_trace;
9108 # Insert the folded remainder of the line as a new element
9109 # of the array. (It may still be too long, but we will
9110 # deal with that next time through the loop.) Omit any
9111 # leading space in the remainder.
9112 $remainder =~ s/^\s+//;
9113 trace "remainder='$remainder'" if main::DEBUG && $to_trace;
9115 # But then indent by whichever is larger of:
9116 # 1) the leading space on the input line;
9117 # 2) the hanging indent.
9118 # This preserves indentation in the original line.
9119 my $lead = ($leading_space)
9120 ? length $leading_space
9122 $lead = max($lead, $hanging_indent);
9123 splice @line, $i+1, 0, (" " x $lead) . $remainder;
9127 # Ready to output the line. Get rid of any trailing space
9128 # And prefix by the required $prefix passed in.
9129 $line[$i] =~ s/\s+$//;
9130 $line[$i] = "$prefix$line[$i]\n";
9131 } # End of looping through all the lines.
9133 return join "", @line;
9136 sub property_ref { # Returns a reference to a property object.
9137 return Property::property_ref(@_);
9140 sub force_unlink ($) {
9141 my $filename = shift;
9142 return unless file_exists($filename);
9143 return if CORE::unlink($filename);
9145 # We might need write permission
9146 chmod 0777, $filename;
9147 CORE::unlink($filename) or Carp::my_carp("Couldn't unlink $filename. Proceeding anyway: $!");
9152 # Given a filename and references to arrays of lines, write the lines of
9153 # each array to the file
9154 # Filename can be given as an arrayref of directory names
9156 return Carp::carp_too_few_args(\@_, 3) if main::DEBUG && @_ < 3;
9159 my $use_utf8 = shift;
9161 # Get into a single string if an array, and get rid of, in Unix terms, any
9163 $file= File::Spec->join(@$file) if ref $file eq 'ARRAY';
9164 $file = File::Spec->canonpath($file);
9166 # If has directories, make sure that they all exist
9167 (undef, my $directories, undef) = File::Spec->splitpath($file);
9168 File::Path::mkpath($directories) if $directories && ! -d $directories;
9170 push @files_actually_output, $file;
9172 force_unlink ($file);
9175 if (not open $OUT, ">", $file) {
9176 Carp::my_carp("can't open $file for output. Skipping this file: $!");
9180 binmode $OUT, ":utf8" if $use_utf8;
9182 while (defined (my $lines_ref = shift)) {
9183 unless (@$lines_ref) {
9184 Carp::my_carp("An array of lines for writing to file '$file' is empty; writing it anyway;");
9187 print $OUT @$lines_ref or die Carp::my_carp("write to '$file' failed: $!");
9189 close $OUT or die Carp::my_carp("close '$file' failed: $!");
9191 print "$file written.\n" if $verbosity >= $VERBOSE;
9197 sub Standardize($) {
9198 # This converts the input name string into a standardized equivalent to
9202 unless (defined $name) {
9203 Carp::my_carp_bug("Standardize() called with undef. Returning undef.");
9207 # Remove any leading or trailing white space
9211 # Convert interior white space and hyphens into underscores.
9212 $name =~ s/ (?<= .) [ -]+ (.) /_$1/xg;
9214 # Capitalize the letter following an underscore, and convert a sequence of
9215 # multiple underscores to a single one
9216 $name =~ s/ (?<= .) _+ (.) /_\u$1/xg;
9218 # And capitalize the first letter, but not for the special cjk ones.
9219 $name = ucfirst($name) unless $name =~ /^k[A-Z]/;
9223 sub standardize ($) {
9224 # Returns a lower-cased standardized name, without underscores. This form
9225 # is chosen so that it can distinguish between any real versus superficial
9226 # Unicode name differences. It relies on the fact that Unicode doesn't
9227 # have interior underscores, white space, nor dashes in any
9228 # stricter-matched name. It should not be used on Unicode code point
9229 # names (the Name property), as they mostly, but not always follow these
9232 my $name = Standardize(shift);
9233 return if !defined $name;
9235 $name =~ s/ (?<= .) _ (?= . ) //xg;
9239 sub utf8_heavy_name ($$) {
9240 # Returns the name that utf8_heavy.pl will use to find a table. XXX
9241 # perhaps this function should be placed somewhere, like Heavy.pl so that
9242 # utf8_heavy can use it directly without duplicating code that can get
9247 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9249 my $property = $table->property;
9250 $property = ($property == $perl)
9251 ? "" # 'perl' is never explicitly stated
9252 : standardize($property->name) . '=';
9253 if ($alias->loose_match) {
9254 return $property . standardize($alias->name);
9257 return lc ($property . $alias->name);
9265 my $indent_increment = " " x (($debugging_build) ? 2 : 0);
9268 $main::simple_dumper_nesting = 0;
9271 # Like Simple Data::Dumper. Good enough for our needs. We can't use
9272 # the real thing as we have to run under miniperl.
9274 # It is designed so that on input it is at the beginning of a line,
9275 # and the final thing output in any call is a trailing ",\n".
9279 $indent = "" if ! $debugging_build || ! defined $indent;
9281 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9283 # nesting level is localized, so that as the call stack pops, it goes
9284 # back to the prior value.
9285 local $main::simple_dumper_nesting = $main::simple_dumper_nesting;
9286 undef %already_output if $main::simple_dumper_nesting == 0;
9287 $main::simple_dumper_nesting++;
9288 #print STDERR __LINE__, ": $main::simple_dumper_nesting: $indent$item\n";
9290 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9292 # Determine the indent for recursive calls.
9293 my $next_indent = $indent . $indent_increment;
9298 # Dump of scalar: just output it in quotes if not a number. To do
9299 # so we must escape certain characters, and therefore need to
9300 # operate on a copy to avoid changing the original
9302 $copy = $UNDEF unless defined $copy;
9304 # Quote non-integers (integers also have optional leading '-')
9305 if ($copy eq "" || $copy !~ /^ -? \d+ $/x) {
9307 # Escape apostrophe and backslash
9308 $copy =~ s/ ( ['\\] ) /\\$1/xg;
9311 $output = "$indent$copy,\n";
9315 # Keep track of cycles in the input, and refuse to infinitely loop
9316 my $addr = do { no overloading; pack 'J', $item; };
9317 if (defined $already_output{$addr}) {
9318 return "${indent}ALREADY OUTPUT: $item\n";
9320 $already_output{$addr} = $item;
9322 if (ref $item eq 'ARRAY') {
9325 if ($main::simple_dumper_nesting > 1) {
9327 $using_brackets = 1;
9330 $using_brackets = 0;
9333 # If the array is empty, put the closing bracket on the same
9334 # line. Otherwise, recursively add each array element
9340 for (my $i = 0; $i < @$item; $i++) {
9342 # Indent array elements one level
9343 $output .= &simple_dumper($item->[$i], $next_indent);
9344 next if ! $debugging_build;
9345 $output =~ s/\n$//; # Remove any trailing nl so
9346 $output .= " # [$i]\n"; # as to add a comment giving
9349 $output .= $indent; # Indent closing ']' to orig level
9351 $output .= ']' if $using_brackets;
9354 elsif (ref $item eq 'HASH') {
9359 # No surrounding braces at top level
9361 if ($main::simple_dumper_nesting > 1) {
9364 $body_indent = $next_indent;
9365 $next_indent .= $indent_increment;
9370 $body_indent = $indent;
9374 # Output hashes sorted alphabetically instead of apparently
9375 # random. Use caseless alphabetic sort
9376 foreach my $key (sort { lc $a cmp lc $b } keys %$item)
9378 if ($is_first_line) {
9382 $output .= "$body_indent";
9385 # The key must be a scalar, but this recursive call quotes
9387 $output .= &simple_dumper($key);
9389 # And change the trailing comma and nl to the hash fat
9390 # comma for clarity, and so the value can be on the same
9392 $output =~ s/,\n$/ => /;
9394 # Recursively call to get the value's dump.
9395 my $next = &simple_dumper($item->{$key}, $next_indent);
9397 # If the value is all on one line, remove its indent, so
9398 # will follow the => immediately. If it takes more than
9399 # one line, start it on a new line.
9400 if ($next !~ /\n.*\n/) {
9409 $output .= "$indent},\n" if $using_braces;
9411 elsif (ref $item eq 'CODE' || ref $item eq 'GLOB') {
9412 $output = $indent . ref($item) . "\n";
9413 # XXX see if blessed
9415 elsif ($item->can('dump')) {
9417 # By convention in this program, objects furnish a 'dump'
9418 # method. Since not doing any output at this level, just pass
9419 # on the input indent
9420 $output = $item->dump($indent);
9423 Carp::my_carp("Can't cope with dumping a " . ref($item) . ". Skipping.");
9430 sub dump_inside_out {
9431 # Dump inside-out hashes in an object's state by converting them to a
9432 # regular hash and then calling simple_dumper on that.
9435 my $fields_ref = shift;
9436 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9438 my $addr = do { no overloading; pack 'J', $object; };
9441 foreach my $key (keys %$fields_ref) {
9442 $hash{$key} = $fields_ref->{$key}{$addr};
9445 return simple_dumper(\%hash, @_);
9449 # Overloaded '.' method that is common to all packages. It uses the
9450 # package's stringify method.
9454 my $reversed = shift;
9455 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9457 $other = "" unless defined $other;
9459 foreach my $which (\$self, \$other) {
9460 next unless ref $$which;
9461 if ($$which->can('_operator_stringify')) {
9462 $$which = $$which->_operator_stringify;
9465 my $ref = ref $$which;
9466 my $addr = do { no overloading; pack 'J', $$which; };
9467 $$which = "$ref ($addr)";
9475 sub _operator_dot_equal {
9476 # Overloaded '.=' method that is common to all packages.
9480 my $reversed = shift;
9481 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9483 $other = "" unless defined $other;
9486 return $other .= "$self";
9489 return "$self" . "$other";
9493 sub _operator_equal {
9494 # Generic overloaded '==' routine. To be equal, they must be the exact
9500 return 0 unless defined $other;
9501 return 0 unless ref $other;
9503 return $self == $other;
9506 sub _operator_not_equal {
9510 return ! _operator_equal($self, $other);
9513 sub process_PropertyAliases($) {
9514 # This reads in the PropertyAliases.txt file, which contains almost all
9515 # the character properties in Unicode and their equivalent aliases:
9516 # scf ; Simple_Case_Folding ; sfc
9518 # Field 0 is the preferred short name for the property.
9519 # Field 1 is the full name.
9520 # Any succeeding ones are other accepted names.
9523 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9525 # This whole file was non-existent in early releases, so use our own
9527 $file->insert_lines(get_old_property_aliases())
9528 if ! -e 'PropertyAliases.txt';
9530 # Add any cjk properties that may have been defined.
9531 $file->insert_lines(@cjk_properties);
9533 while ($file->next_line) {
9535 my @data = split /\s*;\s*/;
9537 my $full = $data[1];
9539 my $this = Property->new($data[0], Full_Name => $full);
9541 # Start looking for more aliases after these two.
9542 for my $i (2 .. @data - 1) {
9543 $this->add_alias($data[$i]);
9548 my $scf = property_ref("Simple_Case_Folding");
9549 $scf->add_alias("scf");
9550 $scf->add_alias("sfc");
9555 sub finish_property_setup {
9556 # Finishes setting up after PropertyAliases.
9559 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9561 # This entry was missing from this file in earlier Unicode versions
9562 if (-e 'Jamo.txt' && ! defined property_ref('JSN')) {
9563 Property->new('JSN', Full_Name => 'Jamo_Short_Name');
9566 # These two properties must be defined in all releases so we can generate
9567 # the tables from them to make regex \X work, but suppress their output so
9568 # aren't application visible prior to releases where they should be
9569 if (! defined property_ref('GCB')) {
9570 Property->new('GCB', Full_Name => 'Grapheme_Cluster_Break',
9571 Fate => $PLACEHOLDER);
9573 if (! defined property_ref('hst')) {
9574 Property->new('hst', Full_Name => 'Hangul_Syllable_Type',
9575 Fate => $PLACEHOLDER);
9578 # These are used so much, that we set globals for them.
9579 $gc = property_ref('General_Category');
9580 $block = property_ref('Block');
9581 $script = property_ref('Script');
9583 # Perl adds this alias.
9584 $gc->add_alias('Category');
9586 # Unicode::Normalize expects this file with this name and directory.
9587 $ccc = property_ref('Canonical_Combining_Class');
9589 $ccc->set_file('CombiningClass');
9590 $ccc->set_directory(File::Spec->curdir());
9593 # These two properties aren't actually used in the core, but unfortunately
9594 # the names just above that are in the core interfere with these, so
9595 # choose different names. These aren't a problem unless the map tables
9596 # for these files get written out.
9597 my $lowercase = property_ref('Lowercase');
9598 $lowercase->set_file('IsLower') if defined $lowercase;
9599 my $uppercase = property_ref('Uppercase');
9600 $uppercase->set_file('IsUpper') if defined $uppercase;
9602 # Set up the hard-coded default mappings, but only on properties defined
9604 foreach my $property (keys %default_mapping) {
9605 my $property_object = property_ref($property);
9606 next if ! defined $property_object;
9607 my $default_map = $default_mapping{$property};
9608 $property_object->set_default_map($default_map);
9610 # A map of <code point> implies the property is string.
9611 if ($property_object->type == $UNKNOWN
9612 && $default_map eq $CODE_POINT)
9614 $property_object->set_type($STRING);
9618 # The following use the Multi_Default class to create objects for
9621 # Bidi class has a complicated default, but the derived file takes care of
9622 # the complications, leaving just 'L'.
9623 if (file_exists("${EXTRACTED}DBidiClass.txt")) {
9624 property_ref('Bidi_Class')->set_default_map('L');
9629 # The derived file was introduced in 3.1.1. The values below are
9630 # taken from table 3-8, TUS 3.0
9632 'my $default = Range_List->new;
9633 $default->add_range(0x0590, 0x05FF);
9634 $default->add_range(0xFB1D, 0xFB4F);'
9637 # The defaults apply only to unassigned characters
9638 $default_R .= '$gc->table("Unassigned") & $default;';
9640 if ($v_version lt v3.0.0) {
9641 $default = Multi_Default->new(R => $default_R, 'L');
9645 # AL apparently not introduced until 3.0: TUS 2.x references are
9646 # not on-line to check it out
9648 'my $default = Range_List->new;
9649 $default->add_range(0x0600, 0x07BF);
9650 $default->add_range(0xFB50, 0xFDFF);
9651 $default->add_range(0xFE70, 0xFEFF);'
9654 # Non-character code points introduced in this release; aren't AL
9655 if ($v_version ge 3.1.0) {
9656 $default_AL .= '$default->delete_range(0xFDD0, 0xFDEF);';
9658 $default_AL .= '$gc->table("Unassigned") & $default';
9659 $default = Multi_Default->new(AL => $default_AL,
9663 property_ref('Bidi_Class')->set_default_map($default);
9666 # Joining type has a complicated default, but the derived file takes care
9667 # of the complications, leaving just 'U' (or Non_Joining), except the file
9669 if (file_exists("${EXTRACTED}DJoinType.txt") || -e 'ArabicShaping.txt') {
9670 if (file_exists("${EXTRACTED}DJoinType.txt") && $v_version ne 3.1.0) {
9671 property_ref('Joining_Type')->set_default_map('Non_Joining');
9675 # Otherwise, there are not one, but two possibilities for the
9676 # missing defaults: T and U.
9677 # The missing defaults that evaluate to T are given by:
9678 # T = Mn + Cf - ZWNJ - ZWJ
9679 # where Mn and Cf are the general category values. In other words,
9680 # any non-spacing mark or any format control character, except
9681 # U+200C ZERO WIDTH NON-JOINER (joining type U) and U+200D ZERO
9682 # WIDTH JOINER (joining type C).
9683 my $default = Multi_Default->new(
9684 'T' => '$gc->table("Mn") + $gc->table("Cf") - 0x200C - 0x200D',
9686 property_ref('Joining_Type')->set_default_map($default);
9690 # Line break has a complicated default in early releases. It is 'Unknown'
9691 # for non-assigned code points; 'AL' for assigned.
9692 if (file_exists("${EXTRACTED}DLineBreak.txt") || -e 'LineBreak.txt') {
9693 my $lb = property_ref('Line_Break');
9694 if ($v_version gt 3.2.0) {
9695 $lb->set_default_map('Unknown');
9698 my $default = Multi_Default->new( 'Unknown' => '$gc->table("Cn")',
9700 $lb->set_default_map($default);
9703 # If has the URS property, make sure that the standard aliases are in
9704 # it, since not in the input tables in some versions.
9705 my $urs = property_ref('Unicode_Radical_Stroke');
9707 $urs->add_alias('cjkRSUnicode');
9708 $urs->add_alias('kRSUnicode');
9712 # For backwards compatibility with applications that may read the mapping
9713 # file directly (it was documented in 5.12 and 5.14 as being thusly
9714 # usable), keep it from being adjusted. (range_size_1 is
9715 # used to force the traditional format.)
9716 if (defined (my $nfkc_cf = property_ref('NFKC_Casefold'))) {
9717 $nfkc_cf->set_to_output_map($EXTERNAL_MAP);
9718 $nfkc_cf->set_range_size_1(1);
9720 if (defined (my $bmg = property_ref('Bidi_Mirroring_Glyph'))) {
9721 $bmg->set_to_output_map($EXTERNAL_MAP);
9722 $bmg->set_range_size_1(1);
9725 property_ref('Numeric_Value')->set_to_output_map($OUTPUT_ADJUSTED);
9730 sub get_old_property_aliases() {
9731 # Returns what would be in PropertyAliases.txt if it existed in very old
9732 # versions of Unicode. It was derived from the one in 3.2, and pared
9733 # down based on the data that was actually in the older releases.
9734 # An attempt was made to use the existence of files to mean inclusion or
9735 # not of various aliases, but if this was not sufficient, using version
9736 # numbers was resorted to.
9740 # These are to be used in all versions (though some are constructed by
9741 # this program if missing)
9742 push @return, split /\n/, <<'END';
9744 Bidi_M ; Bidi_Mirrored
9746 ccc ; Canonical_Combining_Class
9747 dm ; Decomposition_Mapping
9748 dt ; Decomposition_Type
9749 gc ; General_Category
9751 lc ; Lowercase_Mapping
9753 na1 ; Unicode_1_Name
9756 scf ; Simple_Case_Folding
9757 slc ; Simple_Lowercase_Mapping
9758 stc ; Simple_Titlecase_Mapping
9759 suc ; Simple_Uppercase_Mapping
9760 tc ; Titlecase_Mapping
9761 uc ; Uppercase_Mapping
9764 if (-e 'Blocks.txt') {
9765 push @return, "blk ; Block\n";
9767 if (-e 'ArabicShaping.txt') {
9768 push @return, split /\n/, <<'END';
9773 if (-e 'PropList.txt') {
9775 # This first set is in the original old-style proplist.
9776 push @return, split /\n/, <<'END';
9777 Bidi_C ; Bidi_Control
9785 Join_C ; Join_Control
9787 QMark ; Quotation_Mark
9788 Term ; Terminal_Punctuation
9789 WSpace ; White_Space
9791 # The next sets were added later
9792 if ($v_version ge v3.0.0) {
9793 push @return, split /\n/, <<'END';
9798 if ($v_version ge v3.0.1) {
9799 push @return, split /\n/, <<'END';
9800 NChar ; Noncharacter_Code_Point
9803 # The next sets were added in the new-style
9804 if ($v_version ge v3.1.0) {
9805 push @return, split /\n/, <<'END';
9806 OAlpha ; Other_Alphabetic
9807 OLower ; Other_Lowercase
9809 OUpper ; Other_Uppercase
9812 if ($v_version ge v3.1.1) {
9813 push @return, "AHex ; ASCII_Hex_Digit\n";
9816 if (-e 'EastAsianWidth.txt') {
9817 push @return, "ea ; East_Asian_Width\n";
9819 if (-e 'CompositionExclusions.txt') {
9820 push @return, "CE ; Composition_Exclusion\n";
9822 if (-e 'LineBreak.txt') {
9823 push @return, "lb ; Line_Break\n";
9825 if (-e 'BidiMirroring.txt') {
9826 push @return, "bmg ; Bidi_Mirroring_Glyph\n";
9828 if (-e 'Scripts.txt') {
9829 push @return, "sc ; Script\n";
9831 if (-e 'DNormalizationProps.txt') {
9832 push @return, split /\n/, <<'END';
9833 Comp_Ex ; Full_Composition_Exclusion
9834 FC_NFKC ; FC_NFKC_Closure
9835 NFC_QC ; NFC_Quick_Check
9836 NFD_QC ; NFD_Quick_Check
9837 NFKC_QC ; NFKC_Quick_Check
9838 NFKD_QC ; NFKD_Quick_Check
9839 XO_NFC ; Expands_On_NFC
9840 XO_NFD ; Expands_On_NFD
9841 XO_NFKC ; Expands_On_NFKC
9842 XO_NFKD ; Expands_On_NFKD
9845 if (-e 'DCoreProperties.txt') {
9846 push @return, split /\n/, <<'END';
9852 # These can also appear in some versions of PropList.txt
9853 push @return, "Lower ; Lowercase\n"
9854 unless grep { $_ =~ /^Lower\b/} @return;
9855 push @return, "Upper ; Uppercase\n"
9856 unless grep { $_ =~ /^Upper\b/} @return;
9859 # This flag requires the DAge.txt file to be copied into the directory.
9860 if (DEBUG && $compare_versions) {
9861 push @return, 'age ; Age';
9867 sub process_PropValueAliases {
9868 # This file contains values that properties look like:
9869 # bc ; AL ; Arabic_Letter
9870 # blk; n/a ; Greek_And_Coptic ; Greek
9872 # Field 0 is the property.
9873 # Field 1 is the short name of a property value or 'n/a' if no
9874 # short name exists;
9875 # Field 2 is the full property value name;
9876 # Any other fields are more synonyms for the property value.
9877 # Purely numeric property values are omitted from the file; as are some
9878 # others, fewer and fewer in later releases
9880 # Entries for the ccc property have an extra field before the
9882 # ccc; 0; NR ; Not_Reordered
9883 # It is the numeric value that the names are synonyms for.
9885 # There are comment entries for values missing from this file:
9886 # # @missing: 0000..10FFFF; ISO_Comment; <none>
9887 # # @missing: 0000..10FFFF; Lowercase_Mapping; <code point>
9890 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9892 # This whole file was non-existent in early releases, so use our own
9893 # internal one if necessary.
9894 if (! -e 'PropValueAliases.txt') {
9895 $file->insert_lines(get_old_property_value_aliases());
9898 if ($v_version lt 4.0.0) {
9899 $file->insert_lines(split /\n/, <<'END'
9900 hst; L ; Leading_Jamo
9901 hst; LV ; LV_Syllable
9902 hst; LVT ; LVT_Syllable
9903 hst; NA ; Not_Applicable
9904 hst; T ; Trailing_Jamo
9909 if ($v_version lt 4.1.0) {
9910 $file->insert_lines(split /\n/, <<'END'
9926 # Add any explicit cjk values
9927 $file->insert_lines(@cjk_property_values);
9929 # This line is used only for testing the code that checks for name
9930 # conflicts. There is a script Inherited, and when this line is executed
9931 # it causes there to be a name conflict with the 'Inherited' that this
9932 # program generates for this block property value
9933 #$file->insert_lines('blk; n/a; Herited');
9936 # Process each line of the file ...
9937 while ($file->next_line) {
9939 # Fix typo in input file
9940 s/CCC133/CCC132/g if $v_version eq v6.1.0;
9942 my ($property, @data) = split /\s*;\s*/;
9944 # The ccc property has an extra field at the beginning, which is the
9945 # numeric value. Move it to be after the other two, mnemonic, fields,
9946 # so that those will be used as the property value's names, and the
9947 # number will be an extra alias. (Rightmost splice removes field 1-2,
9948 # returning them in a slice; left splice inserts that before anything,
9949 # thus shifting the former field 0 to after them.)
9950 splice (@data, 0, 0, splice(@data, 1, 2)) if $property eq 'ccc';
9952 # Field 0 is a short name unless "n/a"; field 1 is the full name. If
9953 # there is no short name, use the full one in element 1
9954 if ($data[0] eq "n/a") {
9955 $data[0] = $data[1];
9957 elsif ($data[0] ne $data[1]
9958 && standardize($data[0]) eq standardize($data[1])
9959 && $data[1] !~ /[[:upper:]]/)
9961 # Also, there is a bug in the file in which "n/a" is omitted, and
9962 # the two fields are identical except for case, and the full name
9963 # is all lower case. Copy the "short" name unto the full one to
9964 # give it some upper case.
9966 $data[1] = $data[0];
9969 # Earlier releases had the pseudo property 'qc' that should expand to
9970 # the ones that replace it below.
9971 if ($property eq 'qc') {
9972 if (lc $data[0] eq 'y') {
9973 $file->insert_lines('NFC_QC; Y ; Yes',
9979 elsif (lc $data[0] eq 'n') {
9980 $file->insert_lines('NFC_QC; N ; No',
9986 elsif (lc $data[0] eq 'm') {
9987 $file->insert_lines('NFC_QC; M ; Maybe',
9988 'NFKC_QC; M ; Maybe',
9992 $file->carp_bad_line("qc followed by unexpected '$data[0]");
9997 # The first field is the short name, 2nd is the full one.
9998 my $property_object = property_ref($property);
9999 my $table = $property_object->add_match_table($data[0],
10000 Full_Name => $data[1]);
10002 # Start looking for more aliases after these two.
10003 for my $i (2 .. @data - 1) {
10004 $table->add_alias($data[$i]);
10006 } # End of looping through the file
10008 # As noted in the comments early in the program, it generates tables for
10009 # the default values for all releases, even those for which the concept
10010 # didn't exist at the time. Here we add those if missing.
10011 my $age = property_ref('age');
10012 if (defined $age && ! defined $age->table('Unassigned')) {
10013 $age->add_match_table('Unassigned');
10015 $block->add_match_table('No_Block') if -e 'Blocks.txt'
10016 && ! defined $block->table('No_Block');
10019 # Now set the default mappings of the properties from the file. This is
10020 # done after the loop because a number of properties have only @missings
10021 # entries in the file, and may not show up until the end.
10022 my @defaults = $file->get_missings;
10023 foreach my $default_ref (@defaults) {
10024 my $default = $default_ref->[0];
10025 my $property = property_ref($default_ref->[1]);
10026 $property->set_default_map($default);
10031 sub get_old_property_value_aliases () {
10032 # Returns what would be in PropValueAliases.txt if it existed in very old
10033 # versions of Unicode. It was derived from the one in 3.2, and pared
10034 # down. An attempt was made to use the existence of files to mean
10035 # inclusion or not of various aliases, but if this was not sufficient,
10036 # using version numbers was resorted to.
10038 my @return = split /\n/, <<'END';
10039 bc ; AN ; Arabic_Number
10040 bc ; B ; Paragraph_Separator
10041 bc ; CS ; Common_Separator
10042 bc ; EN ; European_Number
10043 bc ; ES ; European_Separator
10044 bc ; ET ; European_Terminator
10045 bc ; L ; Left_To_Right
10046 bc ; ON ; Other_Neutral
10047 bc ; R ; Right_To_Left
10048 bc ; WS ; White_Space
10050 Bidi_M; N; No; F; False
10051 Bidi_M; Y; Yes; T; True
10053 # The standard combining classes are very much different in v1, so only use
10054 # ones that look right (not checked thoroughly)
10055 ccc; 0; NR ; Not_Reordered
10056 ccc; 1; OV ; Overlay
10058 ccc; 8; KV ; Kana_Voicing
10059 ccc; 9; VR ; Virama
10060 ccc; 202; ATBL ; Attached_Below_Left
10061 ccc; 216; ATAR ; Attached_Above_Right
10062 ccc; 218; BL ; Below_Left
10063 ccc; 220; B ; Below
10064 ccc; 222; BR ; Below_Right
10066 ccc; 228; AL ; Above_Left
10067 ccc; 230; A ; Above
10068 ccc; 232; AR ; Above_Right
10069 ccc; 234; DA ; Double_Above
10071 dt ; can ; canonical
10075 dt ; fra ; fraction
10076 dt ; init ; initial
10077 dt ; iso ; isolated
10085 gc ; C ; Other # Cc | Cf | Cn | Co | Cs
10087 gc ; Cn ; Unassigned
10088 gc ; Co ; Private_Use
10089 gc ; L ; Letter # Ll | Lm | Lo | Lt | Lu
10090 gc ; LC ; Cased_Letter # Ll | Lt | Lu
10091 gc ; Ll ; Lowercase_Letter
10092 gc ; Lm ; Modifier_Letter
10093 gc ; Lo ; Other_Letter
10094 gc ; Lu ; Uppercase_Letter
10095 gc ; M ; Mark # Mc | Me | Mn
10096 gc ; Mc ; Spacing_Mark
10097 gc ; Mn ; Nonspacing_Mark
10098 gc ; N ; Number # Nd | Nl | No
10099 gc ; Nd ; Decimal_Number
10100 gc ; No ; Other_Number
10101 gc ; P ; Punctuation # Pc | Pd | Pe | Pf | Pi | Po | Ps
10102 gc ; Pd ; Dash_Punctuation
10103 gc ; Pe ; Close_Punctuation
10104 gc ; Po ; Other_Punctuation
10105 gc ; Ps ; Open_Punctuation
10106 gc ; S ; Symbol # Sc | Sk | Sm | So
10107 gc ; Sc ; Currency_Symbol
10108 gc ; Sm ; Math_Symbol
10109 gc ; So ; Other_Symbol
10110 gc ; Z ; Separator # Zl | Zp | Zs
10111 gc ; Zl ; Line_Separator
10112 gc ; Zp ; Paragraph_Separator
10113 gc ; Zs ; Space_Separator
10121 if (-e 'ArabicShaping.txt') {
10122 push @return, split /\n/, <<'END';
10129 jg ; n/a ; NO_JOINING_GROUP
10137 jt ; C ; Join_Causing
10138 jt ; D ; Dual_Joining
10139 jt ; L ; Left_Joining
10140 jt ; R ; Right_Joining
10141 jt ; U ; Non_Joining
10142 jt ; T ; Transparent
10144 if ($v_version ge v3.0.0) {
10145 push @return, split /\n/, <<'END';
10149 jg ; n/a ; DALATH_RISH
10152 jg ; n/a ; FINAL_SEMKATH
10155 jg ; n/a ; HAMZA_ON_HEH_GOAL
10158 jg ; n/a ; HEH_GOAL
10162 jg ; n/a ; KNOTTED_HEH
10169 jg ; n/a ; REVERSED_PE
10173 jg ; n/a ; SWASH_KAF
10175 jg ; n/a ; TEH_MARBUTA
10178 jg ; n/a ; YEH_BARREE
10179 jg ; n/a ; YEH_WITH_TAIL
10188 if (-e 'EastAsianWidth.txt') {
10189 push @return, split /\n/, <<'END';
10199 if (-e 'LineBreak.txt') {
10200 push @return, split /\n/, <<'END';
10201 lb ; AI ; Ambiguous
10202 lb ; AL ; Alphabetic
10203 lb ; B2 ; Break_Both
10204 lb ; BA ; Break_After
10205 lb ; BB ; Break_Before
10206 lb ; BK ; Mandatory_Break
10207 lb ; CB ; Contingent_Break
10208 lb ; CL ; Close_Punctuation
10209 lb ; CM ; Combining_Mark
10210 lb ; CR ; Carriage_Return
10211 lb ; EX ; Exclamation
10214 lb ; ID ; Ideographic
10215 lb ; IN ; Inseperable
10216 lb ; IS ; Infix_Numeric
10217 lb ; LF ; Line_Feed
10218 lb ; NS ; Nonstarter
10220 lb ; OP ; Open_Punctuation
10221 lb ; PO ; Postfix_Numeric
10222 lb ; PR ; Prefix_Numeric
10223 lb ; QU ; Quotation
10224 lb ; SA ; Complex_Context
10225 lb ; SG ; Surrogate
10227 lb ; SY ; Break_Symbols
10233 if (-e 'DNormalizationProps.txt') {
10234 push @return, split /\n/, <<'END';
10241 if (-e 'Scripts.txt') {
10242 push @return, split /\n/, <<'END';
10244 sc ; Armn ; Armenian
10245 sc ; Beng ; Bengali
10246 sc ; Bopo ; Bopomofo
10247 sc ; Cans ; Canadian_Aboriginal
10248 sc ; Cher ; Cherokee
10249 sc ; Cyrl ; Cyrillic
10250 sc ; Deva ; Devanagari
10251 sc ; Dsrt ; Deseret
10252 sc ; Ethi ; Ethiopic
10253 sc ; Geor ; Georgian
10256 sc ; Gujr ; Gujarati
10257 sc ; Guru ; Gurmukhi
10261 sc ; Hira ; Hiragana
10262 sc ; Ital ; Old_Italic
10263 sc ; Kana ; Katakana
10265 sc ; Knda ; Kannada
10268 sc ; Mlym ; Malayalam
10269 sc ; Mong ; Mongolian
10270 sc ; Mymr ; Myanmar
10273 sc ; Qaai ; Inherited
10275 sc ; Sinh ; Sinhala
10281 sc ; Tibt ; Tibetan
10287 if ($v_version ge v2.0.0) {
10288 push @return, split /\n/, <<'END';
10292 dt ; vert ; vertical
10296 gc ; Cs ; Surrogate
10297 gc ; Lt ; Titlecase_Letter
10298 gc ; Me ; Enclosing_Mark
10299 gc ; Nl ; Letter_Number
10300 gc ; Pc ; Connector_Punctuation
10301 gc ; Sk ; Modifier_Symbol
10304 if ($v_version ge v2.1.2) {
10305 push @return, "bc ; S ; Segment_Separator\n";
10307 if ($v_version ge v2.1.5) {
10308 push @return, split /\n/, <<'END';
10309 gc ; Pf ; Final_Punctuation
10310 gc ; Pi ; Initial_Punctuation
10313 if ($v_version ge v2.1.8) {
10314 push @return, "ccc; 240; IS ; Iota_Subscript\n";
10317 if ($v_version ge v3.0.0) {
10318 push @return, split /\n/, <<'END';
10319 bc ; AL ; Arabic_Letter
10320 bc ; BN ; Boundary_Neutral
10321 bc ; LRE ; Left_To_Right_Embedding
10322 bc ; LRO ; Left_To_Right_Override
10323 bc ; NSM ; Nonspacing_Mark
10324 bc ; PDF ; Pop_Directional_Format
10325 bc ; RLE ; Right_To_Left_Embedding
10326 bc ; RLO ; Right_To_Left_Override
10328 ccc; 233; DB ; Double_Below
10332 if ($v_version ge v3.1.0) {
10333 push @return, "ccc; 226; R ; Right\n";
10339 sub process_NormalizationsTest {
10341 # Each line looks like:
10342 # source code point; NFC; NFD; NFKC; NFKD
10344 # 1E0A;1E0A;0044 0307;1E0A;0044 0307;
10347 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10349 # Process each line of the file ...
10350 while ($file->next_line) {
10354 my ($c1, $c2, $c3, $c4, $c5) = split /\s*;\s*/;
10356 foreach my $var (\$c1, \$c2, \$c3, \$c4, \$c5) {
10357 $$var = pack "U0U*", map { hex } split " ", $$var;
10358 $$var =~ s/(\\)/$1$1/g;
10361 push @normalization_tests,
10362 "Test_N(q
\a$c1
\a, q
\a$c2
\a, q
\a$c3
\a, q
\a$c4
\a, q
\a$c5
\a);\n";
10363 } # End of looping through the file
10366 sub output_perl_charnames_line ($$) {
10368 # Output the entries in Perl_charnames specially, using 5 digits instead
10369 # of four. This makes the entries a constant length, and simplifies
10370 # charnames.pm which this table is for. Unicode can have 6 digit
10371 # ordinals, but they are all private use or noncharacters which do not
10372 # have names, so won't be in this table.
10374 return sprintf "%05X\t%s\n", $_[0], $_[1];
10378 # This is used to store the range list of all the code points usable when
10379 # the little used $compare_versions feature is enabled.
10380 my $compare_versions_range_list;
10382 # These are constants to the $property_info hash in this subroutine, to
10383 # avoid using a quoted-string which might have a typo.
10385 my $DEFAULT_MAP = 'default_map';
10386 my $DEFAULT_TABLE = 'default_table';
10387 my $PSEUDO_MAP_TYPE = 'pseudo_map_type';
10388 my $MISSINGS = 'missings';
10390 sub process_generic_property_file {
10391 # This processes a file containing property mappings and puts them
10392 # into internal map tables. It should be used to handle any property
10393 # files that have mappings from a code point or range thereof to
10394 # something else. This means almost all the UCD .txt files.
10395 # each_line_handlers() should be set to adjust the lines of these
10396 # files, if necessary, to what this routine understands:
10399 # 003C..003E ; Math
10401 # the fields are: "codepoint-range ; property; map"
10403 # meaning the codepoints in the range all have the value 'map' under
10405 # Beginning and trailing white space in each field are not significant.
10406 # Note there is not a trailing semi-colon in the above. A trailing
10407 # semi-colon means the map is a null-string. An omitted map, as
10408 # opposed to a null-string, is assumed to be 'Y', based on Unicode
10409 # table syntax. (This could have been hidden from this routine by
10410 # doing it in the $file object, but that would require parsing of the
10411 # line there, so would have to parse it twice, or change the interface
10412 # to pass this an array. So not done.)
10414 # The map field may begin with a sequence of commands that apply to
10415 # this range. Each such command begins and ends with $CMD_DELIM.
10416 # These are used to indicate, for example, that the mapping for a
10417 # range has a non-default type.
10419 # This loops through the file, calling its next_line() method, and
10420 # then taking the map and adding it to the property's table.
10421 # Complications arise because any number of properties can be in the
10422 # file, in any order, interspersed in any way. The first time a
10423 # property is seen, it gets information about that property and
10424 # caches it for quick retrieval later. It also normalizes the maps
10425 # so that only one of many synonyms is stored. The Unicode input
10426 # files do use some multiple synonyms.
10429 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10431 my %property_info; # To keep track of what properties
10432 # have already had entries in the
10433 # current file, and info about each,
10434 # so don't have to recompute.
10435 my $property_name; # property currently being worked on
10436 my $property_type; # and its type
10437 my $previous_property_name = ""; # name from last time through loop
10438 my $property_object; # pointer to the current property's
10440 my $property_addr; # the address of that object
10441 my $default_map; # the string that code points missing
10442 # from the file map to
10443 my $default_table; # For non-string properties, a
10444 # reference to the match table that
10445 # will contain the list of code
10446 # points that map to $default_map.
10448 # Get the next real non-comment line
10450 while ($file->next_line) {
10452 # Default replacement type; means that if parts of the range have
10453 # already been stored in our tables, the new map overrides them if
10454 # they differ more than cosmetically
10455 my $replace = $IF_NOT_EQUIVALENT;
10456 my $map_type; # Default type for the map of this range
10458 #local $to_trace = 1 if main::DEBUG;
10459 trace $_ if main::DEBUG && $to_trace;
10461 # Split the line into components
10462 my ($range, $property_name, $map, @remainder)
10463 = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
10465 # If more or less on the line than we are expecting, warn and skip
10468 $file->carp_bad_line('Extra fields');
10471 elsif ( ! defined $property_name) {
10472 $file->carp_bad_line('Missing property');
10476 # Examine the range.
10477 if ($range !~ /^ ($code_point_re) (?:\.\. ($code_point_re) )? $/x)
10479 $file->carp_bad_line("Range '$range' not of the form 'CP1' or 'CP1..CP2' (where CP1,2 are code points in hex)");
10483 my $high = (defined $2) ? hex $2 : $low;
10485 # For the very specialized case of comparing two Unicode
10487 if (DEBUG && $compare_versions) {
10488 if ($property_name eq 'Age') {
10490 # Only allow code points at least as old as the version
10492 my $age = pack "C*", split(/\./, $map); # v string
10493 next LINE if $age gt $compare_versions;
10497 # Again, we throw out code points younger than those of
10498 # the specified version. By now, the Age property is
10499 # populated. We use the intersection of each input range
10500 # with this property to find what code points in it are
10501 # valid. To do the intersection, we have to convert the
10502 # Age property map to a Range_list. We only have to do
10504 if (! defined $compare_versions_range_list) {
10505 my $age = property_ref('Age');
10506 if (! -e 'DAge.txt') {
10507 croak "Need to have 'DAge.txt' file to do version comparison";
10509 elsif ($age->count == 0) {
10510 croak "The 'Age' table is empty, but its file exists";
10512 $compare_versions_range_list
10513 = Range_List->new(Initialize => $age);
10516 # An undefined map is always 'Y'
10517 $map = 'Y' if ! defined $map;
10519 # Calculate the intersection of the input range with the
10520 # code points that are known in the specified version
10521 my @ranges = ($compare_versions_range_list
10522 & Range->new($low, $high))->ranges;
10524 # If the intersection is empty, throw away this range
10525 next LINE unless @ranges;
10527 # Only examine the first range this time through the loop.
10528 my $this_range = shift @ranges;
10530 # Put any remaining ranges in the queue to be processed
10531 # later. Note that there is unnecessary work here, as we
10532 # will do the intersection again for each of these ranges
10533 # during some future iteration of the LINE loop, but this
10534 # code is not used in production. The later intersections
10535 # are guaranteed to not splinter, so this will not become
10536 # an infinite loop.
10537 my $line = join ';', $property_name, $map;
10538 foreach my $range (@ranges) {
10539 $file->insert_adjusted_lines(sprintf("%04X..%04X; %s",
10545 # And process the first range, like any other.
10546 $low = $this_range->start;
10547 $high = $this_range->end;
10549 } # End of $compare_versions
10551 # If changing to a new property, get the things constant per
10553 if ($previous_property_name ne $property_name) {
10555 $property_object = property_ref($property_name);
10556 if (! defined $property_object) {
10557 $file->carp_bad_line("Unexpected property '$property_name'. Skipped");
10560 { no overloading; $property_addr = pack 'J', $property_object; }
10562 # Defer changing names until have a line that is acceptable
10563 # (the 'next' statement above means is unacceptable)
10564 $previous_property_name = $property_name;
10566 # If not the first time for this property, retrieve info about
10567 # it from the cache
10568 if (defined ($property_info{$property_addr}{$TYPE})) {
10569 $property_type = $property_info{$property_addr}{$TYPE};
10570 $default_map = $property_info{$property_addr}{$DEFAULT_MAP};
10572 = $property_info{$property_addr}{$PSEUDO_MAP_TYPE};
10574 = $property_info{$property_addr}{$DEFAULT_TABLE};
10578 # Here, is the first time for this property. Set up the
10580 $property_type = $property_info{$property_addr}{$TYPE}
10581 = $property_object->type;
10583 = $property_info{$property_addr}{$PSEUDO_MAP_TYPE}
10584 = $property_object->pseudo_map_type;
10586 # The Unicode files are set up so that if the map is not
10587 # defined, it is a binary property
10588 if (! defined $map && $property_type != $BINARY) {
10589 if ($property_type != $UNKNOWN
10590 && $property_type != $NON_STRING)
10592 $file->carp_bad_line("No mapping defined on a non-binary property. Using 'Y' for the map");
10595 $property_object->set_type($BINARY);
10597 = $property_info{$property_addr}{$TYPE}
10602 # Get any @missings default for this property. This
10603 # should precede the first entry for the property in the
10604 # input file, and is located in a comment that has been
10605 # stored by the Input_file class until we access it here.
10606 # It's possible that there is more than one such line
10607 # waiting for us; collect them all, and parse
10608 my @missings_list = $file->get_missings
10609 if $file->has_missings_defaults;
10610 foreach my $default_ref (@missings_list) {
10611 my $default = $default_ref->[0];
10612 my $addr = do { no overloading; pack 'J', property_ref($default_ref->[1]); };
10614 # For string properties, the default is just what the
10615 # file says, but non-string properties should already
10616 # have set up a table for the default property value;
10617 # use the table for these, so can resolve synonyms
10618 # later to a single standard one.
10619 if ($property_type == $STRING
10620 || $property_type == $UNKNOWN)
10622 $property_info{$addr}{$MISSINGS} = $default;
10625 $property_info{$addr}{$MISSINGS}
10626 = $property_object->table($default);
10630 # Finished storing all the @missings defaults in the input
10631 # file so far. Get the one for the current property.
10632 my $missings = $property_info{$property_addr}{$MISSINGS};
10634 # But we likely have separately stored what the default
10635 # should be. (This is to accommodate versions of the
10636 # standard where the @missings lines are absent or
10637 # incomplete.) Hopefully the two will match. But check
10639 $default_map = $property_object->default_map;
10641 # If the map is a ref, it means that the default won't be
10642 # processed until later, so undef it, so next few lines
10643 # will redefine it to something that nothing will match
10644 undef $default_map if ref $default_map;
10646 # Create a $default_map if don't have one; maybe a dummy
10647 # that won't match anything.
10648 if (! defined $default_map) {
10650 # Use any @missings line in the file.
10651 if (defined $missings) {
10652 if (ref $missings) {
10653 $default_map = $missings->full_name;
10654 $default_table = $missings;
10657 $default_map = $missings;
10660 # And store it with the property for outside use.
10661 $property_object->set_default_map($default_map);
10665 # Neither an @missings nor a default map. Create
10666 # a dummy one, so won't have to test definedness
10667 # in the main loop.
10668 $default_map = '_Perl This will never be in a file
10673 # Here, we have $default_map defined, possibly in terms of
10674 # $missings, but maybe not, and possibly is a dummy one.
10675 if (defined $missings) {
10677 # Make sure there is no conflict between the two.
10678 # $missings has priority.
10679 if (ref $missings) {
10681 = $property_object->table($default_map);
10682 if (! defined $default_table
10683 || $default_table != $missings)
10685 if (! defined $default_table) {
10686 $default_table = $UNDEF;
10688 $file->carp_bad_line(<<END
10689 The \@missings line for $property_name in $file says that missings default to
10690 $missings, but we expect it to be $default_table. $missings used.
10693 $default_table = $missings;
10694 $default_map = $missings->full_name;
10696 $property_info{$property_addr}{$DEFAULT_TABLE}
10699 elsif ($default_map ne $missings) {
10700 $file->carp_bad_line(<<END
10701 The \@missings line for $property_name in $file says that missings default to
10702 $missings, but we expect it to be $default_map. $missings used.
10705 $default_map = $missings;
10709 $property_info{$property_addr}{$DEFAULT_MAP}
10712 # If haven't done so already, find the table corresponding
10713 # to this map for non-string properties.
10714 if (! defined $default_table
10715 && $property_type != $STRING
10716 && $property_type != $UNKNOWN)
10718 $default_table = $property_info{$property_addr}
10720 = $property_object->table($default_map);
10722 } # End of is first time for this property
10723 } # End of switching properties.
10725 # Ready to process the line.
10726 # The Unicode files are set up so that if the map is not defined,
10727 # it is a binary property with value 'Y'
10728 if (! defined $map) {
10733 # If the map begins with a special command to us (enclosed in
10734 # delimiters), extract the command(s).
10735 while ($map =~ s/ ^ $CMD_DELIM (.*?) $CMD_DELIM //x) {
10737 if ($command =~ / ^ $REPLACE_CMD= (.*) /x) {
10740 elsif ($command =~ / ^ $MAP_TYPE_CMD= (.*) /x) {
10744 $file->carp_bad_line("Unknown command line: '$1'");
10750 if ($default_map eq $CODE_POINT && $map =~ / ^ $code_point_re $/x)
10753 # Here, we have a map to a particular code point, and the
10754 # default map is to a code point itself. If the range
10755 # includes the particular code point, change that portion of
10756 # the range to the default. This makes sure that in the final
10757 # table only the non-defaults are listed.
10758 my $decimal_map = hex $map;
10759 if ($low <= $decimal_map && $decimal_map <= $high) {
10761 # If the range includes stuff before or after the map
10762 # we're changing, split it and process the split-off parts
10764 if ($low < $decimal_map) {
10765 $file->insert_adjusted_lines(
10766 sprintf("%04X..%04X; %s; %s",
10772 if ($high > $decimal_map) {
10773 $file->insert_adjusted_lines(
10774 sprintf("%04X..%04X; %s; %s",
10780 $low = $high = $decimal_map;
10781 $map = $CODE_POINT;
10785 # If we can tell that this is a synonym for the default map, use
10786 # the default one instead.
10787 if ($property_type != $STRING
10788 && $property_type != $UNKNOWN)
10790 my $table = $property_object->table($map);
10791 if (defined $table && $table == $default_table) {
10792 $map = $default_map;
10796 # And figure out the map type if not known.
10797 if (! defined $map_type || $map_type == $COMPUTE_NO_MULTI_CP) {
10798 if ($map eq "") { # Nulls are always $NULL map type
10800 } # Otherwise, non-strings, and those that don't allow
10801 # $MULTI_CP, and those that aren't multiple code points are
10804 (($property_type != $STRING && $property_type != $UNKNOWN)
10805 || (defined $map_type && $map_type == $COMPUTE_NO_MULTI_CP)
10806 || $map !~ /^ $code_point_re ( \ $code_point_re )+ $ /x)
10811 $map_type = $MULTI_CP;
10815 $property_object->add_map($low, $high,
10818 Replace => $replace);
10819 } # End of loop through file's lines
10825 { # Closure for UnicodeData.txt handling
10827 # This file was the first one in the UCD; its design leads to some
10828 # awkwardness in processing. Here is a sample line:
10829 # 0041;LATIN CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0061;
10830 # The fields in order are:
10831 my $i = 0; # The code point is in field 0, and is shifted off.
10832 my $CHARNAME = $i++; # character name (e.g. "LATIN CAPITAL LETTER A")
10833 my $CATEGORY = $i++; # category (e.g. "Lu")
10834 my $CCC = $i++; # Canonical combining class (e.g. "230")
10835 my $BIDI = $i++; # directional class (e.g. "L")
10836 my $PERL_DECOMPOSITION = $i++; # decomposition mapping
10837 my $PERL_DECIMAL_DIGIT = $i++; # decimal digit value
10838 my $NUMERIC_TYPE_OTHER_DIGIT = $i++; # digit value, like a superscript
10839 # Dual-use in this program; see below
10840 my $NUMERIC = $i++; # numeric value
10841 my $MIRRORED = $i++; # ? mirrored
10842 my $UNICODE_1_NAME = $i++; # name in Unicode 1.0
10843 my $COMMENT = $i++; # iso comment
10844 my $UPPER = $i++; # simple uppercase mapping
10845 my $LOWER = $i++; # simple lowercase mapping
10846 my $TITLE = $i++; # simple titlecase mapping
10847 my $input_field_count = $i;
10849 # This routine in addition outputs these extra fields:
10851 my $DECOMP_TYPE = $i++; # Decomposition type
10853 # These fields are modifications of ones above, and are usually
10854 # suppressed; they must come last, as for speed, the loop upper bound is
10855 # normally set to ignore them
10856 my $NAME = $i++; # This is the strict name field, not the one that
10858 my $DECOMP_MAP = $i++; # Strict decomposition mapping; not the one used
10859 # by Unicode::Normalize
10860 my $last_field = $i - 1;
10862 # All these are read into an array for each line, with the indices defined
10863 # above. The empty fields in the example line above indicate that the
10864 # value is defaulted. The handler called for each line of the input
10865 # changes these to their defaults.
10867 # Here are the official names of the properties, in a parallel array:
10869 $field_names[$BIDI] = 'Bidi_Class';
10870 $field_names[$CATEGORY] = 'General_Category';
10871 $field_names[$CCC] = 'Canonical_Combining_Class';
10872 $field_names[$CHARNAME] = 'Perl_Charnames';
10873 $field_names[$COMMENT] = 'ISO_Comment';
10874 $field_names[$DECOMP_MAP] = 'Decomposition_Mapping';
10875 $field_names[$DECOMP_TYPE] = 'Decomposition_Type';
10876 $field_names[$LOWER] = 'Lowercase_Mapping';
10877 $field_names[$MIRRORED] = 'Bidi_Mirrored';
10878 $field_names[$NAME] = 'Name';
10879 $field_names[$NUMERIC] = 'Numeric_Value';
10880 $field_names[$NUMERIC_TYPE_OTHER_DIGIT] = 'Numeric_Type';
10881 $field_names[$PERL_DECIMAL_DIGIT] = 'Perl_Decimal_Digit';
10882 $field_names[$PERL_DECOMPOSITION] = 'Perl_Decomposition_Mapping';
10883 $field_names[$TITLE] = 'Titlecase_Mapping';
10884 $field_names[$UNICODE_1_NAME] = 'Unicode_1_Name';
10885 $field_names[$UPPER] = 'Uppercase_Mapping';
10887 # Some of these need a little more explanation:
10888 # The $PERL_DECIMAL_DIGIT field does not lead to an official Unicode
10889 # property, but is used in calculating the Numeric_Type. Perl however,
10890 # creates a file from this field, so a Perl property is created from it.
10891 # Similarly, the Other_Digit field is used only for calculating the
10892 # Numeric_Type, and so it can be safely re-used as the place to store
10893 # the value for Numeric_Type; hence it is referred to as
10894 # $NUMERIC_TYPE_OTHER_DIGIT.
10895 # The input field named $PERL_DECOMPOSITION is a combination of both the
10896 # decomposition mapping and its type. Perl creates a file containing
10897 # exactly this field, so it is used for that. The two properties are
10898 # separated into two extra output fields, $DECOMP_MAP and $DECOMP_TYPE.
10899 # $DECOMP_MAP is usually suppressed (unless the lists are changed to
10900 # output it), as Perl doesn't use it directly.
10901 # The input field named here $CHARNAME is used to construct the
10902 # Perl_Charnames property, which is a combination of the Name property
10903 # (which the input field contains), and the Unicode_1_Name property, and
10904 # others from other files. Since, the strict Name property is not used
10905 # by Perl, this field is used for the table that Perl does use. The
10906 # strict Name property table is usually suppressed (unless the lists are
10907 # changed to output it), so it is accumulated in a separate field,
10908 # $NAME, which to save time is discarded unless the table is actually to
10911 # This file is processed like most in this program. Control is passed to
10912 # process_generic_property_file() which calls filter_UnicodeData_line()
10913 # for each input line. This filter converts the input into line(s) that
10914 # process_generic_property_file() understands. There is also a setup
10915 # routine called before any of the file is processed, and a handler for
10916 # EOF processing, all in this closure.
10918 # A huge speed-up occurred at the cost of some added complexity when these
10919 # routines were altered to buffer the outputs into ranges. Almost all the
10920 # lines of the input file apply to just one code point, and for most
10921 # properties, the map for the next code point up is the same as the
10922 # current one. So instead of creating a line for each property for each
10923 # input line, filter_UnicodeData_line() remembers what the previous map
10924 # of a property was, and doesn't generate a line to pass on until it has
10925 # to, as when the map changes; and that passed-on line encompasses the
10926 # whole contiguous range of code points that have the same map for that
10927 # property. This means a slight amount of extra setup, and having to
10928 # flush these buffers on EOF, testing if the maps have changed, plus
10929 # remembering state information in the closure. But it means a lot less
10930 # real time in not having to change the data base for each property on
10933 # Another complication is that there are already a few ranges designated
10934 # in the input. There are two lines for each, with the same maps except
10935 # the code point and name on each line. This was actually the hardest
10936 # thing to design around. The code points in those ranges may actually
10937 # have real maps not given by these two lines. These maps will either
10938 # be algorithmically determinable, or be in the extracted files furnished
10939 # with the UCD. In the event of conflicts between these extracted files,
10940 # and this one, Unicode says that this one prevails. But it shouldn't
10941 # prevail for conflicts that occur in these ranges. The data from the
10942 # extracted files prevails in those cases. So, this program is structured
10943 # so that those files are processed first, storing maps. Then the other
10944 # files are processed, generally overwriting what the extracted files
10945 # stored. But just the range lines in this input file are processed
10946 # without overwriting. This is accomplished by adding a special string to
10947 # the lines output to tell process_generic_property_file() to turn off the
10948 # overwriting for just this one line.
10949 # A similar mechanism is used to tell it that the map is of a non-default
10952 sub setup_UnicodeData { # Called before any lines of the input are read
10954 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10956 # Create a new property specially located that is a combination of the
10957 # various Name properties: Name, Unicode_1_Name, Named Sequences, and
10958 # Name_Alias properties. (The final duplicates elements of the
10959 # first.) A comment for it will later be constructed based on the
10960 # actual properties present and used
10961 $perl_charname = Property->new('Perl_Charnames',
10963 Directory => File::Spec->curdir(),
10965 Fate => $INTERNAL_ONLY,
10966 Perl_Extension => 1,
10967 Range_Size_1 => \&output_perl_charnames_line,
10970 $perl_charname->set_proxy_for('Name');
10972 my $Perl_decomp = Property->new('Perl_Decomposition_Mapping',
10973 Directory => File::Spec->curdir(),
10974 File => 'Decomposition',
10975 Format => $DECOMP_STRING_FORMAT,
10976 Fate => $INTERNAL_ONLY,
10977 Perl_Extension => 1,
10978 Default_Map => $CODE_POINT,
10980 # normalize.pm can't cope with these
10981 Output_Range_Counts => 0,
10983 # This is a specially formatted table
10984 # explicitly for normalize.pm, which
10985 # is expecting a particular format,
10986 # which means that mappings containing
10987 # multiple code points are in the main
10988 # body of the table
10989 Map_Type => $COMPUTE_NO_MULTI_CP,
10991 To_Output_Map => $INTERNAL_MAP,
10993 $Perl_decomp->set_proxy_for('Decomposition_Mapping', 'Decomposition_Type');
10994 $Perl_decomp->add_comment(join_lines(<<END
10995 This mapping is a combination of the Unicode 'Decomposition_Type' and
10996 'Decomposition_Mapping' properties, formatted for use by normalize.pm. It is
10997 identical to the official Unicode 'Decomposition_Mapping' property except for
10999 1) It omits the algorithmically determinable Hangul syllable decompositions,
11000 which normalize.pm handles algorithmically.
11001 2) It contains the decomposition type as well. Non-canonical decompositions
11002 begin with a word in angle brackets, like <super>, which denotes the
11003 compatible decomposition type. If the map does not begin with the <angle
11004 brackets>, the decomposition is canonical.
11008 my $Decimal_Digit = Property->new("Perl_Decimal_Digit",
11010 Perl_Extension => 1,
11011 Directory => $map_directory,
11013 To_Output_Map => $OUTPUT_ADJUSTED,
11015 $Decimal_Digit->add_comment(join_lines(<<END
11016 This file gives the mapping of all code points which represent a single
11017 decimal digit [0-9] to their respective digits, but it has ranges of 10 code
11018 points, and the mapping of each non-initial element of each range is actually
11019 not to "0", but to the offset that element has from its corresponding DIGIT 0.
11020 These code points are those that have Numeric_Type=Decimal; not special
11021 things, like subscripts nor Roman numerals.
11025 # These properties are not used for generating anything else, and are
11026 # usually not output. By making them last in the list, we can just
11027 # change the high end of the loop downwards to avoid the work of
11028 # generating a table(s) that is/are just going to get thrown away.
11029 if (! property_ref('Decomposition_Mapping')->to_output_map
11030 && ! property_ref('Name')->to_output_map)
11032 $last_field = min($NAME, $DECOMP_MAP) - 1;
11033 } elsif (property_ref('Decomposition_Mapping')->to_output_map) {
11034 $last_field = $DECOMP_MAP;
11035 } elsif (property_ref('Name')->to_output_map) {
11036 $last_field = $NAME;
11041 my $first_time = 1; # ? Is this the first line of the file
11042 my $in_range = 0; # ? Are we in one of the file's ranges
11043 my $previous_cp; # hex code point of previous line
11044 my $decimal_previous_cp = -1; # And its decimal equivalent
11045 my @start; # For each field, the current starting
11046 # code point in hex for the range
11047 # being accumulated.
11048 my @fields; # The input fields;
11049 my @previous_fields; # And those from the previous call
11051 sub filter_UnicodeData_line {
11052 # Handle a single input line from UnicodeData.txt; see comments above
11053 # Conceptually this takes a single line from the file containing N
11054 # properties, and converts it into N lines with one property per line,
11055 # which is what the final handler expects. But there are
11056 # complications due to the quirkiness of the input file, and to save
11057 # time, it accumulates ranges where the property values don't change
11058 # and only emits lines when necessary. This is about an order of
11059 # magnitude fewer lines emitted.
11062 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
11064 # $_ contains the input line.
11065 # -1 in split means retain trailing null fields
11066 (my $cp, @fields) = split /\s*;\s*/, $_, -1;
11068 #local $to_trace = 1 if main::DEBUG;
11069 trace $cp, @fields , $input_field_count if main::DEBUG && $to_trace;
11070 if (@fields > $input_field_count) {
11071 $file->carp_bad_line('Extra fields');
11076 my $decimal_cp = hex $cp;
11078 # We have to output all the buffered ranges when the next code point
11079 # is not exactly one after the previous one, which means there is a
11080 # gap in the ranges.
11081 my $force_output = ($decimal_cp != $decimal_previous_cp + 1);
11083 # The decomposition mapping field requires special handling. It looks
11086 # <compat> 0032 0020
11089 # The decomposition type is enclosed in <brackets>; if missing, it
11090 # means the type is canonical. There are two decomposition mapping
11091 # tables: the one for use by Perl's normalize.pm has a special format
11092 # which is this field intact; the other, for general use is of
11093 # standard format. In either case we have to find the decomposition
11094 # type. Empty fields have None as their type, and map to the code
11096 if ($fields[$PERL_DECOMPOSITION] eq "") {
11097 $fields[$DECOMP_TYPE] = 'None';
11098 $fields[$DECOMP_MAP] = $fields[$PERL_DECOMPOSITION] = $CODE_POINT;
11101 ($fields[$DECOMP_TYPE], my $map) = $fields[$PERL_DECOMPOSITION]
11102 =~ / < ( .+? ) > \s* ( .+ ) /x;
11103 if (! defined $fields[$DECOMP_TYPE]) {
11104 $fields[$DECOMP_TYPE] = 'Canonical';
11105 $fields[$DECOMP_MAP] = $fields[$PERL_DECOMPOSITION];
11108 $fields[$DECOMP_MAP] = $map;
11112 # The 3 numeric fields also require special handling. The 2 digit
11113 # fields must be either empty or match the number field. This means
11114 # that if it is empty, they must be as well, and the numeric type is
11115 # None, and the numeric value is 'Nan'.
11116 # The decimal digit field must be empty or match the other digit
11117 # field. If the decimal digit field is non-empty, the code point is
11118 # a decimal digit, and the other two fields will have the same value.
11119 # If it is empty, but the other digit field is non-empty, the code
11120 # point is an 'other digit', and the number field will have the same
11121 # value as the other digit field. If the other digit field is empty,
11122 # but the number field is non-empty, the code point is a generic
11124 if ($fields[$NUMERIC] eq "") {
11125 if ($fields[$PERL_DECIMAL_DIGIT] ne ""
11126 || $fields[$NUMERIC_TYPE_OTHER_DIGIT] ne ""
11128 $file->carp_bad_line("Numeric values inconsistent. Trying to process anyway");
11130 $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'None';
11131 $fields[$NUMERIC] = 'NaN';
11134 $file->carp_bad_line("'$fields[$NUMERIC]' should be a whole or rational number. Processing as if it were") if $fields[$NUMERIC] !~ qr{ ^ -? \d+ ( / \d+ )? $ }x;
11135 if ($fields[$PERL_DECIMAL_DIGIT] ne "") {
11136 $file->carp_bad_line("$fields[$PERL_DECIMAL_DIGIT] should equal $fields[$NUMERIC]. Processing anyway") if $fields[$PERL_DECIMAL_DIGIT] != $fields[$NUMERIC];
11137 $file->carp_bad_line("$fields[$PERL_DECIMAL_DIGIT] should be empty since the general category ($fields[$CATEGORY]) isn't 'Nd'. Processing as Decimal") if $fields[$CATEGORY] ne "Nd";
11138 $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'Decimal';
11140 elsif ($fields[$NUMERIC_TYPE_OTHER_DIGIT] ne "") {
11141 $file->carp_bad_line("$fields[$NUMERIC_TYPE_OTHER_DIGIT] should equal $fields[$NUMERIC]. Processing anyway") if $fields[$NUMERIC_TYPE_OTHER_DIGIT] != $fields[$NUMERIC];
11142 $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'Digit';
11145 $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'Numeric';
11147 # Rationals require extra effort.
11148 register_fraction($fields[$NUMERIC])
11149 if $fields[$NUMERIC] =~ qr{/};
11153 # For the properties that have empty fields in the file, and which
11154 # mean something different from empty, change them to that default.
11155 # Certain fields just haven't been empty so far in any Unicode
11156 # version, so don't look at those, namely $MIRRORED, $BIDI, $CCC,
11157 # $CATEGORY. This leaves just the two fields, and so we hard-code in
11158 # the defaults; which are very unlikely to ever change.
11159 $fields[$UPPER] = $CODE_POINT if $fields[$UPPER] eq "";
11160 $fields[$LOWER] = $CODE_POINT if $fields[$LOWER] eq "";
11162 # UAX44 says that if title is empty, it is the same as whatever upper
11164 $fields[$TITLE] = $fields[$UPPER] if $fields[$TITLE] eq "";
11166 # There are a few pairs of lines like:
11167 # AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
11168 # D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
11169 # that define ranges. These should be processed after the fields are
11170 # adjusted above, as they may override some of them; but mostly what
11171 # is left is to possibly adjust the $CHARNAME field. The names of all the
11172 # paired lines start with a '<', but this is also true of '<control>,
11173 # which isn't one of these special ones.
11174 if ($fields[$CHARNAME] eq '<control>') {
11176 # Some code points in this file have the pseudo-name
11177 # '<control>', but the official name for such ones is the null
11179 $fields[$NAME] = $fields[$CHARNAME] = "";
11181 # We had better not be in between range lines.
11183 $file->carp_bad_line("Expecting a closing range line, not a $fields[$CHARNAME]'. Trying anyway");
11187 elsif (substr($fields[$CHARNAME], 0, 1) ne '<') {
11189 # Here is a non-range line. We had better not be in between range
11192 $file->carp_bad_line("Expecting a closing range line, not a $fields[$CHARNAME]'. Trying anyway");
11195 if ($fields[$CHARNAME] =~ s/- $cp $//x) {
11197 # These are code points whose names end in their code points,
11198 # which means the names are algorithmically derivable from the
11199 # code points. To shorten the output Name file, the algorithm
11200 # for deriving these is placed in the file instead of each
11201 # code point, so they have map type $CP_IN_NAME
11202 $fields[$CHARNAME] = $CMD_DELIM
11207 . $fields[$CHARNAME];
11209 $fields[$NAME] = $fields[$CHARNAME];
11211 elsif ($fields[$CHARNAME] =~ /^<(.+), First>$/) {
11212 $fields[$CHARNAME] = $fields[$NAME] = $1;
11214 # Here we are at the beginning of a range pair.
11216 $file->carp_bad_line("Expecting a closing range line, not a beginning one, $fields[$CHARNAME]'. Trying anyway");
11220 # Because the properties in the range do not overwrite any already
11221 # in the db, we must flush the buffers of what's already there, so
11222 # they get handled in the normal scheme.
11226 elsif ($fields[$CHARNAME] !~ s/^<(.+), Last>$/$1/) {
11227 $file->carp_bad_line("Unexpected name starting with '<' $fields[$CHARNAME]. Ignoring this line.");
11231 else { # Here, we are at the last line of a range pair.
11234 $file->carp_bad_line("Unexpected end of range $fields[$CHARNAME] when not in one. Ignoring this line.");
11240 $fields[$NAME] = $fields[$CHARNAME];
11242 # Check that the input is valid: that the closing of the range is
11243 # the same as the beginning.
11244 foreach my $i (0 .. $last_field) {
11245 next if $fields[$i] eq $previous_fields[$i];
11246 $file->carp_bad_line("Expecting '$fields[$i]' to be the same as '$previous_fields[$i]'. Bad News. Trying anyway");
11249 # The processing differs depending on the type of range,
11250 # determined by its $CHARNAME
11251 if ($fields[$CHARNAME] =~ /^Hangul Syllable/) {
11253 # Check that the data looks right.
11254 if ($decimal_previous_cp != $SBase) {
11255 $file->carp_bad_line("Unexpected Hangul syllable start = $previous_cp. Bad News. Results will be wrong");
11257 if ($decimal_cp != $SBase + $SCount - 1) {
11258 $file->carp_bad_line("Unexpected Hangul syllable end = $cp. Bad News. Results will be wrong");
11261 # The Hangul syllable range has a somewhat complicated name
11262 # generation algorithm. Each code point in it has a canonical
11263 # decomposition also computable by an algorithm. The
11264 # perl decomposition map table built from these is used only
11265 # by normalize.pm, which has the algorithm built in it, so the
11266 # decomposition maps are not needed, and are large, so are
11267 # omitted from it. If the full decomposition map table is to
11268 # be output, the decompositions are generated for it, in the
11269 # EOF handling code for this input file.
11271 $previous_fields[$DECOMP_TYPE] = 'Canonical';
11273 # This range is stored in our internal structure with its
11274 # own map type, different from all others.
11275 $previous_fields[$CHARNAME] = $previous_fields[$NAME]
11281 . $fields[$CHARNAME];
11283 elsif ($fields[$CHARNAME] =~ /^CJK/) {
11285 # The name for these contains the code point itself, and all
11286 # are defined to have the same base name, regardless of what
11287 # is in the file. They are stored in our internal structure
11288 # with a map type of $CP_IN_NAME
11289 $previous_fields[$CHARNAME] = $previous_fields[$NAME]
11295 . 'CJK UNIFIED IDEOGRAPH';
11298 elsif ($fields[$CATEGORY] eq 'Co'
11299 || $fields[$CATEGORY] eq 'Cs')
11301 # The names of all the code points in these ranges are set to
11302 # null, as there are no names for the private use and
11303 # surrogate code points.
11305 $previous_fields[$CHARNAME] = $previous_fields[$NAME] = "";
11308 $file->carp_bad_line("Unexpected code point range $fields[$CHARNAME] because category is $fields[$CATEGORY]. Attempting to process it.");
11311 # The first line of the range caused everything else to be output,
11312 # and then its values were stored as the beginning values for the
11313 # next set of ranges, which this one ends. Now, for each value,
11314 # add a command to tell the handler that these values should not
11315 # replace any existing ones in our database.
11316 foreach my $i (0 .. $last_field) {
11317 $previous_fields[$i] = $CMD_DELIM
11322 . $previous_fields[$i];
11325 # And change things so it looks like the entire range has been
11326 # gone through with this being the final part of it. Adding the
11327 # command above to each field will cause this range to be flushed
11328 # during the next iteration, as it guaranteed that the stored
11329 # field won't match whatever value the next one has.
11330 $previous_cp = $cp;
11331 $decimal_previous_cp = $decimal_cp;
11333 # We are now set up for the next iteration; so skip the remaining
11334 # code in this subroutine that does the same thing, but doesn't
11335 # know about these ranges.
11341 # On the very first line, we fake it so the code below thinks there is
11342 # nothing to output, and initialize so that when it does get output it
11343 # uses the first line's values for the lowest part of the range.
11344 # (One could avoid this by using peek(), but then one would need to
11345 # know the adjustments done above and do the same ones in the setup
11346 # routine; not worth it)
11349 @previous_fields = @fields;
11350 @start = ($cp) x scalar @fields;
11351 $decimal_previous_cp = $decimal_cp - 1;
11354 # For each field, output the stored up ranges that this code point
11355 # doesn't fit in. Earlier we figured out if all ranges should be
11356 # terminated because of changing the replace or map type styles, or if
11357 # there is a gap between this new code point and the previous one, and
11358 # that is stored in $force_output. But even if those aren't true, we
11359 # need to output the range if this new code point's value for the
11360 # given property doesn't match the stored range's.
11361 #local $to_trace = 1 if main::DEBUG;
11362 foreach my $i (0 .. $last_field) {
11363 my $field = $fields[$i];
11364 if ($force_output || $field ne $previous_fields[$i]) {
11366 # Flush the buffer of stored values.
11367 $file->insert_adjusted_lines("$start[$i]..$previous_cp; $field_names[$i]; $previous_fields[$i]");
11369 # Start a new range with this code point and its value
11371 $previous_fields[$i] = $field;
11375 # Set the values for the next time.
11376 $previous_cp = $cp;
11377 $decimal_previous_cp = $decimal_cp;
11379 # The input line has generated whatever adjusted lines are needed, and
11380 # should not be looked at further.
11385 sub EOF_UnicodeData {
11386 # Called upon EOF to flush the buffers, and create the Hangul
11387 # decomposition mappings if needed.
11390 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
11392 # Flush the buffers.
11393 foreach my $i (0 .. $last_field) {
11394 $file->insert_adjusted_lines("$start[$i]..$previous_cp; $field_names[$i]; $previous_fields[$i]");
11397 if (-e 'Jamo.txt') {
11399 # The algorithm is published by Unicode, based on values in
11400 # Jamo.txt, (which should have been processed before this
11401 # subroutine), and the results left in %Jamo
11403 Carp::my_carp_bug("Jamo.txt should be processed before Unicode.txt. Hangul syllables not generated.");
11407 # If the full decomposition map table is being output, insert
11408 # into it the Hangul syllable mappings. This is to avoid having
11409 # to publish a subroutine in it to compute them. (which would
11410 # essentially be this code.) This uses the algorithm published by
11411 # Unicode. (No hangul syllables in version 1)
11412 if ($v_version ge v2.0.0
11413 && property_ref('Decomposition_Mapping')->to_output_map) {
11414 for (my $S = $SBase; $S < $SBase + $SCount; $S++) {
11416 my $SIndex = $S - $SBase;
11417 my $L = $LBase + $SIndex / $NCount;
11418 my $V = $VBase + ($SIndex % $NCount) / $TCount;
11419 my $T = $TBase + $SIndex % $TCount;
11421 trace "L=$L, V=$V, T=$T" if main::DEBUG && $to_trace;
11422 my $decomposition = sprintf("%04X %04X", $L, $V);
11423 $decomposition .= sprintf(" %04X", $T) if $T != $TBase;
11424 $file->insert_adjusted_lines(
11425 sprintf("%04X; Decomposition_Mapping; %s",
11435 sub filter_v1_ucd {
11436 # Fix UCD lines in version 1. This is probably overkill, but this
11437 # fixes some glaring errors in Version 1 UnicodeData.txt. That file:
11438 # 1) had many Hangul (U+3400 - U+4DFF) code points that were later
11439 # removed. This program retains them
11440 # 2) didn't include ranges, which it should have, and which are now
11441 # added in @corrected_lines below. It was hand populated by
11442 # taking the data from Version 2, verified by analyzing
11444 # 3) There is a syntax error in the entry for U+09F8 which could
11445 # cause problems for utf8_heavy, and so is changed. It's
11446 # numeric value was simply a minus sign, without any number.
11447 # (Eventually Unicode changed the code point to non-numeric.)
11448 # 4) The decomposition types often don't match later versions
11449 # exactly, and the whole syntax of that field is different; so
11450 # the syntax is changed as well as the types to their later
11451 # terminology. Otherwise normalize.pm would be very unhappy
11452 # 5) Many ccc classes are different. These are left intact.
11453 # 6) U+FF10..U+FF19 are missing their numeric values in all three
11454 # fields. These are unchanged because it doesn't really cause
11455 # problems for Perl.
11456 # 7) A number of code points, such as controls, don't have their
11457 # Unicode Version 1 Names in this file. These are added.
11458 # 8) A number of Symbols were marked as Lm. This changes those in
11459 # the Latin1 range, so that regexes work.
11460 # 9) The odd characters U+03DB .. U+03E1 weren't encoded but are
11461 # referred to by their lc equivalents. Not fixed.
11463 my @corrected_lines = split /\n/, <<'END';
11464 4E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;;
11465 9FA5;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;;
11466 E000;<Private Use, First>;Co;0;L;;;;;N;;;;;
11467 F8FF;<Private Use, Last>;Co;0;L;;;;;N;;;;;
11468 F900;<CJK Compatibility Ideograph, First>;Lo;0;L;;;;;N;;;;;
11469 FA2D;<CJK Compatibility Ideograph, Last>;Lo;0;L;;;;;N;;;;;
11473 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
11475 #local $to_trace = 1 if main::DEBUG;
11476 trace $_ if main::DEBUG && $to_trace;
11478 # -1 => retain trailing null fields
11479 my ($code_point, @fields) = split /\s*;\s*/, $_, -1;
11481 # At the first place that is wrong in the input, insert all the
11482 # corrections, replacing the wrong line.
11483 if ($code_point eq '4E00') {
11484 my @copy = @corrected_lines;
11486 ($code_point, @fields) = split /\s*;\s*/, $_, -1;
11488 $file->insert_lines(@copy);
11490 elsif ($code_point =~ /^00/ && $fields[$CATEGORY] eq 'Lm') {
11492 # There are no Lm characters in Latin1; these should be 'Sk', but
11493 # there isn't that in V1.
11494 $fields[$CATEGORY] = 'So';
11497 if ($fields[$NUMERIC] eq '-') {
11498 $fields[$NUMERIC] = '-1'; # This is what 2.0 made it.
11501 if ($fields[$PERL_DECOMPOSITION] ne "") {
11503 # Several entries have this change to superscript 2 or 3 in the
11504 # middle. Convert these to the modern version, which is to use
11505 # the actual U+00B2 and U+00B3 (the superscript forms) instead.
11506 # So 'HHHH HHHH <+sup> 0033 <-sup> HHHH' becomes
11507 # 'HHHH HHHH 00B3 HHHH'.
11508 # It turns out that all of these that don't have another
11509 # decomposition defined at the beginning of the line have the
11510 # <square> decomposition in later releases.
11511 if ($code_point ne '00B2' && $code_point ne '00B3') {
11512 if ($fields[$PERL_DECOMPOSITION]
11513 =~ s/<\+sup> 003([23]) <-sup>/00B$1/)
11515 if (substr($fields[$PERL_DECOMPOSITION], 0, 1) ne '<') {
11516 $fields[$PERL_DECOMPOSITION] = '<square> '
11517 . $fields[$PERL_DECOMPOSITION];
11522 # If is like '<+circled> 0052 <-circled>', convert to
11524 $fields[$PERL_DECOMPOSITION] =~
11525 s/ < \+ ( .*? ) > \s* (.*?) \s* <-\1> /<$1> $2/xg;
11527 # Convert '<join> HHHH HHHH <join>' to '<medial> HHHH HHHH', etc.
11528 $fields[$PERL_DECOMPOSITION] =~
11529 s/ <join> \s* (.*?) \s* <no-join> /<final> $1/x
11530 or $fields[$PERL_DECOMPOSITION] =~
11531 s/ <join> \s* (.*?) \s* <join> /<medial> $1/x
11532 or $fields[$PERL_DECOMPOSITION] =~
11533 s/ <no-join> \s* (.*?) \s* <join> /<initial> $1/x
11534 or $fields[$PERL_DECOMPOSITION] =~
11535 s/ <no-join> \s* (.*?) \s* <no-join> /<isolated> $1/x;
11537 # Convert '<break> HHHH HHHH <break>' to '<break> HHHH', etc.
11538 $fields[$PERL_DECOMPOSITION] =~
11539 s/ <(break|no-break)> \s* (.*?) \s* <\1> /<$1> $2/x;
11541 # Change names to modern form.
11542 $fields[$PERL_DECOMPOSITION] =~ s/<font variant>/<font>/g;
11543 $fields[$PERL_DECOMPOSITION] =~ s/<no-break>/<noBreak>/g;
11544 $fields[$PERL_DECOMPOSITION] =~ s/<circled>/<circle>/g;
11545 $fields[$PERL_DECOMPOSITION] =~ s/<break>/<fraction>/g;
11547 # One entry has weird braces
11548 $fields[$PERL_DECOMPOSITION] =~ s/[{}]//g;
11550 # One entry at U+2116 has an extra <sup>
11551 $fields[$PERL_DECOMPOSITION] =~ s/( < .*? > .* ) < .*? > \ * /$1/x;
11554 $_ = join ';', $code_point, @fields;
11555 trace $_ if main::DEBUG && $to_trace;
11559 sub filter_bad_Nd_ucd {
11560 # Early versions specified a value in the decimal digit field even
11561 # though the code point wasn't a decimal digit. Clear the field in
11562 # that situation, so that the main code doesn't think it is a decimal
11565 my ($code_point, @fields) = split /\s*;\s*/, $_, -1;
11566 if ($fields[$PERL_DECIMAL_DIGIT] ne "" && $fields[$CATEGORY] ne 'Nd') {
11567 $fields[$PERL_DECIMAL_DIGIT] = "";
11568 $_ = join ';', $code_point, @fields;
11573 my @U1_control_names = split /\n/, <<'END';
11578 END OF TRANSMISSION
11583 HORIZONTAL TABULATION
11585 VERTICAL TABULATION
11593 DEVICE CONTROL THREE
11594 DEVICE CONTROL FOUR
11595 NEGATIVE ACKNOWLEDGE
11597 END OF TRANSMISSION BLOCK
11607 BREAK PERMITTED HERE
11611 START OF SELECTED AREA
11612 END OF SELECTED AREA
11613 CHARACTER TABULATION SET
11614 CHARACTER TABULATION WITH JUSTIFICATION
11615 LINE TABULATION SET
11621 DEVICE CONTROL STRING
11627 START OF GUARDED AREA
11628 END OF GUARDED AREA
11630 SINGLE CHARACTER INTRODUCER
11631 CONTROL SEQUENCE INTRODUCER
11633 OPERATING SYSTEM COMMAND
11635 APPLICATION PROGRAM COMMAND
11638 sub filter_early_U1_names {
11639 # Very early versions did not have the Unicode_1_name field specified.
11640 # They differed in which ones were present; make sure a U1 name
11641 # exists, so that Unicode::UCD::charinfo will work
11643 my ($code_point, @fields) = split /\s*;\s*/, $_, -1;
11646 # @U1_control names above are entirely positional, so we pull them out
11647 # in the exact order required, with gaps for the ones that don't have
11649 if ($code_point =~ /^00[01]/
11650 || $code_point eq '007F'
11651 || $code_point =~ /^008[2-9A-F]/
11652 || $code_point =~ /^009[0-8A-F]/)
11654 my $u1_name = shift @U1_control_names;
11655 $fields[$UNICODE_1_NAME] = $u1_name unless $fields[$UNICODE_1_NAME];
11656 $_ = join ';', $code_point, @fields;
11661 sub filter_v2_1_5_ucd {
11662 # A dozen entries in this 2.1.5 file had the mirrored and numeric
11663 # columns swapped; These all had mirrored be 'N'. So if the numeric
11664 # column appears to be N, swap it back.
11666 my ($code_point, @fields) = split /\s*;\s*/, $_, -1;
11667 if ($fields[$NUMERIC] eq 'N') {
11668 $fields[$NUMERIC] = $fields[$MIRRORED];
11669 $fields[$MIRRORED] = 'N';
11670 $_ = join ';', $code_point, @fields;
11675 sub filter_v6_ucd {
11677 # Unicode 6.0 co-opted the name BELL for U+1F514, but until 5.17,
11678 # it wasn't accepted, to allow for some deprecation cycles. This
11679 # function is not called after 5.16
11681 return if $_ !~ /^(?:0007|1F514|070F);/;
11683 my ($code_point, @fields) = split /\s*;\s*/, $_, -1;
11684 if ($code_point eq '0007') {
11685 $fields[$CHARNAME] = "";
11687 elsif ($code_point eq '070F') { # Unicode Corrigendum #8; see
11688 # http://www.unicode.org/versions/corrigendum8.html
11689 $fields[$BIDI] = "AL";
11691 elsif ($^V lt v5.18.0) { # For 5.18 will convert to use Unicode's name
11692 $fields[$CHARNAME] = "";
11695 $_ = join ';', $code_point, @fields;
11699 } # End closure for UnicodeData
11701 sub process_GCB_test {
11704 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
11706 while ($file->next_line) {
11707 push @backslash_X_tests, $_;
11713 sub process_NamedSequences {
11714 # NamedSequences.txt entries are just added to an array. Because these
11715 # don't look like the other tables, they have their own handler.
11717 # LATIN CAPITAL LETTER A WITH MACRON AND GRAVE;0100 0300
11719 # This just adds the sequence to an array for later handling
11722 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
11724 while ($file->next_line) {
11725 my ($name, $sequence, @remainder) = split /\s*;\s*/, $_, -1;
11727 $file->carp_bad_line(
11728 "Doesn't look like 'KHMER VOWEL SIGN OM;17BB 17C6'");
11732 # Note single \t in keeping with special output format of
11733 # Perl_charnames. But it turns out that the code points don't have to
11734 # be 5 digits long, like the rest, based on the internal workings of
11735 # charnames.pm. This could be easily changed for consistency.
11736 push @named_sequences, "$sequence\t$name";
11745 sub filter_early_ea_lb {
11746 # Fixes early EastAsianWidth.txt and LineBreak.txt files. These had a
11747 # third field be the name of the code point, which can be ignored in
11748 # most cases. But it can be meaningful if it marks a range:
11749 # 33FE;W;IDEOGRAPHIC TELEGRAPH SYMBOL FOR DAY THIRTY-ONE
11750 # 3400;W;<CJK Ideograph Extension A, First>
11752 # We need to see the First in the example above to know it's a range.
11753 # They did not use the later range syntaxes. This routine changes it
11754 # to use the modern syntax.
11755 # $1 is the Input_file object.
11757 my @fields = split /\s*;\s*/;
11758 if ($fields[2] =~ /^<.*, First>/) {
11759 $first_range = $fields[0];
11762 elsif ($fields[2] =~ /^<.*, Last>/) {
11763 $_ = $_ = "$first_range..$fields[0]; $fields[1]";
11766 undef $first_range;
11767 $_ = "$fields[0]; $fields[1]";
11774 sub filter_old_style_arabic_shaping {
11775 # Early versions used a different term for the later one.
11777 my @fields = split /\s*;\s*/;
11778 $fields[3] =~ s/<no shaping>/No_Joining_Group/;
11779 $fields[3] =~ s/\s+/_/g; # Change spaces to underscores
11780 $_ = join ';', @fields;
11785 my $lc; # Table for lowercase mapping
11788 my %special_casing_code_points;
11790 sub setup_special_casing {
11791 # SpecialCasing.txt contains the non-simple case change mappings. The
11792 # simple ones are in UnicodeData.txt, which should already have been
11793 # read in to the full property data structures, so as to initialize
11794 # these with the simple ones. Then the SpecialCasing.txt entries
11795 # add or overwrite the ones which have different full mappings.
11797 # This routine sees if the simple mappings are to be output, and if
11798 # so, copies what has already been put into the full mapping tables,
11799 # while they still contain only the simple mappings.
11801 # The reason it is done this way is that the simple mappings are
11802 # probably not going to be output, so it saves work to initialize the
11803 # full tables with the simple mappings, and then overwrite those
11804 # relatively few entries in them that have different full mappings,
11805 # and thus skip the simple mapping tables altogether.
11808 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
11810 $lc = property_ref('lc');
11811 $tc = property_ref('tc');
11812 $uc = property_ref('uc');
11814 # For each of the case change mappings...
11815 foreach my $full_casing_table ($lc, $tc, $uc) {
11816 my $full_casing_name = $full_casing_table->name;
11817 my $full_casing_full_name = $full_casing_table->full_name;
11818 unless (defined $full_casing_table
11819 && ! $full_casing_table->is_empty)
11821 Carp::my_carp_bug("Need to process UnicodeData before SpecialCasing. Only special casing will be generated.");
11824 # Create a table in the old-style format and with the original
11825 # file name for backwards compatibility with applications that
11826 # read it directly. The new tables contain both the simple and
11827 # full maps, and the old are missing simple maps when there is a
11828 # conflicting full one. Probably it would have been ok to add
11829 # those to the legacy version, as was already done in 5.14 to the
11830 # case folding one, but this was not done, out of an abundance of
11831 # caution. The tables are set up here before we deal with the
11832 # full maps so that as we handle those, we can override the simple
11833 # maps for them in the legacy table, and merely add them in the
11835 my $legacy = Property->new("Legacy_" . $full_casing_full_name,
11836 File => $full_casing_full_name
11837 =~ s/case_Mapping//r,
11838 Format => $HEX_FORMAT,
11839 Default_Map => $CODE_POINT,
11840 Initialize => $full_casing_table,
11841 Replacement_Property => $full_casing_full_name,
11844 $full_casing_table->add_comment(join_lines( <<END
11845 This file includes both the simple and full case changing maps. The simple
11846 ones are in the main body of the table below, and the full ones adding to or
11847 overriding them are in the hash.
11851 # The simple version's name in each mapping merely has an 's' in
11852 # front of the full one's
11853 my $simple_name = 's' . $full_casing_name;
11854 my $simple = property_ref($simple_name);
11855 $simple->initialize($full_casing_table) if $simple->to_output_map();
11861 sub filter_2_1_8_special_casing_line {
11863 # This version had duplicate entries in this file. Delete all but the
11865 my @fields = split /\s*;\s*/, $_, -1; # -1 => retain trailing null
11867 if (exists $special_casing_code_points{$fields[0]}) {
11872 $special_casing_code_points{$fields[0]} = 1;
11873 filter_special_casing_line(@_);
11876 sub filter_special_casing_line {
11877 # Change the format of $_ from SpecialCasing.txt into something that
11878 # the generic handler understands. Each input line contains three
11879 # case mappings. This will generate three lines to pass to the
11880 # generic handler for each of those.
11882 # The input syntax (after stripping comments and trailing white space
11883 # is like one of the following (with the final two being entries that
11885 # 00DF; 00DF; 0053 0073; 0053 0053; # LATIN SMALL LETTER SHARP S
11886 # 03A3; 03C2; 03A3; 03A3; Final_Sigma;
11887 # 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
11888 # Note the trailing semi-colon, unlike many of the input files. That
11889 # means that there will be an extra null field generated by the split
11892 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
11894 my @fields = split /\s*;\s*/, $_, -1; # -1 => retain trailing null
11897 # field #4 is when this mapping is conditional. If any of these get
11898 # implemented, it would be by hard-coding in the casing functions in
11899 # the Perl core, not through tables. But if there is a new condition
11900 # we don't know about, output a warning. We know about all the
11901 # conditions through 6.0
11902 if ($fields[4] ne "") {
11903 my @conditions = split ' ', $fields[4];
11904 if ($conditions[0] ne 'tr' # We know that these languages have
11905 # conditions, and some are multiple
11906 && $conditions[0] ne 'az'
11907 && $conditions[0] ne 'lt'
11909 # And, we know about a single condition Final_Sigma, but
11911 && ($v_version gt v5.2.0
11912 && (@conditions > 1 || $conditions[0] ne 'Final_Sigma')))
11914 $file->carp_bad_line("Unknown condition '$fields[4]'. You should inspect it and either add code to handle it, or add to list of those that are to ignore");
11916 elsif ($conditions[0] ne 'Final_Sigma') {
11918 # Don't print out a message for Final_Sigma, because we
11919 # have hard-coded handling for it. (But the standard
11920 # could change what the rule should be, but it wouldn't
11921 # show up here anyway.
11923 print "# SKIPPING Special Casing: $_\n"
11924 if $verbosity >= $VERBOSE;
11929 elsif (@fields > 6 || (@fields == 6 && $fields[5] ne "" )) {
11930 $file->carp_bad_line('Extra fields');
11935 my $decimal_code_point = hex $fields[0];
11937 # Loop to handle each of the three mappings in the input line, in
11938 # order, with $i indicating the current field number.
11940 for my $object ($lc, $tc, $uc) {
11941 $i++; # First time through, $i = 0 ... 3rd time = 3
11943 my $value = $object->value_of($decimal_code_point);
11944 $value = ($value eq $CODE_POINT)
11945 ? $decimal_code_point
11948 # If this isn't a multi-character mapping, it should already have
11950 if ($fields[$i] !~ / /) {
11951 if ($value != hex $fields[$i]) {
11952 Carp::my_carp("Bad news. UnicodeData.txt thinks "
11954 . "(0x$fields[0]) is $value"
11955 . " and SpecialCasing.txt thinks it is "
11957 . ". Good luck. Retaining UnicodeData value, and proceeding anyway.");
11962 # The mapping goes into both the legacy table, in which it
11963 # replaces the simple one...
11964 $file->insert_adjusted_lines("$fields[0]; Legacy_"
11965 . $object->full_name
11966 . "; $fields[$i]");
11968 # ... and the regular table, in which it is additional,
11969 # beyond the simple mapping.
11970 $file->insert_adjusted_lines("$fields[0]; "
11974 . "$REPLACE_CMD=$MULTIPLE_BEFORE"
11980 # Everything has been handled by the insert_adjusted_lines()
11987 sub filter_old_style_case_folding {
11988 # This transforms $_ containing the case folding style of 3.0.1, to 3.1
11989 # and later style. Different letters were used in the earlier.
11992 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
11994 my @fields = split /\s*;\s*/;
11995 if ($fields[0] =~ /^ 013 [01] $/x) { # The two turkish fields
11998 elsif ($fields[1] eq 'L') {
11999 $fields[1] = 'C'; # L => C always
12001 elsif ($fields[1] eq 'E') {
12002 if ($fields[2] =~ / /) { # E => C if one code point; F otherwise
12010 $file->carp_bad_line("Expecting L or E in second field");
12014 $_ = join("; ", @fields) . ';';
12018 { # Closure for case folding
12020 # Create the map for simple only if are going to output it, for otherwise
12021 # it takes no part in anything we do.
12022 my $to_output_simple;
12024 sub setup_case_folding($) {
12025 # Read in the case foldings in CaseFolding.txt. This handles both
12026 # simple and full case folding.
12029 = property_ref('Simple_Case_Folding')->to_output_map;
12031 if (! $to_output_simple) {
12032 property_ref('Case_Folding')->set_proxy_for('Simple_Case_Folding');
12035 # If we ever wanted to show that these tables were combined, a new
12036 # property method could be created, like set_combined_props()
12037 property_ref('Case_Folding')->add_comment(join_lines( <<END
12038 This file includes both the simple and full case folding maps. The simple
12039 ones are in the main body of the table below, and the full ones adding to or
12040 overriding them are in the hash.
12046 sub filter_case_folding_line {
12047 # Called for each line in CaseFolding.txt
12048 # Input lines look like:
12049 # 0041; C; 0061; # LATIN CAPITAL LETTER A
12050 # 00DF; F; 0073 0073; # LATIN SMALL LETTER SHARP S
12051 # 1E9E; S; 00DF; # LATIN CAPITAL LETTER SHARP S
12053 # 'C' means that folding is the same for both simple and full
12054 # 'F' that it is only for full folding
12055 # 'S' that it is only for simple folding
12056 # 'T' is locale-dependent, and ignored
12057 # 'I' is a type of 'F' used in some early releases.
12058 # Note the trailing semi-colon, unlike many of the input files. That
12059 # means that there will be an extra null field generated by the split
12060 # below, which we ignore and hence is not an error.
12063 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
12065 my ($range, $type, $map, @remainder) = split /\s*;\s*/, $_, -1;
12066 if (@remainder > 1 || (@remainder == 1 && $remainder[0] ne "" )) {
12067 $file->carp_bad_line('Extra fields');
12072 if ($type =~ / ^ [IT] $/x) { # Skip Turkic case folding, is locale dependent
12077 # C: complete, F: full, or I: dotted uppercase I -> dotless lowercase
12078 # I are all full foldings; S is single-char. For S, there is always
12079 # an F entry, so we must allow multiple values for the same code
12080 # point. Fortunately this table doesn't need further manipulation
12081 # which would preclude using multiple-values. The S is now included
12082 # so that _swash_inversion_hash() is able to construct closures
12083 # without having to worry about F mappings.
12084 if ($type eq 'C' || $type eq 'F' || $type eq 'I' || $type eq 'S') {
12085 $_ = "$range; Case_Folding; "
12086 . "$CMD_DELIM$REPLACE_CMD=$MULTIPLE_BEFORE$CMD_DELIM$map";
12090 $file->carp_bad_line('Expecting C F I S or T in second field');
12093 # C and S are simple foldings, but simple case folding is not needed
12094 # unless we explicitly want its map table output.
12095 if ($to_output_simple && $type eq 'C' || $type eq 'S') {
12096 $file->insert_adjusted_lines("$range; Simple_Case_Folding; $map");
12102 } # End case fold closure
12104 sub filter_jamo_line {
12105 # Filter Jamo.txt lines. This routine mainly is used to populate hashes
12106 # from this file that is used in generating the Name property for Jamo
12107 # code points. But, it also is used to convert early versions' syntax
12108 # into the modern form. Here are two examples:
12109 # 1100; G # HANGUL CHOSEONG KIYEOK # Modern syntax
12110 # U+1100; G; HANGUL CHOSEONG KIYEOK # 2.0 syntax
12112 # The input is $_, the output is $_ filtered.
12114 my @fields = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
12116 # Let the caller handle unexpected input. In earlier versions, there was
12117 # a third field which is supposed to be a comment, but did not have a '#'
12119 return if @fields > (($v_version gt v3.0.0) ? 2 : 3);
12121 $fields[0] =~ s/^U\+//; # Also, early versions had this extraneous
12124 # Some 2.1 versions had this wrong. Causes havoc with the algorithm.
12125 $fields[1] = 'R' if $fields[0] eq '1105';
12127 # Add to structure so can generate Names from it.
12128 my $cp = hex $fields[0];
12129 my $short_name = $fields[1];
12130 $Jamo{$cp} = $short_name;
12131 if ($cp <= $LBase + $LCount) {
12132 $Jamo_L{$short_name} = $cp - $LBase;
12134 elsif ($cp <= $VBase + $VCount) {
12135 $Jamo_V{$short_name} = $cp - $VBase;
12137 elsif ($cp <= $TBase + $TCount) {
12138 $Jamo_T{$short_name} = $cp - $TBase;
12141 Carp::my_carp_bug("Unexpected Jamo code point in $_");
12145 # Reassemble using just the first two fields to look like a typical
12146 # property file line
12147 $_ = "$fields[0]; $fields[1]";
12152 sub register_fraction($) {
12153 # This registers the input rational number so that it can be passed on to
12154 # utf8_heavy.pl, both in rational and floating forms.
12156 my $rational = shift;
12158 my $float = eval $rational;
12159 $nv_floating_to_rational{$float} = $rational;
12163 sub filter_numeric_value_line {
12164 # DNumValues contains lines of a different syntax than the typical
12166 # 0F33 ; -0.5 ; ; -1/2 # No TIBETAN DIGIT HALF ZERO
12168 # This routine transforms $_ containing the anomalous syntax to the
12169 # typical, by filtering out the extra columns, and convert early version
12170 # decimal numbers to strings that look like rational numbers.
12173 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
12175 # Starting in 5.1, there is a rational field. Just use that, omitting the
12176 # extra columns. Otherwise convert the decimal number in the second field
12177 # to a rational, and omit extraneous columns.
12178 my @fields = split /\s*;\s*/, $_, -1;
12181 if ($v_version ge v5.1.0) {
12182 if (@fields != 4) {
12183 $file->carp_bad_line('Not 4 semi-colon separated fields');
12187 $rational = $fields[3];
12188 $_ = join '; ', @fields[ 0, 3 ];
12192 # Here, is an older Unicode file, which has decimal numbers instead of
12193 # rationals in it. Use the fraction to calculate the denominator and
12194 # convert to rational.
12196 if (@fields != 2 && @fields != 3) {
12197 $file->carp_bad_line('Not 2 or 3 semi-colon separated fields');
12202 my $codepoints = $fields[0];
12203 my $decimal = $fields[1];
12204 if ($decimal =~ s/\.0+$//) {
12206 # Anything ending with a decimal followed by nothing but 0's is an
12208 $_ = "$codepoints; $decimal";
12209 $rational = $decimal;
12214 if ($decimal =~ /\.50*$/) {
12218 # Here have the hardcoded repeating decimals in the fraction, and
12219 # the denominator they imply. There were only a few denominators
12220 # in the older Unicode versions of this file which this code
12221 # handles, so it is easy to convert them.
12223 # The 4 is because of a round-off error in the Unicode 3.2 files
12224 elsif ($decimal =~ /\.33*[34]$/ || $decimal =~ /\.6+7$/) {
12227 elsif ($decimal =~ /\.[27]50*$/) {
12230 elsif ($decimal =~ /\.[2468]0*$/) {
12233 elsif ($decimal =~ /\.16+7$/ || $decimal =~ /\.83+$/) {
12236 elsif ($decimal =~ /\.(12|37|62|87)50*$/) {
12239 if ($denominator) {
12240 my $sign = ($decimal < 0) ? "-" : "";
12241 my $numerator = int((abs($decimal) * $denominator) + .5);
12242 $rational = "$sign$numerator/$denominator";
12243 $_ = "$codepoints; $rational";
12246 $file->carp_bad_line("Can't cope with number '$decimal'.");
12253 register_fraction($rational) if $rational =~ qr{/};
12258 my %unihan_properties;
12261 # Do any special setup for Unihan properties.
12263 # This property gives the wrong computed type, so override.
12264 my $usource = property_ref('kIRG_USource');
12265 $usource->set_type($STRING) if defined $usource;
12267 # This property is to be considered binary (it says so in
12268 # http://www.unicode.org/reports/tr38/)
12269 my $iicore = property_ref('kIICore');
12270 if (defined $iicore) {
12271 $iicore->set_type($FORCED_BINARY);
12272 $iicore->table("Y")->add_note("Forced to a binary property as per unicode.org UAX #38.");
12274 # Unicode doesn't include the maps for this property, so don't
12275 # warn that they are missing.
12276 $iicore->set_pre_declared_maps(0);
12277 $iicore->add_comment(join_lines( <<END
12278 This property contains enum values, but Unicode UAX #38 says it should be
12279 interpreted as binary, so Perl creates tables for both 1) its enum values,
12280 plus 2) true/false tables in which it is considered true for all code points
12281 that have a non-null value
12289 sub filter_unihan_line {
12290 # Change unihan db lines to look like the others in the db. Here is
12292 # U+341C kCangjie IEKN
12294 # Tabs are used instead of semi-colons to separate fields; therefore
12295 # they may have semi-colons embedded in them. Change these to periods
12296 # so won't screw up the rest of the code.
12299 # Remove lines that don't look like ones we accept.
12300 if ($_ !~ /^ [^\t]* \t ( [^\t]* ) /x) {
12305 # Extract the property, and save a reference to its object.
12307 if (! exists $unihan_properties{$property}) {
12308 $unihan_properties{$property} = property_ref($property);
12311 # Don't do anything unless the property is one we're handling, which
12312 # we determine by seeing if there is an object defined for it or not
12313 if (! defined $unihan_properties{$property}) {
12318 # Convert the tab separators to our standard semi-colons, and convert
12319 # the U+HHHH notation to the rest of the standard's HHHH
12321 s/\b U \+ (?= $code_point_re )//xg;
12323 #local $to_trace = 1 if main::DEBUG;
12324 trace $_ if main::DEBUG && $to_trace;
12330 sub filter_blocks_lines {
12331 # In the Blocks.txt file, the names of the blocks don't quite match the
12332 # names given in PropertyValueAliases.txt, so this changes them so they
12333 # do match: Blanks and hyphens are changed into underscores. Also makes
12334 # early release versions look like later ones
12336 # $_ is transformed to the correct value.
12339 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
12341 if ($v_version lt v3.2.0) {
12342 if (/FEFF.*Specials/) { # Bug in old versions: line wrongly inserted
12347 # Old versions used a different syntax to mark the range.
12348 $_ =~ s/;\s+/../ if $v_version lt v3.1.0;
12351 my @fields = split /\s*;\s*/, $_, -1;
12352 if (@fields != 2) {
12353 $file->carp_bad_line("Expecting exactly two fields");
12358 # Change hyphens and blanks in the block name field only
12359 $fields[1] =~ s/[ -]/_/g;
12360 $fields[1] =~ s/_ ( [a-z] ) /_\u$1/g; # Capitalize first letter of word
12362 $_ = join("; ", @fields);
12367 my $current_property;
12369 sub filter_old_style_proplist {
12370 # PropList.txt has been in Unicode since version 2.0. Until 3.1, it
12371 # was in a completely different syntax. Ken Whistler of Unicode says
12372 # that it was something he used as an aid for his own purposes, but
12373 # was never an official part of the standard. Many of the properties
12374 # in it were incorporated into the later PropList.txt, but some were
12375 # not. This program uses this early file to generate property tables
12376 # that are otherwise not accessible in the early UCD's. It does this
12377 # for the ones that eventually became official, and don't appear to be
12378 # too different in their contents from the later official version, and
12379 # throws away the rest. It could be argued that the ones it generates
12380 # were probably not really official at that time, so should be
12381 # ignored. You can easily modify things to skip all of them by
12382 # changing this function to just set $_ to "", and return; and to skip
12383 # certain of them by by simply removing their declarations from
12384 # get_old_property_aliases().
12386 # Here is a list of all the ones that are thrown away:
12387 # Alphabetic The definitions for this are very
12388 # defective, so better to not mislead
12389 # people into thinking it works.
12390 # Instead the Perl extension of the
12391 # same name is constructed from first
12393 # Bidi=* duplicates UnicodeData.txt
12394 # Combining never made into official property;
12396 # Composite never made into official property.
12397 # Currency Symbol duplicates UnicodeData.txt: gc=sc
12398 # Decimal Digit duplicates UnicodeData.txt: gc=nd
12399 # Delimiter never made into official property;
12401 # Format Control never made into official property;
12403 # High Surrogate duplicates Blocks.txt
12404 # Ignorable Control never made into official property;
12406 # ISO Control duplicates UnicodeData.txt: gc=cc
12407 # Left of Pair never made into official property;
12408 # Line Separator duplicates UnicodeData.txt: gc=zl
12409 # Low Surrogate duplicates Blocks.txt
12410 # Non-break was actually listed as a property
12411 # in 3.2, but without any code
12412 # points. Unicode denies that this
12413 # was ever an official property
12414 # Non-spacing duplicate UnicodeData.txt: gc=mn
12415 # Numeric duplicates UnicodeData.txt: gc=cc
12416 # Paired Punctuation never made into official property;
12417 # appears to be gc=ps + gc=pe
12418 # Paragraph Separator duplicates UnicodeData.txt: gc=cc
12419 # Private Use duplicates UnicodeData.txt: gc=co
12420 # Private Use High Surrogate duplicates Blocks.txt
12421 # Punctuation duplicates UnicodeData.txt: gc=p
12422 # Space different definition than eventual
12424 # Titlecase duplicates UnicodeData.txt: gc=lt
12425 # Unassigned Code Value duplicates UnicodeData.txt: gc=cn
12426 # Zero-width never made into official property;
12428 # Most of the properties have the same names in this file as in later
12429 # versions, but a couple do not.
12431 # This subroutine filters $_, converting it from the old style into
12432 # the new style. Here's a sample of the old-style
12434 # *******************************************
12436 # Property dump for: 0x100000A0 (Join Control)
12438 # 200C..200D (2 chars)
12440 # In the example, the property is "Join Control". It is kept in this
12441 # closure between calls to the subroutine. The numbers beginning with
12442 # 0x were internal to Ken's program that generated this file.
12444 # If this line contains the property name, extract it.
12445 if (/^Property dump for: [^(]*\((.*)\)/) {
12448 # Convert white space to underscores.
12451 # Convert the few properties that don't have the same name as
12452 # their modern counterparts
12453 s/Identifier_Part/ID_Continue/
12454 or s/Not_a_Character/NChar/;
12456 # If the name matches an existing property, use it.
12457 if (defined property_ref($_)) {
12458 trace "new property=", $_ if main::DEBUG && $to_trace;
12459 $current_property = $_;
12461 else { # Otherwise discard it
12462 trace "rejected property=", $_ if main::DEBUG && $to_trace;
12463 undef $current_property;
12465 $_ = ""; # The property is saved for the next lines of the
12466 # file, but this defining line is of no further use,
12467 # so clear it so that the caller won't process it
12470 elsif (! defined $current_property || $_ !~ /^$code_point_re/) {
12472 # Here, the input line isn't a header defining a property for the
12473 # following section, and either we aren't in such a section, or
12474 # the line doesn't look like one that defines the code points in
12475 # such a section. Ignore this line.
12480 # Here, we have a line defining the code points for the current
12481 # stashed property. Anything starting with the first blank is
12482 # extraneous. Otherwise, it should look like a normal range to
12483 # the caller. Append the property name so that it looks just like
12484 # a modern PropList entry.
12487 $_ .= "; $current_property";
12489 trace $_ if main::DEBUG && $to_trace;
12492 } # End closure for old style proplist
12494 sub filter_old_style_normalization_lines {
12495 # For early releases of Unicode, the lines were like:
12496 # 74..2A76 ; NFKD_NO
12497 # For later releases this became:
12498 # 74..2A76 ; NFKD_QC; N
12499 # Filter $_ to look like those in later releases.
12500 # Similarly for MAYBEs
12502 s/ _NO \b /_QC; N/x || s/ _MAYBE \b /_QC; M/x;
12504 # Also, the property FC_NFKC was abbreviated to FNC
12509 sub setup_script_extensions {
12510 # The Script_Extensions property starts out with a clone of the Script
12513 my $scx = property_ref("Script_Extensions");
12514 $scx = Property->new("scx", Full_Name => "Script_Extensions")
12516 $scx->_set_format($STRING_WHITE_SPACE_LIST);
12517 $scx->initialize($script);
12518 $scx->set_default_map($script->default_map);
12519 $scx->set_pre_declared_maps(0); # PropValueAliases doesn't list these
12520 $scx->add_comment(join_lines( <<END
12521 The values for code points that appear in one script are just the same as for
12522 the 'Script' property. Likewise the values for those that appear in many
12523 scripts are either 'Common' or 'Inherited', same as with 'Script'. But the
12524 values of code points that appear in a few scripts are a space separated list
12529 # Initialize scx's tables and the aliases for them to be the same as sc's
12530 foreach my $table ($script->tables) {
12531 my $scx_table = $scx->add_match_table($table->name,
12532 Full_Name => $table->full_name);
12533 foreach my $alias ($table->aliases) {
12534 $scx_table->add_alias($alias->name);
12539 sub filter_script_extensions_line {
12540 # The Scripts file comes with the full name for the scripts; the
12541 # ScriptExtensions, with the short name. The final mapping file is a
12542 # combination of these, and without adjustment, would have inconsistent
12543 # entries. This filters the latter file to convert to full names.
12544 # Entries look like this:
12545 # 064B..0655 ; Arab Syrc # Mn [11] ARABIC FATHATAN..ARABIC HAMZA BELOW
12547 my @fields = split /\s*;\s*/;
12549 # This script was erroneously omitted in this Unicode version.
12550 $fields[1] .= ' Takr' if $v_version eq v6.1.0 && $fields[0] =~ /^0964/;
12553 foreach my $short_name (split " ", $fields[1]) {
12554 push @full_names, $script->table($short_name)->full_name;
12556 $fields[1] = join " ", @full_names;
12557 $_ = join "; ", @fields;
12564 # Populates the Hangul Syllable Type property from first principles
12567 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
12569 # These few ranges are hard-coded in.
12570 $file->insert_lines(split /\n/, <<'END'
12578 # The Hangul syllables in version 1 are completely different than what came
12579 # after, so just ignore them there.
12580 if ($v_version lt v2.0.0) {
12581 my $property = property_ref($file->property);
12582 push @tables_that_may_be_empty, $property->table('LV')->complete_name;
12583 push @tables_that_may_be_empty, $property->table('LVT')->complete_name;
12587 # The algorithmically derived syllables are almost all LVT ones, so
12588 # initialize the whole range with that.
12589 $file->insert_lines(sprintf "%04X..%04X; LVT\n",
12590 $SBase, $SBase + $SCount -1);
12592 # Those ones that aren't LVT are LV, and they occur at intervals of
12593 # $TCount code points, starting with the first code point, at $SBase.
12594 for (my $i = $SBase; $i < $SBase + $SCount; $i += $TCount) {
12595 $file->insert_lines(sprintf "%04X..%04X; LV\n", $i, $i);
12603 # Populates the Grapheme Cluster Break property from first principles
12606 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
12608 # All these definitions are from
12609 # http://www.unicode.org/reports/tr29/tr29-3.html with confirmation
12610 # from http://www.unicode.org/reports/tr29/tr29-4.html
12612 foreach my $range ($gc->ranges) {
12614 # Extend includes gc=Me and gc=Mn, while Control includes gc=Cc
12616 if ($range->value =~ / ^ M [en] $ /x) {
12617 $file->insert_lines(sprintf "%04X..%04X; Extend",
12618 $range->start, $range->end);
12620 elsif ($range->value =~ / ^ C [cf] $ /x) {
12621 $file->insert_lines(sprintf "%04X..%04X; Control",
12622 $range->start, $range->end);
12625 $file->insert_lines("2028; Control"); # Line Separator
12626 $file->insert_lines("2029; Control"); # Paragraph Separator
12628 $file->insert_lines("000D; CR");
12629 $file->insert_lines("000A; LF");
12631 # Also from http://www.unicode.org/reports/tr29/tr29-3.html.
12632 foreach my $code_point ( qw{
12634 09BE 09D7 0B3E 0B57 0BBE 0BD7 0CC2 0CD5 0CD6
12635 0D3E 0D57 0DCF 0DDF FF9E FF9F 1D165 1D16E 1D16F
12638 my $category = $gc->value_of(hex $code_point);
12639 next if ! defined $category || $category eq 'Cn'; # But not if
12640 # unassigned in this
12642 $file->insert_lines("$code_point; Extend");
12645 my $hst = property_ref('Hangul_Syllable_Type');
12646 if ($hst->count > 0) {
12647 foreach my $range ($hst->ranges) {
12648 $file->insert_lines(sprintf "%04X..%04X; %s",
12649 $range->start, $range->end, $range->value);
12653 generate_hst($file);
12659 sub setup_early_name_alias {
12661 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
12663 # This has the effect of pretending that the Name_Alias property was
12664 # available in all Unicode releases. Strictly speaking, this property
12665 # should not be availabe in early releases, but doing this allows
12666 # charnames.pm to work on older releases without change. Prior to v5.16
12667 # it had these names hard-coded inside it. Unicode 6.1 came along and
12668 # created these names, and so they were removed from charnames.
12670 my $aliases = property_ref('Name_Alias');
12671 if (! defined $aliases) {
12672 $aliases = Property->new('Name_Alias', Default_Map => "");
12675 $file->insert_lines(get_old_name_aliases());
12680 sub get_old_name_aliases () {
12682 # The Unicode_1_Name field, contains most of these names. One would
12683 # expect, given the field's name, that its values would be fixed across
12684 # versions, giving the true Unicode version 1 name for the character.
12685 # Sadly, this is not the case. Actually Version 1.1.5 had no names for
12686 # any of the controls; Version 2.0 introduced names for the C0 controls,
12687 # and 3.0 introduced C1 names. 3.0.1 removed the name INDEX; and 3.2
12688 # changed some names: it
12689 # changed to parenthesized versions like "NEXT LINE" to
12690 # "NEXT LINE (NEL)";
12691 # changed PARTIAL LINE DOWN to PARTIAL LINE FORWARD
12692 # changed PARTIAL LINE UP to PARTIAL LINE BACKWARD;;
12693 # changed e.g. FILE SEPARATOR to INFORMATION SEPARATOR FOUR
12694 # This list contains all the names that were defined so that
12695 # charnames::vianame(), etc. understand them all EVEN if this version of
12696 # Unicode didn't specify them (this could be construed as a bug).
12697 # mktables elsewhere gives preference to the Unicode_1_Name field over
12698 # these names, so that viacode() will return the correct value for that
12699 # version of Unicode, except when that version doesn't define a name,
12700 # viacode() will return one anyway (this also could be construed as a
12701 # bug). But these potential "bugs" allow for the smooth working of code
12702 # on earlier Unicode releases.
12704 my @return = split /\n/, <<'END';
12706 0000;NUL;abbreviation
12707 0001;START OF HEADING;control
12708 0001;SOH;abbreviation
12709 0002;START OF TEXT;control
12710 0002;STX;abbreviation
12711 0003;END OF TEXT;control
12712 0003;ETX;abbreviation
12713 0004;END OF TRANSMISSION;control
12714 0004;EOT;abbreviation
12715 0005;ENQUIRY;control
12716 0005;ENQ;abbreviation
12717 0006;ACKNOWLEDGE;control
12718 0006;ACK;abbreviation
12720 0007;BEL;abbreviation
12721 0008;BACKSPACE;control
12722 0008;BS;abbreviation
12723 0009;CHARACTER TABULATION;control
12724 0009;HORIZONTAL TABULATION;control
12725 0009;HT;abbreviation
12726 0009;TAB;abbreviation
12727 000A;LINE FEED;control
12728 000A;LINE FEED (LF);control
12729 000A;NEW LINE;control
12730 000A;END OF LINE;control
12731 000A;LF;abbreviation
12732 000A;NL;abbreviation
12733 000A;EOL;abbreviation
12734 000B;LINE TABULATION;control
12735 000B;VERTICAL TABULATION;control
12736 000B;VT;abbreviation
12737 000C;FORM FEED;control
12738 000C;FORM FEED (FF);control
12739 000C;FF;abbreviation
12740 000D;CARRIAGE RETURN;control
12741 000D;CARRIAGE RETURN (CR);control
12742 000D;CR;abbreviation
12743 000E;SHIFT OUT;control
12744 000E;LOCKING-SHIFT ONE;control
12745 000E;SO;abbreviation
12746 000F;SHIFT IN;control
12747 000F;LOCKING-SHIFT ZERO;control
12748 000F;SI;abbreviation
12749 0010;DATA LINK ESCAPE;control
12750 0010;DLE;abbreviation
12751 0011;DEVICE CONTROL ONE;control
12752 0011;DC1;abbreviation
12753 0012;DEVICE CONTROL TWO;control
12754 0012;DC2;abbreviation
12755 0013;DEVICE CONTROL THREE;control
12756 0013;DC3;abbreviation
12757 0014;DEVICE CONTROL FOUR;control
12758 0014;DC4;abbreviation
12759 0015;NEGATIVE ACKNOWLEDGE;control
12760 0015;NAK;abbreviation
12761 0016;SYNCHRONOUS IDLE;control
12762 0016;SYN;abbreviation
12763 0017;END OF TRANSMISSION BLOCK;control
12764 0017;ETB;abbreviation
12765 0018;CANCEL;control
12766 0018;CAN;abbreviation
12767 0019;END OF MEDIUM;control
12768 0019;EOM;abbreviation
12769 001A;SUBSTITUTE;control
12770 001A;SUB;abbreviation
12771 001B;ESCAPE;control
12772 001B;ESC;abbreviation
12773 001C;INFORMATION SEPARATOR FOUR;control
12774 001C;FILE SEPARATOR;control
12775 001C;FS;abbreviation
12776 001D;INFORMATION SEPARATOR THREE;control
12777 001D;GROUP SEPARATOR;control
12778 001D;GS;abbreviation
12779 001E;INFORMATION SEPARATOR TWO;control
12780 001E;RECORD SEPARATOR;control
12781 001E;RS;abbreviation
12782 001F;INFORMATION SEPARATOR ONE;control
12783 001F;UNIT SEPARATOR;control
12784 001F;US;abbreviation
12785 0020;SP;abbreviation
12786 007F;DELETE;control
12787 007F;DEL;abbreviation
12788 0080;PADDING CHARACTER;figment
12789 0080;PAD;abbreviation
12790 0081;HIGH OCTET PRESET;figment
12791 0081;HOP;abbreviation
12792 0082;BREAK PERMITTED HERE;control
12793 0082;BPH;abbreviation
12794 0083;NO BREAK HERE;control
12795 0083;NBH;abbreviation
12797 0084;IND;abbreviation
12798 0085;NEXT LINE;control
12799 0085;NEXT LINE (NEL);control
12800 0085;NEL;abbreviation
12801 0086;START OF SELECTED AREA;control
12802 0086;SSA;abbreviation
12803 0087;END OF SELECTED AREA;control
12804 0087;ESA;abbreviation
12805 0088;CHARACTER TABULATION SET;control
12806 0088;HORIZONTAL TABULATION SET;control
12807 0088;HTS;abbreviation
12808 0089;CHARACTER TABULATION WITH JUSTIFICATION;control
12809 0089;HORIZONTAL TABULATION WITH JUSTIFICATION;control
12810 0089;HTJ;abbreviation
12811 008A;LINE TABULATION SET;control
12812 008A;VERTICAL TABULATION SET;control
12813 008A;VTS;abbreviation
12814 008B;PARTIAL LINE FORWARD;control
12815 008B;PARTIAL LINE DOWN;control
12816 008B;PLD;abbreviation
12817 008C;PARTIAL LINE BACKWARD;control
12818 008C;PARTIAL LINE UP;control
12819 008C;PLU;abbreviation
12820 008D;REVERSE LINE FEED;control
12821 008D;REVERSE INDEX;control
12822 008D;RI;abbreviation
12823 008E;SINGLE SHIFT TWO;control
12824 008E;SINGLE-SHIFT-2;control
12825 008E;SS2;abbreviation
12826 008F;SINGLE SHIFT THREE;control
12827 008F;SINGLE-SHIFT-3;control
12828 008F;SS3;abbreviation
12829 0090;DEVICE CONTROL STRING;control
12830 0090;DCS;abbreviation
12831 0091;PRIVATE USE ONE;control
12832 0091;PRIVATE USE-1;control
12833 0091;PU1;abbreviation
12834 0092;PRIVATE USE TWO;control
12835 0092;PRIVATE USE-2;control
12836 0092;PU2;abbreviation
12837 0093;SET TRANSMIT STATE;control
12838 0093;STS;abbreviation
12839 0094;CANCEL CHARACTER;control
12840 0094;CCH;abbreviation
12841 0095;MESSAGE WAITING;control
12842 0095;MW;abbreviation
12843 0096;START OF GUARDED AREA;control
12844 0096;START OF PROTECTED AREA;control
12845 0096;SPA;abbreviation
12846 0097;END OF GUARDED AREA;control
12847 0097;END OF PROTECTED AREA;control
12848 0097;EPA;abbreviation
12849 0098;START OF STRING;control
12850 0098;SOS;abbreviation
12851 0099;SINGLE GRAPHIC CHARACTER INTRODUCER;figment
12852 0099;SGC;abbreviation
12853 009A;SINGLE CHARACTER INTRODUCER;control
12854 009A;SCI;abbreviation
12855 009B;CONTROL SEQUENCE INTRODUCER;control
12856 009B;CSI;abbreviation
12857 009C;STRING TERMINATOR;control
12858 009C;ST;abbreviation
12859 009D;OPERATING SYSTEM COMMAND;control
12860 009D;OSC;abbreviation
12861 009E;PRIVACY MESSAGE;control
12862 009E;PM;abbreviation
12863 009F;APPLICATION PROGRAM COMMAND;control
12864 009F;APC;abbreviation
12865 00A0;NBSP;abbreviation
12866 00AD;SHY;abbreviation
12867 200B;ZWSP;abbreviation
12868 200C;ZWNJ;abbreviation
12869 200D;ZWJ;abbreviation
12870 200E;LRM;abbreviation
12871 200F;RLM;abbreviation
12872 202A;LRE;abbreviation
12873 202B;RLE;abbreviation
12874 202C;PDF;abbreviation
12875 202D;LRO;abbreviation
12876 202E;RLO;abbreviation
12877 FEFF;BYTE ORDER MARK;alternate
12878 FEFF;BOM;abbreviation
12879 FEFF;ZWNBSP;abbreviation
12882 if ($v_version ge v3.0.0) {
12883 push @return, split /\n/, <<'END';
12884 180B; FVS1; abbreviation
12885 180C; FVS2; abbreviation
12886 180D; FVS3; abbreviation
12887 180E; MVS; abbreviation
12888 202F; NNBSP; abbreviation
12892 if ($v_version ge v3.2.0) {
12893 push @return, split /\n/, <<'END';
12894 034F; CGJ; abbreviation
12895 205F; MMSP; abbreviation
12896 2060; WJ; abbreviation
12899 my $cp = 0xFE00 - 1;
12900 for my $i (1..16) {
12901 push @return, sprintf("%04X; VS%d; abbreviation", $cp + $i, $i);
12904 if ($v_version ge v4.0.0) { # Add in VS17..VS256
12905 my $cp = 0xE0100 - 17;
12906 for my $i (17..256) {
12907 push @return, sprintf("%04X; VS%d; abbreviation", $cp + $i, $i);
12911 # ALERT did not come along until 6.0, at which point it became preferred
12912 # over BELL, and was never in the Unicode_1_Name field. For the same
12913 # reasons, that the other names are made known to all releases by this
12914 # function, we make ALERT known too. By inserting it
12915 # last in early releases, BELL is preferred over it; and vice-vers in 6.0
12916 my $alert = '0007; ALERT; control';
12917 if ($v_version lt v6.0.0) {
12918 push @return, $alert;
12921 unshift @return, $alert;
12927 sub filter_later_version_name_alias_line {
12929 # This file has an extra entry per line for the alias type. This is
12930 # handled by creating a compound entry: "$alias: $type"; First, split
12931 # the line into components.
12932 my ($range, $alias, $type, @remainder)
12933 = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
12935 # This file contains multiple entries for some components, so tell the
12936 # downstream code to allow this in our internal tables; the
12937 # $MULTIPLE_AFTER preserves the input ordering.
12938 $_ = join ";", $range, $CMD_DELIM
12948 sub filter_early_version_name_alias_line {
12950 # Early versions did not have the trailing alias type field; implicitly it
12951 # was 'correction'. But our synthetic lines we add in this program do
12952 # have it, so test for the type field.
12953 $_ .= "; correction" if $_ !~ /;.*;/;
12955 filter_later_version_name_alias_line;
12959 sub finish_Unicode() {
12960 # This routine should be called after all the Unicode files have been read
12962 # 1) Creates properties that are missing from the version of Unicode being
12963 # compiled, and which, for whatever reason, are needed for the Perl
12964 # core to function properly. These are minimally populated as
12966 # 2) Adds the mappings for code points missing from the files which have
12967 # defaults specified for them.
12968 # 3) At this this point all mappings are known, so it computes the type of
12969 # each property whose type hasn't been determined yet.
12970 # 4) Calculates all the regular expression match tables based on the
12972 # 5) Calculates and adds the tables which are defined by Unicode, but
12973 # which aren't derived by them, and certain derived tables that Perl
12976 # Folding information was introduced later into Unicode data. To get
12977 # Perl's case ignore (/i) to work at all in releases that don't have
12978 # folding, use the best available alternative, which is lower casing.
12979 my $fold = property_ref('Case_Folding');
12980 if ($fold->is_empty) {
12981 $fold->initialize(property_ref('Lowercase_Mapping'));
12982 $fold->add_note(join_lines(<<END
12983 WARNING: This table uses lower case as a substitute for missing fold
12989 # Multiple-character mapping was introduced later into Unicode data, so it
12990 # is by default the simple version. If to output the simple versions and
12991 # not present, just use the regular (which in these Unicode versions is
12992 # the simple as well).
12993 foreach my $map (qw { Uppercase_Mapping
12999 my $comment = <<END;
13001 Note that although the Perl core uses this file, it has the standard values
13002 for code points from U+0000 to U+00FF compiled in, so changing this table will
13003 not change the core's behavior with respect to these code points. Use
13004 Unicode::Casing to override this table.
13006 if ($map eq 'Case_Folding') {
13008 (/i regex matching is not overridable except by using a custom regex engine)
13011 property_ref($map)->add_comment(join_lines($comment));
13012 my $simple = property_ref("Simple_$map");
13013 next if ! $simple->is_empty;
13014 if ($simple->to_output_map) {
13015 $simple->initialize(property_ref($map));
13018 property_ref($map)->set_proxy_for($simple->name);
13022 # For each property, fill in any missing mappings, and calculate the re
13023 # match tables. If a property has more than one missing mapping, the
13024 # default is a reference to a data structure, and requires data from other
13025 # properties to resolve. The sort is used to cause these to be processed
13026 # last, after all the other properties have been calculated.
13027 # (Fortunately, the missing properties so far don't depend on each other.)
13028 foreach my $property
13029 (sort { (defined $a->default_map && ref $a->default_map) ? 1 : -1 }
13032 # $perl has been defined, but isn't one of the Unicode properties that
13033 # need to be finished up.
13034 next if $property == $perl;
13036 # Nor do we need to do anything with properties that aren't going to
13038 next if $property->fate == $SUPPRESSED;
13040 # Handle the properties that have more than one possible default
13041 if (ref $property->default_map) {
13042 my $default_map = $property->default_map;
13044 # These properties have stored in the default_map:
13046 # 1) A default map which applies to all code points in a
13048 # 2) an expression which will evaluate to the list of code
13049 # points in that class
13051 # 3) the default map which applies to every other missing code
13054 # Go through each list.
13055 while (my ($default, $eval) = $default_map->get_next_defaults) {
13057 # Get the class list, and intersect it with all the so-far
13058 # unspecified code points yielding all the code points
13059 # in the class that haven't been specified.
13060 my $list = eval $eval;
13062 Carp::my_carp("Can't set some defaults for missing code points for $property because eval '$eval' failed with '$@'");
13066 # Narrow down the list to just those code points we don't have
13068 $list = $list & $property->inverse_list;
13070 # Add mappings to the property for each code point in the list
13071 foreach my $range ($list->ranges) {
13072 $property->add_map($range->start, $range->end, $default,
13073 Replace => $CROAK);
13077 # All remaining code points have the other mapping. Set that up
13078 # so the normal single-default mapping code will work on them
13079 $property->set_default_map($default_map->other_default);
13081 # And fall through to do that
13084 # We should have enough data now to compute the type of the property.
13085 my $property_name = $property->name;
13086 $property->compute_type;
13087 my $property_type = $property->type;
13089 next if ! $property->to_create_match_tables;
13091 # Here want to create match tables for this property
13093 # The Unicode db always (so far, and they claim into the future) have
13094 # the default for missing entries in binary properties be 'N' (unless
13095 # there is a '@missing' line that specifies otherwise)
13096 if (! defined $property->default_map) {
13097 if ($property_type == $BINARY) {
13098 $property->set_default_map('N');
13100 elsif ($property_type == $ENUM) {
13101 Carp::my_carp("Property '$property_name doesn't have a default mapping. Using a fake one");
13102 $property->set_default_map('XXX This makes sure there is a default map');
13106 # Add any remaining code points to the mapping, using the default for
13107 # missing code points.
13109 if (defined (my $default_map = $property->default_map)) {
13111 # Make sure there is a match table for the default
13112 if (! defined ($default_table = $property->table($default_map))) {
13113 $default_table = $property->add_match_table($default_map);
13116 # And, if the property is binary, the default table will just
13117 # be the complement of the other table.
13118 if ($property_type == $BINARY) {
13119 my $non_default_table;
13121 # Find the non-default table.
13122 for my $table ($property->tables) {
13123 next if $table == $default_table;
13124 $non_default_table = $table;
13126 $default_table->set_complement($non_default_table);
13130 # This fills in any missing values with the default. It's not
13131 # necessary to do this with binary properties, as the default
13132 # is defined completely in terms of the Y table.
13133 $property->add_map(0, $MAX_WORKING_CODEPOINT,
13134 $default_map, Replace => $NO);
13138 # Have all we need to populate the match tables.
13139 my $maps_should_be_defined = $property->pre_declared_maps;
13140 foreach my $range ($property->ranges) {
13141 my $map = $range->value;
13142 my $table = $property->table($map);
13143 if (! defined $table) {
13145 # Integral and rational property values are not necessarily
13146 # defined in PropValueAliases, but whether all the other ones
13147 # should be depends on the property.
13148 if ($maps_should_be_defined
13149 && $map !~ /^ -? \d+ ( \/ \d+ )? $/x)
13151 Carp::my_carp("Table '$property_name=$map' should have been defined. Defining it now.")
13153 $table = $property->add_match_table($map);
13156 next if $table->complement != 0; # Don't need to populate these
13157 $table->add_range($range->start, $range->end);
13160 # A forced binary property has additional true/false tables which
13161 # should have been set up when it was forced into binary. The false
13162 # table matches exactly the same set as the property's default table.
13163 # The true table matches the complement of that. The false table is
13164 # not the same as an additional set of aliases on top of the default
13165 # table, so use 'set_equivalent_to'. If it were implemented as
13166 # additional aliases, various things would have to be adjusted, but
13167 # especially, if the user wants to get a list of names for the table
13168 # using Unicode::UCD::prop_value_aliases(), s/he should get a
13169 # different set depending on whether they want the default table or
13171 if ($property_type == $FORCED_BINARY) {
13172 $property->table('N')->set_equivalent_to($default_table,
13174 $property->table('Y')->set_complement($default_table);
13177 # For Perl 5.6 compatibility, all properties matchable in regexes can
13178 # have an optional 'Is_' prefix. This is now done in utf8_heavy.pl.
13179 # But warn if this creates a conflict with a (new) Unicode property
13180 # name, although it appears that Unicode has made a decision never to
13181 # begin a property name with 'Is_', so this shouldn't happen.
13182 foreach my $alias ($property->aliases) {
13183 my $Is_name = 'Is_' . $alias->name;
13184 if (defined (my $pre_existing = property_ref($Is_name))) {
13185 Carp::my_carp(<<END
13186 There is already an alias named $Is_name (from " . $pre_existing . "), so
13187 creating one for $property won't work. This is bad news. If it is not too
13188 late, get Unicode to back off. Otherwise go back to the old scheme (findable
13189 from the git blame log for this area of the code that suppressed individual
13190 aliases that conflict with the new Unicode names. Proceeding anyway.
13194 } # End of loop through aliases for this property
13195 } # End of loop through all Unicode properties.
13197 # Fill in the mappings that Unicode doesn't completely furnish. First the
13198 # single letter major general categories. If Unicode were to start
13199 # delivering the values, this would be redundant, but better that than to
13200 # try to figure out if should skip and not get it right. Ths could happen
13201 # if a new major category were to be introduced, and the hard-coded test
13202 # wouldn't know about it.
13203 # This routine depends on the standard names for the general categories
13204 # being what it thinks they are, like 'Cn'. The major categories are the
13205 # union of all the general category tables which have the same first
13206 # letters. eg. L = Lu + Lt + Ll + Lo + Lm
13207 foreach my $minor_table ($gc->tables) {
13208 my $minor_name = $minor_table->name;
13209 next if length $minor_name == 1;
13210 if (length $minor_name != 2) {
13211 Carp::my_carp_bug("Unexpected general category '$minor_name'. Skipped.");
13215 my $major_name = uc(substr($minor_name, 0, 1));
13216 my $major_table = $gc->table($major_name);
13217 $major_table += $minor_table;
13220 # LC is Ll, Lu, and Lt. (used to be L& or L_, but PropValueAliases.txt
13221 # defines it as LC)
13222 my $LC = $gc->table('LC');
13223 $LC->add_alias('L_', Status => $DISCOURAGED); # For backwards...
13224 $LC->add_alias('L&', Status => $DISCOURAGED); # compatibility.
13227 if ($LC->is_empty) { # Assume if not empty that Unicode has started to
13228 # deliver the correct values in it
13229 $LC->initialize($gc->table('Ll') + $gc->table('Lu'));
13231 # Lt not in release 1.
13232 if (defined $gc->table('Lt')) {
13233 $LC += $gc->table('Lt');
13234 $gc->table('Lt')->set_caseless_equivalent($LC);
13237 $LC->add_description('[\p{Ll}\p{Lu}\p{Lt}]');
13239 $gc->table('Ll')->set_caseless_equivalent($LC);
13240 $gc->table('Lu')->set_caseless_equivalent($LC);
13242 my $Cs = $gc->table('Cs');
13244 # Create digit and case fold tables with the original file names for
13245 # backwards compatibility with applications that read them directly.
13246 my $Digit = Property->new("Legacy_Perl_Decimal_Digit",
13248 File => 'Digit', # Trad. location
13249 Directory => $map_directory,
13251 Replacement_Property => "Perl_Decimal_Digit",
13252 Initialize => property_ref('Perl_Decimal_Digit'),
13254 $Digit->add_comment(join_lines(<<END
13255 This file gives the mapping of all code points which represent a single
13256 decimal digit [0-9] to their respective digits. For example, the code point
13257 U+0031 (an ASCII '1') is mapped to a numeric 1. These code points are those
13258 that have Numeric_Type=Decimal; not special things, like subscripts nor Roman
13263 Property->new('Legacy_Case_Folding',
13265 Directory => $map_directory,
13266 Default_Map => $CODE_POINT,
13268 Replacement_Property => "Case_Folding",
13269 Format => $HEX_FORMAT,
13270 Initialize => property_ref('cf'),
13273 # The Script_Extensions property started out as a clone of the Script
13274 # property. But processing its data file caused some elements to be
13275 # replaced with different data. (These elements were for the Common and
13276 # Inherited properties.) This data is a qw() list of all the scripts that
13277 # the code points in the given range are in. An example line is:
13278 # 060C ; Arab Syrc Thaa # Po ARABIC COMMA
13280 # The code above has created a new match table named "Arab Syrc Thaa"
13281 # which contains 060C. (The cloned table started out with this code point
13282 # mapping to "Common".) Now we add 060C to each of the Arab, Syrc, and
13283 # Thaa match tables. Then we delete the now spurious "Arab Syrc Thaa"
13284 # match table. This is repeated for all these tables and ranges. The map
13285 # data is retained in the map table for reference, but the spurious match
13286 # tables are deleted.
13288 my $scx = property_ref("Script_Extensions");
13289 if (defined $scx) {
13290 foreach my $table ($scx->tables) {
13291 next unless $table->name =~ /\s/; # All the new and only the new
13292 # tables have a space in their
13294 my @scripts = split /\s+/, $table->name;
13295 foreach my $script (@scripts) {
13296 my $script_table = $scx->table($script);
13297 $script_table += $table;
13299 $scx->delete_match_table($table);
13306 sub pre_3_dot_1_Nl () {
13308 # Return a range list for gc=nl for Unicode versions prior to 3.1, which
13309 # is when Unicode's became fully usable. These code points were
13310 # determined by inspection and experimentation. gc=nl is important for
13311 # certain Perl-extension properties that should be available in all
13314 my $Nl = Range_List->new();
13315 if (defined (my $official = $gc->table('Nl'))) {
13319 $Nl->add_range(0x2160, 0x2182);
13320 $Nl->add_range(0x3007, 0x3007);
13321 $Nl->add_range(0x3021, 0x3029);
13323 $Nl->add_range(0xFE20, 0xFE23);
13324 $Nl->add_range(0x16EE, 0x16F0) if $v_version ge v3.0.0; # 3.0 was when
13329 sub compile_perl() {
13330 # Create perl-defined tables. Almost all are part of the pseudo-property
13331 # named 'perl' internally to this program. Many of these are recommended
13332 # in UTS#18 "Unicode Regular Expressions", and their derivations are based
13333 # on those found there.
13334 # Almost all of these are equivalent to some Unicode property.
13335 # A number of these properties have equivalents restricted to the ASCII
13336 # range, with their names prefaced by 'Posix', to signify that these match
13337 # what the Posix standard says they should match. A couple are
13338 # effectively this, but the name doesn't have 'Posix' in it because there
13339 # just isn't any Posix equivalent. 'XPosix' are the Posix tables extended
13340 # to the full Unicode range, by our guesses as to what is appropriate.
13342 # 'All' is all code points. As an error check, instead of just setting it
13343 # to be that, construct it to be the union of all the major categories
13344 $All = $perl->add_match_table('All',
13346 => "All code points, including those above Unicode. Same as qr/./s",
13349 foreach my $major_table ($gc->tables) {
13351 # Major categories are the ones with single letter names.
13352 next if length($major_table->name) != 1;
13354 $All += $major_table;
13357 if ($All->max != $MAX_WORKING_CODEPOINT) {
13358 Carp::my_carp_bug("Generated highest code point ("
13359 . sprintf("%X", $All->max)
13360 . ") doesn't match expected value $MAX_WORKING_CODEPOINT_STRING.")
13362 if ($All->range_count != 1 || $All->min != 0) {
13363 Carp::my_carp_bug("Generated table 'All' doesn't match all code points.")
13366 my $Any = $perl->add_match_table('Any',
13367 Description => "All Unicode code points: [\\x{0000}-\\x{10FFFF}]",
13369 $Any->add_range(0, 0x10FFFF);
13370 $Any->add_alias('Unicode');
13372 # Assigned is the opposite of gc=unassigned
13373 my $Assigned = $perl->add_match_table('Assigned',
13374 Description => "All assigned code points",
13375 Initialize => ~ $gc->table('Unassigned'),
13378 # Our internal-only property should be treated as more than just a
13379 # synonym; grandfather it in to the pod.
13380 $perl->add_match_table('_CombAbove', Re_Pod_Entry => 1,
13381 Fate => $INTERNAL_ONLY, Status => $DISCOURAGED)
13382 ->set_equivalent_to(property_ref('ccc')->table('Above'),
13385 my $ASCII = $perl->add_match_table('ASCII', Description => '[[:ASCII:]]');
13386 if (defined $block) { # This is equivalent to the block if have it.
13387 my $Unicode_ASCII = $block->table('Basic_Latin');
13388 if (defined $Unicode_ASCII && ! $Unicode_ASCII->is_empty) {
13389 $ASCII->set_equivalent_to($Unicode_ASCII, Related => 1);
13393 # Very early releases didn't have blocks, so initialize ASCII ourselves if
13395 if ($ASCII->is_empty) {
13396 if (! NON_ASCII_PLATFORM) {
13397 $ASCII->add_range(0, 127);
13400 for my $i (0 .. 127) {
13401 $ASCII->add_range(utf8::unicode_to_native($i),
13402 utf8::unicode_to_native($i));
13407 # Get the best available case definitions. Early Unicode versions didn't
13408 # have Uppercase and Lowercase defined, so use the general category
13409 # instead for them, modified by hard-coding in the code points each is
13411 my $Lower = $perl->add_match_table('Lower');
13412 my $Unicode_Lower = property_ref('Lowercase');
13413 if (defined $Unicode_Lower && ! $Unicode_Lower->is_empty) {
13414 $Lower->set_equivalent_to($Unicode_Lower->table('Y'), Related => 1);
13418 $Lower += $gc->table('Lowercase_Letter');
13420 # There are quite a few code points in Lower, that aren't in gc=lc,
13421 # and not all are in all releases.
13422 foreach my $code_point ( utf8::unicode_to_native(0xAA),
13423 utf8::unicode_to_native(0xBA),
13441 # Don't include the code point unless it is assigned in this
13443 my $category = $gc->value_of(hex $code_point);
13444 next if ! defined $category || $category eq 'Cn';
13446 $Lower += $code_point;
13449 $Lower->add_alias('XPosixLower');
13450 my $Posix_Lower = $perl->add_match_table("PosixLower",
13451 Description => "[a-z]",
13452 Initialize => $Lower & $ASCII,
13455 my $Upper = $perl->add_match_table('Upper');
13456 my $Unicode_Upper = property_ref('Uppercase');
13457 if (defined $Unicode_Upper && ! $Unicode_Upper->is_empty) {
13458 $Upper->set_equivalent_to($Unicode_Upper->table('Y'), Related => 1);
13462 # Unlike Lower, there are only two ranges in Upper that aren't in
13463 # gc=Lu, and all code points were assigned in all releases.
13464 $Upper += $gc->table('Uppercase_Letter');
13465 $Upper->add_range(0x2160, 0x216F); # Uppercase Roman numerals
13466 $Upper->add_range(0x24B6, 0x24CF); # Circled Latin upper case letters
13468 $Upper->add_alias('XPosixUpper');
13469 my $Posix_Upper = $perl->add_match_table("PosixUpper",
13470 Description => "[A-Z]",
13471 Initialize => $Upper & $ASCII,
13474 # Earliest releases didn't have title case. Initialize it to empty if not
13475 # otherwise present
13476 my $Title = $perl->add_match_table('Title', Full_Name => 'Titlecase',
13477 Description => '(= \p{Gc=Lt})');
13478 my $lt = $gc->table('Lt');
13480 # Earlier versions of mktables had this related to $lt since they have
13481 # identical code points, but their caseless equivalents are not the same,
13482 # one being 'Cased' and the other being 'LC', and so now must be kept as
13483 # separate entities.
13488 push @tables_that_may_be_empty, $Title->complete_name;
13491 my $Unicode_Cased = property_ref('Cased');
13492 if (defined $Unicode_Cased) {
13493 my $yes = $Unicode_Cased->table('Y');
13494 my $no = $Unicode_Cased->table('N');
13495 $Title->set_caseless_equivalent($yes);
13496 if (defined $Unicode_Upper) {
13497 $Unicode_Upper->table('Y')->set_caseless_equivalent($yes);
13498 $Unicode_Upper->table('N')->set_caseless_equivalent($no);
13500 $Upper->set_caseless_equivalent($yes);
13501 if (defined $Unicode_Lower) {
13502 $Unicode_Lower->table('Y')->set_caseless_equivalent($yes);
13503 $Unicode_Lower->table('N')->set_caseless_equivalent($no);
13505 $Lower->set_caseless_equivalent($yes);
13508 # If this Unicode version doesn't have Cased, set up the Perl
13509 # extension from first principles. From Unicode 5.1: Definition D120:
13510 # A character C is defined to be cased if and only if C has the
13511 # Lowercase or Uppercase property or has a General_Category value of
13512 # Titlecase_Letter.
13513 my $cased = $perl->add_match_table('Cased',
13514 Initialize => $Lower + $Upper + $Title,
13515 Description => 'Uppercase or Lowercase or Titlecase',
13517 # $notcased is purely for the caseless equivalents below
13518 my $notcased = $perl->add_match_table('_Not_Cased',
13519 Initialize => ~ $cased,
13520 Fate => $INTERNAL_ONLY,
13521 Description => 'All not-cased code points');
13522 $Title->set_caseless_equivalent($cased);
13523 if (defined $Unicode_Upper) {
13524 $Unicode_Upper->table('Y')->set_caseless_equivalent($cased);
13525 $Unicode_Upper->table('N')->set_caseless_equivalent($notcased);
13527 $Upper->set_caseless_equivalent($cased);
13528 if (defined $Unicode_Lower) {
13529 $Unicode_Lower->table('Y')->set_caseless_equivalent($cased);
13530 $Unicode_Lower->table('N')->set_caseless_equivalent($notcased);
13532 $Lower->set_caseless_equivalent($cased);
13535 # Similarly, set up our own Case_Ignorable property if this Unicode
13536 # version doesn't have it. From Unicode 5.1: Definition D121: A character
13537 # C is defined to be case-ignorable if C has the value MidLetter or the
13538 # value MidNumLet for the Word_Break property or its General_Category is
13539 # one of Nonspacing_Mark (Mn), Enclosing_Mark (Me), Format (Cf),
13540 # Modifier_Letter (Lm), or Modifier_Symbol (Sk).
13542 # Perl has long had an internal-only alias for this property; grandfather
13543 # it in to the pod, but discourage its use.
13544 my $perl_case_ignorable = $perl->add_match_table('_Case_Ignorable',
13546 Fate => $INTERNAL_ONLY,
13547 Status => $DISCOURAGED);
13548 my $case_ignorable = property_ref('Case_Ignorable');
13549 if (defined $case_ignorable && ! $case_ignorable->is_empty) {
13550 $perl_case_ignorable->set_equivalent_to($case_ignorable->table('Y'),
13555 $perl_case_ignorable->initialize($gc->table('Mn') + $gc->table('Lm'));
13557 # The following three properties are not in early releases
13558 $perl_case_ignorable += $gc->table('Me') if defined $gc->table('Me');
13559 $perl_case_ignorable += $gc->table('Cf') if defined $gc->table('Cf');
13560 $perl_case_ignorable += $gc->table('Sk') if defined $gc->table('Sk');
13562 # For versions 4.1 - 5.0, there is no MidNumLet property, and
13563 # correspondingly the case-ignorable definition lacks that one. For
13564 # 4.0, it appears that it was meant to be the same definition, but was
13565 # inadvertently omitted from the standard's text, so add it if the
13566 # property actually is there
13567 my $wb = property_ref('Word_Break');
13569 my $midlet = $wb->table('MidLetter');
13570 $perl_case_ignorable += $midlet if defined $midlet;
13571 my $midnumlet = $wb->table('MidNumLet');
13572 $perl_case_ignorable += $midnumlet if defined $midnumlet;
13576 # In earlier versions of the standard, instead of the above two
13577 # properties , just the following characters were used:
13578 $perl_case_ignorable +=
13580 + utf8::unicode_to_native(0xAD) # SOFT HYPHEN (SHY)
13581 + 0x2019; # RIGHT SINGLE QUOTATION MARK
13585 # The remaining perl defined tables are mostly based on Unicode TR 18,
13586 # "Annex C: Compatibility Properties". All of these have two versions,
13587 # one whose name generally begins with Posix that is posix-compliant, and
13588 # one that matches Unicode characters beyond the Posix, ASCII range
13590 my $Alpha = $perl->add_match_table('Alpha');
13592 # Alphabetic was not present in early releases
13593 my $Alphabetic = property_ref('Alphabetic');
13594 if (defined $Alphabetic && ! $Alphabetic->is_empty) {
13595 $Alpha->set_equivalent_to($Alphabetic->table('Y'), Related => 1);
13599 # The Alphabetic property doesn't exist for early releases, so
13600 # generate it. The actual definition, in 5.2 terms is:
13602 # gc=L + gc=Nl + Other_Alphabetic
13604 # Other_Alphabetic is also not defined in these early releases, but it
13605 # contains one gc=So range plus most of gc=Mn and gc=Mc, so we add
13606 # those last two as well, then subtract the relatively few of them that
13607 # shouldn't have been added. (The gc=So range is the circled capital
13608 # Latin characters. Early releases mistakenly didn't also include the
13609 # lower-case versions of these characters, and so we don't either, to
13610 # maintain consistency with those releases that first had this
13612 $Alpha->initialize($gc->table('Letter')
13617 $Alpha->add_range(0x24D0, 0x24E9); # gc=So
13618 foreach my $range ( [ 0x0300, 0x0344 ],
13619 [ 0x0346, 0x034E ],
13620 [ 0x0360, 0x0362 ],
13621 [ 0x0483, 0x0486 ],
13622 [ 0x0591, 0x05AF ],
13623 [ 0x06DF, 0x06E0 ],
13624 [ 0x06EA, 0x06EC ],
13625 [ 0x0740, 0x074A ],
13628 [ 0x0951, 0x0954 ],
13642 [ 0x0E47, 0x0E4C ],
13644 [ 0x0EC8, 0x0ECC ],
13645 [ 0x0F18, 0x0F19 ],
13649 [ 0x0F3E, 0x0F3F ],
13650 [ 0x0F82, 0x0F84 ],
13651 [ 0x0F86, 0x0F87 ],
13655 [ 0x17C9, 0x17D3 ],
13656 [ 0x20D0, 0x20DC ],
13658 [ 0x302A, 0x302F ],
13659 [ 0x3099, 0x309A ],
13660 [ 0xFE20, 0xFE23 ],
13661 [ 0x1D165, 0x1D169 ],
13662 [ 0x1D16D, 0x1D172 ],
13663 [ 0x1D17B, 0x1D182 ],
13664 [ 0x1D185, 0x1D18B ],
13665 [ 0x1D1AA, 0x1D1AD ],
13668 $Alpha->delete_range($range->[0], $range->[1]);
13671 $Alpha->delete_range($range, $range);
13674 $Alpha->add_description('Alphabetic');
13675 $Alpha->add_alias('Alphabetic');
13677 $Alpha->add_alias('XPosixAlpha');
13678 my $Posix_Alpha = $perl->add_match_table("PosixAlpha",
13679 Description => "[A-Za-z]",
13680 Initialize => $Alpha & $ASCII,
13682 $Posix_Upper->set_caseless_equivalent($Posix_Alpha);
13683 $Posix_Lower->set_caseless_equivalent($Posix_Alpha);
13685 my $Alnum = $perl->add_match_table('Alnum',
13686 Description => 'Alphabetic and (decimal) Numeric',
13687 Initialize => $Alpha + $gc->table('Decimal_Number'),
13689 $Alnum->add_alias('XPosixAlnum');
13690 $perl->add_match_table("PosixAlnum",
13691 Description => "[A-Za-z0-9]",
13692 Initialize => $Alnum & $ASCII,
13695 my $Word = $perl->add_match_table('Word',
13696 Description => '\w, including beyond ASCII;'
13697 . ' = \p{Alnum} + \pM + \p{Pc}',
13698 Initialize => $Alnum + $gc->table('Mark'),
13700 $Word->add_alias('XPosixWord');
13701 my $Pc = $gc->table('Connector_Punctuation'); # 'Pc' Not in release 1
13706 $Word += ord('_'); # Make sure this is a $Word
13708 my $JC = property_ref('Join_Control'); # Wasn't in release 1
13710 $Word += $JC->table('Y');
13713 $Word += 0x200C + 0x200D;
13716 # This is a Perl extension, so the name doesn't begin with Posix.
13717 my $PerlWord = $perl->add_match_table('PerlWord',
13718 Description => '\w, restricted to ASCII = [A-Za-z0-9_]',
13719 Initialize => $Word & $ASCII,
13721 $PerlWord->add_alias('PosixWord');
13723 my $Blank = $perl->add_match_table('Blank',
13724 Description => '\h, Horizontal white space',
13726 # 200B is Zero Width Space which is for line
13727 # break control, and was listed as
13728 # Space_Separator in early releases
13729 Initialize => $gc->table('Space_Separator')
13733 $Blank->add_alias('HorizSpace'); # Another name for it.
13734 $Blank->add_alias('XPosixBlank');
13735 $perl->add_match_table("PosixBlank",
13736 Description => "\\t and ' '",
13737 Initialize => $Blank & $ASCII,
13740 my $VertSpace = $perl->add_match_table('VertSpace',
13741 Description => '\v',
13743 $gc->table('Line_Separator')
13744 + $gc->table('Paragraph_Separator')
13745 + utf8::unicode_to_native(0x0A) # LINE FEED
13746 + utf8::unicode_to_native(0x0B) # VERTICAL TAB
13748 + utf8::unicode_to_native(0x0D) # CARRIAGE RETURN
13749 + utf8::unicode_to_native(0x85) # NEL
13751 # No Posix equivalent for vertical space
13753 my $Space = $perl->add_match_table('Space',
13754 Description => '\s including beyond ASCII and vertical tab',
13755 Initialize => $Blank + $VertSpace,
13757 $Space->add_alias('XPosixSpace');
13758 my $posix_space = $perl->add_match_table("PosixSpace",
13759 Description => "\\t, \\n, \\cK, \\f, \\r, and ' '. (\\cK is vertical tab)",
13760 Initialize => $Space & $ASCII,
13763 # Perl's traditional space doesn't include Vertical Tab prior to v5.18
13764 my $XPerlSpace = $perl->add_match_table('XPerlSpace',
13765 Description => '\s, including beyond ASCII',
13766 Initialize => $Space,
13767 #Initialize => $Space
13768 # - utf8::unicode_to_native(0x0B]
13770 $XPerlSpace->add_alias('SpacePerl'); # A pre-existing synonym
13771 my $PerlSpace = $perl->add_match_table('PerlSpace',
13772 Description => '\s, restricted to ASCII = [ \f\n\r\t] plus vertical tab',
13773 Initialize => $XPerlSpace & $ASCII,
13777 my $Cntrl = $perl->add_match_table('Cntrl',
13778 Description => 'Control characters');
13779 $Cntrl->set_equivalent_to($gc->table('Cc'), Related => 1);
13780 $Cntrl->add_alias('XPosixCntrl');
13781 $perl->add_match_table("PosixCntrl",
13782 Description => "ASCII control characters: NUL, SOH, STX, ETX, EOT, ENQ, ACK, BEL, BS, HT, LF, VT, FF, CR, SO, SI, DLE, DC1, DC2, DC3, DC4, NAK, SYN, ETB, CAN, EOM, SUB, ESC, FS, GS, RS, US, and DEL",
13783 Initialize => $Cntrl & $ASCII,
13786 # $controls is a temporary used to construct Graph.
13787 my $controls = Range_List->new(Initialize => $gc->table('Unassigned')
13788 + $gc->table('Control'));
13789 # Cs not in release 1
13790 $controls += $gc->table('Surrogate') if defined $gc->table('Surrogate');
13792 # Graph is ~space & ~(Cc|Cs|Cn) = ~(space + $controls)
13793 my $Graph = $perl->add_match_table('Graph',
13794 Description => 'Characters that are graphical',
13795 Initialize => ~ ($Space + $controls),
13797 $Graph->add_alias('XPosixGraph');
13798 $perl->add_match_table("PosixGraph",
13800 '[-!"#$%&\'()*+,./:;<=>?@[\\\]^_`{|}~0-9A-Za-z]',
13801 Initialize => $Graph & $ASCII,
13804 $print = $perl->add_match_table('Print',
13805 Description => 'Characters that are graphical plus space characters (but no controls)',
13806 Initialize => $Blank + $Graph - $gc->table('Control'),
13808 $print->add_alias('XPosixPrint');
13809 $perl->add_match_table("PosixPrint",
13811 '[- 0-9A-Za-z!"#$%&\'()*+,./:;<=>?@[\\\]^_`{|}~]',
13812 Initialize => $print & $ASCII,
13815 my $Punct = $perl->add_match_table('Punct');
13816 $Punct->set_equivalent_to($gc->table('Punctuation'), Related => 1);
13818 # \p{punct} doesn't include the symbols, which posix does
13819 my $XPosixPunct = $perl->add_match_table('XPosixPunct',
13820 Description => '\p{Punct} + ASCII-range \p{Symbol}',
13821 Initialize => $gc->table('Punctuation')
13822 + ($ASCII & $gc->table('Symbol')),
13823 Perl_Extension => 1
13825 $perl->add_match_table('PosixPunct', Perl_Extension => 1,
13826 Description => '[-!"#$%&\'()*+,./:;<=>?@[\\\]^_`{|}~]',
13827 Initialize => $ASCII & $XPosixPunct,
13830 my $Digit = $perl->add_match_table('Digit',
13831 Description => '[0-9] + all other decimal digits');
13832 $Digit->set_equivalent_to($gc->table('Decimal_Number'), Related => 1);
13833 $Digit->add_alias('XPosixDigit');
13834 my $PosixDigit = $perl->add_match_table("PosixDigit",
13835 Description => '[0-9]',
13836 Initialize => $Digit & $ASCII,
13839 # Hex_Digit was not present in first release
13840 my $Xdigit = $perl->add_match_table('XDigit');
13841 $Xdigit->add_alias('XPosixXDigit');
13842 my $Hex = property_ref('Hex_Digit');
13843 if (defined $Hex && ! $Hex->is_empty) {
13844 $Xdigit->set_equivalent_to($Hex->table('Y'), Related => 1);
13847 $Xdigit->initialize([ ord('0') .. ord('9'),
13848 ord('A') .. ord('F'),
13849 ord('a') .. ord('f'),
13850 0xFF10..0xFF19, 0xFF21..0xFF26, 0xFF41..0xFF46]);
13851 $Xdigit->add_description('[0-9A-Fa-f] and corresponding fullwidth versions, like U+FF10: FULLWIDTH DIGIT ZERO');
13854 # AHex was not present in early releases
13855 my $PosixXDigit = $perl->add_match_table('PosixXDigit');
13856 my $AHex = property_ref('ASCII_Hex_Digit');
13857 if (defined $AHex && ! $AHex->is_empty) {
13858 $PosixXDigit->set_equivalent_to($AHex->table('Y'), Related => 1);
13861 $PosixXDigit->initialize($Xdigit & $ASCII);
13862 $PosixXDigit->add_alias('AHex');
13863 $PosixXDigit->add_alias('Ascii_Hex_Digit');
13865 $PosixXDigit->add_description('[0-9A-Fa-f]');
13867 my $any_folds = $perl->add_match_table("_Perl_Any_Folds",
13868 Description => "Code points that particpate in some fold",
13870 my $loc_problem_folds = $perl->add_match_table(
13871 "_Perl_Problematic_Locale_Folds",
13873 "Code points that are in some way problematic under locale",
13876 # This allows regexec.c to skip some work when appropriate. Some of the
13877 # entries in _Perl_Problematic_Locale_Folds are multi-character folds,
13878 my $loc_problem_folds_start = $perl->add_match_table(
13879 "_Perl_Problematic_Locale_Foldeds_Start",
13881 "The first character of every sequence in _Perl_Problematic_Locale_Folds",
13884 my $cf = property_ref('Case_Folding');
13886 # Every character 0-255 is problematic because what each folds to depends
13887 # on the current locale
13888 $loc_problem_folds->add_range(0, 255);
13889 $loc_problem_folds_start += $loc_problem_folds;
13891 # Also problematic are anything these fold to outside the range. Likely
13892 # forever the only thing folded to by these outside the 0-255 range is the
13893 # GREEK SMALL MU (from the MICRO SIGN), but it's easy to make the code
13894 # completely general, which should catch any unexpected changes or errors.
13895 # We look at each code point 0-255, and add its fold (including each part
13896 # of a multi-char fold) to the list. See commit message
13897 # 31f05a37c4e9c37a7263491f2fc0237d836e1a80 for a more complete description
13899 foreach my $range ($loc_problem_folds->ranges) {
13900 foreach my $code_point($range->start .. $range->end) {
13901 my $fold_range = $cf->containing_range($code_point);
13902 next unless defined $fold_range;
13904 my @hex_folds = split " ", $fold_range->value;
13905 my $start_cp = hex $hex_folds[0];
13906 foreach my $i (0 .. @hex_folds - 1) {
13907 my $cp = hex $hex_folds[$i];
13908 next unless $cp > 255; # Already have the < 256 ones
13910 $loc_problem_folds->add_range($cp, $cp);
13911 $loc_problem_folds_start->add_range($start_cp, $start_cp);
13916 my $folds_to_multi_char = $perl->add_match_table(
13917 "_Perl_Folds_To_Multi_Char",
13919 "Code points whose fold is a string of more than one character",
13922 # Look through all the known folds to populate these tables.
13923 foreach my $range ($cf->ranges) {
13924 my $start = $range->start;
13925 my $end = $range->end;
13926 $any_folds->add_range($start, $end);
13928 my @hex_folds = split " ", $range->value;
13929 if (@hex_folds > 1) { # Is multi-char fold
13930 $folds_to_multi_char->add_range($start, $end);
13933 my $found_locale_problematic = 0;
13935 # Look at each of the folded-to characters...
13936 foreach my $i (0 .. @hex_folds - 1) {
13937 my $cp = hex $hex_folds[$i];
13938 $any_folds->add_range($cp, $cp);
13940 # The fold is problematic if any of the folded-to characters is
13941 # already considered problematic.
13942 if ($loc_problem_folds->contains($cp)) {
13943 $loc_problem_folds->add_range($start, $end);
13944 $found_locale_problematic = 1;
13948 # If this is a problematic fold, add to the start chars the
13949 # folding-from characters and first folded-to character.
13950 if ($found_locale_problematic) {
13951 $loc_problem_folds_start->add_range($start, $end);
13952 my $cp = hex $hex_folds[0];
13953 $loc_problem_folds_start->add_range($cp, $cp);
13957 my $dt = property_ref('Decomposition_Type');
13958 $dt->add_match_table('Non_Canon', Full_Name => 'Non_Canonical',
13959 Initialize => ~ ($dt->table('None') + $dt->table('Canonical')),
13960 Perl_Extension => 1,
13961 Note => 'Union of all non-canonical decompositions',
13964 # _CanonDCIJ is equivalent to Soft_Dotted, but if on a release earlier
13965 # than SD appeared, construct it ourselves, based on the first release SD
13966 # was in. A pod entry is grandfathered in for it
13967 my $CanonDCIJ = $perl->add_match_table('_CanonDCIJ', Re_Pod_Entry => 1,
13968 Perl_Extension => 1,
13969 Fate => $INTERNAL_ONLY,
13970 Status => $DISCOURAGED);
13971 my $soft_dotted = property_ref('Soft_Dotted');
13972 if (defined $soft_dotted && ! $soft_dotted->is_empty) {
13973 $CanonDCIJ->set_equivalent_to($soft_dotted->table('Y'), Related => 1);
13977 # This list came from 3.2 Soft_Dotted; all of these code points are in
13979 $CanonDCIJ->initialize([ ord('i'),
13988 $CanonDCIJ = $CanonDCIJ & $Assigned;
13991 # For backward compatibility, Perl has its own definition for IDStart.
13992 # It is regular XID_Start plus the underscore, but all characters must be
13993 # Word characters as well
13994 my $XID_Start = property_ref('XID_Start');
13995 my $perl_xids = $perl->add_match_table('_Perl_IDStart',
13996 Perl_Extension => 1,
13997 Fate => $INTERNAL_ONLY,
13998 Initialize => ord('_')
14000 if (defined $XID_Start
14001 || defined ($XID_Start = property_ref('ID_Start')))
14003 $perl_xids += $XID_Start->table('Y');
14006 # For Unicode versions that don't have the property, construct our own
14007 # from first principles. The actual definition is:
14009 # + letter numbers (Nl)
14011 # - Pattern_White_Space
14012 # + stability extensions
14013 # - NKFC modifications
14015 # What we do in the code below is to include the identical code points
14016 # that are in the first release that had Unicode's version of this
14017 # property, essentially extrapolating backwards. There were no
14018 # stability extensions until v4.1, so none are included; likewise in
14019 # no Unicode version so far do subtracting PatSyn and PatWS make any
14020 # difference, so those also are ignored.
14021 $perl_xids += $gc->table('Letter') + pre_3_dot_1_Nl();
14023 # We do subtract the NFKC modifications that are in the first version
14024 # that had this property. We don't bother to test if they are in the
14025 # version in question, because if they aren't, the operation is a
14026 # no-op. The NKFC modifications are discussed in
14027 # http://www.unicode.org/reports/tr31/#NFKC_Modifications
14028 foreach my $range ( 0x037A,
14031 [ 0xFC5E, 0xFC63 ],
14032 [ 0xFDFA, 0xFE70 ],
14033 [ 0xFE72, 0xFE76 ],
14038 [ 0xFF9E, 0xFF9F ],
14041 $perl_xids->delete_range($range->[0], $range->[1]);
14044 $perl_xids->delete_range($range, $range);
14049 $perl_xids &= $Word;
14051 my $perl_xidc = $perl->add_match_table('_Perl_IDCont',
14052 Perl_Extension => 1,
14053 Fate => $INTERNAL_ONLY);
14054 my $XIDC = property_ref('XID_Continue');
14056 || defined ($XIDC = property_ref('ID_Continue')))
14058 $perl_xidc += $XIDC->table('Y');
14061 # Similarly, we construct our own XIDC if necessary for early Unicode
14062 # versions. The definition is:
14063 # everything in XIDS
14069 # - Pattern_White_Space
14070 # + stability extensions
14071 # - NFKC modifications
14073 # The same thing applies to this as with XIDS for the PatSyn, PatWS,
14074 # and stability extensions. There is a somewhat different set of NFKC
14075 # mods to remove (and add in this case). The ones below make this
14076 # have identical code points as in the first release that defined it.
14077 $perl_xidc += $perl_xids
14082 + utf8::unicode_to_native(0xB7)
14084 if (defined (my $pc = $gc->table('Pc'))) {
14087 else { # 1.1.5 didn't have Pc, but these should have been in it
14088 $perl_xidc += 0xFF3F;
14089 $perl_xidc->add_range(0x203F, 0x2040);
14090 $perl_xidc->add_range(0xFE33, 0xFE34);
14091 $perl_xidc->add_range(0xFE4D, 0xFE4F);
14094 # Subtract the NFKC mods
14095 foreach my $range ( 0x037A,
14096 [ 0xFC5E, 0xFC63 ],
14097 [ 0xFDFA, 0xFE1F ],
14099 [ 0xFE72, 0xFE76 ],
14106 $perl_xidc->delete_range($range->[0], $range->[1]);
14109 $perl_xidc->delete_range($range, $range);
14114 $perl_xidc &= $Word;
14116 my $charname_begin = $perl->add_match_table('_Perl_Charname_Begin',
14117 Perl_Extension => 1,
14118 Fate => $INTERNAL_ONLY,
14119 Initialize => $gc->table('Letter') & $Alpha & $perl_xids,
14122 my $charname_continue = $perl->add_match_table('_Perl_Charname_Continue',
14123 Perl_Extension => 1,
14124 Fate => $INTERNAL_ONLY,
14125 Initialize => $perl_xidc
14130 + utf8::unicode_to_native(0xA0) # NBSP
14133 # These two tables are for matching \X, which is based on the 'extended'
14134 # grapheme cluster, which came in 5.1; create empty ones if not already
14135 # present. The straight 'grapheme cluster' (non-extended) is used prior
14136 # to 5.1, and differs from the extended (see
14137 # http://www.unicode.org/reports/tr29/) only by these two tables, so we
14138 # get the older definition automatically when they are empty.
14139 my $gcb = property_ref('Grapheme_Cluster_Break');
14140 my $perl_prepend = $perl->add_match_table('_X_GCB_Prepend',
14141 Perl_Extension => 1,
14142 Fate => $INTERNAL_ONLY);
14143 if (defined (my $gcb_prepend = $gcb->table('Prepend'))) {
14144 $perl_prepend->set_equivalent_to($gcb_prepend, Related => 1);
14147 push @tables_that_may_be_empty, $perl_prepend->complete_name;
14150 # All the tables with _X_ in their names are used in defining \X handling,
14151 # and are based on the Unicode GCB property. Basically, \X matches:
14153 # | Prepend* Begin Extend*
14155 # Begin is: ( Special_Begin | ! Control )
14156 # Begin is also: ( Regular_Begin | Special_Begin )
14157 # where Regular_Begin is defined as ( ! Control - Special_Begin )
14158 # Special_Begin is: ( Regional-Indicator+ | Hangul-syllable )
14159 # Extend is: ( Grapheme_Extend | Spacing_Mark )
14160 # Control is: [ GCB_Control | CR | LF ]
14161 # Hangul-syllable is: ( T+ | ( L* ( L | ( LVT | ( V | LV ) V* ) T* ) ))
14163 foreach my $gcb_name (qw{ L V T LV LVT }) {
14165 # The perl internal extension's name is the gcb table name prepended
14167 my $perl_table = $perl->add_match_table('_X_GCB_' . $gcb_name,
14168 Perl_Extension => 1,
14169 Fate => $INTERNAL_ONLY,
14170 Initialize => $gcb->table($gcb_name),
14172 # Version 1 had mostly different Hangul syllables that were removed
14173 # from later versions, so some of the tables may not apply.
14174 if ($v_version lt v2.0) {
14175 push @tables_that_may_be_empty, $perl_table->complete_name;
14179 # More GCB. Populate a combined hangul syllables table
14180 my $lv_lvt_v = $perl->add_match_table('_X_LV_LVT_V',
14181 Perl_Extension => 1,
14182 Fate => $INTERNAL_ONLY);
14183 $lv_lvt_v += $gcb->table('LV') + $gcb->table('LVT') + $gcb->table('V');
14184 $lv_lvt_v->add_comment('For use in \X; matches: gcb=LV | gcb=LVT | gcb=V');
14186 my $ri = $perl->add_match_table('_X_RI', Perl_Extension => 1,
14187 Fate => $INTERNAL_ONLY);
14188 if ($v_version ge v6.2) {
14189 $ri += $gcb->table('RI');
14192 push @tables_that_may_be_empty, $ri->full_name;
14195 my $specials_begin = $perl->add_match_table('_X_Special_Begin_Start',
14196 Perl_Extension => 1,
14197 Fate => $INTERNAL_ONLY,
14198 Initialize => $lv_lvt_v
14203 $specials_begin->add_comment(join_lines( <<END
14204 For use in \\X; matches first (perhaps only) character of potential
14205 multi-character sequences that can begin an extended grapheme cluster. They
14206 need special handling because of their complicated nature.
14209 my $regular_begin = $perl->add_match_table('_X_Regular_Begin',
14210 Perl_Extension => 1,
14211 Fate => $INTERNAL_ONLY,
14212 Initialize => ~ $gcb->table('Control')
14214 - $gcb->table('CR')
14215 - $gcb->table('LF')
14217 $regular_begin->add_comment(join_lines( <<END
14218 For use in \\X; matches first character of anything that can begin an extended
14219 grapheme cluster, except those that require special handling.
14223 my $extend = $perl->add_match_table('_X_Extend', Perl_Extension => 1,
14224 Fate => $INTERNAL_ONLY,
14225 Initialize => $gcb->table('Extend')
14227 if (defined (my $sm = $gcb->table('SpacingMark'))) {
14230 $extend->add_comment('For use in \X; matches: Extend | SpacingMark');
14232 # End of GCB \X processing
14234 my @composition = ('Name', 'Unicode_1_Name', 'Name_Alias');
14236 if (@named_sequences) {
14237 push @composition, 'Named_Sequence';
14238 foreach my $sequence (@named_sequences) {
14239 $perl_charname->add_anomalous_entry($sequence);
14243 my $alias_sentence = "";
14245 my $alias = property_ref('Name_Alias');
14246 $perl_charname->set_proxy_for('Name_Alias');
14248 # Add each entry in Name_Alias to Perl_Charnames. Where these go with
14249 # respect to any existing entry depends on the entry type. Corrections go
14250 # before said entry, as they should be returned in preference over the
14251 # existing entry. (A correction to a correction should be later in the
14252 # Name_Alias table, so it will correctly precede the erroneous correction
14253 # in Perl_Charnames.)
14255 # Abbreviations go after everything else, so they are saved temporarily in
14256 # a hash for later.
14258 # Everything else is added added afterwards, which preserves the input
14261 foreach my $range ($alias->ranges) {
14262 next if $range->value eq "";
14263 my $code_point = $range->start;
14264 if ($code_point != $range->end) {
14265 Carp::my_carp_bug("Bad News. Expecting only one code point in the range $range. Just to keep going, using only the first code point;");
14267 my ($value, $type) = split ': ', $range->value;
14269 if ($type eq 'correction') {
14270 $replace_type = $MULTIPLE_BEFORE;
14272 elsif ($type eq 'abbreviation') {
14275 $abbreviations{$value} = $code_point;
14279 $replace_type = $MULTIPLE_AFTER;
14282 # Actually add; before or after current entry(ies) as determined
14285 $perl_charname->add_duplicate($code_point, $value, Replace => $replace_type);
14287 $alias_sentence = <<END;
14288 The Name_Alias property adds duplicate code point entries that are
14289 alternatives to the original name. If an addition is a corrected
14290 name, it will be physically first in the table. The original (less correct,
14291 but still valid) name will be next; then any alternatives, in no particular
14292 order; and finally any abbreviations, again in no particular order.
14295 # Now add the Unicode_1 names for the controls. The Unicode_1 names had
14296 # precedence before 6.1, so should be first in the file; the other names
14297 # have precedence starting in 6.1,
14298 my $before_or_after = ($v_version lt v6.1.0)
14302 foreach my $range (property_ref('Unicode_1_Name')->ranges) {
14303 my $code_point = $range->start;
14304 my $unicode_1_value = $range->value;
14305 next if $unicode_1_value eq ""; # Skip if name doesn't exist.
14307 if ($code_point != $range->end) {
14308 Carp::my_carp_bug("Bad News. Expecting only one code point in the range $range. Just to keep going, using only the first code point;");
14311 # To handle EBCDIC, we don't hard code in the code points of the
14312 # controls; instead realizing that all of them are below 256.
14313 last if $code_point > 255;
14315 # We only add in the controls.
14316 next if $gc->value_of($code_point) ne 'Cc';
14318 # We reject this Unicode1 name for later Perls, as it is used for
14319 # another code point
14320 next if $unicode_1_value eq 'BELL' && $^V ge v5.17.0;
14322 # This won't add an exact duplicate.
14323 $perl_charname->add_duplicate($code_point, $unicode_1_value,
14324 Replace => $before_or_after);
14327 # But in this version only, the ALERT has precedence over BELL, the
14328 # Unicode_1_Name that would otherwise have precedence.
14329 if ($v_version eq v6.0.0) {
14330 $perl_charname->add_duplicate(7, 'ALERT', Replace => $MULTIPLE_BEFORE);
14333 # Now that have everything added, add in abbreviations after
14334 # everything else. Sort so results don't change between runs of this
14336 foreach my $value (sort keys %abbreviations) {
14337 $perl_charname->add_duplicate($abbreviations{$value}, $value,
14338 Replace => $MULTIPLE_AFTER);
14342 if (@composition <= 2) { # Always at least 2
14343 $comment = join " and ", @composition;
14346 $comment = join ", ", @composition[0 .. scalar @composition - 2];
14347 $comment .= ", and $composition[-1]";
14350 $perl_charname->add_comment(join_lines( <<END
14351 This file is for charnames.pm. It is the union of the $comment properties.
14352 Unicode_1_Name entries are used only for nameless code points in the Name
14355 This file doesn't include the algorithmically determinable names. For those,
14356 use 'unicore/Name.pm'
14359 property_ref('Name')->add_comment(join_lines( <<END
14360 This file doesn't include the algorithmically determinable names. For those,
14361 use 'unicore/Name.pm'
14365 # Construct the Present_In property from the Age property.
14366 if (-e 'DAge.txt' && defined (my $age = property_ref('Age'))) {
14367 my $default_map = $age->default_map;
14368 my $in = Property->new('In',
14369 Default_Map => $default_map,
14370 Full_Name => "Present_In",
14371 Perl_Extension => 1,
14373 Initialize => $age,
14375 $in->add_comment(join_lines(<<END
14376 THIS FILE SHOULD NOT BE USED FOR ANY PURPOSE. The values in this file are the
14377 same as for $age, and not for what $in really means. This is because anything
14378 defined in a given release should have multiple values: that release and all
14379 higher ones. But only one value per code point can be represented in a table
14384 # The Age tables are named like 1.5, 2.0, 2.1, .... Sort so that the
14385 # lowest numbered (earliest) come first, with the non-numeric one
14387 my ($first_age, @rest_ages) = sort { ($a->name !~ /^[\d.]*$/)
14389 : ($b->name !~ /^[\d.]*$/)
14391 : $a->name <=> $b->name
14394 # The Present_In property is the cumulative age properties. The first
14395 # one hence is identical to the first age one.
14396 my $previous_in = $in->add_match_table($first_age->name);
14397 $previous_in->set_equivalent_to($first_age, Related => 1);
14399 my $description_start = "Code point's usage introduced in version ";
14400 $first_age->add_description($description_start . $first_age->name);
14402 # To construct the accumulated values, for each of the age tables
14403 # starting with the 2nd earliest, merge the earliest with it, to get
14404 # all those code points existing in the 2nd earliest. Repeat merging
14405 # the new 2nd earliest with the 3rd earliest to get all those existing
14406 # in the 3rd earliest, and so on.
14407 foreach my $current_age (@rest_ages) {
14408 next if $current_age->name !~ /^[\d.]*$/; # Skip the non-numeric
14410 my $current_in = $in->add_match_table(
14411 $current_age->name,
14412 Initialize => $current_age + $previous_in,
14413 Description => $description_start
14414 . $current_age->name
14417 $previous_in = $current_in;
14419 # Add clarifying material for the corresponding age file. This is
14420 # in part because of the confusing and contradictory information
14421 # given in the Standard's documentation itself, as of 5.2.
14422 $current_age->add_description(
14423 "Code point's usage was introduced in version "
14424 . $current_age->name);
14425 $current_age->add_note("See also $in");
14429 # And finally the code points whose usages have yet to be decided are
14430 # the same in both properties. Note that permanently unassigned code
14431 # points actually have their usage assigned (as being permanently
14432 # unassigned), so that these tables are not the same as gc=cn.
14433 my $unassigned = $in->add_match_table($default_map);
14434 my $age_default = $age->table($default_map);
14435 $age_default->add_description(<<END
14436 Code point's usage has not been assigned in any Unicode release thus far.
14439 $unassigned->set_equivalent_to($age_default, Related => 1);
14442 # See L<perlfunc/quotemeta>
14443 my $quotemeta = $perl->add_match_table('_Perl_Quotemeta',
14444 Perl_Extension => 1,
14445 Fate => $INTERNAL_ONLY,
14447 # Initialize to what's common in
14448 # all Unicode releases.
14451 + $gc->table('Control')
14454 # In early releases without the proper Unicode properties, just set to \W.
14455 if (! defined (my $patsyn = property_ref('Pattern_Syntax'))
14456 || ! defined (my $patws = property_ref('Pattern_White_Space'))
14457 || ! defined (my $di = property_ref('Default_Ignorable_Code_Point')))
14459 $quotemeta += ~ $Word;
14462 $quotemeta += $patsyn->table('Y')
14463 + $patws->table('Y')
14465 + ((~ $Word) & $ASCII);
14468 # Finished creating all the perl properties. All non-internal non-string
14469 # ones have a synonym of 'Is_' prefixed. (Internal properties begin with
14470 # an underscore.) These do not get a separate entry in the pod file
14471 foreach my $table ($perl->tables) {
14472 foreach my $alias ($table->aliases) {
14473 next if $alias->name =~ /^_/;
14474 $table->add_alias('Is_' . $alias->name,
14477 Status => $alias->status,
14478 OK_as_Filename => 0);
14482 # Here done with all the basic stuff. Ready to populate the information
14483 # about each character if annotating them.
14486 # See comments at its declaration
14487 $annotate_ranges = Range_Map->new;
14489 # This separates out the non-characters from the other unassigneds, so
14490 # can give different annotations for each.
14491 $unassigned_sans_noncharacters = Range_List->new(
14492 Initialize => $gc->table('Unassigned'));
14493 if (defined (my $nonchars = property_ref('Noncharacter_Code_Point'))) {
14494 $unassigned_sans_noncharacters &= $nonchars->table('N');
14497 for (my $i = 0; $i <= $MAX_UNICODE_CODEPOINT + 1; $i++ ) {
14498 $i = populate_char_info($i); # Note sets $i so may cause skips
14506 sub add_perl_synonyms() {
14507 # A number of Unicode tables have Perl synonyms that are expressed in
14508 # the single-form, \p{name}. These are:
14509 # All the binary property Y tables, so that \p{Name=Y} gets \p{Name} and
14510 # \p{Is_Name} as synonyms
14511 # \p{Script=Value} gets \p{Value}, \p{Is_Value} as synonyms
14512 # \p{General_Category=Value} gets \p{Value}, \p{Is_Value} as synonyms
14513 # \p{Block=Value} gets \p{In_Value} as a synonym, and, if there is no
14514 # conflict, \p{Value} and \p{Is_Value} as well
14516 # This routine generates these synonyms, warning of any unexpected
14519 # Construct the list of tables to get synonyms for. Start with all the
14520 # binary and the General_Category ones.
14521 my @tables = grep { $_->type == $BINARY || $_->type == $FORCED_BINARY }
14523 push @tables, $gc->tables;
14525 # If the version of Unicode includes the Script property, add its tables
14526 push @tables, $script->tables if defined $script;
14528 # The Block tables are kept separate because they are treated differently.
14529 # And the earliest versions of Unicode didn't include them, so add only if
14532 push @blocks, $block->tables if defined $block;
14534 # Here, have the lists of tables constructed. Process blocks last so that
14535 # if there are name collisions with them, blocks have lowest priority.
14536 # Should there ever be other collisions, manual intervention would be
14537 # required. See the comments at the beginning of the program for a
14538 # possible way to handle those semi-automatically.
14539 foreach my $table (@tables, @blocks) {
14541 # For non-binary properties, the synonym is just the name of the
14542 # table, like Greek, but for binary properties the synonym is the name
14543 # of the property, and means the code points in its 'Y' table.
14544 my $nominal = $table;
14545 my $nominal_property = $nominal->property;
14547 if (! $nominal->isa('Property')) {
14552 # Here is a binary property. Use the 'Y' table. Verify that is
14554 my $yes = $nominal->table('Y');
14555 unless (defined $yes) { # Must be defined, but is permissible to
14557 Carp::my_carp_bug("Undefined $nominal, 'Y'. Skipping.");
14563 foreach my $alias ($nominal->aliases) {
14565 # Attempt to create a table in the perl directory for the
14566 # candidate table, using whatever aliases in it that don't
14567 # conflict. Also add non-conflicting aliases for all these
14568 # prefixed by 'Is_' (and/or 'In_' for Block property tables)
14570 foreach my $prefix ("", 'Is_', 'In_') {
14572 # Only Block properties can have added 'In_' aliases.
14573 next if $prefix eq 'In_' and $nominal_property != $block;
14575 my $proposed_name = $prefix . $alias->name;
14577 # No Is_Is, In_In, nor combinations thereof
14578 trace "$proposed_name is a no-no" if main::DEBUG && $to_trace && $proposed_name =~ /^ I [ns] _I [ns] _/x;
14579 next if $proposed_name =~ /^ I [ns] _I [ns] _/x;
14581 trace "Seeing if can add alias or table: 'perl=$proposed_name' based on $nominal" if main::DEBUG && $to_trace;
14583 # Get a reference to any existing table in the perl
14584 # directory with the desired name.
14585 my $pre_existing = $perl->table($proposed_name);
14587 if (! defined $pre_existing) {
14589 # No name collision, so ok to add the perl synonym.
14591 my $make_re_pod_entry;
14592 my $ok_as_filename;
14593 my $status = $alias->status;
14594 if ($nominal_property == $block) {
14596 # For block properties, the 'In' form is preferred for
14597 # external use; the pod file contains wild cards for
14598 # this and the 'Is' form so no entries for those; and
14599 # we don't want people using the name without the
14600 # 'In', so discourage that.
14601 if ($prefix eq "") {
14602 $make_re_pod_entry = 1;
14603 $status = $status || $DISCOURAGED;
14604 $ok_as_filename = 0;
14606 elsif ($prefix eq 'In_') {
14607 $make_re_pod_entry = 0;
14608 $status = $status || $NORMAL;
14609 $ok_as_filename = 1;
14612 $make_re_pod_entry = 0;
14613 $status = $status || $DISCOURAGED;
14614 $ok_as_filename = 0;
14617 elsif ($prefix ne "") {
14619 # The 'Is' prefix is handled in the pod by a wild
14620 # card, and we won't use it for an external name
14621 $make_re_pod_entry = 0;
14622 $status = $status || $NORMAL;
14623 $ok_as_filename = 0;
14627 # Here, is an empty prefix, non block. This gets its
14628 # own pod entry and can be used for an external name.
14629 $make_re_pod_entry = 1;
14630 $status = $status || $NORMAL;
14631 $ok_as_filename = 1;
14634 # Here, there isn't a perl pre-existing table with the
14635 # name. Look through the list of equivalents of this
14636 # table to see if one is a perl table.
14637 foreach my $equivalent ($actual->leader->equivalents) {
14638 next if $equivalent->property != $perl;
14640 # Here, have found a table for $perl. Add this alias
14641 # to it, and are done with this prefix.
14642 $equivalent->add_alias($proposed_name,
14643 Re_Pod_Entry => $make_re_pod_entry,
14645 # Currently don't output these in the
14646 # ucd pod, as are strongly discouraged
14651 OK_as_Filename => $ok_as_filename);
14652 trace "adding alias perl=$proposed_name to $equivalent" if main::DEBUG && $to_trace;
14656 # Here, $perl doesn't already have a table that is a
14657 # synonym for this property, add one.
14658 my $added_table = $perl->add_match_table($proposed_name,
14659 Re_Pod_Entry => $make_re_pod_entry,
14661 # See UCD comment just above
14665 OK_as_Filename => $ok_as_filename);
14666 # And it will be related to the actual table, since it is
14668 $added_table->set_equivalent_to($actual, Related => 1);
14669 trace "added ", $perl->table($proposed_name) if main::DEBUG && $to_trace;
14671 } # End of no pre-existing.
14673 # Here, there is a pre-existing table that has the proposed
14674 # name. We could be in trouble, but not if this is just a
14675 # synonym for another table that we have already made a child
14676 # of the pre-existing one.
14677 if ($pre_existing->is_set_equivalent_to($actual)) {
14678 trace "$pre_existing is already equivalent to $actual; adding alias perl=$proposed_name to it" if main::DEBUG && $to_trace;
14679 $pre_existing->add_alias($proposed_name);
14683 # Here, there is a name collision, but it still could be ok if
14684 # the tables match the identical set of code points, in which
14685 # case, we can combine the names. Compare each table's code
14686 # point list to see if they are identical.
14687 trace "Potential name conflict with $pre_existing having ", $pre_existing->count, " code points" if main::DEBUG && $to_trace;
14688 if ($pre_existing->matches_identically_to($actual)) {
14690 # Here, they do match identically. Not a real conflict.
14691 # Make the perl version a child of the Unicode one, except
14692 # in the non-obvious case of where the perl name is
14693 # already a synonym of another Unicode property. (This is
14694 # excluded by the test for it being its own parent.) The
14695 # reason for this exclusion is that then the two Unicode
14696 # properties become related; and we don't really know if
14697 # they are or not. We generate documentation based on
14698 # relatedness, and this would be misleading. Code
14699 # later executed in the process will cause the tables to
14700 # be represented by a single file anyway, without making
14701 # it look in the pod like they are necessarily related.
14702 if ($pre_existing->parent == $pre_existing
14703 && ($pre_existing->property == $perl
14704 || $actual->property == $perl))
14706 trace "Setting $pre_existing equivalent to $actual since one is \$perl, and match identical sets" if main::DEBUG && $to_trace;
14707 $pre_existing->set_equivalent_to($actual, Related => 1);
14709 elsif (main::DEBUG && $to_trace) {
14710 trace "$pre_existing is equivalent to $actual since match identical sets, but not setting them equivalent, to preserve the separateness of the perl aliases";
14711 trace $pre_existing->parent;
14716 # Here they didn't match identically, there is a real conflict
14717 # between our new name and a pre-existing property.
14718 $actual->add_conflicting($proposed_name, 'p', $pre_existing);
14719 $pre_existing->add_conflicting($nominal->full_name,
14723 # Don't output a warning for aliases for the block
14724 # properties (unless they start with 'In_') as it is
14725 # expected that there will be conflicts and the block
14727 if ($verbosity >= $NORMAL_VERBOSITY
14728 && ($actual->property != $block || $prefix eq 'In_'))
14730 print simple_fold(join_lines(<<END
14731 There is already an alias named $proposed_name (from $pre_existing),
14732 so not creating this alias for $actual
14737 # Keep track for documentation purposes.
14738 $has_In_conflicts++ if $prefix eq 'In_';
14739 $has_Is_conflicts++ if $prefix eq 'Is_';
14744 # There are some properties which have No and Yes (and N and Y) as
14745 # property values, but aren't binary, and could possibly be confused with
14746 # binary ones. So create caveats for them. There are tables that are
14747 # named 'No', and tables that are named 'N', but confusion is not likely
14748 # unless they are the same table. For example, N meaning Number or
14749 # Neutral is not likely to cause confusion, so don't add caveats to things
14751 foreach my $property (grep { $_->type != $BINARY
14752 && $_->type != $FORCED_BINARY }
14755 my $yes = $property->table('Yes');
14756 if (defined $yes) {
14757 my $y = $property->table('Y');
14758 if (defined $y && $yes == $y) {
14759 foreach my $alias ($property->aliases) {
14760 $yes->add_conflicting($alias->name);
14764 my $no = $property->table('No');
14766 my $n = $property->table('N');
14767 if (defined $n && $no == $n) {
14768 foreach my $alias ($property->aliases) {
14769 $no->add_conflicting($alias->name, 'P');
14778 sub register_file_for_name($$$) {
14779 # Given info about a table and a datafile that it should be associated
14780 # with, register that association
14783 my $directory_ref = shift; # Array of the directory path for the file
14784 my $file = shift; # The file name in the final directory.
14785 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
14787 trace "table=$table, file=$file, directory=@$directory_ref" if main::DEBUG && $to_trace;
14789 if ($table->isa('Property')) {
14790 $table->set_file_path(@$directory_ref, $file);
14791 push @map_properties, $table;
14793 # No swash means don't do the rest of this.
14794 return if $table->fate != $ORDINARY;
14796 # Get the path to the file
14797 my @path = $table->file_path;
14799 # Use just the file name if no subdirectory.
14800 shift @path if $path[0] eq File::Spec->curdir();
14802 my $file = join '/', @path;
14804 # Create a hash entry for utf8_heavy to get the file that stores this
14805 # property's map table
14806 foreach my $alias ($table->aliases) {
14807 my $name = $alias->name;
14808 $loose_property_to_file_of{standardize($name)} = $file;
14811 # And a way for utf8_heavy to find the proper key in the SwashInfo
14812 # hash for this property.
14813 $file_to_swash_name{$file} = "To" . $table->swash_name;
14817 # Do all of the work for all equivalent tables when called with the leader
14818 # table, so skip if isn't the leader.
14819 return if $table->leader != $table;
14821 # If this is a complement of another file, use that other file instead,
14822 # with a ! prepended to it.
14824 if (($complement = $table->complement) != 0) {
14825 my @directories = $complement->file_path;
14827 # This assumes that the 0th element is something like 'lib',
14828 # the 1th element the property name (in its own directory), like
14829 # 'AHex', and the 2th element the file like 'Y' which will have a .pl
14830 # appended to it later.
14831 $directories[1] =~ s/^/!/;
14832 $file = pop @directories;
14833 $directory_ref =\@directories;
14836 # Join all the file path components together, using slashes.
14837 my $full_filename = join('/', @$directory_ref, $file);
14839 # All go in the same subdirectory of unicore, or the special
14840 # pseudo-directory '#'
14841 if ($directory_ref->[0] !~ / ^ $matches_directory | \# $ /x) {
14842 Carp::my_carp("Unexpected directory in "
14843 . join('/', @{$directory_ref}, $file));
14846 # For this table and all its equivalents ...
14847 foreach my $table ($table, $table->equivalents) {
14849 # Associate it with its file internally. Don't include the
14850 # $matches_directory first component
14851 $table->set_file_path(@$directory_ref, $file);
14853 # No swash means don't do the rest of this.
14854 next if $table->isa('Map_Table') && $table->fate != $ORDINARY;
14856 my $sub_filename = join('/', $directory_ref->[1, -1], $file);
14858 my $property = $table->property;
14859 my $property_name = ($property == $perl)
14860 ? "" # 'perl' is never explicitly stated
14861 : standardize($property->name) . '=';
14863 my $is_default = 0; # Is this table the default one for the property?
14865 # To calculate $is_default, we find if this table is the same as the
14866 # default one for the property. But this is complicated by the
14867 # possibility that there is a master table for this one, and the
14868 # information is stored there instead of here.
14869 my $parent = $table->parent;
14870 my $leader_prop = $parent->property;
14871 my $default_map = $leader_prop->default_map;
14872 if (defined $default_map) {
14873 my $default_table = $leader_prop->table($default_map);
14874 $is_default = 1 if defined $default_table && $parent == $default_table;
14877 # Calculate the loose name for this table. Mostly it's just its name,
14878 # standardized. But in the case of Perl tables that are single-form
14879 # equivalents to Unicode properties, it is the latter's name.
14880 my $loose_table_name =
14881 ($property != $perl || $leader_prop == $perl)
14882 ? standardize($table->name)
14883 : standardize($parent->name);
14885 my $deprecated = ($table->status eq $DEPRECATED)
14886 ? $table->status_info
14888 my $caseless_equivalent = $table->caseless_equivalent;
14890 # And for each of the table's aliases... This inner loop eventually
14891 # goes through all aliases in the UCD that we generate regex match
14893 foreach my $alias ($table->aliases) {
14894 my $standard = utf8_heavy_name($table, $alias);
14896 # Generate an entry in either the loose or strict hashes, which
14897 # will translate the property and alias names combination into the
14898 # file where the table for them is stored.
14899 if ($alias->loose_match) {
14900 if (exists $loose_to_file_of{$standard}) {
14901 Carp::my_carp("Can't change file registered to $loose_to_file_of{$standard} to '$sub_filename'.");
14904 $loose_to_file_of{$standard} = $sub_filename;
14908 if (exists $stricter_to_file_of{$standard}) {
14909 Carp::my_carp("Can't change file registered to $stricter_to_file_of{$standard} to '$sub_filename'.");
14912 $stricter_to_file_of{$standard} = $sub_filename;
14914 # Tightly coupled with how utf8_heavy.pl works, for a
14915 # floating point number that is a whole number, get rid of
14916 # the trailing decimal point and 0's, so that utf8_heavy
14917 # will work. Also note that this assumes that such a
14918 # number is matched strictly; so if that were to change,
14919 # this would be wrong.
14920 if ((my $integer_name = $alias->name)
14921 =~ s/^ ( -? \d+ ) \.0+ $ /$1/x)
14923 $stricter_to_file_of{$property_name . $integer_name}
14929 # For Unicode::UCD, create a mapping of the prop=value to the
14930 # canonical =value for that property.
14931 if ($standard =~ /=/) {
14933 # This could happen if a strict name mapped into an existing
14934 # loose name. In that event, the strict names would have to
14935 # be moved to a new hash.
14936 if (exists($loose_to_standard_value{$standard})) {
14937 Carp::my_carp_bug("'$standard' conflicts with a pre-existing use. Bad News. Continuing anyway");
14939 $loose_to_standard_value{$standard} = $loose_table_name;
14942 # Keep a list of the deprecated properties and their filenames
14943 if ($deprecated && $complement == 0) {
14944 $utf8::why_deprecated{$sub_filename} = $deprecated;
14947 # And a substitute table, if any, for case-insensitive matching
14948 if ($caseless_equivalent != 0) {
14949 $caseless_equivalent_to{$standard} = $caseless_equivalent;
14952 # Add to defaults list if the table this alias belongs to is the
14954 $loose_defaults{$standard} = 1 if $is_default;
14962 my %base_names; # Names already used for avoiding DOS 8.3 filesystem
14964 my %full_dir_name_of; # Full length names of directories used.
14966 sub construct_filename($$$) {
14967 # Return a file name for a table, based on the table name, but perhaps
14968 # changed to get rid of non-portable characters in it, and to make
14969 # sure that it is unique on a file system that allows the names before
14970 # any period to be at most 8 characters (DOS). While we're at it
14971 # check and complain if there are any directory conflicts.
14973 my $name = shift; # The name to start with
14974 my $mutable = shift; # Boolean: can it be changed? If no, but
14975 # yet it must be to work properly, a warning
14977 my $directories_ref = shift; # A reference to an array containing the
14978 # path to the file, with each element one path
14979 # component. This is used because the same
14980 # name can be used in different directories.
14981 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
14983 my $warn = ! defined wantarray; # If true, then if the name is
14984 # changed, a warning is issued as well.
14986 if (! defined $name) {
14987 Carp::my_carp("Undefined name in directory "
14988 . File::Spec->join(@$directories_ref)
14993 # Make sure that no directory names conflict with each other. Look at
14994 # each directory in the input file's path. If it is already in use,
14995 # assume it is correct, and is merely being re-used, but if we
14996 # truncate it to 8 characters, and find that there are two directories
14997 # that are the same for the first 8 characters, but differ after that,
14998 # then that is a problem.
14999 foreach my $directory (@$directories_ref) {
15000 my $short_dir = substr($directory, 0, 8);
15001 if (defined $full_dir_name_of{$short_dir}) {
15002 next if $full_dir_name_of{$short_dir} eq $directory;
15003 Carp::my_carp("$directory conflicts with $full_dir_name_of{$short_dir}. Bad News. Continuing anyway");
15006 $full_dir_name_of{$short_dir} = $directory;
15010 my $path = join '/', @$directories_ref;
15011 $path .= '/' if $path;
15013 # Remove interior underscores.
15014 (my $filename = $name) =~ s/ (?<=.) _ (?=.) //xg;
15016 # Change any non-word character into an underscore, and truncate to 8.
15017 $filename =~ s/\W+/_/g; # eg., "L&" -> "L_"
15018 substr($filename, 8) = "" if length($filename) > 8;
15020 # Make sure the basename doesn't conflict with something we
15021 # might have already written. If we have, say,
15028 while (my $num = $base_names{$path}{lc $filename}++) {
15029 $num++; # so basenames with numbers start with '2', which
15030 # just looks more natural.
15032 # Want to append $num, but if it'll make the basename longer
15033 # than 8 characters, pre-truncate $filename so that the result
15035 my $delta = length($filename) + length($num) - 8;
15037 substr($filename, -$delta) = $num;
15042 if ($warn && ! $warned) {
15044 Carp::my_carp("'$path$name' conflicts with another name on a filesystem with 8 significant characters (like DOS). Proceeding anyway.");
15048 return $filename if $mutable;
15050 # If not changeable, must return the input name, but warn if needed to
15051 # change it beyond shortening it.
15052 if ($name ne $filename
15053 && substr($name, 0, length($filename)) ne $filename) {
15054 Carp::my_carp("'$path$name' had to be changed into '$filename'. Bad News. Proceeding anyway.");
15060 # The pod file contains a very large table. Many of the lines in that table
15061 # would exceed a typical output window's size, and so need to be wrapped with
15062 # a hanging indent to make them look good. The pod language is really
15063 # insufficient here. There is no general construct to do that in pod, so it
15064 # is done here by beginning each such line with a space to cause the result to
15065 # be output without formatting, and doing all the formatting here. This leads
15066 # to the result that if the eventual display window is too narrow it won't
15067 # look good, and if the window is too wide, no advantage is taken of that
15068 # extra width. A further complication is that the output may be indented by
15069 # the formatter so that there is less space than expected. What I (khw) have
15070 # done is to assume that that indent is a particular number of spaces based on
15071 # what it is in my Linux system; people can always resize their windows if
15072 # necessary, but this is obviously less than desirable, but the best that can
15074 my $automatic_pod_indent = 8;
15076 # Try to format so that uses fewest lines, but few long left column entries
15077 # slide into the right column. An experiment on 5.1 data yielded the
15078 # following percentages that didn't cut into the other side along with the
15079 # associated first-column widths
15081 # 80% not too bad except for a few blocks
15082 # 90% = 33; # , cuts 353/3053 lines from 37 = 12%
15084 my $indent_info_column = 27; # 75% of lines didn't have overlap
15086 my $FILLER = 3; # Length of initial boiler-plate columns in a pod line
15087 # The 3 is because of:
15088 # 1 for the leading space to tell the pod formatter to
15091 # 1 for the space between the flag and the main data
15093 sub format_pod_line ($$$;$$) {
15094 # Take a pod line and return it, formatted properly
15096 my $first_column_width = shift;
15097 my $entry = shift; # Contents of left column
15098 my $info = shift; # Contents of right column
15100 my $status = shift || ""; # Any flag
15102 my $loose_match = shift; # Boolean.
15103 $loose_match = 1 unless defined $loose_match;
15105 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
15108 $flags .= $STRICTER if ! $loose_match;
15110 $flags .= $status if $status;
15112 # There is a blank in the left column to cause the pod formatter to
15113 # output the line as-is.
15114 return sprintf " %-*s%-*s %s\n",
15115 # The first * in the format is replaced by this, the -1 is
15116 # to account for the leading blank. There isn't a
15117 # hard-coded blank after this to separate the flags from
15118 # the rest of the line, so that in the unlikely event that
15119 # multiple flags are shown on the same line, they both
15120 # will get displayed at the expense of that separation,
15121 # but since they are left justified, a blank will be
15122 # inserted in the normal case.
15126 # The other * in the format is replaced by this number to
15127 # cause the first main column to right fill with blanks.
15128 # The -1 is for the guaranteed blank following it.
15129 $first_column_width - $FILLER - 1,
15134 my @zero_match_tables; # List of tables that have no matches in this release
15136 sub make_re_pod_entries($) {
15137 # This generates the entries for the pod file for a given table.
15138 # Also done at this time are any children tables. The output looks like:
15139 # \p{Common} \p{Script=Common} (Short: \p{Zyyy}) (5178)
15141 my $input_table = shift; # Table the entry is for
15142 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
15144 # Generate parent and all its children at the same time.
15145 return if $input_table->parent != $input_table;
15147 my $property = $input_table->property;
15148 my $type = $property->type;
15149 my $full_name = $property->full_name;
15151 my $count = $input_table->count;
15153 my $non_unicode_string;
15154 if ($count > $MAX_UNICODE_CODEPOINTS) {
15155 $unicode_count = $count - ($MAX_WORKING_CODEPOINT
15156 - $MAX_UNICODE_CODEPOINT);
15157 $non_unicode_string = " plus all above-Unicode code points";
15160 $unicode_count = $count;
15161 $non_unicode_string = "";
15163 my $string_count = clarify_number($unicode_count) . $non_unicode_string;
15164 my $status = $input_table->status;
15165 my $status_info = $input_table->status_info;
15166 my $caseless_equivalent = $input_table->caseless_equivalent;
15168 # Don't mention a placeholder equivalent as it isn't to be listed in the
15170 $caseless_equivalent = 0 if $caseless_equivalent != 0
15171 && $caseless_equivalent->fate > $ORDINARY;
15173 my $entry_for_first_table; # The entry for the first table output.
15174 # Almost certainly, it is the parent.
15176 # For each related table (including itself), we will generate a pod entry
15177 # for each name each table goes by
15178 foreach my $table ($input_table, $input_table->children) {
15180 # utf8_heavy.pl cannot deal with null string property values, so skip
15181 # any tables that have no non-null names.
15182 next if ! grep { $_->name ne "" } $table->aliases;
15184 # First, gather all the info that applies to this table as a whole.
15186 push @zero_match_tables, $table if $count == 0
15187 # Don't mention special tables
15188 # as being zero length
15189 && $table->fate == $ORDINARY;
15191 my $table_property = $table->property;
15193 # The short name has all the underscores removed, while the full name
15194 # retains them. Later, we decide whether to output a short synonym
15195 # for the full one, we need to compare apples to apples, so we use the
15196 # short name's length including underscores.
15197 my $table_property_short_name_length;
15198 my $table_property_short_name
15199 = $table_property->short_name(\$table_property_short_name_length);
15200 my $table_property_full_name = $table_property->full_name;
15202 # Get how much savings there is in the short name over the full one
15203 # (delta will always be <= 0)
15204 my $table_property_short_delta = $table_property_short_name_length
15205 - length($table_property_full_name);
15206 my @table_description = $table->description;
15207 my @table_note = $table->note;
15209 # Generate an entry for each alias in this table.
15210 my $entry_for_first_alias; # saves the first one encountered.
15211 foreach my $alias ($table->aliases) {
15213 # Skip if not to go in pod.
15214 next unless $alias->make_re_pod_entry;
15216 # Start gathering all the components for the entry
15217 my $name = $alias->name;
15219 # Skip if name is empty, as can't be accessed by regexes.
15220 next if $name eq "";
15222 my $entry; # Holds the left column, may include extras
15223 my $entry_ref; # To refer to the left column's contents from
15224 # another entry; has no extras
15226 # First the left column of the pod entry. Tables for the $perl
15227 # property always use the single form.
15228 if ($table_property == $perl) {
15229 $entry = "\\p{$name}";
15230 $entry .= " \\p$name" if length $name == 1; # Show non-braced
15232 $entry_ref = "\\p{$name}";
15234 else { # Compound form.
15236 # Only generate one entry for all the aliases that mean true
15237 # or false in binary properties. Append a '*' to indicate
15238 # some are missing. (The heading comment notes this.)
15240 if ($type == $BINARY) {
15241 next if $name ne 'N' && $name ne 'Y';
15244 elsif ($type != $FORCED_BINARY) {
15249 # Forced binary properties require special handling. It
15250 # has two sets of tables, one set is true/false; and the
15251 # other set is everything else. Entries are generated for
15252 # each set. Use the Bidi_Mirrored property (which appears
15253 # in all Unicode versions) to get a list of the aliases
15254 # for the true/false tables. Of these, only output the N
15255 # and Y ones, the same as, a regular binary property. And
15256 # output all the rest, same as a non-binary property.
15257 my $bm = property_ref("Bidi_Mirrored");
15258 if ($name eq 'N' || $name eq 'Y') {
15260 } elsif (grep { $name eq $_->name } $bm->table("Y")->aliases,
15261 $bm->table("N")->aliases)
15270 # Colon-space is used to give a little more space to be easier
15273 . $table_property_full_name
15276 # But for the reference to this entry, which will go in the
15277 # right column, where space is at a premium, use equals
15279 $entry_ref = "\\p{" . $table_property_full_name . "=$name}";
15282 # Then the right (info) column. This is stored as components of
15283 # an array for the moment, then joined into a string later. For
15284 # non-internal only properties, begin the info with the entry for
15285 # the first table we encountered (if any), as things are ordered
15286 # so that that one is the most descriptive. This leads to the
15287 # info column of an entry being a more descriptive version of the
15290 if ($name =~ /^_/) {
15292 '(For internal use by Perl, not necessarily stable)';
15294 elsif ($entry_for_first_alias) {
15295 push @info, $entry_for_first_alias;
15298 # If this entry is equivalent to another, add that to the info,
15299 # using the first such table we encountered
15300 if ($entry_for_first_table) {
15302 push @info, "(= $entry_for_first_table)";
15305 push @info, $entry_for_first_table;
15309 # If the name is a large integer, add an equivalent with an
15310 # exponent for better readability
15311 if ($name =~ /^[+-]?[\d]+$/ && $name >= 10_000) {
15312 push @info, sprintf "(= %.1e)", $name
15315 my $parenthesized = "";
15316 if (! $entry_for_first_alias) {
15318 # This is the first alias for the current table. The alias
15319 # array is ordered so that this is the fullest, most
15320 # descriptive alias, so it gets the fullest info. The other
15321 # aliases are mostly merely pointers to this one, using the
15322 # information already added above.
15324 # Display any status message, but only on the parent table
15325 if ($status && ! $entry_for_first_table) {
15326 push @info, $status_info;
15329 # Put out any descriptive info
15330 if (@table_description || @table_note) {
15331 push @info, join "; ", @table_description, @table_note;
15334 # Look to see if there is a shorter name we can point people
15336 my $standard_name = standardize($name);
15338 my $proposed_short = $table->short_name;
15339 if (defined $proposed_short) {
15340 my $standard_short = standardize($proposed_short);
15342 # If the short name is shorter than the standard one, or
15343 # even it it's not, but the combination of it and its
15344 # short property name (as in \p{prop=short} ($perl doesn't
15345 # have this form)) saves at least two characters, then,
15346 # cause it to be listed as a shorter synonym.
15347 if (length $standard_short < length $standard_name
15348 || ($table_property != $perl
15349 && (length($standard_short)
15350 - length($standard_name)
15351 + $table_property_short_delta) # (<= 0)
15354 $short_name = $proposed_short;
15355 if ($table_property != $perl) {
15356 $short_name = $table_property_short_name
15359 $short_name = "\\p{$short_name}";
15363 # And if this is a compound form name, see if there is a
15364 # single form equivalent
15366 if ($table_property != $perl) {
15368 # Special case the binary N tables, so that will print
15369 # \P{single}, but use the Y table values to populate
15370 # 'single', as we haven't likewise populated the N table.
15371 # For forced binary tables, we can't just look at the N
15372 # table, but must see if this table is equivalent to the N
15373 # one, as there are two equivalent beasts in these
15377 if ( ($type == $BINARY
15378 && $input_table == $property->table('No'))
15379 || ($type == $FORCED_BINARY
15380 && $property->table('No')->
15381 is_set_equivalent_to($input_table)))
15383 $test_table = $property->table('Yes');
15387 $test_table = $input_table;
15391 # Look for a single form amongst all the children.
15392 foreach my $table ($test_table->children) {
15393 next if $table->property != $perl;
15394 my $proposed_name = $table->short_name;
15395 next if ! defined $proposed_name;
15397 # Don't mention internal-only properties as a possible
15398 # single form synonym
15399 next if substr($proposed_name, 0, 1) eq '_';
15401 $proposed_name = "\\$p\{$proposed_name}";
15402 if (! defined $single_form
15403 || length($proposed_name) < length $single_form)
15405 $single_form = $proposed_name;
15407 # The goal here is to find a single form; not the
15408 # shortest possible one. We've already found a
15409 # short name. So, stop at the first single form
15410 # found, which is likely to be closer to the
15417 # Ouput both short and single in the same parenthesized
15418 # expression, but with only one of 'Single', 'Short' if there
15420 if ($short_name || $single_form || $table->conflicting) {
15421 $parenthesized .= "Short: $short_name" if $short_name;
15422 if ($short_name && $single_form) {
15423 $parenthesized .= ', ';
15425 elsif ($single_form) {
15426 $parenthesized .= 'Single: ';
15428 $parenthesized .= $single_form if $single_form;
15432 if ($caseless_equivalent != 0) {
15433 $parenthesized .= '; ' if $parenthesized ne "";
15434 $parenthesized .= "/i= " . $caseless_equivalent->complete_name;
15438 # Warn if this property isn't the same as one that a
15439 # semi-casual user might expect. The other components of this
15440 # parenthesized structure are calculated only for the first entry
15441 # for this table, but the conflicting is deemed important enough
15442 # to go on every entry.
15443 my $conflicting = join " NOR ", $table->conflicting;
15444 if ($conflicting) {
15445 $parenthesized .= '; ' if $parenthesized ne "";
15446 $parenthesized .= "NOT $conflicting";
15449 push @info, "($parenthesized)" if $parenthesized;
15451 if ($name =~ /_$/ && $alias->loose_match) {
15452 push @info, "Note the trailing '_' matters in spite of loose matching rules.";
15455 if ($table_property != $perl && $table->perl_extension) {
15456 push @info, '(Perl extension)';
15458 push @info, "($string_count)";
15460 # Now, we have both the entry and info so add them to the
15461 # list of all the properties.
15462 push @match_properties,
15463 format_pod_line($indent_info_column,
15467 $alias->loose_match);
15469 $entry_for_first_alias = $entry_ref unless $entry_for_first_alias;
15470 } # End of looping through the aliases for this table.
15472 if (! $entry_for_first_table) {
15473 $entry_for_first_table = $entry_for_first_alias;
15475 } # End of looping through all the related tables
15479 sub make_ucd_table_pod_entries {
15482 # Generate the entries for the UCD section of the pod for $table. This
15483 # also calculates if names are ambiguous, so has to be called even if the
15484 # pod is not being output
15486 my $short_name = $table->name;
15487 my $standard_short_name = standardize($short_name);
15488 my $full_name = $table->full_name;
15489 my $standard_full_name = standardize($full_name);
15491 my $full_info = ""; # Text of info column for full-name entries
15492 my $other_info = ""; # Text of info column for short-name entries
15493 my $short_info = ""; # Text of info column for other entries
15494 my $meaning = ""; # Synonym of this table
15496 my $property = ($table->isa('Property'))
15498 : $table->parent->property;
15500 my $perl_extension = $table->perl_extension;
15502 # Get the more official name for for perl extensions that aren't
15503 # stand-alone properties
15504 if ($perl_extension && $property != $table) {
15505 if ($property == $perl ||$property->type == $BINARY) {
15506 $meaning = $table->complete_name;
15509 $meaning = $property->full_name . "=$full_name";
15513 # There are three types of info column. One for the short name, one for
15514 # the full name, and one for everything else. They mostly are the same,
15515 # so initialize in the same loop.
15516 foreach my $info_ref (\$full_info, \$short_info, \$other_info) {
15517 if ($perl_extension && $property != $table) {
15519 # Add the synonymous name for the non-full name entries; and to
15520 # the full-name entry if it adds extra information
15521 if ($info_ref == \$other_info
15522 || ($info_ref == \$short_info
15523 && $standard_short_name ne $standard_full_name)
15524 || standardize($meaning) ne $standard_full_name
15526 $$info_ref .= "$meaning.";
15529 elsif ($info_ref != \$full_info) {
15531 # Otherwise, the non-full name columns include the full name
15532 $$info_ref .= $full_name;
15535 # And the full-name entry includes the short name, if different
15536 if ($info_ref == \$full_info
15537 && $standard_short_name ne $standard_full_name)
15539 $full_info =~ s/\.\Z//;
15540 $full_info .= " " if $full_info;
15541 $full_info .= "(Short: $short_name)";
15544 if ($table->perl_extension) {
15545 $$info_ref =~ s/\.\Z//;
15546 $$info_ref .= ". " if $$info_ref;
15547 $$info_ref .= "(Perl extension)";
15551 # Add any extra annotations to the full name entry
15552 foreach my $more_info ($table->description,
15554 $table->status_info)
15556 next unless $more_info;
15557 $full_info =~ s/\.\Z//;
15558 $full_info .= ". " if $full_info;
15559 $full_info .= $more_info;
15562 # These keep track if have created full and short name pod entries for the
15565 my $done_short = 0;
15567 # Every possible name is kept track of, even those that aren't going to be
15568 # output. This way we can be sure to find the ambiguities.
15569 foreach my $alias ($table->aliases) {
15570 my $name = $alias->name;
15571 my $standard = standardize($name);
15573 my $output_this = $alias->ucd;
15575 # If the full and short names are the same, we want to output the full
15576 # one's entry, so it has priority.
15577 if ($standard eq $standard_full_name) {
15578 next if $done_full;
15580 $info = $full_info;
15582 elsif ($standard eq $standard_short_name) {
15583 next if $done_short;
15585 next if $standard_short_name eq $standard_full_name;
15586 $info = $short_info;
15589 $info = $other_info;
15592 # Here, we have set up the two columns for this entry. But if an
15593 # entry already exists for this name, we have to decide which one
15594 # we're going to later output.
15595 if (exists $ucd_pod{$standard}) {
15597 # If the two entries refer to the same property, it's not going to
15598 # be ambiguous. (Likely it's because the names when standardized
15599 # are the same.) But that means if they are different properties,
15600 # there is ambiguity.
15601 if ($ucd_pod{$standard}->{'property'} != $property) {
15603 # Here, we have an ambiguity. This code assumes that one is
15604 # scheduled to be output and one not and that one is a perl
15605 # extension (which is not to be output) and the other isn't.
15606 # If those assumptions are wrong, things have to be rethought.
15607 if ($ucd_pod{$standard}{'output_this'} == $output_this
15608 || $ucd_pod{$standard}{'perl_extension'} == $perl_extension
15609 || $output_this == $perl_extension)
15611 Carp::my_carp("Bad news. $property and $ucd_pod{$standard}->{'property'} have unexpected output status and perl-extension combinations. Proceeding anyway.");
15614 # We modifiy the info column of the one being output to
15615 # indicate the ambiguity. Set $which to point to that one's
15618 if ($ucd_pod{$standard}{'output_this'}) {
15619 $which = \$ucd_pod{$standard}->{'info'};
15623 $meaning = $ucd_pod{$standard}{'meaning'};
15627 $$which =~ s/\.\Z//;
15628 $$which .= "; NOT '$standard' meaning '$meaning'";
15630 $ambiguous_names{$standard} = 1;
15633 # Use the non-perl-extension variant
15634 next unless $ucd_pod{$standard}{'perl_extension'};
15637 # Store enough information about this entry that we can later look for
15638 # ambiguities, and output it properly.
15639 $ucd_pod{$standard} = { 'name' => $name,
15641 'meaning' => $meaning,
15642 'output_this' => $output_this,
15643 'perl_extension' => $perl_extension,
15644 'property' => $property,
15645 'status' => $alias->status,
15647 } # End of looping through all this table's aliases
15652 sub pod_alphanumeric_sort {
15653 # Sort pod entries alphanumerically.
15655 # The first few character columns are filler, plus the '\p{'; and get rid
15656 # of all the trailing stuff, starting with the trailing '}', so as to sort
15657 # on just 'Name=Value'
15658 (my $a = lc $a) =~ s/^ .*? { //x;
15660 (my $b = lc $b) =~ s/^ .*? { //x;
15663 # Determine if the two operands are both internal only or both not.
15664 # Character 0 should be a '\'; 1 should be a p; 2 should be '{', so 3
15665 # should be the underscore that begins internal only
15666 my $a_is_internal = (substr($a, 0, 1) eq '_');
15667 my $b_is_internal = (substr($b, 0, 1) eq '_');
15669 # Sort so the internals come last in the table instead of first (which the
15670 # leading underscore would otherwise indicate).
15671 if ($a_is_internal != $b_is_internal) {
15672 return 1 if $a_is_internal;
15676 # Determine if the two operands are numeric property values or not.
15677 # A numeric property will look like xyz: 3. But the number
15678 # can begin with an optional minus sign, and may have a
15679 # fraction or rational component, like xyz: 3/2. If either
15680 # isn't numeric, use alphabetic sort.
15681 my ($a_initial, $a_number) =
15682 ($a =~ /^ ( [^:=]+ [:=] \s* ) (-? \d+ (?: [.\/] \d+)? )/ix);
15683 return $a cmp $b unless defined $a_number;
15684 my ($b_initial, $b_number) =
15685 ($b =~ /^ ( [^:=]+ [:=] \s* ) (-? \d+ (?: [.\/] \d+)? )/ix);
15686 return $a cmp $b unless defined $b_number;
15688 # Here they are both numeric, but use alphabetic sort if the
15689 # initial parts don't match
15690 return $a cmp $b if $a_initial ne $b_initial;
15692 # Convert rationals to floating for the comparison.
15693 $a_number = eval $a_number if $a_number =~ qr{/};
15694 $b_number = eval $b_number if $b_number =~ qr{/};
15696 return $a_number <=> $b_number;
15700 # Create the .pod file. This generates the various subsections and then
15701 # combines them in one big HERE document.
15703 my $Is_flags_text = "If an entry has flag(s) at its beginning, like \"$DEPRECATED\", the \"Is_\" form has the same flag(s)";
15705 return unless defined $pod_directory;
15706 print "Making pod file\n" if $verbosity >= $PROGRESS;
15708 my $exception_message =
15709 '(Any exceptions are individually noted beginning with the word NOT.)';
15711 if (-e 'Blocks.txt') {
15713 # Add the line: '\p{In_*} \p{Block: *}', with the warning message
15714 # if the global $has_In_conflicts indicates we have them.
15715 push @match_properties, format_pod_line($indent_info_column,
15718 . (($has_In_conflicts)
15719 ? " $exception_message"
15721 @block_warning = << "END";
15723 Matches in the Block property have shortcuts that begin with "In_". For
15724 example, C<\\p{Block=Latin1}> can be written as C<\\p{In_Latin1}>. For
15725 backward compatibility, if there is no conflict with another shortcut, these
15726 may also be written as C<\\p{Latin1}> or C<\\p{Is_Latin1}>. But, N.B., there
15727 are numerous such conflicting shortcuts. Use of these forms for Block is
15728 discouraged, and are flagged as such, not only because of the potential
15729 confusion as to what is meant, but also because a later release of Unicode may
15730 preempt the shortcut, and your program would no longer be correct. Use the
15731 "In_" form instead to avoid this, or even more clearly, use the compound form,
15732 e.g., C<\\p{blk:latin1}>. See L<perlunicode/"Blocks"> for more information
15736 my $text = $Is_flags_text;
15737 $text = "$exception_message $text" if $has_Is_conflicts;
15739 # And the 'Is_ line';
15740 push @match_properties, format_pod_line($indent_info_column,
15744 # Sort the properties array for output. It is sorted alphabetically
15745 # except numerically for numeric properties, and only output unique lines.
15746 @match_properties = sort pod_alphanumeric_sort uniques @match_properties;
15748 my $formatted_properties = simple_fold(\@match_properties,
15750 # indent succeeding lines by two extra
15751 # which looks better
15752 $indent_info_column + 2,
15754 # shorten the line length by how much
15755 # the formatter indents, so the folded
15756 # line will fit in the space
15757 # presumably available
15758 $automatic_pod_indent);
15759 # Add column headings, indented to be a little more centered, but not
15761 $formatted_properties = format_pod_line($indent_info_column,
15765 . $formatted_properties;
15767 # Generate pod documentation lines for the tables that match nothing
15768 my $zero_matches = "";
15769 if (@zero_match_tables) {
15770 @zero_match_tables = uniques(@zero_match_tables);
15771 $zero_matches = join "\n\n",
15772 map { $_ = '=item \p{' . $_->complete_name . "}" }
15773 sort { $a->complete_name cmp $b->complete_name }
15774 @zero_match_tables;
15776 $zero_matches = <<END;
15778 =head2 Legal C<\\p{}> and C<\\P{}> constructs that match no characters
15780 Unicode has some property-value pairs that currently don't match anything.
15781 This happens generally either because they are obsolete, or they exist for
15782 symmetry with other forms, but no language has yet been encoded that uses
15783 them. In this version of Unicode, the following match zero code points:
15794 # Generate list of properties that we don't accept, grouped by the reasons
15795 # why. This is so only put out the 'why' once, and then list all the
15796 # properties that have that reason under it.
15798 my %why_list; # The keys are the reasons; the values are lists of
15799 # properties that have the key as their reason
15801 # For each property, add it to the list that are suppressed for its reason
15802 # The sort will cause the alphabetically first properties to be added to
15803 # each list first, so each list will be sorted.
15804 foreach my $property (sort keys %why_suppressed) {
15805 push @{$why_list{$why_suppressed{$property}}}, $property;
15808 # For each reason (sorted by the first property that has that reason)...
15809 my @bad_re_properties;
15810 foreach my $why (sort { $why_list{$a}->[0] cmp $why_list{$b}->[0] }
15813 # Add to the output, all the properties that have that reason.
15814 my $has_item = 0; # Flag if actually output anything.
15815 foreach my $name (@{$why_list{$why}}) {
15817 # Split compound names into $property and $table components
15818 my $property = $name;
15820 if ($property =~ / (.*) = (.*) /x) {
15825 # This release of Unicode may not have a property that is
15826 # suppressed, so don't reference a non-existent one.
15827 $property = property_ref($property);
15828 next if ! defined $property;
15830 # And since this list is only for match tables, don't list the
15831 # ones that don't have match tables.
15832 next if ! $property->to_create_match_tables;
15834 # Find any abbreviation, and turn it into a compound name if this
15835 # is a property=value pair.
15836 my $short_name = $property->name;
15837 $short_name .= '=' . $property->table($table)->name if $table;
15839 # Start with an empty line.
15840 push @bad_re_properties, "\n\n" unless $has_item;
15842 # And add the property as an item for the reason.
15843 push @bad_re_properties, "\n=item I<$name> ($short_name)\n";
15847 # And add the reason under the list of properties, if such a list
15848 # actually got generated. Note that the header got added
15849 # unconditionally before. But pod ignores extra blank lines, so no
15851 push @bad_re_properties, "\n$why\n" if $has_item;
15853 } # End of looping through each reason.
15855 if (! @bad_re_properties) {
15856 push @bad_re_properties,
15857 "*** This installation accepts ALL non-Unihan properties ***";
15860 # Add =over only if non-empty to avoid an empty =over/=back section,
15861 # which is considered bad form.
15862 unshift @bad_re_properties, "\n=over 4\n";
15863 push @bad_re_properties, "\n=back\n";
15866 # Similiarly, generate a list of files that we don't use, grouped by the
15867 # reasons why. First, create a hash whose keys are the reasons, and whose
15868 # values are anonymous arrays of all the files that share that reason.
15869 my %grouped_by_reason;
15870 foreach my $file (keys %ignored_files) {
15871 push @{$grouped_by_reason{$ignored_files{$file}}}, $file;
15873 foreach my $file (keys %skipped_files) {
15874 push @{$grouped_by_reason{$skipped_files{$file}}}, $file;
15877 # Then, sort each group.
15878 foreach my $group (keys %grouped_by_reason) {
15879 @{$grouped_by_reason{$group}} = sort { lc $a cmp lc $b }
15880 @{$grouped_by_reason{$group}} ;
15883 # Finally, create the output text. For each reason (sorted by the
15884 # alphabetically first file that has that reason)...
15886 foreach my $reason (sort { lc $grouped_by_reason{$a}->[0]
15887 cmp lc $grouped_by_reason{$b}->[0]
15889 keys %grouped_by_reason)
15891 # Add all the files that have that reason to the output. Start
15892 # with an empty line.
15893 push @unused_files, "\n\n";
15894 push @unused_files, map { "\n=item F<$_> \n" }
15895 @{$grouped_by_reason{$reason}};
15896 # And add the reason under the list of files
15897 push @unused_files, "\n$reason\n";
15900 # Similarly, create the output text for the UCD section of the pod
15902 foreach my $key (keys %ucd_pod) {
15903 next unless $ucd_pod{$key}->{'output_this'};
15904 push @ucd_pod, format_pod_line($indent_info_column,
15905 $ucd_pod{$key}->{'name'},
15906 $ucd_pod{$key}->{'info'},
15907 $ucd_pod{$key}->{'status'},
15911 # Sort alphabetically, and fold for output
15912 @ucd_pod = sort { lc substr($a, 2) cmp lc substr($b, 2) } @ucd_pod;
15913 my $ucd_pod = simple_fold(\@ucd_pod,
15915 $indent_info_column,
15916 $automatic_pod_indent);
15917 $ucd_pod = format_pod_line($indent_info_column, 'NAME', ' INFO')
15922 # Everything is ready to assemble.
15923 my @OUT = << "END";
15928 To change this file, edit $0 instead.
15934 $pod_file - Index of Unicode Version $string_version character properties in Perl
15938 This document provides information about the portion of the Unicode database
15939 that deals with character properties, that is the portion that is defined on
15940 single code points. (L</Other information in the Unicode data base>
15941 below briefly mentions other data that Unicode provides.)
15943 Perl can provide access to all non-provisional Unicode character properties,
15944 though not all are enabled by default. The omitted ones are the Unihan
15945 properties (accessible via the CPAN module L<Unicode::Unihan>) and certain
15946 deprecated or Unicode-internal properties. (An installation may choose to
15947 recompile Perl's tables to change this. See L<Unicode character
15948 properties that are NOT accepted by Perl>.)
15950 For most purposes, access to Unicode properties from the Perl core is through
15951 regular expression matches, as described in the next section.
15952 For some special purposes, and to access the properties that are not suitable
15953 for regular expression matching, all the Unicode character properties that
15954 Perl handles are accessible via the standard L<Unicode::UCD> module, as
15955 described in the section L</Properties accessible through Unicode::UCD>.
15957 Perl also provides some additional extensions and short-cut synonyms
15958 for Unicode properties.
15960 This document merely lists all available properties and does not attempt to
15961 explain what each property really means. There is a brief description of each
15962 Perl extension; see L<perlunicode/Other Properties> for more information on
15963 these. There is some detail about Blocks, Scripts, General_Category,
15964 and Bidi_Class in L<perlunicode>, but to find out about the intricacies of the
15965 official Unicode properties, refer to the Unicode standard. A good starting
15966 place is L<$unicode_reference_url>.
15968 Note that you can define your own properties; see
15969 L<perlunicode/"User-Defined Character Properties">.
15971 =head1 Properties accessible through C<\\p{}> and C<\\P{}>
15973 The Perl regular expression C<\\p{}> and C<\\P{}> constructs give access to
15974 most of the Unicode character properties. The table below shows all these
15975 constructs, both single and compound forms.
15977 B<Compound forms> consist of two components, separated by an equals sign or a
15978 colon. The first component is the property name, and the second component is
15979 the particular value of the property to match against, for example,
15980 C<\\p{Script: Greek}> and C<\\p{Script=Greek}> both mean to match characters
15981 whose Script property value is Greek.
15983 B<Single forms>, like C<\\p{Greek}>, are mostly Perl-defined shortcuts for
15984 their equivalent compound forms. The table shows these equivalences. (In our
15985 example, C<\\p{Greek}> is a just a shortcut for C<\\p{Script=Greek}>.)
15986 There are also a few Perl-defined single forms that are not shortcuts for a
15987 compound form. One such is C<\\p{Word}>. These are also listed in the table.
15989 In parsing these constructs, Perl always ignores Upper/lower case differences
15990 everywhere within the {braces}. Thus C<\\p{Greek}> means the same thing as
15991 C<\\p{greek}>. But note that changing the case of the C<"p"> or C<"P"> before
15992 the left brace completely changes the meaning of the construct, from "match"
15993 (for C<\\p{}>) to "doesn't match" (for C<\\P{}>). Casing in this document is
15994 for improved legibility.
15996 Also, white space, hyphens, and underscores are normally ignored
15997 everywhere between the {braces}, and hence can be freely added or removed
15998 even if the C</x> modifier hasn't been specified on the regular expression.
15999 But in the table below $a_bold_stricter at the beginning of an entry
16000 means that tighter (stricter) rules are used for that entry:
16006 =item Single form (C<\\p{name}>) tighter rules:
16008 White space, hyphens, and underscores ARE significant
16013 =item * white space adjacent to a non-word character
16015 =item * underscores separating digits in numbers
16019 That means, for example, that you can freely add or remove white space
16020 adjacent to (but within) the braces without affecting the meaning.
16022 =item Compound form (C<\\p{name=value}> or C<\\p{name:value}>) tighter rules:
16024 The tighter rules given above for the single form apply to everything to the
16025 right of the colon or equals; the looser rules still apply to everything to
16028 That means, for example, that you can freely add or remove white space
16029 adjacent to (but within) the braces and the colon or equal sign.
16035 Some properties are considered obsolete by Unicode, but still available.
16036 There are several varieties of obsolescence:
16044 A property may be stabilized. Such a determination does not indicate
16045 that the property should or should not be used; instead it is a declaration
16046 that the property will not be maintained nor extended for newly encoded
16047 characters. Such properties are marked with $a_bold_stabilized in the
16052 A property may be deprecated, perhaps because its original intent
16053 has been replaced by another property, or because its specification was
16054 somehow defective. This means that its use is strongly
16055 discouraged, so much so that a warning will be issued if used, unless the
16056 regular expression is in the scope of a C<S<no warnings 'deprecated'>>
16057 statement. $A_bold_deprecated flags each such entry in the table, and
16058 the entry there for the longest, most descriptive version of the property will
16059 give the reason it is deprecated, and perhaps advice. Perl may issue such a
16060 warning, even for properties that aren't officially deprecated by Unicode,
16061 when there used to be characters or code points that were matched by them, but
16062 no longer. This is to warn you that your program may not work like it did on
16063 earlier Unicode releases.
16065 A deprecated property may be made unavailable in a future Perl version, so it
16066 is best to move away from them.
16068 A deprecated property may also be stabilized, but this fact is not shown.
16072 Properties marked with $a_bold_obsolete in the table are considered (plain)
16073 obsolete. Generally this designation is given to properties that Unicode once
16074 used for internal purposes (but not any longer).
16078 Some Perl extensions are present for backwards compatibility and are
16079 discouraged from being used, but are not obsolete. $A_bold_discouraged
16080 flags each such entry in the table. Future Unicode versions may force
16081 some of these extensions to be removed without warning, replaced by another
16082 property with the same name that means something different. Use the
16083 equivalent shown instead.
16089 The table below has two columns. The left column contains the C<\\p{}>
16090 constructs to look up, possibly preceded by the flags mentioned above; and
16091 the right column contains information about them, like a description, or
16092 synonyms. The table shows both the single and compound forms for each
16093 property that has them. If the left column is a short name for a property,
16094 the right column will give its longer, more descriptive name; and if the left
16095 column is the longest name, the right column will show any equivalent shortest
16096 name, in both single and compound forms if applicable.
16098 If braces are not needed to specify a property (e.g., C<\\pL>), the left
16099 column contains both forms, with and without braces.
16101 The right column will also caution you if a property means something different
16102 than what might normally be expected.
16104 All single forms are Perl extensions; a few compound forms are as well, and
16107 Numbers in (parentheses) indicate the total number of Unicode code points
16108 matched by the property. For emphasis, those properties that match no code
16109 points at all are listed as well in a separate section following the table.
16111 Most properties match the same code points regardless of whether C<"/i">
16112 case-insensitive matching is specified or not. But a few properties are
16113 affected. These are shown with the notation S<C<(/i= I<other_property>)>>
16114 in the second column. Under case-insensitive matching they match the
16115 same code pode points as the property I<other_property>.
16117 There is no description given for most non-Perl defined properties (See
16118 L<$unicode_reference_url> for that).
16120 For compactness, 'B<*>' is used as a wildcard instead of showing all possible
16121 combinations. For example, entries like:
16123 \\p{Gc: *} \\p{General_Category: *}
16125 mean that 'Gc' is a synonym for 'General_Category', and anything that is valid
16126 for the latter is also valid for the former. Similarly,
16130 means that if and only if, for example, C<\\p{Foo}> exists, then
16131 C<\\p{Is_Foo}> and C<\\p{IsFoo}> are also valid and all mean the same thing.
16132 And similarly, C<\\p{Foo=Bar}> means the same as C<\\p{Is_Foo=Bar}> and
16133 C<\\p{IsFoo=Bar}>. "*" here is restricted to something not beginning with an
16136 Also, in binary properties, 'Yes', 'T', and 'True' are all synonyms for 'Y'.
16137 And 'No', 'F', and 'False' are all synonyms for 'N'. The table shows 'Y*' and
16138 'N*' to indicate this, and doesn't have separate entries for the other
16139 possibilities. Note that not all properties which have values 'Yes' and 'No'
16140 are binary, and they have all their values spelled out without using this wild
16141 card, and a C<NOT> clause in their description that highlights their not being
16142 binary. These also require the compound form to match them, whereas true
16143 binary properties have both single and compound forms available.
16145 Note that all non-essential underscores are removed in the display of the
16154 B<*> is a wild-card
16158 B<(\\d+)> in the info column gives the number of Unicode code points matched
16163 B<$DEPRECATED> means this is deprecated.
16167 B<$OBSOLETE> means this is obsolete.
16171 B<$STABILIZED> means this is stabilized.
16175 B<$STRICTER> means tighter (stricter) name matching applies.
16179 B<$DISCOURAGED> means use of this form is discouraged, and may not be
16184 $formatted_properties
16188 =head1 Properties accessible through Unicode::UCD
16190 All the Unicode character properties mentioned above (except for those marked
16191 as for internal use by Perl) are also accessible by
16192 L<Unicode::UCD/prop_invlist()>.
16194 Due to their nature, not all Unicode character properties are suitable for
16195 regular expression matches, nor C<prop_invlist()>. The remaining
16196 non-provisional, non-internal ones are accessible via
16197 L<Unicode::UCD/prop_invmap()> (except for those that this Perl installation
16198 hasn't included; see L<below for which those are|/Unicode character properties
16199 that are NOT accepted by Perl>).
16201 For compatibility with other parts of Perl, all the single forms given in the
16202 table in the L<section above|/Properties accessible through \\p{} and \\P{}>
16203 are recognized. BUT, there are some ambiguities between some Perl extensions
16204 and the Unicode properties, all of which are silently resolved in favor of the
16205 official Unicode property. To avoid surprises, you should only use
16206 C<prop_invmap()> for forms listed in the table below, which omits the
16207 non-recommended ones. The affected forms are the Perl single form equivalents
16208 of Unicode properties, such as C<\\p{sc}> being a single-form equivalent of
16209 C<\\p{gc=sc}>, which is treated by C<prop_invmap()> as the C<Script> property,
16210 whose short name is C<sc>. The table indicates the current ambiguities in the
16211 INFO column, beginning with the word C<"NOT">.
16213 The standard Unicode properties listed below are documented in
16214 L<$unicode_reference_url>; Perl_Decimal_Digit is documented in
16215 L<Unicode::UCD/prop_invmap()>. The other Perl extensions are in
16216 L<perlunicode/Other Properties>;
16218 The first column in the table is a name for the property; the second column is
16219 an alternative name, if any, plus possibly some annotations. The alternative
16220 name is the property's full name, unless that would simply repeat the first
16221 column, in which case the second column indicates the property's short name
16222 (if different). The annotations are given only in the entry for the full
16223 name. If a property is obsolete, etc, the entry will be flagged with the same
16224 characters used in the table in the L<section above|/Properties accessible
16225 through \\p{} and \\P{}>, like B<$DEPRECATED> or B<$STABILIZED>.
16229 =head1 Properties accessible through other means
16231 Certain properties are accessible also via core function calls. These are:
16233 Lowercase_Mapping lc() and lcfirst()
16234 Titlecase_Mapping ucfirst()
16235 Uppercase_Mapping uc()
16237 Also, Case_Folding is accessible through the C</i> modifier in regular
16238 expressions, the C<\\F> transliteration escape, and the C<L<fc|perlfunc/fc>>
16241 And, the Name and Name_Aliases properties are accessible through the C<\\N{}>
16242 interpolation in double-quoted strings and regular expressions; and functions
16243 C<charnames::viacode()>, C<charnames::vianame()>, and
16244 C<charnames::string_vianame()> (which require a C<use charnames ();> to be
16247 Finally, most properties related to decomposition are accessible via
16248 L<Unicode::Normalize>.
16250 =head1 Unicode character properties that are NOT accepted by Perl
16252 Perl will generate an error for a few character properties in Unicode when
16253 used in a regular expression. The non-Unihan ones are listed below, with the
16254 reasons they are not accepted, perhaps with work-arounds. The short names for
16255 the properties are listed enclosed in (parentheses).
16256 As described after the list, an installation can change the defaults and choose
16257 to accept any of these. The list is machine generated based on the
16258 choices made for the installation that generated this document.
16262 An installation can choose to allow any of these to be matched by downloading
16263 the Unicode database from L<http://www.unicode.org/Public/> to
16264 C<\$Config{privlib}>/F<unicore/> in the Perl source tree, changing the
16265 controlling lists contained in the program
16266 C<\$Config{privlib}>/F<unicore/mktables> and then re-compiling and installing.
16267 (C<\%Config> is available from the Config module).
16269 =head1 Other information in the Unicode data base
16271 The Unicode data base is delivered in two different formats. The XML version
16272 is valid for more modern Unicode releases. The other version is a collection
16273 of files. The two are intended to give equivalent information. Perl uses the
16274 older form; this allows you to recompile Perl to use early Unicode releases.
16276 The only non-character property that Perl currently supports is Named
16277 Sequences, in which a sequence of code points
16278 is given a name and generally treated as a single entity. (Perl supports
16279 these via the C<\\N{...}> double-quotish construct,
16280 L<charnames/charnames::string_vianame(name)>, and L<Unicode::UCD/namedseq()>.
16282 Below is a list of the files in the Unicode data base that Perl doesn't
16283 currently use, along with very brief descriptions of their purposes.
16284 Some of the names of the files have been shortened from those that Unicode
16285 uses, in order to allow them to be distinguishable from similarly named files
16286 on file systems for which only the first 8 characters of a name are
16297 L<$unicode_reference_url>
16305 # And write it. The 0 means no utf8.
16306 main::write([ $pod_directory, "$pod_file.pod" ], 0, \@OUT);
16310 sub make_Heavy () {
16311 # Create and write Heavy.pl, which passes info about the tables to
16314 # Stringify structures for output
16315 my $loose_property_name_of
16316 = simple_dumper(\%loose_property_name_of, ' ' x 4);
16317 chomp $loose_property_name_of;
16319 my $stricter_to_file_of = simple_dumper(\%stricter_to_file_of, ' ' x 4);
16320 chomp $stricter_to_file_of;
16322 my $inline_definitions = simple_dumper(\@inline_definitions, " " x 4);
16323 chomp $inline_definitions;
16325 my $loose_to_file_of = simple_dumper(\%loose_to_file_of, ' ' x 4);
16326 chomp $loose_to_file_of;
16328 my $nv_floating_to_rational
16329 = simple_dumper(\%nv_floating_to_rational, ' ' x 4);
16330 chomp $nv_floating_to_rational;
16332 my $why_deprecated = simple_dumper(\%utf8::why_deprecated, ' ' x 4);
16333 chomp $why_deprecated;
16335 # We set the key to the file when we associated files with tables, but we
16336 # couldn't do the same for the value then, as we might not have the file
16337 # for the alternate table figured out at that time.
16338 foreach my $cased (keys %caseless_equivalent_to) {
16339 my @path = $caseless_equivalent_to{$cased}->file_path;
16341 if ($path[0] eq "#") { # Pseudo-directory '#'
16342 $path = join '/', @path;
16344 else { # Gets rid of lib/
16345 $path = join '/', @path[1, -1];
16347 $caseless_equivalent_to{$cased} = $path;
16349 my $caseless_equivalent_to
16350 = simple_dumper(\%caseless_equivalent_to, ' ' x 4);
16351 chomp $caseless_equivalent_to;
16353 my $loose_property_to_file_of
16354 = simple_dumper(\%loose_property_to_file_of, ' ' x 4);
16355 chomp $loose_property_to_file_of;
16357 my $file_to_swash_name = simple_dumper(\%file_to_swash_name, ' ' x 4);
16358 chomp $file_to_swash_name;
16362 $INTERNAL_ONLY_HEADER
16364 # This file is for the use of utf8_heavy.pl and Unicode::UCD
16366 # Maps Unicode (not Perl single-form extensions) property names in loose
16367 # standard form to their corresponding standard names
16368 \%utf8::loose_property_name_of = (
16369 $loose_property_name_of
16372 # Gives the definitions (in the form of inversion lists) for those properties
16373 # whose definitions aren't kept in files
16374 \@utf8::inline_definitions = (
16375 $inline_definitions
16378 # Maps property, table to file for those using stricter matching. For paths
16379 # whose directory is '#', the file is in the form of a numeric index into
16380 # \@inline_definitions
16381 \%utf8::stricter_to_file_of = (
16382 $stricter_to_file_of
16385 # Maps property, table to file for those using loose matching. For paths
16386 # whose directory is '#', the file is in the form of a numeric index into
16387 # \@inline_definitions
16388 \%utf8::loose_to_file_of = (
16392 # Maps floating point to fractional form
16393 \%utf8::nv_floating_to_rational = (
16394 $nv_floating_to_rational
16397 # If a floating point number doesn't have enough digits in it to get this
16398 # close to a fraction, it isn't considered to be that fraction even if all the
16399 # digits it does have match.
16400 \$utf8::max_floating_slop = $MAX_FLOATING_SLOP;
16402 # Deprecated tables to generate a warning for. The key is the file containing
16403 # the table, so as to avoid duplication, as many property names can map to the
16404 # file, but we only need one entry for all of them.
16405 \%utf8::why_deprecated = (
16409 # A few properties have different behavior under /i matching. This maps
16410 # those to substitute files to use under /i.
16411 \%utf8::caseless_equivalent = (
16412 $caseless_equivalent_to
16415 # Property names to mapping files
16416 \%utf8::loose_property_to_file_of = (
16417 $loose_property_to_file_of
16420 # Files to the swash names within them.
16421 \%utf8::file_to_swash_name = (
16422 $file_to_swash_name
16428 main::write("Heavy.pl", 0, \@heavy); # The 0 means no utf8.
16432 sub make_Name_pm () {
16433 # Create and write Name.pm, which contains subroutines and data to use in
16434 # conjunction with Name.pl
16436 # Maybe there's nothing to do.
16437 return unless $has_hangul_syllables || @code_points_ending_in_code_point;
16441 $INTERNAL_ONLY_HEADER
16444 # Convert these structures to output format.
16445 my $code_points_ending_in_code_point =
16446 main::simple_dumper(\@code_points_ending_in_code_point,
16448 my $names = main::simple_dumper(\%names_ending_in_code_point,
16450 my $loose_names = main::simple_dumper(\%loose_names_ending_in_code_point,
16453 # Do the same with the Hangul names,
16459 if ($has_hangul_syllables) {
16461 # Construct a regular expression of all the possible
16462 # combinations of the Hangul syllables.
16463 my @L_re; # Leading consonants
16464 for my $i ($LBase .. $LBase + $LCount - 1) {
16465 push @L_re, $Jamo{$i}
16467 my @V_re; # Middle vowels
16468 for my $i ($VBase .. $VBase + $VCount - 1) {
16469 push @V_re, $Jamo{$i}
16471 my @T_re; # Trailing consonants
16472 for my $i ($TBase + 1 .. $TBase + $TCount - 1) {
16473 push @T_re, $Jamo{$i}
16476 # The whole re is made up of the L V T combination.
16478 . join ('|', sort @L_re)
16480 . join ('|', sort @V_re)
16482 . join ('|', sort @T_re)
16485 # These hashes needed by the algorithm were generated
16486 # during reading of the Jamo.txt file
16487 $jamo = main::simple_dumper(\%Jamo, ' ' x 8);
16488 $jamo_l = main::simple_dumper(\%Jamo_L, ' ' x 8);
16489 $jamo_v = main::simple_dumper(\%Jamo_V, ' ' x 8);
16490 $jamo_t = main::simple_dumper(\%Jamo_T, ' ' x 8);
16497 # This module contains machine-generated tables and code for the
16498 # algorithmically-determinable Unicode character names. The following
16499 # routines can be used to translate between name and code point and vice versa
16503 # Matches legal code point. 4-6 hex numbers, If there are 6, the first
16504 # two must be 10; if there are 5, the first must not be a 0. Written this
16505 # way to decrease backtracking. The first regex allows the code point to
16506 # be at the end of a word, but to work properly, the word shouldn't end
16507 # with a valid hex character. The second one won't match a code point at
16508 # the end of a word, and doesn't have the run-on issue
16509 my \$run_on_code_point_re = qr/$run_on_code_point_re/;
16510 my \$code_point_re = qr/$code_point_re/;
16512 # In the following hash, the keys are the bases of names which include
16513 # the code point in the name, like CJK UNIFIED IDEOGRAPH-4E01. The value
16514 # of each key is another hash which is used to get the low and high ends
16515 # for each range of code points that apply to the name.
16516 my %names_ending_in_code_point = (
16520 # The following hash is a copy of the previous one, except is for loose
16521 # matching, so each name has blanks and dashes squeezed out
16522 my %loose_names_ending_in_code_point = (
16526 # And the following array gives the inverse mapping from code points to
16527 # names. Lowest code points are first
16528 my \@code_points_ending_in_code_point = (
16529 $code_points_ending_in_code_point
16532 # Earlier releases didn't have Jamos. No sense outputting
16533 # them unless will be used.
16534 if ($has_hangul_syllables) {
16537 # Convert from code point to Jamo short name for use in composing Hangul
16543 # Leading consonant (can be null)
16553 # Optional trailing consonant
16558 # Computed re that splits up a Hangul name into LVT or LV syllables
16559 my \$syllable_re = qr/$jamo_re/;
16561 my \$HANGUL_SYLLABLE = "HANGUL SYLLABLE ";
16562 my \$loose_HANGUL_SYLLABLE = "HANGULSYLLABLE";
16564 # These constants names and values were taken from the Unicode standard,
16565 # version 5.1, section 3.12. They are used in conjunction with Hangul
16567 my \$SBase = $SBase_string;
16568 my \$LBase = $LBase_string;
16569 my \$VBase = $VBase_string;
16570 my \$TBase = $TBase_string;
16571 my \$SCount = $SCount;
16572 my \$LCount = $LCount;
16573 my \$VCount = $VCount;
16574 my \$TCount = $TCount;
16575 my \$NCount = \$VCount * \$TCount;
16577 } # End of has Jamos
16579 push @name, << 'END';
16581 sub name_to_code_point_special {
16582 my ($name, $loose) = @_;
16584 # Returns undef if not one of the specially handled names; otherwise
16585 # returns the code point equivalent to the input name
16586 # $loose is non-zero if to use loose matching, 'name' in that case
16587 # must be input as upper case with all blanks and dashes squeezed out.
16589 if ($has_hangul_syllables) {
16590 push @name, << 'END';
16592 if ((! $loose && $name =~ s/$HANGUL_SYLLABLE//)
16593 || ($loose && $name =~ s/$loose_HANGUL_SYLLABLE//))
16595 return if $name !~ qr/^$syllable_re$/;
16596 my $L = $Jamo_L{$1};
16597 my $V = $Jamo_V{$2};
16598 my $T = (defined $3) ? $Jamo_T{$3} : 0;
16599 return ($L * $VCount + $V) * $TCount + $T + $SBase;
16603 push @name, << 'END';
16605 # Name must end in 'code_point' for this to handle.
16606 return if (($loose && $name !~ /^ (.*?) ($run_on_code_point_re) $/x)
16607 || (! $loose && $name !~ /^ (.*) ($code_point_re) $/x));
16610 my $code_point = CORE::hex $2;
16614 $names_ref = \%loose_names_ending_in_code_point;
16617 return if $base !~ s/-$//;
16618 $names_ref = \%names_ending_in_code_point;
16621 # Name must be one of the ones which has the code point in it.
16622 return if ! $names_ref->{$base};
16624 # Look through the list of ranges that apply to this name to see if
16625 # the code point is in one of them.
16626 for (my $i = 0; $i < scalar @{$names_ref->{$base}{'low'}}; $i++) {
16627 return if $names_ref->{$base}{'low'}->[$i] > $code_point;
16628 next if $names_ref->{$base}{'high'}->[$i] < $code_point;
16630 # Here, the code point is in the range.
16631 return $code_point;
16634 # Here, looked like the name had a code point number in it, but
16635 # did not match one of the valid ones.
16639 sub code_point_to_name_special {
16640 my $code_point = shift;
16642 # Returns the name of a code point if algorithmically determinable;
16645 if ($has_hangul_syllables) {
16646 push @name, << 'END';
16648 # If in the Hangul range, calculate the name based on Unicode's
16650 if ($code_point >= $SBase && $code_point <= $SBase + $SCount -1) {
16652 my $SIndex = $code_point - $SBase;
16653 my $L = $LBase + $SIndex / $NCount;
16654 my $V = $VBase + ($SIndex % $NCount) / $TCount;
16655 my $T = $TBase + $SIndex % $TCount;
16656 $name = "$HANGUL_SYLLABLE$Jamo{$L}$Jamo{$V}";
16657 $name .= $Jamo{$T} if $T != $TBase;
16662 push @name, << 'END';
16664 # Look through list of these code points for one in range.
16665 foreach my $hash (@code_points_ending_in_code_point) {
16666 return if $code_point < $hash->{'low'};
16667 if ($code_point <= $hash->{'high'}) {
16668 return sprintf("%s-%04X", $hash->{'name'}, $code_point);
16671 return; # None found
16678 main::write("Name.pm", 0, \@name); # The 0 means no utf8.
16683 # Create and write UCD.pl, which passes info about the tables to
16686 # Create a mapping from each alias of Perl single-form extensions to all
16687 # its equivalent aliases, for quick look-up.
16688 my %perlprop_to_aliases;
16689 foreach my $table ($perl->tables) {
16691 # First create the list of the aliases of each extension
16692 my @aliases_list; # List of legal aliases for this extension
16694 my $table_name = $table->name;
16695 my $standard_table_name = standardize($table_name);
16696 my $table_full_name = $table->full_name;
16697 my $standard_table_full_name = standardize($table_full_name);
16699 # Make sure that the list has both the short and full names
16700 push @aliases_list, $table_name, $table_full_name;
16702 my $found_ucd = 0; # ? Did we actually get an alias that should be
16703 # output for this table
16705 # Go through all the aliases (including the two just added), and add
16706 # any new unique ones to the list
16707 foreach my $alias ($table->aliases) {
16709 # Skip non-legal names
16710 next unless $alias->ok_as_filename;
16711 next unless $alias->ucd;
16713 $found_ucd = 1; # have at least one legal name
16715 my $name = $alias->name;
16716 my $standard = standardize($name);
16718 # Don't repeat a name that is equivalent to one already on the
16720 next if $standard eq $standard_table_name;
16721 next if $standard eq $standard_table_full_name;
16723 push @aliases_list, $name;
16726 # If there were no legal names, don't output anything.
16727 next unless $found_ucd;
16729 # To conserve memory in the program reading these in, omit full names
16730 # that are identical to the short name, when those are the only two
16731 # aliases for the property.
16732 if (@aliases_list == 2 && $aliases_list[0] eq $aliases_list[1]) {
16736 # Here, @aliases_list is the list of all the aliases that this
16737 # extension legally has. Now can create a map to it from each legal
16738 # standardized alias
16739 foreach my $alias ($table->aliases) {
16740 next unless $alias->ucd;
16741 next unless $alias->ok_as_filename;
16742 push @{$perlprop_to_aliases{standardize($alias->name)}},
16747 # Make a list of all combinations of properties/values that are suppressed.
16749 if (! $debug_skip) { # This tends to fail in this debug mode
16750 foreach my $property_name (keys %why_suppressed) {
16753 my $value_name = $1 if $property_name =~ s/ = ( .* ) //x;
16755 # The hash may contain properties not in this release of Unicode
16756 next unless defined (my $property = property_ref($property_name));
16758 # Find all combinations
16759 foreach my $prop_alias ($property->aliases) {
16760 my $prop_alias_name = standardize($prop_alias->name);
16762 # If no =value, there's just one combination possibe for this
16763 if (! $value_name) {
16765 # The property may be suppressed, but there may be a proxy
16766 # for it, so it shouldn't be listed as suppressed
16767 next if $prop_alias->ucd;
16768 push @suppressed, $prop_alias_name;
16771 foreach my $value_alias
16772 ($property->table($value_name)->aliases)
16774 next if $value_alias->ucd;
16776 push @suppressed, "$prop_alias_name="
16777 . standardize($value_alias->name);
16783 @suppressed = sort @suppressed; # So doesn't change between runs of this
16786 # Convert the structure below (designed for Name.pm) to a form that UCD
16787 # wants, so it doesn't have to modify it at all; i.e. so that it includes
16788 # an element for the Hangul syllables in the appropriate place, and
16789 # otherwise changes the name to include the "-<code point>" suffix.
16790 my @algorithm_names;
16791 my $done_hangul = 0;
16793 # Copy it linearly.
16794 for my $i (0 .. @code_points_ending_in_code_point - 1) {
16796 # Insert the hanguls in the correct place.
16798 && $code_points_ending_in_code_point[$i]->{'low'} > $SBase)
16801 push @algorithm_names, { low => $SBase,
16802 high => $SBase + $SCount - 1,
16803 name => '<hangul syllable>',
16807 # Copy the current entry, modified.
16808 push @algorithm_names, {
16809 low => $code_points_ending_in_code_point[$i]->{'low'},
16810 high => $code_points_ending_in_code_point[$i]->{'high'},
16812 "$code_points_ending_in_code_point[$i]->{'name'}-<code point>",
16816 # Serialize these structures for output.
16817 my $loose_to_standard_value
16818 = simple_dumper(\%loose_to_standard_value, ' ' x 4);
16819 chomp $loose_to_standard_value;
16821 my $string_property_loose_to_name
16822 = simple_dumper(\%string_property_loose_to_name, ' ' x 4);
16823 chomp $string_property_loose_to_name;
16825 my $perlprop_to_aliases = simple_dumper(\%perlprop_to_aliases, ' ' x 4);
16826 chomp $perlprop_to_aliases;
16828 my $prop_aliases = simple_dumper(\%prop_aliases, ' ' x 4);
16829 chomp $prop_aliases;
16831 my $prop_value_aliases = simple_dumper(\%prop_value_aliases, ' ' x 4);
16832 chomp $prop_value_aliases;
16834 my $suppressed = (@suppressed) ? simple_dumper(\@suppressed, ' ' x 4) : "";
16837 my $algorithm_names = simple_dumper(\@algorithm_names, ' ' x 4);
16838 chomp $algorithm_names;
16840 my $ambiguous_names = simple_dumper(\%ambiguous_names, ' ' x 4);
16841 chomp $ambiguous_names;
16843 my $loose_defaults = simple_dumper(\%loose_defaults, ' ' x 4);
16844 chomp $loose_defaults;
16848 $INTERNAL_ONLY_HEADER
16850 # This file is for the use of Unicode::UCD
16852 # Highest legal Unicode code point
16853 \$Unicode::UCD::MAX_UNICODE_CODEPOINT = 0x$MAX_UNICODE_CODEPOINT_STRING;
16856 \$Unicode::UCD::HANGUL_BEGIN = $SBase_string;
16857 \$Unicode::UCD::HANGUL_COUNT = $SCount;
16859 # Keys are all the possible "prop=value" combinations, in loose form; values
16860 # are the standard loose name for the 'value' part of the key
16861 \%Unicode::UCD::loose_to_standard_value = (
16862 $loose_to_standard_value
16865 # String property loose names to standard loose name
16866 \%Unicode::UCD::string_property_loose_to_name = (
16867 $string_property_loose_to_name
16870 # Keys are Perl extensions in loose form; values are each one's list of
16872 \%Unicode::UCD::loose_perlprop_to_name = (
16873 $perlprop_to_aliases
16876 # Keys are standard property name; values are each one's aliases
16877 \%Unicode::UCD::prop_aliases = (
16881 # Keys of top level are standard property name; values are keys to another
16882 # hash, Each one is one of the property's values, in standard form. The
16883 # values are that prop-val's aliases. If only one specified, the short and
16884 # long alias are identical.
16885 \%Unicode::UCD::prop_value_aliases = (
16886 $prop_value_aliases
16889 # Ordered (by code point ordinal) list of the ranges of code points whose
16890 # names are algorithmically determined. Each range entry is an anonymous hash
16891 # of the start and end points and a template for the names within it.
16892 \@Unicode::UCD::algorithmic_named_code_points = (
16896 # The properties that as-is have two meanings, and which must be disambiguated
16897 \%Unicode::UCD::ambiguous_names = (
16901 # Keys are the prop-val combinations which are the default values for the
16902 # given property, expressed in standard loose form
16903 \%Unicode::UCD::loose_defaults = (
16907 # All combinations of names that are suppressed.
16908 # This is actually for UCD.t, so it knows which properties shouldn't have
16909 # entries. If it got any bigger, would probably want to put it in its own
16910 # file to use memory only when it was needed, in testing.
16911 \@Unicode::UCD::suppressed_properties = (
16918 main::write("UCD.pl", 0, \@ucd); # The 0 means no utf8.
16922 sub write_all_tables() {
16923 # Write out all the tables generated by this program to files, as well as
16924 # the supporting data structures, pod file, and .t file.
16926 my @writables; # List of tables that actually get written
16927 my %match_tables_to_write; # Used to collapse identical match tables
16928 # into one file. Each key is a hash function
16929 # result to partition tables into buckets.
16930 # Each value is an array of the tables that
16931 # fit in the bucket.
16933 # For each property ...
16934 # (sort so that if there is an immutable file name, it has precedence, so
16935 # some other property can't come in and take over its file name. (We
16936 # don't care if both defined, as they had better be different anyway.)
16937 # The property named 'Perl' needs to be first (it doesn't have any
16938 # immutable file name) because empty properties are defined in terms of
16939 # its table named 'All' under the -annotate option.) We also sort by
16940 # the property's name. This is just for repeatability of the outputs
16941 # between runs of this program, but does not affect correctness.
16943 foreach my $property ($perl,
16944 sort { return -1 if defined $a->file;
16945 return 1 if defined $b->file;
16946 return $a->name cmp $b->name;
16947 } grep { $_ != $perl } property_ref('*'))
16949 my $type = $property->type;
16951 # And for each table for that property, starting with the mapping
16954 foreach my $table($property,
16956 # and all the match tables for it (if any), sorted so
16957 # the ones with the shortest associated file name come
16958 # first. The length sorting prevents problems of a
16959 # longer file taking a name that might have to be used
16960 # by a shorter one. The alphabetic sorting prevents
16961 # differences between releases
16962 sort { my $ext_a = $a->external_name;
16963 return 1 if ! defined $ext_a;
16964 my $ext_b = $b->external_name;
16965 return -1 if ! defined $ext_b;
16967 # But return the non-complement table before
16968 # the complement one, as the latter is defined
16969 # in terms of the former, and needs to have
16970 # the information for the former available.
16971 return 1 if $a->complement != 0;
16972 return -1 if $b->complement != 0;
16974 # Similarly, return a subservient table after
16976 return 1 if $a->leader != $a;
16977 return -1 if $b->leader != $b;
16979 my $cmp = length $ext_a <=> length $ext_b;
16981 # Return result if lengths not equal
16982 return $cmp if $cmp;
16984 # Alphabetic if lengths equal
16985 return $ext_a cmp $ext_b
16986 } $property->tables
16990 # Here we have a table associated with a property. It could be
16991 # the map table (done first for each property), or one of the
16992 # other tables. Determine which type.
16993 my $is_property = $table->isa('Property');
16995 my $name = $table->name;
16996 my $complete_name = $table->complete_name;
16998 # See if should suppress the table if is empty, but warn if it
16999 # contains something.
17000 my $suppress_if_empty_warn_if_not
17001 = $why_suppress_if_empty_warn_if_not{$complete_name} || 0;
17003 # Calculate if this table should have any code points associated
17005 my $expected_empty =
17007 # $perl should be empty, as well as properties that we just
17008 # don't do anything with
17010 && ($table == $perl
17011 || grep { $complete_name eq $_ }
17012 @unimplemented_properties
17016 # Match tables in properties we skipped populating should be
17018 || (! $is_property && ! $property->to_create_match_tables)
17020 # Tables and properties that are expected to have no code
17021 # points should be empty
17022 || $suppress_if_empty_warn_if_not
17025 # Set a boolean if this table is the complement of an empty binary
17027 my $is_complement_of_empty_binary =
17028 $type == $BINARY &&
17029 (($table == $property->table('Y')
17030 && $property->table('N')->is_empty)
17031 || ($table == $property->table('N')
17032 && $property->table('Y')->is_empty));
17034 if ($table->is_empty) {
17036 if ($suppress_if_empty_warn_if_not) {
17037 $table->set_fate($SUPPRESSED,
17038 $suppress_if_empty_warn_if_not);
17041 # Suppress (by skipping them) expected empty tables.
17042 next TABLE if $expected_empty;
17044 # And setup to later output a warning for those that aren't
17045 # known to be allowed to be empty. Don't do the warning if
17046 # this table is a child of another one to avoid duplicating
17047 # the warning that should come from the parent one.
17048 if (($table == $property || $table->parent == $table)
17049 && $table->fate != $SUPPRESSED
17050 && $table->fate != $MAP_PROXIED
17051 && ! grep { $complete_name =~ /^$_$/ }
17052 @tables_that_may_be_empty)
17054 push @unhandled_properties, "$table";
17057 # The old way of expressing an empty match list was to
17058 # complement the list that matches everything. The new way is
17059 # to create an empty inversion list, but this doesn't work for
17060 # annotating, so use the old way then.
17061 $table->set_complement($All) if $annotate
17062 && $table != $property;
17064 elsif ($expected_empty) {
17066 if ($suppress_if_empty_warn_if_not) {
17067 $because = " because $suppress_if_empty_warn_if_not";
17070 Carp::my_carp("Not expecting property $table$because. Generating file for it anyway.");
17073 # Some tables should match everything
17074 my $expected_full =
17075 ($table->fate == $SUPPRESSED)
17078 ? # All these types of map tables will be full because
17079 # they will have been populated with defaults
17080 ($type == $ENUM || $type == $FORCED_BINARY)
17082 : # A match table should match everything if its method
17084 ($table->matches_all
17086 # The complement of an empty binary table will match
17088 || $is_complement_of_empty_binary
17092 my $count = $table->count;
17093 if ($expected_full) {
17094 if ($count != $MAX_WORKING_CODEPOINTS) {
17095 Carp::my_carp("$table matches only "
17096 . clarify_number($count)
17097 . " Unicode code points but should match "
17098 . clarify_number($MAX_WORKING_CODEPOINTS)
17100 . clarify_number(abs($MAX_WORKING_CODEPOINTS - $count))
17101 . "). Proceeding anyway.");
17104 # Here is expected to be full. If it is because it is the
17105 # complement of an (empty) binary table that is to be
17106 # suppressed, then suppress this one as well.
17107 if ($is_complement_of_empty_binary) {
17108 my $opposing_name = ($name eq 'Y') ? 'N' : 'Y';
17109 my $opposing = $property->table($opposing_name);
17110 my $opposing_status = $opposing->status;
17111 if ($opposing_status) {
17112 $table->set_status($opposing_status,
17113 $opposing->status_info);
17117 elsif ($count == $MAX_UNICODE_CODEPOINTS
17119 && ($table == $property || $table->leader == $table)
17120 && $table->property->status ne $NORMAL)
17122 Carp::my_carp("$table unexpectedly matches all Unicode code points. Proceeding anyway.");
17125 if ($table->fate >= $SUPPRESSED) {
17126 if (! $is_property) {
17127 my @children = $table->children;
17128 foreach my $child (@children) {
17129 if ($child->fate < $SUPPRESSED) {
17130 Carp::my_carp_bug("'$table' is suppressed and has a child '$child' which isn't");
17138 if (! $is_property) {
17140 make_ucd_table_pod_entries($table) if $table->property == $perl;
17142 # Several things need to be done just once for each related
17143 # group of match tables. Do them on the parent.
17144 if ($table->parent == $table) {
17146 # Add an entry in the pod file for the table; it also does
17148 make_re_pod_entries($table) if defined $pod_directory;
17150 # See if the the table matches identical code points with
17151 # something that has already been output. In that case,
17152 # no need to have two files with the same code points in
17153 # them. We use the table's hash() method to store these
17154 # in buckets, so that it is quite likely that if two
17155 # tables are in the same bucket they will be identical, so
17156 # don't have to compare tables frequently. The tables
17157 # have to have the same status to share a file, so add
17158 # this to the bucket hash. (The reason for this latter is
17159 # that Heavy.pl associates a status with a file.)
17160 # We don't check tables that are inverses of others, as it
17161 # would lead to some coding complications, and checking
17162 # all the regular ones should find everything.
17163 if ($table->complement == 0) {
17164 my $hash = $table->hash . ';' . $table->status;
17166 # Look at each table that is in the same bucket as
17167 # this one would be.
17168 foreach my $comparison
17169 (@{$match_tables_to_write{$hash}})
17171 if ($table->matches_identically_to($comparison)) {
17172 $table->set_equivalent_to($comparison,
17178 # Here, not equivalent, add this table to the bucket.
17179 push @{$match_tables_to_write{$hash}}, $table;
17185 # Here is the property itself.
17186 # Don't write out or make references to the $perl property
17187 next if $table == $perl;
17189 make_ucd_table_pod_entries($table);
17191 # There is a mapping stored of the various synonyms to the
17192 # standardized name of the property for utf8_heavy.pl.
17193 # Also, the pod file contains entries of the form:
17194 # \p{alias: *} \p{full: *}
17195 # rather than show every possible combination of things.
17197 my @property_aliases = $property->aliases;
17199 my $full_property_name = $property->full_name;
17200 my $property_name = $property->name;
17201 my $standard_property_name = standardize($property_name);
17202 my $standard_property_full_name
17203 = standardize($full_property_name);
17205 # We also create for Unicode::UCD a list of aliases for
17206 # the property. The list starts with the property name;
17207 # then its full name. Legacy properties are not listed in
17211 if ( $property->fate <= $MAP_PROXIED) {
17212 @property_list = ($property_name, $full_property_name);
17213 @standard_list = ($standard_property_name,
17214 $standard_property_full_name);
17217 # For each synonym ...
17218 for my $i (0 .. @property_aliases - 1) {
17219 my $alias = $property_aliases[$i];
17220 my $alias_name = $alias->name;
17221 my $alias_standard = standardize($alias_name);
17224 # Add other aliases to the list of property aliases
17225 if ($property->fate <= $MAP_PROXIED
17226 && ! grep { $alias_standard eq $_ } @standard_list)
17228 push @property_list, $alias_name;
17229 push @standard_list, $alias_standard;
17232 # For utf8_heavy, set the mapping of the alias to the
17234 if ($type == $STRING) {
17235 if ($property->fate <= $MAP_PROXIED) {
17236 $string_property_loose_to_name{$alias_standard}
17237 = $standard_property_name;
17241 if (exists ($loose_property_name_of{$alias_standard}))
17243 Carp::my_carp("There already is a property with the same standard name as $alias_name: $loose_property_name_of{$alias_standard}. Old name is retained");
17246 $loose_property_name_of{$alias_standard}
17247 = $standard_property_name;
17250 # Now for the re pod entry for this alias. Skip if not
17251 # outputting a pod; skip the first one, which is the
17252 # full name so won't have an entry like: '\p{full: *}
17253 # \p{full: *}', and skip if don't want an entry for
17256 || ! defined $pod_directory
17257 || ! $alias->make_re_pod_entry;
17259 my $rhs = "\\p{$full_property_name: *}";
17260 if ($property != $perl && $table->perl_extension) {
17261 $rhs .= ' (Perl extension)';
17263 push @match_properties,
17264 format_pod_line($indent_info_column,
17265 '\p{' . $alias->name . ': *}',
17271 # The list of all possible names is attached to each alias, so
17273 if (@property_list) {
17274 push @{$prop_aliases{$standard_list[0]}}, @property_list;
17277 if ($property->fate <= $MAP_PROXIED) {
17279 # Similarly, we create for Unicode::UCD a list of
17280 # property-value aliases.
17282 my $property_full_name = $property->full_name;
17284 # Look at each table in the property...
17285 foreach my $table ($property->tables) {
17287 my $table_full_name = $table->full_name;
17288 my $standard_table_full_name
17289 = standardize($table_full_name);
17290 my $table_name = $table->name;
17291 my $standard_table_name = standardize($table_name);
17293 # The list starts with the table name and its full
17295 push @values_list, $table_name, $table_full_name;
17297 # We add to the table each unique alias that isn't
17298 # discouraged from use.
17299 foreach my $alias ($table->aliases) {
17300 next if $alias->status
17301 && $alias->status eq $DISCOURAGED;
17302 my $name = $alias->name;
17303 my $standard = standardize($name);
17304 next if $standard eq $standard_table_name;
17305 next if $standard eq $standard_table_full_name;
17306 push @values_list, $name;
17309 # Here @values_list is a list of all the aliases for
17310 # the table. That is, all the property-values given
17311 # by this table. By agreement with Unicode::UCD,
17312 # if the name and full name are identical, and there
17313 # are no other names, drop the duplcate entry to save
17315 if (@values_list == 2
17316 && $values_list[0] eq $values_list[1])
17321 # To save memory, unlike the similar list for property
17322 # aliases above, only the standard forms hve the list.
17323 # This forces an extra step of converting from input
17324 # name to standard name, but the savings are
17325 # considerable. (There is only marginal savings if we
17326 # did this with the property aliases.)
17327 push @{$prop_value_aliases{$standard_property_name}{$standard_table_name}}, @values_list;
17331 # Don't write out a mapping file if not desired.
17332 next if ! $property->to_output_map;
17335 # Here, we know we want to write out the table, but don't do it
17336 # yet because there may be other tables that come along and will
17337 # want to share the file, and the file's comments will change to
17338 # mention them. So save for later.
17339 push @writables, $table;
17341 } # End of looping through the property and all its tables.
17342 } # End of looping through all properties.
17344 # Now have all the tables that will have files written for them. Do it.
17345 foreach my $table (@writables) {
17348 my $property = $table->property;
17349 my $is_property = ($table == $property);
17351 # For very short tables, instead of writing them out to actual files,
17352 # we in-line their inversion list definitions into Heavy.pl. The
17353 # definition replaces the file name, and the special pseudo-directory
17354 # '#' is used to signal this. This significantly cuts down the number
17355 # of files written at little extra cost to the hashes in Heavy.pl.
17356 # And it means, no run-time files to read to get the definitions.
17358 && ! $annotate # For annotation, we want to explicitly show
17359 # everything, so keep in files
17360 && $table->ranges <= 3)
17362 my @ranges = $table->ranges;
17363 my $count = @ranges;
17364 if ($count == 0) { # 0th index reserved for 0-length lists
17367 elsif ($table->leader != $table) {
17369 # Here, is a table that is equivalent to another; code
17370 # in register_file_for_name() causes its leader's definition
17375 else { # No equivalent table so far.
17377 # Build up its definition range-by-range.
17378 my $definition = "";
17379 while (defined (my $range = shift @ranges)) {
17380 my $end = $range->end;
17381 if ($end < $MAX_WORKING_CODEPOINT) {
17383 $end = "\n" . ($end + 1);
17385 else { # Extends to infinity, hence no 'end'
17388 $definition .= "\n" . $range->start . $end;
17390 $definition = "V$count" . $definition;
17391 $filename = @inline_definitions;
17392 push @inline_definitions, $definition;
17395 register_file_for_name($table, \@directory, $filename);
17399 if (! $is_property) {
17400 # Match tables for the property go in lib/$subdirectory, which is
17401 # the property's name. Don't use the standard file name for this,
17402 # as may get an unfamiliar alias
17403 @directory = ($matches_directory, $property->external_name);
17407 @directory = $table->directory;
17408 $filename = $table->file;
17411 # Use specified filename if available, or default to property's
17412 # shortest name. We need an 8.3 safe filename (which means "an 8
17413 # safe" filename, since after the dot is only 'pl', which is < 3)
17414 # The 2nd parameter is if the filename shouldn't be changed, and
17415 # it shouldn't iff there is a hard-coded name for this table.
17416 $filename = construct_filename(
17417 $filename || $table->external_name,
17418 ! $filename, # mutable if no filename
17421 register_file_for_name($table, \@directory, $filename);
17423 # Only need to write one file when shared by more than one
17425 next if ! $is_property
17426 && ($table->leader != $table || $table->complement != 0);
17428 # Construct a nice comment to add to the file
17429 $table->set_final_comment;
17435 # Write out the pod file
17438 # And Heavy.pl, Name.pm, UCD.pl
17443 make_property_test_script() if $make_test_script;
17444 make_normalization_test_script() if $make_norm_test_script;
17448 my @white_space_separators = ( # This used only for making the test script.
17455 sub generate_separator($) {
17456 # This used only for making the test script. It generates the colon or
17457 # equal separator between the property and property value, with random
17458 # white space surrounding the separator
17462 return "" if $lhs eq ""; # No separator if there's only one (the r) side
17464 # Choose space before and after randomly
17465 my $spaces_before =$white_space_separators[rand(@white_space_separators)];
17466 my $spaces_after = $white_space_separators[rand(@white_space_separators)];
17468 # And return the whole complex, half the time using a colon, half the
17470 return $spaces_before
17471 . (rand() < 0.5) ? '=' : ':'
17475 sub generate_tests($$$$$) {
17476 # This used only for making the test script. It generates test cases that
17477 # are expected to compile successfully in perl. Note that the lhs and
17478 # rhs are assumed to already be as randomized as the caller wants.
17480 my $lhs = shift; # The property: what's to the left of the colon
17481 # or equals separator
17482 my $rhs = shift; # The property value; what's to the right
17483 my $valid_code = shift; # A code point that's known to be in the
17484 # table given by lhs=rhs; undef if table is
17486 my $invalid_code = shift; # A code point known to not be in the table;
17487 # undef if the table is all code points
17488 my $warning = shift;
17490 # Get the colon or equal
17491 my $separator = generate_separator($lhs);
17493 # The whole 'property=value'
17494 my $name = "$lhs$separator$rhs";
17497 # Create a complete set of tests, with complements.
17498 if (defined $valid_code) {
17499 push @output, <<"EOC"
17500 Expect(1, $valid_code, '\\p{$name}', $warning);
17501 Expect(0, $valid_code, '\\p{^$name}', $warning);
17502 Expect(0, $valid_code, '\\P{$name}', $warning);
17503 Expect(1, $valid_code, '\\P{^$name}', $warning);
17506 if (defined $invalid_code) {
17507 push @output, <<"EOC"
17508 Expect(0, $invalid_code, '\\p{$name}', $warning);
17509 Expect(1, $invalid_code, '\\p{^$name}', $warning);
17510 Expect(1, $invalid_code, '\\P{$name}', $warning);
17511 Expect(0, $invalid_code, '\\P{^$name}', $warning);
17517 sub generate_error($$$) {
17518 # This used only for making the test script. It generates test cases that
17519 # are expected to not only not match, but to be syntax or similar errors
17521 my $lhs = shift; # The property: what's to the left of the
17522 # colon or equals separator
17523 my $rhs = shift; # The property value; what's to the right
17524 my $already_in_error = shift; # Boolean; if true it's known that the
17525 # unmodified lhs and rhs will cause an error.
17526 # This routine should not force another one
17527 # Get the colon or equal
17528 my $separator = generate_separator($lhs);
17530 # Since this is an error only, don't bother to randomly decide whether to
17531 # put the error on the left or right side; and assume that the rhs is
17532 # loosely matched, again for convenience rather than rigor.
17533 $rhs = randomize_loose_name($rhs, 'ERROR') unless $already_in_error;
17535 my $property = $lhs . $separator . $rhs;
17538 Error('\\p{$property}');
17539 Error('\\P{$property}');
17543 # These are used only for making the test script
17544 # XXX Maybe should also have a bad strict seps, which includes underscore.
17546 my @good_loose_seps = (
17553 my @bad_loose_seps = (
17558 sub randomize_stricter_name {
17559 # This used only for making the test script. Take the input name and
17560 # return a randomized, but valid version of it under the stricter matching
17564 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
17566 # If the name looks like a number (integer, floating, or rational), do
17568 if ($name =~ qr{ ^ ( -? ) (\d+ ( ( [./] ) \d+ )? ) $ }x) {
17571 my $separator = $3;
17573 # If there isn't a sign, part of the time add a plus
17574 # Note: Not testing having any denominator having a minus sign
17576 $sign = '+' if rand() <= .3;
17579 # And add 0 or more leading zeros.
17580 $name = $sign . ('0' x int rand(10)) . $number;
17582 if (defined $separator) {
17583 my $extra_zeros = '0' x int rand(10);
17585 if ($separator eq '.') {
17587 # Similarly, add 0 or more trailing zeros after a decimal
17589 $name .= $extra_zeros;
17593 # Or, leading zeros before the denominator
17594 $name =~ s,/,/$extra_zeros,;
17599 # For legibility of the test, only change the case of whole sections at a
17600 # time. To do this, first split into sections. The split returns the
17603 for my $section (split / ( [ - + \s _ . ]+ ) /x, $name) {
17604 trace $section if main::DEBUG && $to_trace;
17606 if (length $section > 1 && $section !~ /\D/) {
17608 # If the section is a sequence of digits, about half the time
17609 # randomly add underscores between some of them.
17612 # Figure out how many underscores to add. max is 1 less than
17613 # the number of digits. (But add 1 at the end to make sure
17614 # result isn't 0, and compensate earlier by subtracting 2
17616 my $num_underscores = int rand(length($section) - 2) + 1;
17618 # And add them evenly throughout, for convenience, not rigor
17620 my $spacing = (length($section) - 1)/ $num_underscores;
17621 my $temp = $section;
17623 for my $i (1 .. $num_underscores) {
17624 $section .= substr($temp, 0, $spacing, "") . '_';
17628 push @sections, $section;
17632 # Here not a sequence of digits. Change the case of the section
17634 my $switch = int rand(4);
17635 if ($switch == 0) {
17636 push @sections, uc $section;
17638 elsif ($switch == 1) {
17639 push @sections, lc $section;
17641 elsif ($switch == 2) {
17642 push @sections, ucfirst $section;
17645 push @sections, $section;
17649 trace "returning", join "", @sections if main::DEBUG && $to_trace;
17650 return join "", @sections;
17653 sub randomize_loose_name($;$) {
17654 # This used only for making the test script
17657 my $want_error = shift; # if true, make an error
17658 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
17660 $name = randomize_stricter_name($name);
17663 push @parts, $good_loose_seps[rand(@good_loose_seps)];
17665 # Preserve trailing ones for the sake of not stripping the underscore from
17667 for my $part (split /[-\s_]+ (?= . )/, $name) {
17669 if ($want_error and rand() < 0.3) {
17670 push @parts, $bad_loose_seps[rand(@bad_loose_seps)];
17674 push @parts, $good_loose_seps[rand(@good_loose_seps)];
17677 push @parts, $part;
17679 my $new = join("", @parts);
17680 trace "$name => $new" if main::DEBUG && $to_trace;
17683 if (rand() >= 0.5) {
17684 $new .= $bad_loose_seps[rand(@bad_loose_seps)];
17687 $new = $bad_loose_seps[rand(@bad_loose_seps)] . $new;
17693 # Used to make sure don't generate duplicate test cases.
17694 my %test_generated;
17696 sub make_property_test_script() {
17697 # This used only for making the test script
17698 # this written directly -- it's huge.
17700 print "Making test script\n" if $verbosity >= $PROGRESS;
17702 # This uses randomness to test different possibilities without testing all
17703 # possibilities. To ensure repeatability, set the seed to 0. But if
17704 # tests are added, it will perturb all later ones in the .t file
17707 $t_path = 'TestProp.pl' unless defined $t_path; # the traditional name
17709 # Keep going down an order of magnitude
17710 # until find that adding this quantity to
17711 # 1 remains 1; but put an upper limit on
17712 # this so in case this algorithm doesn't
17713 # work properly on some platform, that we
17714 # won't loop forever.
17716 my $min_floating_slop = 1;
17717 while (1+ $min_floating_slop != 1
17720 my $next = $min_floating_slop / 10;
17721 last if $next == 0; # If underflows,
17723 $min_floating_slop = $next;
17726 # It doesn't matter whether the elements of this array contain single lines
17727 # or multiple lines. main::write doesn't count the lines.
17730 # Sort these so get results in same order on different runs of this
17732 foreach my $property (sort { $a->name cmp $b->name } property_ref('*')) {
17733 foreach my $table (sort { $a->name cmp $b->name } $property->tables) {
17735 # Find code points that match, and don't match this table.
17736 my $valid = $table->get_valid_code_point;
17737 my $invalid = $table->get_invalid_code_point;
17738 my $warning = ($table->status eq $DEPRECATED)
17742 # Test each possible combination of the property's aliases with
17743 # the table's. If this gets to be too many, could do what is done
17744 # in the set_final_comment() for Tables
17745 my @table_aliases = $table->aliases;
17746 my @property_aliases = $table->property->aliases;
17748 # Every property can be optionally be prefixed by 'Is_', so test
17749 # that those work, by creating such a new alias for each
17750 # pre-existing one.
17751 push @property_aliases, map { Alias->new("Is_" . $_->name,
17753 $_->make_re_pod_entry,
17754 $_->ok_as_filename,
17758 } @property_aliases;
17759 my $max = max(scalar @table_aliases, scalar @property_aliases);
17760 for my $j (0 .. $max - 1) {
17762 # The current alias for property is the next one on the list,
17763 # or if beyond the end, start over. Similarly for table
17765 = $property_aliases[$j % @property_aliases]->name;
17767 $property_name = "" if $table->property == $perl;
17768 my $table_alias = $table_aliases[$j % @table_aliases];
17769 my $table_name = $table_alias->name;
17770 my $loose_match = $table_alias->loose_match;
17772 # If the table doesn't have a file, any test for it is
17773 # already guaranteed to be in error
17774 my $already_error = ! $table->file_path;
17776 # Generate error cases for this alias.
17777 push @output, generate_error($property_name,
17781 # If the table is guaranteed to always generate an error,
17782 # quit now without generating success cases.
17783 next if $already_error;
17785 # Now for the success cases.
17787 if ($loose_match) {
17789 # For loose matching, create an extra test case for the
17791 my $standard = standardize($table_name);
17793 # $test_name should be a unique combination for each test
17794 # case; used just to avoid duplicate tests
17795 my $test_name = "$property_name=$standard";
17797 # Don't output duplicate test cases.
17798 if (! exists $test_generated{$test_name}) {
17799 $test_generated{$test_name} = 1;
17800 push @output, generate_tests($property_name,
17807 $random = randomize_loose_name($table_name)
17809 else { # Stricter match
17810 $random = randomize_stricter_name($table_name);
17813 # Now for the main test case for this alias.
17814 my $test_name = "$property_name=$random";
17815 if (! exists $test_generated{$test_name}) {
17816 $test_generated{$test_name} = 1;
17817 push @output, generate_tests($property_name,
17824 # If the name is a rational number, add tests for the
17825 # floating point equivalent.
17826 if ($table_name =~ qr{/}) {
17828 # Calculate the float, and find just the fraction.
17829 my $float = eval $table_name;
17830 my ($whole, $fraction)
17831 = $float =~ / (.*) \. (.*) /x;
17833 # Starting with one digit after the decimal point,
17834 # create a test for each possible precision (number of
17835 # digits past the decimal point) until well beyond the
17836 # native number found on this machine. (If we started
17837 # with 0 digits, it would be an integer, which could
17838 # well match an unrelated table)
17840 for my $i (1 .. $min_floating_slop + 3) {
17841 my $table_name = sprintf("%.*f", $i, $float);
17842 if ($i < $MIN_FRACTION_LENGTH) {
17844 # If the test case has fewer digits than the
17845 # minimum acceptable precision, it shouldn't
17846 # succeed, so we expect an error for it.
17847 # E.g., 2/3 = .7 at one decimal point, and we
17848 # shouldn't say it matches .7. We should make
17849 # it be .667 at least before agreeing that the
17850 # intent was to match 2/3. But at the
17851 # less-than- acceptable level of precision, it
17852 # might actually match an unrelated number.
17853 # So don't generate a test case if this
17854 # conflating is possible. In our example, we
17855 # don't want 2/3 matching 7/10, if there is
17856 # a 7/10 code point.
17858 (keys %nv_floating_to_rational)
17861 if abs($table_name - $existing)
17862 < $MAX_FLOATING_SLOP;
17864 push @output, generate_error($property_name,
17866 1 # 1 => already an error
17871 # Here the number of digits exceeds the
17872 # minimum we think is needed. So generate a
17873 # success test case for it.
17874 push @output, generate_tests($property_name,
17893 (map {"Test_X('$_');\n"} @backslash_X_tests),
17899 sub make_normalization_test_script() {
17900 print "Making normalization test script\n" if $verbosity >= $PROGRESS;
17902 my $n_path = 'TestNorm.pl';
17904 unshift @normalization_tests, <<'END';
17908 sub ord_string { # Convert packed ords to printable string
17910 return "'" . join("", map { '\N{' . charnames::viacode($_) . '}' }
17911 unpack "U*", shift) . "'";
17912 #return "'" . join(" ", map { sprintf "%04X", $_ } unpack "U*", shift) . "'";
17916 my ($source, $nfc, $nfd, $nfkc, $nfkd) = @_;
17917 my $display_source = ord_string($source);
17918 my $display_nfc = ord_string($nfc);
17919 my $display_nfd = ord_string($nfd);
17920 my $display_nfkc = ord_string($nfkc);
17921 my $display_nfkd = ord_string($nfkd);
17923 use Unicode::Normalize;
17925 # nfc == toNFC(source) == toNFC(nfc) == toNFC(nfd)
17926 # nfkc == toNFC(nfkc) == toNFC(nfkd)
17929 # nfd == toNFD(source) == toNFD(nfc) == toNFD(nfd)
17930 # nfkd == toNFD(nfkc) == toNFD(nfkd)
17933 # nfkc == toNFKC(source) == toNFKC(nfc) == toNFKC(nfd) ==
17934 # toNFKC(nfkc) == toNFKC(nfkd)
17937 # nfkd == toNFKD(source) == toNFKD(nfc) == toNFKD(nfd) ==
17938 # toNFKD(nfkc) == toNFKD(nfkd)
17940 is(NFC($source), $nfc, "NFC($display_source) eq $display_nfc");
17941 is(NFC($nfc), $nfc, "NFC($display_nfc) eq $display_nfc");
17942 is(NFC($nfd), $nfc, "NFC($display_nfd) eq $display_nfc");
17943 is(NFC($nfkc), $nfkc, "NFC($display_nfkc) eq $display_nfkc");
17944 is(NFC($nfkd), $nfkc, "NFC($display_nfkd) eq $display_nfkc");
17946 is(NFD($source), $nfd, "NFD($display_source) eq $display_nfd");
17947 is(NFD($nfc), $nfd, "NFD($display_nfc) eq $display_nfd");
17948 is(NFD($nfd), $nfd, "NFD($display_nfd) eq $display_nfd");
17949 is(NFD($nfkc), $nfkd, "NFD($display_nfkc) eq $display_nfkd");
17950 is(NFD($nfkd), $nfkd, "NFD($display_nfkd) eq $display_nfkd");
17952 is(NFKC($source), $nfkc, "NFKC($display_source) eq $display_nfkc");
17953 is(NFKC($nfc), $nfkc, "NFKC($display_nfc) eq $display_nfkc");
17954 is(NFKC($nfd), $nfkc, "NFKC($display_nfd) eq $display_nfkc");
17955 is(NFKC($nfkc), $nfkc, "NFKC($display_nfkc) eq $display_nfkc");
17956 is(NFKC($nfkd), $nfkc, "NFKC($display_nfkd) eq $display_nfkc");
17958 is(NFKD($source), $nfkd, "NFKD($display_source) eq $display_nfkd");
17959 is(NFKD($nfc), $nfkd, "NFKD($display_nfc) eq $display_nfkd");
17960 is(NFKD($nfd), $nfkd, "NFKD($display_nfd) eq $display_nfkd");
17961 is(NFKD($nfkc), $nfkd, "NFKD($display_nfkc) eq $display_nfkd");
17962 is(NFKD($nfkd), $nfkd, "NFKD($display_nfkd) eq $display_nfkd");
17969 @normalization_tests,
17975 # This is a list of the input files and how to handle them. The files are
17976 # processed in their order in this list. Some reordering is possible if
17977 # desired, but the v0 files should be first, and the extracted before the
17978 # others except DAge.txt (as data in an extracted file can be over-ridden by
17979 # the non-extracted. Some other files depend on data derived from an earlier
17980 # file, like UnicodeData requires data from Jamo, and the case changing and
17981 # folding requires data from Unicode. Mostly, it is safest to order by first
17982 # version releases in (except the Jamo). DAge.txt is read before the
17983 # extracted ones because of the rarely used feature $compare_versions. In the
17984 # unlikely event that there were ever an extracted file that contained the Age
17985 # property information, it would have to go in front of DAge.
17987 # The version strings allow the program to know whether to expect a file or
17988 # not, but if a file exists in the directory, it will be processed, even if it
17989 # is in a version earlier than expected, so you can copy files from a later
17990 # release into an earlier release's directory.
17991 my @input_file_objects = (
17992 Input_file->new('PropertyAliases.txt', v0,
17993 Handler => \&process_PropertyAliases,
17995 Input_file->new(undef, v0, # No file associated with this
17996 Progress_Message => 'Finishing property setup',
17997 Handler => \&finish_property_setup,
17999 Input_file->new('PropValueAliases.txt', v0,
18000 Handler => \&process_PropValueAliases,
18001 Has_Missings_Defaults => $NOT_IGNORED,
18003 Input_file->new('DAge.txt', v3.2.0,
18004 Has_Missings_Defaults => $NOT_IGNORED,
18007 Input_file->new("${EXTRACTED}DGeneralCategory.txt", v3.1.0,
18008 Property => 'General_Category',
18010 Input_file->new("${EXTRACTED}DCombiningClass.txt", v3.1.0,
18011 Property => 'Canonical_Combining_Class',
18012 Has_Missings_Defaults => $NOT_IGNORED,
18014 Input_file->new("${EXTRACTED}DNumType.txt", v3.1.0,
18015 Property => 'Numeric_Type',
18016 Has_Missings_Defaults => $NOT_IGNORED,
18018 Input_file->new("${EXTRACTED}DEastAsianWidth.txt", v3.1.0,
18019 Property => 'East_Asian_Width',
18020 Has_Missings_Defaults => $NOT_IGNORED,
18022 Input_file->new("${EXTRACTED}DLineBreak.txt", v3.1.0,
18023 Property => 'Line_Break',
18024 Has_Missings_Defaults => $NOT_IGNORED,
18026 Input_file->new("${EXTRACTED}DBidiClass.txt", v3.1.1,
18027 Property => 'Bidi_Class',
18028 Has_Missings_Defaults => $NOT_IGNORED,
18030 Input_file->new("${EXTRACTED}DDecompositionType.txt", v3.1.0,
18031 Property => 'Decomposition_Type',
18032 Has_Missings_Defaults => $NOT_IGNORED,
18034 Input_file->new("${EXTRACTED}DBinaryProperties.txt", v3.1.0),
18035 Input_file->new("${EXTRACTED}DNumValues.txt", v3.1.0,
18036 Property => 'Numeric_Value',
18037 Each_Line_Handler => \&filter_numeric_value_line,
18038 Has_Missings_Defaults => $NOT_IGNORED,
18040 Input_file->new("${EXTRACTED}DJoinGroup.txt", v3.1.0,
18041 Property => 'Joining_Group',
18042 Has_Missings_Defaults => $NOT_IGNORED,
18045 Input_file->new("${EXTRACTED}DJoinType.txt", v3.1.0,
18046 Property => 'Joining_Type',
18047 Has_Missings_Defaults => $NOT_IGNORED,
18049 Input_file->new('Jamo.txt', v2.0.0,
18050 Property => 'Jamo_Short_Name',
18051 Each_Line_Handler => \&filter_jamo_line,
18053 Input_file->new('UnicodeData.txt', v1.1.5,
18054 Pre_Handler => \&setup_UnicodeData,
18056 # We clean up this file for some early versions.
18057 Each_Line_Handler => [ (($v_version lt v2.0.0 )
18059 : ($v_version eq v2.1.5)
18060 ? \&filter_v2_1_5_ucd
18062 # And for 5.14 Perls with 6.0,
18063 # have to also make changes
18064 : ($v_version ge v6.0.0
18069 # Early versions did not have the
18070 # proper Unicode_1 names for the
18072 (($v_version lt v3.0.0)
18073 ? \&filter_early_U1_names
18076 # Early versions did not correctly
18077 # use the later method for giving
18078 # decimal digit values
18079 (($v_version le v3.2.0)
18080 ? \&filter_bad_Nd_ucd
18083 # And the main filter
18084 \&filter_UnicodeData_line,
18086 EOF_Handler => \&EOF_UnicodeData,
18088 Input_file->new('ArabicShaping.txt', v2.0.0,
18089 Each_Line_Handler =>
18090 ($v_version lt 4.1.0)
18091 ? \&filter_old_style_arabic_shaping
18093 # The first field after the range is a "schematic name"
18095 Properties => [ '<ignored>', 'Joining_Type', 'Joining_Group' ],
18096 Has_Missings_Defaults => $NOT_IGNORED,
18098 Input_file->new('Blocks.txt', v2.0.0,
18099 Property => 'Block',
18100 Has_Missings_Defaults => $NOT_IGNORED,
18101 Each_Line_Handler => \&filter_blocks_lines
18103 Input_file->new('PropList.txt', v2.0.0,
18104 Each_Line_Handler => (($v_version lt v3.1.0)
18105 ? \&filter_old_style_proplist
18108 Input_file->new('Unihan.txt', v2.0.0,
18109 Pre_Handler => \&setup_unihan,
18111 Each_Line_Handler => \&filter_unihan_line,
18113 Input_file->new('SpecialCasing.txt', v2.1.8,
18114 Each_Line_Handler => ($v_version eq 2.1.8)
18115 ? \&filter_2_1_8_special_casing_line
18116 : \&filter_special_casing_line,
18117 Pre_Handler => \&setup_special_casing,
18118 Has_Missings_Defaults => $IGNORED,
18121 'LineBreak.txt', v3.0.0,
18122 Has_Missings_Defaults => $NOT_IGNORED,
18123 Property => 'Line_Break',
18124 # Early versions had problematic syntax
18125 Each_Line_Handler => (($v_version lt v3.1.0)
18126 ? \&filter_early_ea_lb
18129 Input_file->new('EastAsianWidth.txt', v3.0.0,
18130 Property => 'East_Asian_Width',
18131 Has_Missings_Defaults => $NOT_IGNORED,
18132 # Early versions had problematic syntax
18133 Each_Line_Handler => (($v_version lt v3.1.0)
18134 ? \&filter_early_ea_lb
18137 Input_file->new('CompositionExclusions.txt', v3.0.0,
18138 Property => 'Composition_Exclusion',
18140 Input_file->new('BidiMirroring.txt', v3.0.1,
18141 Property => 'Bidi_Mirroring_Glyph',
18142 Has_Missings_Defaults => ($v_version lt v6.2.0)
18144 # Is <none> which doesn't mean
18145 # anything to us, we will use the
18150 Input_file->new("NormTest.txt", v3.0.0,
18151 Handler => \&process_NormalizationsTest,
18152 Skip => ($make_norm_test_script) ? 0 : 'Validation Tests',
18154 Input_file->new('CaseFolding.txt', v3.0.1,
18155 Pre_Handler => \&setup_case_folding,
18156 Each_Line_Handler =>
18157 [ ($v_version lt v3.1.0)
18158 ? \&filter_old_style_case_folding
18160 \&filter_case_folding_line
18162 Has_Missings_Defaults => $IGNORED,
18164 Input_file->new('DCoreProperties.txt', v3.1.0,
18165 # 5.2 changed this file
18166 Has_Missings_Defaults => (($v_version ge v5.2.0)
18170 Input_file->new('Scripts.txt', v3.1.0,
18171 Property => 'Script',
18172 Has_Missings_Defaults => $NOT_IGNORED,
18174 Input_file->new('DNormalizationProps.txt', v3.1.0,
18175 Has_Missings_Defaults => $NOT_IGNORED,
18176 Each_Line_Handler => (($v_version lt v4.0.1)
18177 ? \&filter_old_style_normalization_lines
18180 Input_file->new('HangulSyllableType.txt', v0,
18181 Has_Missings_Defaults => $NOT_IGNORED,
18182 Property => 'Hangul_Syllable_Type',
18183 Pre_Handler => ($v_version lt v4.0.0)
18187 Input_file->new("$AUXILIARY/WordBreakProperty.txt", v4.1.0,
18188 Property => 'Word_Break',
18189 Has_Missings_Defaults => $NOT_IGNORED,
18191 Input_file->new("$AUXILIARY/GraphemeBreakProperty.txt", v0,
18192 Property => 'Grapheme_Cluster_Break',
18193 Has_Missings_Defaults => $NOT_IGNORED,
18194 Pre_Handler => ($v_version lt v4.1.0)
18198 Input_file->new("$AUXILIARY/GCBTest.txt", v4.1.0,
18199 Handler => \&process_GCB_test,
18201 Input_file->new("$AUXILIARY/LBTest.txt", v4.1.0,
18202 Skip => 'Validation Tests',
18204 Input_file->new("$AUXILIARY/SBTest.txt", v4.1.0,
18205 Skip => 'Validation Tests',
18207 Input_file->new("$AUXILIARY/WBTest.txt", v4.1.0,
18208 Skip => 'Validation Tests',
18210 Input_file->new("$AUXILIARY/SentenceBreakProperty.txt", v4.1.0,
18211 Property => 'Sentence_Break',
18212 Has_Missings_Defaults => $NOT_IGNORED,
18214 Input_file->new('NamedSequences.txt', v4.1.0,
18215 Handler => \&process_NamedSequences
18217 Input_file->new('NameAliases.txt', v0,
18218 Property => 'Name_Alias',
18219 Pre_Handler => ($v_version le v6.0.0)
18220 ? \&setup_early_name_alias
18222 Each_Line_Handler => ($v_version le v6.0.0)
18223 ? \&filter_early_version_name_alias_line
18224 : \&filter_later_version_name_alias_line,
18226 Input_file->new("BidiTest.txt", v5.2.0,
18227 Skip => 'Validation Tests',
18229 Input_file->new('UnihanIndicesDictionary.txt', v5.2.0,
18231 Each_Line_Handler => \&filter_unihan_line,
18233 Input_file->new('UnihanDataDictionaryLike.txt', v5.2.0,
18235 Each_Line_Handler => \&filter_unihan_line,
18237 Input_file->new('UnihanIRGSources.txt', v5.2.0,
18239 Pre_Handler => \&setup_unihan,
18240 Each_Line_Handler => \&filter_unihan_line,
18242 Input_file->new('UnihanNumericValues.txt', v5.2.0,
18244 Each_Line_Handler => \&filter_unihan_line,
18246 Input_file->new('UnihanOtherMappings.txt', v5.2.0,
18248 Each_Line_Handler => \&filter_unihan_line,
18250 Input_file->new('UnihanRadicalStrokeCounts.txt', v5.2.0,
18252 Each_Line_Handler => \&filter_unihan_line,
18254 Input_file->new('UnihanReadings.txt', v5.2.0,
18256 Each_Line_Handler => \&filter_unihan_line,
18258 Input_file->new('UnihanVariants.txt', v5.2.0,
18260 Each_Line_Handler => \&filter_unihan_line,
18262 Input_file->new('ScriptExtensions.txt', v6.0.0,
18263 Property => 'Script_Extensions',
18264 Pre_Handler => \&setup_script_extensions,
18265 Each_Line_Handler => \&filter_script_extensions_line,
18266 Has_Missings_Defaults => (($v_version le v6.0.0)
18270 # The two Indic files are actually available starting in v6.0.0, but their
18271 # property values are missing from PropValueAliases.txt in that release,
18272 # so that further work would have to be done to get them to work properly
18273 # for that release.
18274 Input_file->new('IndicMatraCategory.txt', v6.1.0,
18275 Property => 'Indic_Matra_Category',
18276 Has_Missings_Defaults => $NOT_IGNORED,
18277 Skip => "Provisional; for the analysis and processing of Indic scripts",
18279 Input_file->new('IndicSyllabicCategory.txt', v6.1.0,
18280 Property => 'Indic_Syllabic_Category',
18281 Has_Missings_Defaults => $NOT_IGNORED,
18282 Skip => "Provisional; for the analysis and processing of Indic scripts",
18284 Input_file->new('BidiBrackets.txt', v6.3.0,
18285 Properties => [ 'Bidi_Paired_Bracket', 'Bidi_Paired_Bracket_Type' ],
18286 Has_Missings_Defaults => $NO_DEFAULTS,
18288 Input_file->new("BidiCharacterTest.txt", v6.3.0,
18289 Skip => 'Validation Tests',
18293 # End of all the preliminaries.
18296 if ($compare_versions) {
18297 Carp::my_carp(<<END
18298 Warning. \$compare_versions is set. Output is not suitable for production
18303 # Put into %potential_files a list of all the files in the directory structure
18304 # that could be inputs to this program, excluding those that we should ignore.
18305 # Use absolute file names because it makes it easier across machine types.
18306 my @ignored_files_full_names = map { File::Spec->rel2abs(
18307 internal_file_to_platform($_))
18308 } keys %ignored_files;
18311 return unless /\.txt$/i; # Some platforms change the name's case
18312 my $full = lc(File::Spec->rel2abs($_));
18313 $potential_files{$full} = 1
18314 if ! grep { $full eq lc($_) } @ignored_files_full_names;
18317 }, File::Spec->curdir());
18319 my @mktables_list_output_files;
18320 my $old_start_time = 0;
18321 my $old_options = "";
18323 if (! -e $file_list) {
18324 print "'$file_list' doesn't exist, so forcing rebuild.\n" if $verbosity >= $VERBOSE;
18325 $write_unchanged_files = 1;
18326 } elsif ($write_unchanged_files) {
18327 print "Not checking file list '$file_list'.\n" if $verbosity >= $VERBOSE;
18330 print "Reading file list '$file_list'\n" if $verbosity >= $VERBOSE;
18332 if (! open $file_handle, "<", $file_list) {
18333 Carp::my_carp("Failed to open '$file_list'; turning on -globlist option instead: $!");
18339 # Read and parse mktables.lst, placing the results from the first part
18340 # into @input, and the second part into @mktables_list_output_files
18341 for my $list ( \@input, \@mktables_list_output_files ) {
18342 while (<$file_handle>) {
18343 s/^ \s+ | \s+ $//xg;
18344 if (/^ \s* \# \s* Autogenerated\ starting\ on\ (\d+)/x) {
18345 $old_start_time = $1;
18348 if (/^ \s* \# \s* From\ options\ (.+) /x) {
18352 next if /^ \s* (?: \# .* )? $/x;
18354 my ( $file ) = split /\t/;
18355 push @$list, $file;
18357 @$list = uniques(@$list);
18361 # Look through all the input files
18362 foreach my $input (@input) {
18363 next if $input eq 'version'; # Already have checked this.
18365 # Ignore if doesn't exist. The checking about whether we care or
18366 # not is done via the Input_file object.
18367 next if ! file_exists($input);
18369 # The paths are stored with relative names, and with '/' as the
18370 # delimiter; convert to absolute on this machine
18371 my $full = lc(File::Spec->rel2abs(internal_file_to_platform($input)));
18372 $potential_files{lc $full} = 1
18373 if ! grep { lc($full) eq lc($_) } @ignored_files_full_names;
18377 close $file_handle;
18382 # Here wants to process all .txt files in the directory structure.
18383 # Convert them to full path names. They are stored in the platform's
18386 foreach my $object (@input_file_objects) {
18387 my $file = $object->file;
18388 next unless defined $file;
18389 push @known_files, File::Spec->rel2abs($file);
18392 my @unknown_input_files;
18393 foreach my $file (keys %potential_files) { # The keys are stored in lc
18394 next if grep { $file eq lc($_) } @known_files;
18396 # Here, the file is unknown to us. Get relative path name
18397 $file = File::Spec->abs2rel($file);
18398 push @unknown_input_files, $file;
18400 # What will happen is we create a data structure for it, and add it to
18401 # the list of input files to process. First get the subdirectories
18403 my (undef, $directories, undef) = File::Spec->splitpath($file);
18404 $directories =~ s;/$;;; # Can have extraneous trailing '/'
18405 my @directories = File::Spec->splitdir($directories);
18407 # If the file isn't extracted (meaning none of the directories is the
18408 # extracted one), just add it to the end of the list of inputs.
18409 if (! grep { $EXTRACTED_DIR eq $_ } @directories) {
18410 push @input_file_objects, Input_file->new($file, v0);
18414 # Here, the file is extracted. It needs to go ahead of most other
18415 # processing. Search for the first input file that isn't a
18416 # special required property (that is, find one whose first_release
18417 # is non-0), and isn't extracted. Also, the Age property file is
18418 # processed before the extracted ones, just in case
18419 # $compare_versions is set.
18420 for (my $i = 0; $i < @input_file_objects; $i++) {
18421 if ($input_file_objects[$i]->first_released ne v0
18422 && lc($input_file_objects[$i]->file) ne 'dage.txt'
18423 && $input_file_objects[$i]->file !~ /$EXTRACTED_DIR/i)
18425 splice @input_file_objects, $i, 0,
18426 Input_file->new($file, v0);
18433 if (@unknown_input_files) {
18434 print STDERR simple_fold(join_lines(<<END
18436 The following files are unknown as to how to handle. Assuming they are
18437 typical property files. You'll know by later error messages if it worked or
18440 ) . " " . join(", ", @unknown_input_files) . "\n\n");
18442 } # End of looking through directory structure for more .txt files.
18444 # Create the list of input files from the objects we have defined, plus
18446 my @input_files = qw(version Makefile);
18447 foreach my $object (@input_file_objects) {
18448 my $file = $object->file;
18449 next if ! defined $file; # Not all objects have files
18450 next if $object->optional && ! -e $file;
18451 push @input_files, $file;
18454 if ( $verbosity >= $VERBOSE ) {
18455 print "Expecting ".scalar( @input_files )." input files. ",
18456 "Checking ".scalar( @mktables_list_output_files )." output files.\n";
18459 # We set $most_recent to be the most recently changed input file, including
18460 # this program itself (done much earlier in this file)
18461 foreach my $in (@input_files) {
18462 next unless -e $in; # Keep going even if missing a file
18463 my $mod_time = (stat $in)[9];
18464 $most_recent = $mod_time if $mod_time > $most_recent;
18466 # See that the input files have distinct names, to warn someone if they
18467 # are adding a new one
18469 my ($volume, $directories, $file ) = File::Spec->splitpath($in);
18470 $directories =~ s;/$;;; # Can have extraneous trailing '/'
18471 my @directories = File::Spec->splitdir($directories);
18472 my $base = $file =~ s/\.txt$//;
18473 construct_filename($file, 'mutable', \@directories);
18477 # We use 'Makefile' just to see if it has changed since the last time we
18478 # rebuilt. Now discard it.
18479 @input_files = grep { $_ ne 'Makefile' } @input_files;
18481 my $rebuild = $write_unchanged_files # Rebuild: if unconditional rebuild
18482 || ! scalar @mktables_list_output_files # or if no outputs known
18483 || $old_start_time < $most_recent # or out-of-date
18484 || $old_options ne $command_line_arguments; # or with different
18487 # Now we check to see if any output files are older than youngest, if
18488 # they are, we need to continue on, otherwise we can presumably bail.
18490 foreach my $out (@mktables_list_output_files) {
18491 if ( ! file_exists($out)) {
18492 print "'$out' is missing.\n" if $verbosity >= $VERBOSE;
18496 #local $to_trace = 1 if main::DEBUG;
18497 trace $most_recent, (stat $out)[9] if main::DEBUG && $to_trace;
18498 if ( (stat $out)[9] <= $most_recent ) {
18499 #trace "$out: most recent mod time: ", (stat $out)[9], ", youngest: $most_recent\n" if main::DEBUG && $to_trace;
18500 print "'$out' is too old.\n" if $verbosity >= $VERBOSE;
18507 print "Files seem to be ok, not bothering to rebuild. Add '-w' option to force build\n";
18510 print "Must rebuild tables.\n" if $verbosity >= $VERBOSE;
18512 # Ready to do the major processing. First create the perl pseudo-property.
18513 $perl = Property->new('perl', Type => $NON_STRING, Perl_Extension => 1);
18515 # Process each input file
18516 foreach my $file (@input_file_objects) {
18520 # Finish the table generation.
18522 print "Finishing processing Unicode properties\n" if $verbosity >= $PROGRESS;
18525 print "Compiling Perl properties\n" if $verbosity >= $PROGRESS;
18528 print "Creating Perl synonyms\n" if $verbosity >= $PROGRESS;
18529 add_perl_synonyms();
18531 print "Writing tables\n" if $verbosity >= $PROGRESS;
18532 write_all_tables();
18534 # Write mktables.lst
18535 if ( $file_list and $make_list ) {
18537 print "Updating '$file_list'\n" if $verbosity >= $PROGRESS;
18538 foreach my $file (@input_files, @files_actually_output) {
18539 my (undef, $directories, $file) = File::Spec->splitpath($file);
18540 my @directories = File::Spec->splitdir($directories);
18541 $file = join '/', @directories, $file;
18545 if (! open $ofh,">",$file_list) {
18546 Carp::my_carp("Can't write to '$file_list'. Skipping: $!");
18550 my $localtime = localtime $start_time;
18551 print $ofh <<"END";
18553 # $file_list -- File list for $0.
18555 # Autogenerated starting on $start_time ($localtime)
18556 # From options $command_line_arguments
18558 # - First section is input files
18559 # ($0 itself is not listed but is automatically considered an input)
18560 # - Section separator is /^=+\$/
18561 # - Second section is a list of output files.
18562 # - Lines matching /^\\s*#/ are treated as comments
18563 # which along with blank lines are ignored.
18569 print $ofh "$_\n" for sort(@input_files);
18570 print $ofh "\n=================================\n# Output files:\n\n";
18571 print $ofh "$_\n" for sort @files_actually_output;
18572 print $ofh "\n# ",scalar(@input_files)," input files\n",
18573 "# ",scalar(@files_actually_output)+1," output files\n\n",
18576 or Carp::my_carp("Failed to close $ofh: $!");
18578 print "Filelist has ",scalar(@input_files)," input files and ",
18579 scalar(@files_actually_output)+1," output files\n"
18580 if $verbosity >= $VERBOSE;
18584 # Output these warnings unless -q explicitly specified.
18585 if ($verbosity >= $NORMAL_VERBOSITY && ! $debug_skip) {
18586 if (@unhandled_properties) {
18587 print "\nProperties and tables that unexpectedly have no code points\n";
18588 foreach my $property (sort @unhandled_properties) {
18589 print $property, "\n";
18593 if (%potential_files) {
18594 print "\nInput files that are not considered:\n";
18595 foreach my $file (sort keys %potential_files) {
18596 print File::Spec->abs2rel($file), "\n";
18599 print "\nAll done\n" if $verbosity >= $VERBOSE;
18603 # TRAILING CODE IS USED BY make_property_test_script()
18609 # If run outside the normal test suite on an ASCII platform, you can
18610 # just create a latin1_to_native() function that just returns its
18611 # inputs, because that's the only function used from charset_tools.pl
18612 require "charset_tools.pl";
18614 # Test qr/\X/ and the \p{} regular expression constructs. This file is
18615 # constructed by mktables from the tables it generates, so if mktables is
18616 # buggy, this won't necessarily catch those bugs. Tests are generated for all
18617 # feasible properties; a few aren't currently feasible; see
18618 # is_code_point_usable() in mktables for details.
18620 # Standard test packages are not used because this manipulates SIG_WARN. It
18621 # exits 0 if every non-skipped test succeeded; -1 if any failed.
18627 my $expected = shift;
18630 my $warning_type = shift; # Type of warning message, like 'deprecated'
18632 my $line = (caller)[2];
18634 # Convert the code point to hex form
18635 my $string = sprintf "\"\\x{%04X}\"", $ord;
18639 # The first time through, use all warnings. If the input should generate
18640 # a warning, add another time through with them turned off
18641 push @tests, "no warnings '$warning_type';" if $warning_type;
18643 foreach my $no_warnings (@tests) {
18645 # Store any warning messages instead of outputting them
18646 local $SIG{__WARN__} = $SIG{__WARN__};
18647 my $warning_message;
18648 $SIG{__WARN__} = sub { $warning_message = $_[0] };
18652 # A string eval is needed because of the 'no warnings'.
18653 # Assumes no parens in the regular expression
18654 my $result = eval "$no_warnings
18655 my \$RegObj = qr($regex);
18656 $string =~ \$RegObj ? 1 : 0";
18657 if (not defined $result) {
18658 print "not ok $Tests - couldn't compile /$regex/; line $line: $@\n";
18661 elsif ($result ^ $expected) {
18662 print "not ok $Tests - expected $expected but got $result for $string =~ qr/$regex/; line $line\n";
18665 elsif ($warning_message) {
18666 if (! $warning_type || ($warning_type && $no_warnings)) {
18667 print "not ok $Tests - for qr/$regex/ did not expect warning message '$warning_message'; line $line\n";
18671 print "ok $Tests - expected and got a warning message for qr/$regex/; line $line\n";
18674 elsif ($warning_type && ! $no_warnings) {
18675 print "not ok $Tests - for qr/$regex/ expected a $warning_type warning message, but got none; line $line\n";
18679 print "ok $Tests - got $result for $string =~ qr/$regex/; line $line\n";
18688 if (eval { 'x' =~ qr/$regex/; 1 }) {
18690 my $line = (caller)[2];
18691 print "not ok $Tests - re compiled ok, but expected error for qr/$regex/; line $line: $@\n";
18694 my $line = (caller)[2];
18695 print "ok $Tests - got and expected error for qr/$regex/; line $line\n";
18700 # GCBTest.txt character that separates grapheme clusters
18701 my $breakable_utf8 = my $breakable = chr(utf8::unicode_to_native(0xF7));
18702 utf8::upgrade($breakable_utf8);
18704 # GCBTest.txt character that indicates that the adjoining code points are part
18705 # of the same grapheme cluster
18706 my $nobreak_utf8 = my $nobreak = chr(utf8::unicode_to_native(0xD7));
18707 utf8::upgrade($nobreak_utf8);
18710 # Test qr/\X/ matches. The input is a line from auxiliary/GCBTest.txt
18711 # Each such line is a sequence of code points given by their hex numbers,
18712 # separated by the two characters defined just before this subroutine that
18713 # indicate that either there can or cannot be a break between the adjacent
18714 # code points. If there isn't a break, that means the sequence forms an
18715 # extended grapheme cluster, which means that \X should match the whole
18716 # thing. If there is a break, \X should stop there. This is all
18717 # converted by this routine into a match:
18718 # $string =~ /(\X)/,
18719 # Each \X should match the next cluster; and that is what is checked.
18721 my $template = shift;
18723 my $line = (caller)[2];
18725 # The line contains characters above the ASCII range, but in Latin1. It
18726 # may or may not be in utf8, and if it is, it may or may not know it. So,
18727 # convert these characters to 8 bits. If knows is in utf8, simply
18729 if (utf8::is_utf8($template)) {
18730 utf8::downgrade($template);
18733 # Otherwise, if it is in utf8, but doesn't know it, the next lines
18734 # convert the two problematic characters to their 8-bit equivalents.
18735 # If it isn't in utf8, they don't harm anything.
18737 $template =~ s/$nobreak_utf8/$nobreak/g;
18738 $template =~ s/$breakable_utf8/$breakable/g;
18741 # Get rid of the leading and trailing breakables
18742 $template =~ s/^ \s* $breakable \s* //x;
18743 $template =~ s/ \s* $breakable \s* $ //x;
18745 # And no-breaks become just a space.
18746 $template =~ s/ \s* $nobreak \s* / /xg;
18748 # Split the input into segments that are breakable between them.
18749 my @segments = split /\s*$breakable\s*/, $template;
18752 my $display_string = "";
18754 my @should_display;
18756 # Convert the code point sequence in each segment into a Perl string of
18758 foreach my $segment (@segments) {
18759 my @code_points = split /\s+/, $segment;
18760 my $this_string = "";
18761 my $this_display = "";
18762 foreach my $code_point (@code_points) {
18763 $this_string .= latin1_to_native(chr(hex $code_point));
18764 $this_display .= "\\x{$code_point}";
18767 # The next cluster should match the string in this segment.
18768 push @should_match, $this_string;
18769 push @should_display, $this_display;
18770 $string .= $this_string;
18771 $display_string .= $this_display;
18774 # If a string can be represented in both non-ut8 and utf8, test both cases
18776 for my $to_upgrade (0 .. 1) {
18780 # If already in utf8, would just be a repeat
18781 next UPGRADE if utf8::is_utf8($string);
18783 utf8::upgrade($string);
18786 # Finally, do the \X match.
18787 my @matches = $string =~ /(\X)/g;
18789 # Look through each matched cluster to verify that it matches what we
18791 my $min = (@matches < @should_match) ? @matches : @should_match;
18792 for my $i (0 .. $min - 1) {
18794 if ($matches[$i] eq $should_match[$i]) {
18795 print "ok $Tests - ";
18797 print "In \"$display_string\" =~ /(\\X)/g, \\X #1";
18799 print "And \\X #", $i + 1,
18801 print " correctly matched $should_display[$i]; line $line\n";
18803 $matches[$i] = join("", map { sprintf "\\x{%04X}", $_ }
18804 unpack("U*", $matches[$i]));
18805 print "not ok $Tests - In \"$display_string\" =~ /(\\X)/g, \\X #",
18807 " should have matched $should_display[$i]",
18808 " but instead matched $matches[$i]",
18809 ". Abandoning rest of line $line\n";
18814 # And the number of matches should equal the number of expected matches.
18816 if (@matches == @should_match) {
18817 print "ok $Tests - Nothing was left over; line $line\n";
18819 print "not ok $Tests - There were ", scalar @should_match, " \\X matches expected, but got ", scalar @matches, " instead; line $line\n";
18827 print "1..$Tests\n";
18828 exit($Fails ? -1 : 0);
18831 Error('\p{Script=InGreek}'); # Bug #69018
18832 Test_X("1100 $nobreak 1161"); # Bug #70940
18833 Expect(0, 0x2028, '\p{Print}', ""); # Bug # 71722
18834 Expect(0, 0x2029, '\p{Print}', ""); # Bug # 71722
18835 Expect(1, 0xFF10, '\p{XDigit}', ""); # Bug # 71726