3 # !!!!!!!!!!!!!! IF YOU MODIFY THIS FILE !!!!!!!!!!!!!!!!!!!!!!!!!
4 # Any files created or read by this program should be listed in 'mktables.lst'
5 # Use -makelist to regenerate it.
7 # There was an attempt when this was first rewritten to make it 5.8
8 # compatible, but that has now been abandoned, and newer constructs are used
12 BEGIN { # Get the time the script started running; do it at compilation to
13 # get it as close as possible
29 sub DEBUG () { 0 } # Set to 0 for production; 1 for development
30 my $debugging_build = $Config{"ccflags"} =~ /-DDEBUGGING/;
32 sub NON_ASCII_PLATFORM { ord("A") != 65 }
34 ##########################################################################
36 # mktables -- create the runtime Perl Unicode files (lib/unicore/.../*.pl),
37 # from the Unicode database files (lib/unicore/.../*.txt), It also generates
38 # a pod file and .t files, depending on option parameters.
40 # The structure of this file is:
41 # First these introductory comments; then
42 # code needed for everywhere, such as debugging stuff; then
43 # code to handle input parameters; then
44 # data structures likely to be of external interest (some of which depend on
45 # the input parameters, so follows them; then
46 # more data structures and subroutine and package (class) definitions; then
47 # the small actual loop to process the input files and finish up; then
48 # a __DATA__ section, for the .t tests
50 # This program works on all releases of Unicode so far. The outputs have been
51 # scrutinized most intently for release 5.1. The others have been checked for
52 # somewhat more than just sanity. It can handle all non-provisional Unicode
53 # character properties in those releases.
55 # This program is mostly about Unicode character (or code point) properties.
56 # A property describes some attribute or quality of a code point, like if it
57 # is lowercase or not, its name, what version of Unicode it was first defined
58 # in, or what its uppercase equivalent is. Unicode deals with these disparate
59 # possibilities by making all properties into mappings from each code point
60 # into some corresponding value. In the case of it being lowercase or not,
61 # the mapping is either to 'Y' or 'N' (or various synonyms thereof). Each
62 # property maps each Unicode code point to a single value, called a "property
63 # value". (Some more recently defined properties, map a code point to a set
66 # When using a property in a regular expression, what is desired isn't the
67 # mapping of the code point to its property's value, but the reverse (or the
68 # mathematical "inverse relation"): starting with the property value, "Does a
69 # code point map to it?" These are written in a "compound" form:
70 # \p{property=value}, e.g., \p{category=punctuation}. This program generates
71 # files containing the lists of code points that map to each such regular
72 # expression property value, one file per list
74 # There is also a single form shortcut that Perl adds for many of the commonly
75 # used properties. This happens for all binary properties, plus script,
76 # general_category, and block properties.
78 # Thus the outputs of this program are files. There are map files, mostly in
79 # the 'To' directory; and there are list files for use in regular expression
80 # matching, all in subdirectories of the 'lib' directory, with each
81 # subdirectory being named for the property that the lists in it are for.
82 # Bookkeeping, test, and documentation files are also generated.
84 my $matches_directory = 'lib'; # Where match (\p{}) files go.
85 my $map_directory = 'To'; # Where map files go.
89 # The major data structures of this program are Property, of course, but also
90 # Table. There are two kinds of tables, very similar to each other.
91 # "Match_Table" is the data structure giving the list of code points that have
92 # a particular property value, mentioned above. There is also a "Map_Table"
93 # data structure which gives the property's mapping from code point to value.
94 # There are two structures because the match tables need to be combined in
95 # various ways, such as constructing unions, intersections, complements, etc.,
96 # and the map ones don't. And there would be problems, perhaps subtle, if
97 # a map table were inadvertently operated on in some of those ways.
98 # The use of separate classes with operations defined on one but not the other
99 # prevents accidentally confusing the two.
101 # At the heart of each table's data structure is a "Range_List", which is just
102 # an ordered list of "Ranges", plus ancillary information, and methods to
103 # operate on them. A Range is a compact way to store property information.
104 # Each range has a starting code point, an ending code point, and a value that
105 # is meant to apply to all the code points between the two end points,
106 # inclusive. For a map table, this value is the property value for those
107 # code points. Two such ranges could be written like this:
108 # 0x41 .. 0x5A, 'Upper',
109 # 0x61 .. 0x7A, 'Lower'
111 # Each range also has a type used as a convenience to classify the values.
112 # Most ranges in this program will be Type 0, or normal, but there are some
113 # ranges that have a non-zero type. These are used only in map tables, and
114 # are for mappings that don't fit into the normal scheme of things. Mappings
115 # that require a hash entry to communicate with utf8.c are one example;
116 # another example is mappings for charnames.pm to use which indicate a name
117 # that is algorithmically determinable from its code point (and the reverse).
118 # These are used to significantly compact these tables, instead of listing
119 # each one of the tens of thousands individually.
121 # In a match table, the value of a range is irrelevant (and hence the type as
122 # well, which will always be 0), and arbitrarily set to the null string.
123 # Using the example above, there would be two match tables for those two
124 # entries, one named Upper would contain the 0x41..0x5A range, and the other
125 # named Lower would contain 0x61..0x7A.
127 # Actually, there are two types of range lists, "Range_Map" is the one
128 # associated with map tables, and "Range_List" with match tables.
129 # Again, this is so that methods can be defined on one and not the others so
130 # as to prevent operating on them in incorrect ways.
132 # Eventually, most tables are written out to files to be read by utf8_heavy.pl
133 # in the perl core. All tables could in theory be written, but some are
134 # suppressed because there is no current practical use for them. It is easy
135 # to change which get written by changing various lists that are near the top
136 # of the actual code in this file. The table data structures contain enough
137 # ancillary information to allow them to be treated as separate entities for
138 # writing, such as the path to each one's file. There is a heading in each
139 # map table that gives the format of its entries, and what the map is for all
140 # the code points missing from it. (This allows tables to be more compact.)
142 # The Property data structure contains one or more tables. All properties
143 # contain a map table (except the $perl property which is a
144 # pseudo-property containing only match tables), and any properties that
145 # are usable in regular expression matches also contain various matching
146 # tables, one for each value the property can have. A binary property can
147 # have two values, True and False (or Y and N, which are preferred by Unicode
148 # terminology). Thus each of these properties will have a map table that
149 # takes every code point and maps it to Y or N (but having ranges cuts the
150 # number of entries in that table way down), and two match tables, one
151 # which has a list of all the code points that map to Y, and one for all the
152 # code points that map to N. (For each binary property, a third table is also
153 # generated for the pseudo Perl property. It contains the identical code
154 # points as the Y table, but can be written in regular expressions, not in the
155 # compound form, but in a "single" form like \p{IsUppercase}.) Many
156 # properties are binary, but some properties have several possible values,
157 # some have many, and properties like Name have a different value for every
158 # named code point. Those will not, unless the controlling lists are changed,
159 # have their match tables written out. But all the ones which can be used in
160 # regular expression \p{} and \P{} constructs will. Prior to 5.14, generally
161 # a property would have either its map table or its match tables written but
162 # not both. Again, what gets written is controlled by lists which can easily
163 # be changed. Starting in 5.14, advantage was taken of this, and all the map
164 # tables needed to reconstruct the Unicode db are now written out, while
165 # suppressing the Unicode .txt files that contain the data. Our tables are
166 # much more compact than the .txt files, so a significant space savings was
167 # achieved. Also, tables are not written out that are trivially derivable
168 # from tables that do get written. So, there typically is no file containing
169 # the code points not matched by a binary property (the table for \P{} versus
170 # lowercase \p{}), since you just need to invert the True table to get the
173 # Properties have a 'Type', like 'binary', or 'string', or 'enum' depending on
174 # how many match tables there are and the content of the maps. This 'Type' is
175 # different than a range 'Type', so don't get confused by the two concepts
176 # having the same name.
178 # For information about the Unicode properties, see Unicode's UAX44 document:
180 my $unicode_reference_url = 'http://www.unicode.org/reports/tr44/';
182 # As stated earlier, this program will work on any release of Unicode so far.
183 # Most obvious problems in earlier data have NOT been corrected except when
184 # necessary to make Perl or this program work reasonably, and to keep out
185 # potential security issues. For example, no folding information was given in
186 # early releases, so this program substitutes lower case instead, just so that
187 # a regular expression with the /i option will do something that actually
188 # gives the right results in many cases. There are also a couple other
189 # corrections for version 1.1.5, commented at the point they are made. As an
190 # example of corrections that weren't made (but could be) is this statement
191 # from DerivedAge.txt: "The supplementary private use code points and the
192 # non-character code points were assigned in version 2.0, but not specifically
193 # listed in the UCD until versions 3.0 and 3.1 respectively." (To be precise
194 # it was 3.0.1 not 3.0.0) More information on Unicode version glitches is
195 # further down in these introductory comments.
197 # This program works on all non-provisional properties as of the current
198 # Unicode release, though the files for some are suppressed for various
199 # reasons. You can change which are output by changing lists in this program.
201 # The old version of mktables emphasized the term "Fuzzy" to mean Unicode's
202 # loose matchings rules (from Unicode TR18):
204 # The recommended names for UCD properties and property values are in
205 # PropertyAliases.txt [Prop] and PropertyValueAliases.txt
206 # [PropValue]. There are both abbreviated names and longer, more
207 # descriptive names. It is strongly recommended that both names be
208 # recognized, and that loose matching of property names be used,
209 # whereby the case distinctions, whitespace, hyphens, and underbar
212 # The program still allows Fuzzy to override its determination of if loose
213 # matching should be used, but it isn't currently used, as it is no longer
214 # needed; the calculations it makes are good enough.
216 # SUMMARY OF HOW IT WORKS:
220 # A list is constructed containing each input file that is to be processed
222 # Each file on the list is processed in a loop, using the associated handler
224 # The PropertyAliases.txt and PropValueAliases.txt files are processed
225 # first. These files name the properties and property values.
226 # Objects are created of all the property and property value names
227 # that the rest of the input should expect, including all synonyms.
228 # The other input files give mappings from properties to property
229 # values. That is, they list code points and say what the mapping
230 # is under the given property. Some files give the mappings for
231 # just one property; and some for many. This program goes through
232 # each file and populates the properties and their map tables from
233 # them. Some properties are listed in more than one file, and
234 # Unicode has set up a precedence as to which has priority if there
235 # is a conflict. Thus the order of processing matters, and this
236 # program handles the conflict possibility by processing the
237 # overriding input files last, so that if necessary they replace
239 # After this is all done, the program creates the property mappings not
240 # furnished by Unicode, but derivable from what it does give.
241 # The tables of code points that match each property value in each
242 # property that is accessible by regular expressions are created.
243 # The Perl-defined properties are created and populated. Many of these
244 # require data determined from the earlier steps
245 # Any Perl-defined synonyms are created, and name clashes between Perl
246 # and Unicode are reconciled and warned about.
247 # All the properties are written to files
248 # Any other files are written, and final warnings issued.
250 # For clarity, a number of operators have been overloaded to work on tables:
251 # ~ means invert (take all characters not in the set). The more
252 # conventional '!' is not used because of the possibility of confusing
253 # it with the actual boolean operation.
255 # - means subtraction
256 # & means intersection
257 # The precedence of these is the order listed. Parentheses should be
258 # copiously used. These are not a general scheme. The operations aren't
259 # defined for a number of things, deliberately, to avoid getting into trouble.
260 # Operations are done on references and affect the underlying structures, so
261 # that the copy constructors for them have been overloaded to not return a new
262 # clone, but the input object itself.
264 # The bool operator is deliberately not overloaded to avoid confusion with
265 # "should it mean if the object merely exists, or also is non-empty?".
267 # WHY CERTAIN DESIGN DECISIONS WERE MADE
269 # This program needs to be able to run under miniperl. Therefore, it uses a
270 # minimum of other modules, and hence implements some things itself that could
271 # be gotten from CPAN
273 # This program uses inputs published by the Unicode Consortium. These can
274 # change incompatibly between releases without the Perl maintainers realizing
275 # it. Therefore this program is now designed to try to flag these. It looks
276 # at the directories where the inputs are, and flags any unrecognized files.
277 # It keeps track of all the properties in the files it handles, and flags any
278 # that it doesn't know how to handle. It also flags any input lines that
279 # don't match the expected syntax, among other checks.
281 # It is also designed so if a new input file matches one of the known
282 # templates, one hopefully just needs to add it to a list to have it
285 # As mentioned earlier, some properties are given in more than one file. In
286 # particular, the files in the extracted directory are supposedly just
287 # reformattings of the others. But they contain information not easily
288 # derivable from the other files, including results for Unihan (which isn't
289 # usually available to this program) and for unassigned code points. They
290 # also have historically had errors or been incomplete. In an attempt to
291 # create the best possible data, this program thus processes them first to
292 # glean information missing from the other files; then processes those other
293 # files to override any errors in the extracted ones. Much of the design was
294 # driven by this need to store things and then possibly override them.
296 # It tries to keep fatal errors to a minimum, to generate something usable for
297 # testing purposes. It always looks for files that could be inputs, and will
298 # warn about any that it doesn't know how to handle (the -q option suppresses
301 # Why is there more than one type of range?
302 # This simplified things. There are some very specialized code points that
303 # have to be handled specially for output, such as Hangul syllable names.
304 # By creating a range type (done late in the development process), it
305 # allowed this to be stored with the range, and overridden by other input.
306 # Originally these were stored in another data structure, and it became a
307 # mess trying to decide if a second file that was for the same property was
308 # overriding the earlier one or not.
310 # Why are there two kinds of tables, match and map?
311 # (And there is a base class shared by the two as well.) As stated above,
312 # they actually are for different things. Development proceeded much more
313 # smoothly when I (khw) realized the distinction. Map tables are used to
314 # give the property value for every code point (actually every code point
315 # that doesn't map to a default value). Match tables are used for regular
316 # expression matches, and are essentially the inverse mapping. Separating
317 # the two allows more specialized methods, and error checks so that one
318 # can't just take the intersection of two map tables, for example, as that
321 # What about 'fate' and 'status'. The concept of a table's fate was created
322 # late when it became clear that something more was needed. The difference
323 # between this and 'status' is unclean, and could be improved if someone
324 # wanted to spend the effort.
328 # This program is written so it will run under miniperl. Occasionally changes
329 # will cause an error where the backtrace doesn't work well under miniperl.
330 # To diagnose the problem, you can instead run it under regular perl, if you
333 # There is a good trace facility. To enable it, first sub DEBUG must be set
334 # to return true. Then a line like
336 # local $to_trace = 1 if main::DEBUG;
338 # can be added to enable tracing in its lexical scope (plus dynamic) or until
339 # you insert another line:
341 # local $to_trace = 0 if main::DEBUG;
343 # To actually trace, use a line like "trace $a, @b, %c, ...;
345 # Some of the more complex subroutines already have trace statements in them.
346 # Permanent trace statements should be like:
348 # trace ... if main::DEBUG && $to_trace;
350 # If there is just one or a few files that you're debugging, you can easily
351 # cause most everything else to be skipped. Change the line
353 # my $debug_skip = 0;
355 # to 1, and every file whose object is in @input_file_objects and doesn't have
356 # a, 'non_skip => 1,' in its constructor will be skipped. However, skipping
357 # Jamo.txt or UnicodeData.txt will likely cause fatal errors.
359 # To compare the output tables, it may be useful to specify the -annotate
360 # flag. (As of this writing, this can't be done on a clean workspace, due to
361 # requirements in Text::Tabs used in this option; so first run mktables
362 # without this option.) This option adds comment lines to each table, one for
363 # each non-algorithmically named character giving, currently its code point,
364 # name, and graphic representation if printable (and you have a font that
365 # knows about it). This makes it easier to see what the particular code
366 # points are in each output table. Non-named code points are annotated with a
367 # description of their status, and contiguous ones with the same description
368 # will be output as a range rather than individually. Algorithmically named
369 # characters are also output as ranges, except when there are just a few
374 # The program would break if Unicode were to change its names so that
375 # interior white space, underscores, or dashes differences were significant
376 # within property and property value names.
378 # It might be easier to use the xml versions of the UCD if this program ever
379 # would need heavy revision, and the ability to handle old versions was not
382 # There is the potential for name collisions, in that Perl has chosen names
383 # that Unicode could decide it also likes. There have been such collisions in
384 # the past, with mostly Perl deciding to adopt the Unicode definition of the
385 # name. However in the 5.2 Unicode beta testing, there were a number of such
386 # collisions, which were withdrawn before the final release, because of Perl's
387 # and other's protests. These all involved new properties which began with
388 # 'Is'. Based on the protests, Unicode is unlikely to try that again. Also,
389 # many of the Perl-defined synonyms, like Any, Word, etc, are listed in a
390 # Unicode document, so they are unlikely to be used by Unicode for another
391 # purpose. However, they might try something beginning with 'In', or use any
392 # of the other Perl-defined properties. This program will warn you of name
393 # collisions, and refuse to generate tables with them, but manual intervention
394 # will be required in this event. One scheme that could be implemented, if
395 # necessary, would be to have this program generate another file, or add a
396 # field to mktables.lst that gives the date of first definition of a property.
397 # Each new release of Unicode would use that file as a basis for the next
398 # iteration. And the Perl synonym addition code could sort based on the age
399 # of the property, so older properties get priority, and newer ones that clash
400 # would be refused; hence existing code would not be impacted, and some other
401 # synonym would have to be used for the new property. This is ugly, and
402 # manual intervention would certainly be easier to do in the short run; lets
403 # hope it never comes to this.
407 # This program can generate tables from the Unihan database. But that db
408 # isn't normally available, so it is marked as optional. Prior to version
409 # 5.2, this database was in a single file, Unihan.txt. In 5.2 the database
410 # was split into 8 different files, all beginning with the letters 'Unihan'.
411 # If you plunk those files down into the directory mktables ($0) is in, this
412 # program will read them and automatically create tables for the properties
413 # from it that are listed in PropertyAliases.txt and PropValueAliases.txt,
414 # plus any you add to the @cjk_properties array and the @cjk_property_values
415 # array, being sure to add necessary '# @missings' lines to the latter. For
416 # Unicode versions earlier than 5.2, most of the Unihan properties are not
417 # listed at all in PropertyAliases nor PropValueAliases. This program assumes
418 # for these early releases that you want the properties that are specified in
421 # You may need to adjust the entries to suit your purposes. setup_unihan(),
422 # and filter_unihan_line() are the functions where this is done. This program
423 # already does some adjusting to make the lines look more like the rest of the
424 # Unicode DB; You can see what that is in filter_unihan_line()
426 # There is a bug in the 3.2 data file in which some values for the
427 # kPrimaryNumeric property have commas and an unexpected comment. A filter
428 # could be added to correct these; or for a particular installation, the
429 # Unihan.txt file could be edited to fix them.
431 # HOW TO ADD A FILE TO BE PROCESSED
433 # A new file from Unicode needs to have an object constructed for it in
434 # @input_file_objects, probably at the end or at the end of the extracted
435 # ones. The program should warn you if its name will clash with others on
436 # restrictive file systems, like DOS. If so, figure out a better name, and
437 # add lines to the README.perl file giving that. If the file is a character
438 # property, it should be in the format that Unicode has implicitly
439 # standardized for such files for the more recently introduced ones.
440 # If so, the Input_file constructor for @input_file_objects can just be the
441 # file name and release it first appeared in. If not, then it should be
442 # possible to construct an each_line_handler() to massage the line into the
445 # For non-character properties, more code will be needed. You can look at
446 # the existing entries for clues.
448 # UNICODE VERSIONS NOTES
450 # The Unicode UCD has had a number of errors in it over the versions. And
451 # these remain, by policy, in the standard for that version. Therefore it is
452 # risky to correct them, because code may be expecting the error. So this
453 # program doesn't generally make changes, unless the error breaks the Perl
454 # core. As an example, some versions of 2.1.x Jamo.txt have the wrong value
455 # for U+1105, which causes real problems for the algorithms for Jamo
456 # calculations, so it is changed here.
458 # But it isn't so clear cut as to what to do about concepts that are
459 # introduced in a later release; should they extend back to earlier releases
460 # where the concept just didn't exist? It was easier to do this than to not,
461 # so that's what was done. For example, the default value for code points not
462 # in the files for various properties was probably undefined until changed by
463 # some version. No_Block for blocks is such an example. This program will
464 # assign No_Block even in Unicode versions that didn't have it. This has the
465 # benefit that code being written doesn't have to special case earlier
466 # versions; and the detriment that it doesn't match the Standard precisely for
467 # the affected versions.
469 # Here are some observations about some of the issues in early versions:
471 # Prior to version 3.0, there were 3 character decompositions. These are not
472 # handled by Unicode::Normalize, nor will it compile when presented a version
473 # that has them. However, you can trivially get it to compile by simply
474 # ignoring those decompositions, by changing the croak to a carp. At the time
475 # of this writing, the line (in cpan/Unicode-Normalize/Normalize.pm or
476 # cpan/Unicode-Normalize/mkheader) reads
478 # croak("Weird Canonical Decomposition of U+$h");
480 # Simply comment it out. It will compile, but will not know about any three
481 # character decompositions.
483 # The number of code points in \p{alpha=True} halved in 2.1.9. It turns out
484 # that the reason is that the CJK block starting at 4E00 was removed from
485 # PropList, and was not put back in until 3.1.0. The Perl extension (the
486 # single property name \p{alpha}) has the correct values. But the compound
487 # form is simply not generated until 3.1, as it can be argued that prior to
488 # this release, this was not an official property. The comments for
489 # filter_old_style_proplist() give more details.
491 # Unicode introduced the synonym Space for White_Space in 4.1. Perl has
492 # always had a \p{Space}. In release 3.2 only, they are not synonymous. The
493 # reason is that 3.2 introduced U+205F=medium math space, which was not
494 # classed as white space, but Perl figured out that it should have been. 4.0
495 # reclassified it correctly.
497 # Another change between 3.2 and 4.0 is the CCC property value ATBL. In 3.2
498 # this was erroneously a synonym for 202 (it should be 200). In 4.0, ATB
499 # became 202, and ATBL was left with no code points, as all the ones that
500 # mapped to 202 stayed mapped to 202. Thus if your program used the numeric
501 # name for the class, it would not have been affected, but if it used the
502 # mnemonic, it would have been.
504 # \p{Script=Hrkt} (Katakana_Or_Hiragana) came in 4.0.1. Before that, code
505 # points which eventually came to have this script property value, instead
506 # mapped to "Unknown". But in the next release all these code points were
507 # moved to \p{sc=common} instead.
509 # The tests furnished by Unicode for testing WordBreak and SentenceBreak
510 # generate errors in 5.0 and earlier.
512 # The default for missing code points for BidiClass is complicated. Starting
513 # in 3.1.1, the derived file DBidiClass.txt handles this, but this program
514 # tries to do the best it can for earlier releases. It is done in
515 # process_PropertyAliases()
517 # In version 2.1.2, the entry in UnicodeData.txt:
518 # 0275;LATIN SMALL LETTER BARRED O;Ll;0;L;;;;;N;;;;019F;
520 # 0275;LATIN SMALL LETTER BARRED O;Ll;0;L;;;;;N;;;019F;;019F
521 # Without this change, there are casing problems for this character.
523 # Search for $string_compare_versions to see how to compare changes to
524 # properties between Unicode versions
526 ##############################################################################
528 my $UNDEF = ':UNDEF:'; # String to print out for undefined values in tracing
530 my $MAX_LINE_WIDTH = 78;
532 # Debugging aid to skip most files so as to not be distracted by them when
533 # concentrating on the ones being debugged. Add
535 # to the constructor for those files you want processed when you set this.
536 # Files with a first version number of 0 are special: they are always
537 # processed regardless of the state of this flag. Generally, Jamo.txt and
538 # UnicodeData.txt must not be skipped if you want this program to not die
539 # before normal completion.
543 # Normally these are suppressed.
544 my $write_Unicode_deprecated_tables = 0;
546 # Set to 1 to enable tracing.
549 { # Closure for trace: debugging aid
550 my $print_caller = 1; # ? Include calling subroutine name
551 my $main_with_colon = 'main::';
552 my $main_colon_length = length($main_with_colon);
555 return unless $to_trace; # Do nothing if global flag not set
559 local $DB::trace = 0;
560 $DB::trace = 0; # Quiet 'used only once' message
564 # Loop looking up the stack to get the first non-trace caller
569 $line_number = $caller_line;
570 (my $pkg, my $file, $caller_line, my $caller) = caller $i++;
571 $caller = $main_with_colon unless defined $caller;
573 $caller_name = $caller;
576 $caller_name =~ s/.*:://;
577 if (substr($caller_name, 0, $main_colon_length)
580 $caller_name = substr($caller_name, $main_colon_length);
583 } until ($caller_name ne 'trace');
585 # If the stack was empty, we were called from the top level
586 $caller_name = 'main' if ($caller_name eq ""
587 || $caller_name eq 'trace');
590 #print STDERR __LINE__, ": ", join ", ", @input, "\n";
591 foreach my $string (@input) {
592 if (ref $string eq 'ARRAY' || ref $string eq 'HASH') {
593 $output .= simple_dumper($string);
596 $string = "$string" if ref $string;
597 $string = $UNDEF unless defined $string;
599 $string = '""' if $string eq "";
600 $output .= " " if $output ne ""
602 && substr($output, -1, 1) ne " "
603 && substr($string, 0, 1) ne " ";
608 print STDERR sprintf "%4d: ", $line_number if defined $line_number;
609 print STDERR "$caller_name: " if $print_caller;
610 print STDERR $output, "\n";
615 # This is for a rarely used development feature that allows you to compare two
616 # versions of the Unicode standard without having to deal with changes caused
617 # by the code points introduced in the later version. You probably also want
618 # to use the -annotate option when using this. Run this program on a unicore
619 # containing the starting release you want to compare. Save that output
620 # structrue. Then, switching to a unicore with the ending release, change the
621 # 0 in the $string_compare_versions definition just below to a string
622 # containing a SINGLE dotted Unicode release number (e.g. "2.1") corresponding
623 # to the starting release. This program will then compile, but throw away all
624 # code points introduced after the starting release. Finally use a diff tool
625 # to compare the two directory structures. They include only the code points
626 # common to both releases, and you can see the changes caused just by the
627 # underlying release semantic changes. For versions earlier than 3.2, you
628 # must copy a version of DAge.txt into the directory.
629 my $string_compare_versions = DEBUG && 0; # e.g., "2.1";
630 my $compare_versions = DEBUG
631 && $string_compare_versions
632 && pack "C*", split /\./, $string_compare_versions;
635 # Returns non-duplicated input values. From "Perl Best Practices:
636 # Encapsulated Cleverness". p. 455 in first edition.
639 # Arguably this breaks encapsulation, if the goal is to permit multiple
640 # distinct objects to stringify to the same value, and be interchangeable.
641 # However, for this program, no two objects stringify identically, and all
642 # lists passed to this function are either objects or strings. So this
643 # doesn't affect correctness, but it does give a couple of percent speedup.
645 return grep { ! $seen{$_}++ } @_;
648 $0 = File::Spec->canonpath($0);
650 my $make_test_script = 0; # ? Should we output a test script
651 my $make_norm_test_script = 0; # ? Should we output a normalization test script
652 my $write_unchanged_files = 0; # ? Should we update the output files even if
653 # we don't think they have changed
654 my $use_directory = ""; # ? Should we chdir somewhere.
655 my $pod_directory; # input directory to store the pod file.
656 my $pod_file = 'perluniprops';
657 my $t_path; # Path to the .t test file
658 my $file_list = 'mktables.lst'; # File to store input and output file names.
659 # This is used to speed up the build, by not
660 # executing the main body of the program if
661 # nothing on the list has changed since the
663 my $make_list = 1; # ? Should we write $file_list. Set to always
664 # make a list so that when the pumpking is
665 # preparing a release, s/he won't have to do
667 my $glob_list = 0; # ? Should we try to include unknown .txt files
669 my $output_range_counts = $debugging_build; # ? Should we include the number
670 # of code points in ranges in
672 my $annotate = 0; # ? Should character names be in the output
674 # Verbosity levels; 0 is quiet
675 my $NORMAL_VERBOSITY = 1;
679 my $verbosity = $NORMAL_VERBOSITY;
681 # Stored in mktables.lst so that if this program is called with different
682 # options, will regenerate even if the files otherwise look like they're
684 my $command_line_arguments = join " ", @ARGV;
688 my $arg = shift @ARGV;
690 $verbosity = $VERBOSE;
692 elsif ($arg eq '-p') {
693 $verbosity = $PROGRESS;
694 $| = 1; # Flush buffers as we go.
696 elsif ($arg eq '-q') {
699 elsif ($arg eq '-w') {
700 $write_unchanged_files = 1; # update the files even if havent changed
702 elsif ($arg eq '-check') {
703 my $this = shift @ARGV;
704 my $ok = shift @ARGV;
706 print "Skipping as check params are not the same.\n";
710 elsif ($arg eq '-P' && defined ($pod_directory = shift)) {
711 -d $pod_directory or croak "Directory '$pod_directory' doesn't exist";
713 elsif ($arg eq '-maketest' || ($arg eq '-T' && defined ($t_path = shift)))
715 $make_test_script = 1;
717 elsif ($arg eq '-makenormtest')
719 $make_norm_test_script = 1;
721 elsif ($arg eq '-makelist') {
724 elsif ($arg eq '-C' && defined ($use_directory = shift)) {
725 -d $use_directory or croak "Unknown directory '$use_directory'";
727 elsif ($arg eq '-L') {
729 # Existence not tested until have chdir'd
732 elsif ($arg eq '-globlist') {
735 elsif ($arg eq '-c') {
736 $output_range_counts = ! $output_range_counts
738 elsif ($arg eq '-annotate') {
740 $debugging_build = 1;
741 $output_range_counts = 1;
745 $with_c .= 'out' if $output_range_counts; # Complements the state
747 usage: $0 [-c|-p|-q|-v|-w] [-C dir] [-L filelist] [ -P pod_dir ]
748 [ -T test_file_path ] [-globlist] [-makelist] [-maketest]
750 -c : Output comments $with_c number of code points in ranges
751 -q : Quiet Mode: Only output serious warnings.
752 -p : Set verbosity level to normal plus show progress.
753 -v : Set Verbosity level high: Show progress and non-serious
755 -w : Write files regardless
756 -C dir : Change to this directory before proceeding. All relative paths
757 except those specified by the -P and -T options will be done
758 with respect to this directory.
759 -P dir : Output $pod_file file to directory 'dir'.
760 -T path : Create a test script as 'path'; overrides -maketest
761 -L filelist : Use alternate 'filelist' instead of standard one
762 -globlist : Take as input all non-Test *.txt files in current and sub
764 -maketest : Make test script 'TestProp.pl' in current (or -C directory),
766 -makelist : Rewrite the file list $file_list based on current setup
767 -annotate : Output an annotation for each character in the table files;
768 useful for debugging mktables, looking at diffs; but is slow
770 -check A B : Executes $0 only if A and B are the same
775 # Stores the most-recently changed file. If none have changed, can skip the
777 my $most_recent = (stat $0)[9]; # Do this before the chdir!
779 # Change directories now, because need to read 'version' early.
780 if ($use_directory) {
781 if ($pod_directory && ! File::Spec->file_name_is_absolute($pod_directory)) {
782 $pod_directory = File::Spec->rel2abs($pod_directory);
784 if ($t_path && ! File::Spec->file_name_is_absolute($t_path)) {
785 $t_path = File::Spec->rel2abs($t_path);
787 chdir $use_directory or croak "Failed to chdir to '$use_directory':$!";
788 if ($pod_directory && File::Spec->file_name_is_absolute($pod_directory)) {
789 $pod_directory = File::Spec->abs2rel($pod_directory);
791 if ($t_path && File::Spec->file_name_is_absolute($t_path)) {
792 $t_path = File::Spec->abs2rel($t_path);
796 # Get Unicode version into regular and v-string. This is done now because
797 # various tables below get populated based on it. These tables are populated
798 # here to be near the top of the file, and so easily seeable by those needing
800 open my $VERSION, "<", "version"
801 or croak "$0: can't open required file 'version': $!\n";
802 my $string_version = <$VERSION>;
804 chomp $string_version;
805 my $v_version = pack "C*", split /\./, $string_version; # v string
807 my $unicode_version = ($compare_versions)
808 ? ( "$string_compare_versions (using "
809 . "$string_version rules)")
812 # The following are the complete names of properties with property values that
813 # are known to not match any code points in some versions of Unicode, but that
814 # may change in the future so they should be matchable, hence an empty file is
815 # generated for them.
816 my @tables_that_may_be_empty;
817 push @tables_that_may_be_empty, 'Joining_Type=Left_Joining'
818 if $v_version lt v6.3.0;
819 push @tables_that_may_be_empty, 'Script=Common' if $v_version le v4.0.1;
820 push @tables_that_may_be_empty, 'Title' if $v_version lt v2.0.0;
821 push @tables_that_may_be_empty, 'Script=Katakana_Or_Hiragana'
822 if $v_version ge v4.1.0;
823 push @tables_that_may_be_empty, 'Script_Extensions=Katakana_Or_Hiragana'
824 if $v_version ge v6.0.0;
825 push @tables_that_may_be_empty, 'Grapheme_Cluster_Break=Prepend'
826 if $v_version ge v6.1.0;
827 push @tables_that_may_be_empty, 'Canonical_Combining_Class=CCC133'
828 if $v_version ge v6.2.0;
830 # The lists below are hashes, so the key is the item in the list, and the
831 # value is the reason why it is in the list. This makes generation of
832 # documentation easier.
834 my %why_suppressed; # No file generated for these.
836 # Files aren't generated for empty extraneous properties. This is arguable.
837 # Extraneous properties generally come about because a property is no longer
838 # used in a newer version of Unicode. If we generated a file without code
839 # points, programs that used to work on that property will still execute
840 # without errors. It just won't ever match (or will always match, with \P{}).
841 # This means that the logic is now likely wrong. I (khw) think its better to
842 # find this out by getting an error message. Just move them to the table
843 # above to change this behavior
844 my %why_suppress_if_empty_warn_if_not = (
846 # It is the only property that has ever officially been removed from the
847 # Standard. The database never contained any code points for it.
848 'Special_Case_Condition' => 'Obsolete',
850 # Apparently never official, but there were code points in some versions of
851 # old-style PropList.txt
852 'Non_Break' => 'Obsolete',
855 # These would normally go in the warn table just above, but they were changed
856 # a long time before this program was written, so warnings about them are
858 if ($v_version gt v3.2.0) {
859 push @tables_that_may_be_empty,
860 'Canonical_Combining_Class=Attached_Below_Left'
863 # Enum values for to_output_map() method in the Map_Table package. (0 is don't
865 my $EXTERNAL_MAP = 1;
866 my $INTERNAL_MAP = 2;
867 my $OUTPUT_ADJUSTED = 3;
869 # To override computed values for writing the map tables for these properties.
870 # The default for enum map tables is to write them out, so that the Unicode
871 # .txt files can be removed, but all the data to compute any property value
872 # for any code point is available in a more compact form.
873 my %global_to_output_map = (
874 # Needed by UCD.pm, but don't want to publicize that it exists, so won't
875 # get stuck supporting it if things change. Since it is a STRING
876 # property, it normally would be listed in the pod, but INTERNAL_MAP
878 Unicode_1_Name => $INTERNAL_MAP,
880 Present_In => 0, # Suppress, as easily computed from Age
881 Block => (NON_ASCII_PLATFORM) ? 1 : 0, # Suppress, as Blocks.txt is
882 # retained, but needed for
885 # Suppress, as mapping can be found instead from the
886 # Perl_Decomposition_Mapping file
887 Decomposition_Type => 0,
890 # There are several types of obsolete properties defined by Unicode. These
891 # must be hand-edited for every new Unicode release.
892 my %why_deprecated; # Generates a deprecated warning message if used.
893 my %why_stabilized; # Documentation only
894 my %why_obsolete; # Documentation only
897 my $simple = 'Perl uses the more complete version';
898 my $unihan = 'Unihan properties are by default not enabled in the Perl core. Instead use CPAN: Unicode::Unihan';
900 my $other_properties = 'other properties';
901 my $contributory = "Used by Unicode internally for generating $other_properties and not intended to be used stand-alone";
902 my $why_no_expand = "Deprecated by Unicode. These are characters that expand to more than one character in the specified normalization form, but whether they actually take up more bytes or not depends on the encoding being used. For example, a UTF-8 encoded character may expand to a different number of bytes than a UTF-32 encoded character.";
905 'Grapheme_Link' => 'Deprecated by Unicode: Duplicates ccc=vr (Canonical_Combining_Class=Virama)',
906 'Jamo_Short_Name' => $contributory,
907 'Line_Break=Surrogate' => 'Deprecated by Unicode because surrogates should never appear in well-formed text, and therefore shouldn\'t be the basis for line breaking',
908 'Other_Alphabetic' => $contributory,
909 'Other_Default_Ignorable_Code_Point' => $contributory,
910 'Other_Grapheme_Extend' => $contributory,
911 'Other_ID_Continue' => $contributory,
912 'Other_ID_Start' => $contributory,
913 'Other_Lowercase' => $contributory,
914 'Other_Math' => $contributory,
915 'Other_Uppercase' => $contributory,
916 'Expands_On_NFC' => $why_no_expand,
917 'Expands_On_NFD' => $why_no_expand,
918 'Expands_On_NFKC' => $why_no_expand,
919 'Expands_On_NFKD' => $why_no_expand,
923 # There is a lib/unicore/Decomposition.pl (used by Normalize.pm) which
924 # contains the same information, but without the algorithmically
925 # determinable Hangul syllables'. This file is not published, so it's
926 # existence is not noted in the comment.
927 'Decomposition_Mapping' => 'Accessible via Unicode::Normalize or prop_invmap() or charprop() in Unicode::UCD::',
929 # Don't suppress ISO_Comment, as otherwise special handling is needed
930 # to differentiate between it and gc=c, which can be written as 'isc',
931 # which is the same characters as ISO_Comment's short name.
933 'Name' => "Accessible via \\N{...} or 'use charnames;' or charprop() or prop_invmap() in Unicode::UCD::",
935 'Simple_Case_Folding' => "$simple. Can access this through casefold(), charprop(), or prop_invmap() in Unicode::UCD",
936 'Simple_Lowercase_Mapping' => "$simple. Can access this through charinfo(), charprop(), or prop_invmap() in Unicode::UCD",
937 'Simple_Titlecase_Mapping' => "$simple. Can access this through charinfo(), charprop(), or prop_invmap() in Unicode::UCD",
938 'Simple_Uppercase_Mapping' => "$simple. Can access this through charinfo(), charprop(), or prop_invmap() in Unicode::UCD",
940 FC_NFKC_Closure => 'Deprecated by Unicode, and supplanted in usage by NFKC_Casefold; otherwise not useful',
943 foreach my $property (
945 # The following are suppressed because they were made contributory
946 # or deprecated by Unicode before Perl ever thought about
955 # The following are suppressed because they have been marked
956 # as deprecated for a sufficient amount of time
958 'Other_Default_Ignorable_Code_Point',
959 'Other_Grapheme_Extend',
966 $why_suppressed{$property} = $why_deprecated{$property};
969 # Customize the message for all the 'Other_' properties
970 foreach my $property (keys %why_deprecated) {
971 next if (my $main_property = $property) !~ s/^Other_//;
972 $why_deprecated{$property} =~ s/$other_properties/the $main_property property (which should be used instead)/;
976 if ($write_Unicode_deprecated_tables) {
977 foreach my $property (keys %why_suppressed) {
978 delete $why_suppressed{$property} if $property =~
979 / ^ Other | Grapheme /x;
983 if ($v_version ge 4.0.0) {
984 $why_stabilized{'Hyphen'} = 'Use the Line_Break property instead; see www.unicode.org/reports/tr14';
985 if ($v_version ge 6.0.0) {
986 $why_deprecated{'Hyphen'} = 'Supplanted by Line_Break property values; see www.unicode.org/reports/tr14';
989 if ($v_version ge 5.2.0 && $v_version lt 6.0.0) {
990 $why_obsolete{'ISO_Comment'} = 'Code points for it have been removed';
991 if ($v_version ge 6.0.0) {
992 $why_deprecated{'ISO_Comment'} = 'No longer needed for Unicode\'s internal chart generation; otherwise not useful, and code points for it have been removed';
996 # Probably obsolete forever
997 if ($v_version ge v4.1.0) {
998 $why_suppressed{'Script=Katakana_Or_Hiragana'} = 'Obsolete. All code points previously matched by this have been moved to "Script=Common".';
1000 if ($v_version ge v6.0.0) {
1001 $why_suppressed{'Script=Katakana_Or_Hiragana'} .= ' Consider instead using "Script_Extensions=Katakana" or "Script_Extensions=Hiragana" (or both)';
1002 $why_suppressed{'Script_Extensions=Katakana_Or_Hiragana'} = 'All code points that would be matched by this are matched by either "Script_Extensions=Katakana" or "Script_Extensions=Hiragana"';
1005 # This program can create files for enumerated-like properties, such as
1006 # 'Numeric_Type'. This file would be the same format as for a string
1007 # property, with a mapping from code point to its value, so you could look up,
1008 # for example, the script a code point is in. But no one so far wants this
1009 # mapping, or they have found another way to get it since this is a new
1010 # feature. So no file is generated except if it is in this list.
1011 my @output_mapped_properties = split "\n", <<END;
1014 # If you want more Unihan properties than the default, you need to add them to
1015 # these arrays. Depending on the property type, @missing lines might have to
1016 # be added to the second array. A sample entry would be (including the '#'):
1017 # @missing: 0000..10FFFF; cjkAccountingNumeric; NaN
1018 my @cjk_properties = split "\n", <<'END';
1020 my @cjk_property_values = split "\n", <<'END';
1023 # The input files don't list every code point. Those not listed are to be
1024 # defaulted to some value. Below are hard-coded what those values are for
1025 # non-binary properties as of 5.1. Starting in 5.0, there are
1026 # machine-parsable comment lines in the files that give the defaults; so this
1027 # list shouldn't have to be extended. The claim is that all missing entries
1028 # for binary properties will default to 'N'. Unicode tried to change that in
1029 # 5.2, but the beta period produced enough protest that they backed off.
1031 # The defaults for the fields that appear in UnicodeData.txt in this hash must
1032 # be in the form that it expects. The others may be synonyms.
1033 my $CODE_POINT = '<code point>';
1034 my %default_mapping = (
1035 Age => "Unassigned",
1036 # Bidi_Class => Complicated; set in code
1037 Bidi_Mirroring_Glyph => "",
1038 Block => 'No_Block',
1039 Canonical_Combining_Class => 0,
1040 Case_Folding => $CODE_POINT,
1041 Decomposition_Mapping => $CODE_POINT,
1042 Decomposition_Type => 'None',
1043 East_Asian_Width => "Neutral",
1044 FC_NFKC_Closure => $CODE_POINT,
1045 General_Category => ($v_version le 6.3.0) ? 'Cn' : 'Unassigned',
1046 Grapheme_Cluster_Break => 'Other',
1047 Hangul_Syllable_Type => 'NA',
1049 Jamo_Short_Name => "",
1050 Joining_Group => "No_Joining_Group",
1051 # Joining_Type => Complicated; set in code
1052 kIICore => 'N', # Is converted to binary
1053 #Line_Break => Complicated; set in code
1054 Lowercase_Mapping => $CODE_POINT,
1061 Numeric_Type => 'None',
1062 Numeric_Value => 'NaN',
1063 Script => ($v_version le 4.1.0) ? 'Common' : 'Unknown',
1064 Sentence_Break => 'Other',
1065 Simple_Case_Folding => $CODE_POINT,
1066 Simple_Lowercase_Mapping => $CODE_POINT,
1067 Simple_Titlecase_Mapping => $CODE_POINT,
1068 Simple_Uppercase_Mapping => $CODE_POINT,
1069 Titlecase_Mapping => $CODE_POINT,
1070 Unicode_1_Name => "",
1071 Unicode_Radical_Stroke => "",
1072 Uppercase_Mapping => $CODE_POINT,
1073 Word_Break => 'Other',
1076 ### End of externally interesting definitions, except for @input_file_objects
1079 # !!!!!!! DO NOT EDIT THIS FILE !!!!!!!
1080 # This file is machine-generated by $0 from the Unicode
1081 # database, Version $unicode_version. Any changes made here will be lost!
1084 my $INTERNAL_ONLY_HEADER = <<"EOF";
1086 # !!!!!!! INTERNAL PERL USE ONLY !!!!!!!
1087 # This file is for internal use by core Perl only. The format and even the
1088 # name or existence of this file are subject to change without notice. Don't
1089 # use it directly. Use Unicode::UCD to access the Unicode character data
1093 my $DEVELOPMENT_ONLY=<<"EOF";
1094 # !!!!!!! DEVELOPMENT USE ONLY !!!!!!!
1095 # This file contains information artificially constrained to code points
1096 # present in Unicode release $string_compare_versions.
1097 # IT CANNOT BE RELIED ON. It is for use during development only and should
1098 # not be used for production.
1102 my $MAX_UNICODE_CODEPOINT_STRING = ($v_version ge v2.0.0)
1105 my $MAX_UNICODE_CODEPOINT = hex $MAX_UNICODE_CODEPOINT_STRING;
1106 my $MAX_UNICODE_CODEPOINTS = $MAX_UNICODE_CODEPOINT + 1;
1108 # We work with above-Unicode code points, up to UV_MAX. But when you get
1109 # that high, above IV_MAX, some operations don't work, and you can easily get
1110 # overflow. Therefore for internal use, we use a much smaller number,
1111 # translating it to UV_MAX only for output. The exact number is immaterial
1112 # (all Unicode code points are treated exactly the same), but the algorithm
1113 # requires it to be at least 2 * $MAX_UNICODE_CODEPOINTS + 1;
1114 my $MAX_WORKING_CODEPOINTS= $MAX_UNICODE_CODEPOINT * 8;
1115 my $MAX_WORKING_CODEPOINT = $MAX_WORKING_CODEPOINTS - 1;
1116 my $MAX_WORKING_CODEPOINT_STRING = sprintf("%X", $MAX_WORKING_CODEPOINT);
1118 my $MAX_PLATFORM_CODEPOINT = ~0;
1120 # Matches legal code point. 4-6 hex numbers, If there are 6, the first
1121 # two must be 10; if there are 5, the first must not be a 0. Written this way
1122 # to decrease backtracking. The first regex allows the code point to be at
1123 # the end of a word, but to work properly, the word shouldn't end with a valid
1124 # hex character. The second one won't match a code point at the end of a
1125 # word, and doesn't have the run-on issue
1126 my $run_on_code_point_re =
1127 qr/ (?: 10[0-9A-F]{4} | [1-9A-F][0-9A-F]{4} | [0-9A-F]{4} ) \b/x;
1128 my $code_point_re = qr/\b$run_on_code_point_re/;
1130 # This matches the beginning of the line in the Unicode db files that give the
1131 # defaults for code points not listed (i.e., missing) in the file. The code
1132 # depends on this ending with a semi-colon, so it can assume it is a valid
1133 # field when the line is split() by semi-colons
1134 my $missing_defaults_prefix = qr/^#\s+\@missing:\s+0000\.\.10FFFF\s*;/;
1136 # Property types. Unicode has more types, but these are sufficient for our
1138 my $UNKNOWN = -1; # initialized to illegal value
1139 my $NON_STRING = 1; # Either binary or enum
1141 my $FORCED_BINARY = 3; # Not a binary property, but, besides its normal
1142 # tables, additional true and false tables are
1143 # generated so that false is anything matching the
1144 # default value, and true is everything else.
1145 my $ENUM = 4; # Include catalog
1146 my $STRING = 5; # Anything else: string or misc
1148 # Some input files have lines that give default values for code points not
1149 # contained in the file. Sometimes these should be ignored.
1150 my $NO_DEFAULTS = 0; # Must evaluate to false
1151 my $NOT_IGNORED = 1;
1154 # Range types. Each range has a type. Most ranges are type 0, for normal,
1155 # and will appear in the main body of the tables in the output files, but
1156 # there are other types of ranges as well, listed below, that are specially
1157 # handled. There are pseudo-types as well that will never be stored as a
1158 # type, but will affect the calculation of the type.
1160 # 0 is for normal, non-specials
1161 my $MULTI_CP = 1; # Sequence of more than code point
1162 my $HANGUL_SYLLABLE = 2;
1163 my $CP_IN_NAME = 3; # The NAME contains the code point appended to it.
1164 my $NULL = 4; # The map is to the null string; utf8.c can't
1165 # handle these, nor is there an accepted syntax
1166 # for them in \p{} constructs
1167 my $COMPUTE_NO_MULTI_CP = 5; # Pseudo-type; means that ranges that would
1168 # otherwise be $MULTI_CP type are instead type 0
1170 # process_generic_property_file() can accept certain overrides in its input.
1171 # Each of these must begin AND end with $CMD_DELIM.
1172 my $CMD_DELIM = "\a";
1173 my $REPLACE_CMD = 'replace'; # Override the Replace
1174 my $MAP_TYPE_CMD = 'map_type'; # Override the Type
1179 # Values for the Replace argument to add_range.
1180 # $NO # Don't replace; add only the code points not
1182 my $IF_NOT_EQUIVALENT = 1; # Replace only under certain conditions; details in
1183 # the comments at the subroutine definition.
1184 my $UNCONDITIONALLY = 2; # Replace without conditions.
1185 my $MULTIPLE_BEFORE = 4; # Don't replace, but add a duplicate record if
1187 my $MULTIPLE_AFTER = 5; # Don't replace, but add a duplicate record if
1189 my $CROAK = 6; # Die with an error if is already there
1191 # Flags to give property statuses. The phrases are to remind maintainers that
1192 # if the flag is changed, the indefinite article referring to it in the
1193 # documentation may need to be as well.
1195 my $DEPRECATED = 'D';
1196 my $a_bold_deprecated = "a 'B<$DEPRECATED>'";
1197 my $A_bold_deprecated = "A 'B<$DEPRECATED>'";
1198 my $DISCOURAGED = 'X';
1199 my $a_bold_discouraged = "an 'B<$DISCOURAGED>'";
1200 my $A_bold_discouraged = "An 'B<$DISCOURAGED>'";
1202 my $a_bold_stricter = "a 'B<$STRICTER>'";
1203 my $A_bold_stricter = "A 'B<$STRICTER>'";
1204 my $STABILIZED = 'S';
1205 my $a_bold_stabilized = "an 'B<$STABILIZED>'";
1206 my $A_bold_stabilized = "An 'B<$STABILIZED>'";
1208 my $a_bold_obsolete = "an 'B<$OBSOLETE>'";
1209 my $A_bold_obsolete = "An 'B<$OBSOLETE>'";
1211 # Aliases can also have an extra status:
1212 my $INTERNAL_ALIAS = 'P';
1214 my %status_past_participles = (
1215 $DISCOURAGED => 'discouraged',
1216 $STABILIZED => 'stabilized',
1217 $OBSOLETE => 'obsolete',
1218 $DEPRECATED => 'deprecated',
1219 $INTERNAL_ALIAS => 'reserved for Perl core internal use only',
1222 # Table fates. These are somewhat ordered, so that fates < $MAP_PROXIED should be
1223 # externally documented.
1224 my $ORDINARY = 0; # The normal fate.
1225 my $MAP_PROXIED = 1; # The map table for the property isn't written out,
1226 # but there is a file written that can be used to
1227 # reconstruct this table
1228 my $INTERNAL_ONLY = 2; # The file for this table is written out, but it is
1229 # for Perl's internal use only
1230 my $LEGACY_ONLY = 3; # Like $INTERNAL_ONLY, but not actually used by Perl.
1231 # Is for backwards compatibility for applications that
1232 # read the file directly, so it's format is
1234 my $SUPPRESSED = 4; # The file for this table is not written out, and as a
1235 # result, we don't bother to do many computations on
1237 my $PLACEHOLDER = 5; # Like $SUPPRESSED, but we go through all the
1238 # computations anyway, as the values are needed for
1239 # things to work. This happens when we have Perl
1240 # extensions that depend on Unicode tables that
1241 # wouldn't normally be in a given Unicode version.
1243 # The format of the values of the tables:
1244 my $EMPTY_FORMAT = "";
1245 my $BINARY_FORMAT = 'b';
1246 my $DECIMAL_FORMAT = 'd';
1247 my $FLOAT_FORMAT = 'f';
1248 my $INTEGER_FORMAT = 'i';
1249 my $HEX_FORMAT = 'x';
1250 my $RATIONAL_FORMAT = 'r';
1251 my $STRING_FORMAT = 's';
1252 my $ADJUST_FORMAT = 'a';
1253 my $HEX_ADJUST_FORMAT = 'ax';
1254 my $DECOMP_STRING_FORMAT = 'c';
1255 my $STRING_WHITE_SPACE_LIST = 'sw';
1257 my %map_table_formats = (
1258 $BINARY_FORMAT => 'binary',
1259 $DECIMAL_FORMAT => 'single decimal digit',
1260 $FLOAT_FORMAT => 'floating point number',
1261 $INTEGER_FORMAT => 'integer',
1262 $HEX_FORMAT => 'non-negative hex whole number; a code point',
1263 $RATIONAL_FORMAT => 'rational: an integer or a fraction',
1264 $STRING_FORMAT => 'string',
1265 $ADJUST_FORMAT => 'some entries need adjustment',
1266 $HEX_ADJUST_FORMAT => 'mapped value in hex; some entries need adjustment',
1267 $DECOMP_STRING_FORMAT => 'Perl\'s internal (Normalize.pm) decomposition mapping',
1268 $STRING_WHITE_SPACE_LIST => 'string, but some elements are interpreted as a list; white space occurs only as list item separators'
1271 # Unicode didn't put such derived files in a separate directory at first.
1272 my $EXTRACTED_DIR = (-d 'extracted') ? 'extracted' : "";
1273 my $EXTRACTED = ($EXTRACTED_DIR) ? "$EXTRACTED_DIR/" : "";
1274 my $AUXILIARY = 'auxiliary';
1276 # Hashes and arrays that will eventually go into Heavy.pl for the use of
1277 # utf8_heavy.pl and into UCD.pl for the use of UCD.pm
1278 my %loose_to_file_of; # loosely maps table names to their respective
1280 my %stricter_to_file_of; # same; but for stricter mapping.
1281 my %loose_property_to_file_of; # Maps a loose property name to its map file
1282 my %strict_property_to_file_of; # Same, but strict
1283 my @inline_definitions = "V0"; # Each element gives a definition of a unique
1284 # inversion list. When a definition is inlined,
1285 # its value in the hash it's in (one of the two
1286 # defined just above) will include an index into
1287 # this array. The 0th element is initialized to
1288 # the definition for a zero length inversion list
1289 my %file_to_swash_name; # Maps the file name to its corresponding key name
1290 # in the hash %utf8::SwashInfo
1291 my %nv_floating_to_rational; # maps numeric values floating point numbers to
1292 # their rational equivalent
1293 my %loose_property_name_of; # Loosely maps (non_string) property names to
1295 my %strict_property_name_of; # Strictly maps (non_string) property names to
1297 my %string_property_loose_to_name; # Same, for string properties.
1298 my %loose_defaults; # keys are of form "prop=value", where 'prop' is
1299 # the property name in standard loose form, and
1300 # 'value' is the default value for that property,
1301 # also in standard loose form.
1302 my %loose_to_standard_value; # loosely maps table names to the canonical
1304 my %ambiguous_names; # keys are alias names (in standard form) that
1305 # have more than one possible meaning.
1306 my %combination_property; # keys are alias names (in standard form) that
1307 # have both a map table, and a binary one that
1308 # yields true for all non-null maps.
1309 my %prop_aliases; # Keys are standard property name; values are each
1311 my %prop_value_aliases; # Keys of top level are standard property name;
1312 # values are keys to another hash, Each one is
1313 # one of the property's values, in standard form.
1314 # The values are that prop-val's aliases.
1315 my %skipped_files; # List of files that we skip
1316 my %ucd_pod; # Holds entries that will go into the UCD section of the pod
1318 # Most properties are immune to caseless matching, otherwise you would get
1319 # nonsensical results, as properties are a function of a code point, not
1320 # everything that is caselessly equivalent to that code point. For example,
1321 # Changes_When_Case_Folded('s') should be false, whereas caselessly it would
1322 # be true because 's' and 'S' are equivalent caselessly. However,
1323 # traditionally, [:upper:] and [:lower:] are equivalent caselessly, so we
1324 # extend that concept to those very few properties that are like this. Each
1325 # such property will match the full range caselessly. They are hard-coded in
1326 # the program; it's not worth trying to make it general as it's extremely
1327 # unlikely that they will ever change.
1328 my %caseless_equivalent_to;
1330 # This is the range of characters that were in Release 1 of Unicode, and
1331 # removed in Release 2 (replaced with the current Hangul syllables starting at
1332 # U+AC00). The range was reused starting in Release 3 for other purposes.
1333 my $FIRST_REMOVED_HANGUL_SYLLABLE = 0x3400;
1334 my $FINAL_REMOVED_HANGUL_SYLLABLE = 0x4DFF;
1336 # These constants names and values were taken from the Unicode standard,
1337 # version 5.1, section 3.12. They are used in conjunction with Hangul
1338 # syllables. The '_string' versions are so generated tables can retain the
1339 # hex format, which is the more familiar value
1340 my $SBase_string = "0xAC00";
1341 my $SBase = CORE::hex $SBase_string;
1342 my $LBase_string = "0x1100";
1343 my $LBase = CORE::hex $LBase_string;
1344 my $VBase_string = "0x1161";
1345 my $VBase = CORE::hex $VBase_string;
1346 my $TBase_string = "0x11A7";
1347 my $TBase = CORE::hex $TBase_string;
1352 my $NCount = $VCount * $TCount;
1354 # For Hangul syllables; These store the numbers from Jamo.txt in conjunction
1355 # with the above published constants.
1357 my %Jamo_L; # Leading consonants
1358 my %Jamo_V; # Vowels
1359 my %Jamo_T; # Trailing consonants
1361 # For code points whose name contains its ordinal as a '-ABCD' suffix.
1362 # The key is the base name of the code point, and the value is an
1363 # array giving all the ranges that use this base name. Each range
1364 # is actually a hash giving the 'low' and 'high' values of it.
1365 my %names_ending_in_code_point;
1366 my %loose_names_ending_in_code_point; # Same as above, but has blanks, dashes
1367 # removed from the names
1368 # Inverse mapping. The list of ranges that have these kinds of
1369 # names. Each element contains the low, high, and base names in an
1371 my @code_points_ending_in_code_point;
1373 # To hold Unicode's normalization test suite
1374 my @normalization_tests;
1376 # Boolean: does this Unicode version have the hangul syllables, and are we
1377 # writing out a table for them?
1378 my $has_hangul_syllables = 0;
1380 # Does this Unicode version have code points whose names end in their
1381 # respective code points, and are we writing out a table for them? 0 for no;
1382 # otherwise points to first property that a table is needed for them, so that
1383 # if multiple tables are needed, we don't create duplicates
1384 my $needing_code_points_ending_in_code_point = 0;
1386 my @backslash_X_tests; # List of tests read in for testing \X
1387 my @SB_tests; # List of tests read in for testing \b{sb}
1388 my @WB_tests; # List of tests read in for testing \b{wb}
1389 my @unhandled_properties; # Will contain a list of properties found in
1390 # the input that we didn't process.
1391 my @match_properties; # Properties that have match tables, to be
1393 my @map_properties; # Properties that get map files written
1394 my @named_sequences; # NamedSequences.txt contents.
1395 my %potential_files; # Generated list of all .txt files in the directory
1396 # structure so we can warn if something is being
1398 my @missing_early_files; # Generated list of absent files that we need to
1399 # proceed in compiling this early Unicode version
1400 my @files_actually_output; # List of files we generated.
1401 my @more_Names; # Some code point names are compound; this is used
1402 # to store the extra components of them.
1403 my $MIN_FRACTION_LENGTH = 3; # How many digits of a floating point number at
1404 # the minimum before we consider it equivalent to a
1405 # candidate rational
1406 my $MAX_FLOATING_SLOP = 10 ** - $MIN_FRACTION_LENGTH; # And in floating terms
1408 # These store references to certain commonly used property objects
1417 my $Assigned; # All assigned characters in this Unicode release
1418 my $DI; # Default_Ignorable_Code_Point property
1419 my $NChar; # Noncharacter_Code_Point property
1422 # Are there conflicting names because of beginning with 'In_', or 'Is_'
1423 my $has_In_conflicts = 0;
1424 my $has_Is_conflicts = 0;
1426 sub internal_file_to_platform ($) {
1427 # Convert our file paths which have '/' separators to those of the
1431 return undef unless defined $file;
1433 return File::Spec->join(split '/', $file);
1436 sub file_exists ($) { # platform independent '-e'. This program internally
1437 # uses slash as a path separator.
1439 return 0 if ! defined $file;
1440 return -e internal_file_to_platform($file);
1444 # Returns the address of the blessed input object.
1445 # It doesn't check for blessedness because that would do a string eval
1446 # every call, and the program is structured so that this is never called
1447 # for a non-blessed object.
1449 no overloading; # If overloaded, numifying below won't work.
1451 # Numifying a ref gives its address.
1452 return pack 'J', $_[0];
1455 # These are used only if $annotate is true.
1456 # The entire range of Unicode characters is examined to populate these
1457 # after all the input has been processed. But most can be skipped, as they
1458 # have the same descriptive phrases, such as being unassigned
1459 my @viacode; # Contains the 1 million character names
1460 my @age; # And their ages ("" if none)
1461 my @printable; # boolean: And are those characters printable?
1462 my @annotate_char_type; # Contains a type of those characters, specifically
1463 # for the purposes of annotation.
1464 my $annotate_ranges; # A map of ranges of code points that have the same
1465 # name for the purposes of annotation. They map to the
1466 # upper edge of the range, so that the end point can
1467 # be immediately found. This is used to skip ahead to
1468 # the end of a range, and avoid processing each
1469 # individual code point in it.
1470 my $unassigned_sans_noncharacters; # A Range_List of the unassigned
1471 # characters, but excluding those which are
1472 # also noncharacter code points
1474 # The annotation types are an extension of the regular range types, though
1475 # some of the latter are folded into one. Make the new types negative to
1476 # avoid conflicting with the regular types
1477 my $SURROGATE_TYPE = -1;
1478 my $UNASSIGNED_TYPE = -2;
1479 my $PRIVATE_USE_TYPE = -3;
1480 my $NONCHARACTER_TYPE = -4;
1481 my $CONTROL_TYPE = -5;
1482 my $ABOVE_UNICODE_TYPE = -6;
1483 my $UNKNOWN_TYPE = -7; # Used only if there is a bug in this program
1485 sub populate_char_info ($) {
1486 # Used only with the $annotate option. Populates the arrays with the
1487 # input code point's info that are needed for outputting more detailed
1488 # comments. If calling context wants a return, it is the end point of
1489 # any contiguous range of characters that share essentially the same info
1492 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
1494 $viacode[$i] = $perl_charname->value_of($i) || "";
1495 $age[$i] = (defined $age)
1496 ? (($age->value_of($i) =~ / ^ \d \. \d $ /x)
1497 ? $age->value_of($i)
1501 # A character is generally printable if Unicode says it is,
1502 # but below we make sure that most Unicode general category 'C' types
1504 $printable[$i] = $print->contains($i);
1506 # But the characters in this range were removed in v2.0 and replaced by
1507 # different ones later. Modern fonts will be for the replacement
1508 # characters, so suppress printing them.
1509 if (($v_version lt v2.0
1510 || ($compare_versions && $compare_versions lt v2.0))
1511 && ( $i >= $FIRST_REMOVED_HANGUL_SYLLABLE
1512 && $i <= $FINAL_REMOVED_HANGUL_SYLLABLE))
1517 $annotate_char_type[$i] = $perl_charname->type_of($i) || 0;
1519 # Only these two regular types are treated specially for annotations
1521 $annotate_char_type[$i] = 0 if $annotate_char_type[$i] != $CP_IN_NAME
1522 && $annotate_char_type[$i] != $HANGUL_SYLLABLE;
1524 # Give a generic name to all code points that don't have a real name.
1525 # We output ranges, if applicable, for these. Also calculate the end
1526 # point of the range.
1528 if (! $viacode[$i]) {
1529 if ($i > $MAX_UNICODE_CODEPOINT) {
1530 $viacode[$i] = 'Above-Unicode';
1531 $annotate_char_type[$i] = $ABOVE_UNICODE_TYPE;
1533 $end = $MAX_WORKING_CODEPOINT;
1535 elsif ($gc-> table('Private_use')->contains($i)) {
1536 $viacode[$i] = 'Private Use';
1537 $annotate_char_type[$i] = $PRIVATE_USE_TYPE;
1539 $end = $gc->table('Private_Use')->containing_range($i)->end;
1541 elsif ($NChar->contains($i)) {
1542 $viacode[$i] = 'Noncharacter';
1543 $annotate_char_type[$i] = $NONCHARACTER_TYPE;
1545 $end = $NChar->containing_range($i)->end;
1547 elsif ($gc-> table('Control')->contains($i)) {
1548 my $name_ref = property_ref('Name_Alias');
1549 $name_ref = property_ref('Unicode_1_Name') if ! defined $name_ref;
1550 $viacode[$i] = (defined $name_ref)
1551 ? $name_ref->value_of($i)
1553 $annotate_char_type[$i] = $CONTROL_TYPE;
1556 elsif ($gc-> table('Unassigned')->contains($i)) {
1557 $annotate_char_type[$i] = $UNASSIGNED_TYPE;
1559 $viacode[$i] = 'Unassigned';
1561 if (defined $block) { # No blocks in earliest releases
1562 $viacode[$i] .= ', block=' . $block-> value_of($i);
1563 $end = $gc-> table('Unassigned')->containing_range($i)->end;
1565 # Because we name the unassigned by the blocks they are in, it
1566 # can't go past the end of that block, and it also can't go
1567 # past the unassigned range it is in. The special table makes
1568 # sure that the non-characters, which are unassigned, are
1570 $end = min($block->containing_range($i)->end,
1571 $unassigned_sans_noncharacters->
1572 containing_range($i)->end);
1576 while ($unassigned_sans_noncharacters->contains($end)) {
1582 elsif ($perl->table('_Perl_Surrogate')->contains($i)) {
1583 $viacode[$i] = 'Surrogate';
1584 $annotate_char_type[$i] = $SURROGATE_TYPE;
1586 $end = $gc->table('Surrogate')->containing_range($i)->end;
1589 Carp::my_carp_bug("Can't figure out how to annotate "
1590 . sprintf("U+%04X", $i)
1591 . ". Proceeding anyway.");
1592 $viacode[$i] = 'UNKNOWN';
1593 $annotate_char_type[$i] = $UNKNOWN_TYPE;
1598 # Here, has a name, but if it's one in which the code point number is
1599 # appended to the name, do that.
1600 elsif ($annotate_char_type[$i] == $CP_IN_NAME) {
1601 $viacode[$i] .= sprintf("-%04X", $i);
1603 my $limit = $perl_charname->containing_range($i)->end;
1605 # Do all these as groups of the same age, instead of individually,
1606 # because their names are so meaningless, and there are typically
1607 # large quantities of them.
1609 while ($end <= $limit && $age->value_of($end) == $age[$i]) {
1619 # And here, has a name, but if it's a hangul syllable one, replace it with
1620 # the correct name from the Unicode algorithm
1621 elsif ($annotate_char_type[$i] == $HANGUL_SYLLABLE) {
1623 my $SIndex = $i - $SBase;
1624 my $L = $LBase + $SIndex / $NCount;
1625 my $V = $VBase + ($SIndex % $NCount) / $TCount;
1626 my $T = $TBase + $SIndex % $TCount;
1627 $viacode[$i] = "HANGUL SYLLABLE $Jamo{$L}$Jamo{$V}";
1628 $viacode[$i] .= $Jamo{$T} if $T != $TBase;
1629 $end = $perl_charname->containing_range($i)->end;
1632 return if ! defined wantarray;
1633 return $i if ! defined $end; # If not a range, return the input
1635 # Save this whole range so can find the end point quickly
1636 $annotate_ranges->add_map($i, $end, $end);
1641 # Commented code below should work on Perl 5.8.
1642 ## This 'require' doesn't necessarily work in miniperl, and even if it does,
1643 ## the native perl version of it (which is what would operate under miniperl)
1644 ## is extremely slow, as it does a string eval every call.
1645 #my $has_fast_scalar_util = $^X !~ /miniperl/
1646 # && defined eval "require Scalar::Util";
1649 # # Returns the address of the blessed input object. Uses the XS version if
1650 # # available. It doesn't check for blessedness because that would do a
1651 # # string eval every call, and the program is structured so that this is
1652 # # never called for a non-blessed object.
1654 # return Scalar::Util::refaddr($_[0]) if $has_fast_scalar_util;
1656 # # Check at least that is a ref.
1657 # my $pkg = ref($_[0]) or return undef;
1659 # # Change to a fake package to defeat any overloaded stringify
1660 # bless $_[0], 'main::Fake';
1662 # # Numifying a ref gives its address.
1663 # my $addr = pack 'J', $_[0];
1665 # # Return to original class
1666 # bless $_[0], $pkg;
1673 return $a if $a >= $b;
1680 return $a if $a <= $b;
1684 sub clarify_number ($) {
1685 # This returns the input number with underscores inserted every 3 digits
1686 # in large (5 digits or more) numbers. Input must be entirely digits, not
1690 my $pos = length($number) - 3;
1691 return $number if $pos <= 1;
1693 substr($number, $pos, 0) = '_';
1699 sub clarify_code_point_count ($) {
1700 # This is like clarify_number(), but the input is assumed to be a count of
1701 # code points, rather than a generic number.
1706 if ($number > $MAX_UNICODE_CODEPOINTS) {
1707 $number -= ($MAX_WORKING_CODEPOINTS - $MAX_UNICODE_CODEPOINTS);
1708 return "All above-Unicode code points" if $number == 0;
1709 $append = " + all above-Unicode code points";
1711 return clarify_number($number) . $append;
1716 # These routines give a uniform treatment of messages in this program. They
1717 # are placed in the Carp package to cause the stack trace to not include them,
1718 # although an alternative would be to use another package and set @CARP_NOT
1721 our $Verbose = 1 if main::DEBUG; # Useful info when debugging
1723 # This is a work-around suggested by Nicholas Clark to fix a problem with Carp
1724 # and overload trying to load Scalar:Util under miniperl. See
1725 # http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/2009-11/msg01057.html
1726 undef $overload::VERSION;
1729 my $message = shift || "";
1730 my $nofold = shift || 0;
1733 $message = main::join_lines($message);
1734 $message =~ s/^$0: *//; # Remove initial program name
1735 $message =~ s/[.;,]+$//; # Remove certain ending punctuation
1736 $message = "\n$0: $message;";
1738 # Fold the message with program name, semi-colon end punctuation
1739 # (which looks good with the message that carp appends to it), and a
1740 # hanging indent for continuation lines.
1741 $message = main::simple_fold($message, "", 4) unless $nofold;
1742 $message =~ s/\n$//; # Remove the trailing nl so what carp
1743 # appends is to the same line
1746 return $message if defined wantarray; # If a caller just wants the msg
1753 # This is called when it is clear that the problem is caused by a bug in
1756 my $message = shift;
1757 $message =~ s/^$0: *//;
1758 $message = my_carp("Bug in $0. Please report it by running perlbug or if that is unavailable, by sending email to perbug\@perl.org:\n$message");
1763 sub carp_too_few_args {
1765 my_carp_bug("Wrong number of arguments: to 'carp_too_few_arguments'. No action taken.");
1769 my $args_ref = shift;
1772 my_carp_bug("Need at least $count arguments to "
1774 . ". Instead got: '"
1775 . join ', ', @$args_ref
1776 . "'. No action taken.");
1780 sub carp_extra_args {
1781 my $args_ref = shift;
1782 my_carp_bug("Too many arguments to 'carp_extra_args': (" . join(', ', @_) . "); Extras ignored.") if @_;
1784 unless (ref $args_ref) {
1785 my_carp_bug("Argument to 'carp_extra_args' ($args_ref) must be a ref. Not checking arguments.");
1788 my ($package, $file, $line) = caller;
1789 my $subroutine = (caller 1)[3];
1792 if (ref $args_ref eq 'HASH') {
1793 foreach my $key (keys %$args_ref) {
1794 $args_ref->{$key} = $UNDEF unless defined $args_ref->{$key};
1796 $list = join ', ', each %{$args_ref};
1798 elsif (ref $args_ref eq 'ARRAY') {
1799 foreach my $arg (@$args_ref) {
1800 $arg = $UNDEF unless defined $arg;
1802 $list = join ', ', @$args_ref;
1805 my_carp_bug("Can't cope with ref "
1807 . " . argument to 'carp_extra_args'. Not checking arguments.");
1811 my_carp_bug("Unrecognized parameters in options: '$list' to $subroutine. Skipped.");
1819 # This program uses the inside-out method for objects, as recommended in
1820 # "Perl Best Practices". (This is the best solution still, since this has
1821 # to run under miniperl.) This closure aids in generating those. There
1822 # are two routines. setup_package() is called once per package to set
1823 # things up, and then set_access() is called for each hash representing a
1824 # field in the object. These routines arrange for the object to be
1825 # properly destroyed when no longer used, and for standard accessor
1826 # functions to be generated. If you need more complex accessors, just
1827 # write your own and leave those accesses out of the call to set_access().
1828 # More details below.
1830 my %constructor_fields; # fields that are to be used in constructors; see
1833 # The values of this hash will be the package names as keys to other
1834 # hashes containing the name of each field in the package as keys, and
1835 # references to their respective hashes as values.
1839 # Sets up the package, creating standard DESTROY and dump methods
1840 # (unless already defined). The dump method is used in debugging by
1842 # The optional parameters are:
1843 # a) a reference to a hash, that gets populated by later
1844 # set_access() calls with one of the accesses being
1845 # 'constructor'. The caller can then refer to this, but it is
1846 # not otherwise used by these two routines.
1847 # b) a reference to a callback routine to call during destruction
1848 # of the object, before any fields are actually destroyed
1851 my $constructor_ref = delete $args{'Constructor_Fields'};
1852 my $destroy_callback = delete $args{'Destroy_Callback'};
1853 Carp::carp_extra_args(\@_) if main::DEBUG && %args;
1856 my $package = (caller)[0];
1858 $package_fields{$package} = \%fields;
1859 $constructor_fields{$package} = $constructor_ref;
1861 unless ($package->can('DESTROY')) {
1862 my $destroy_name = "${package}::DESTROY";
1865 # Use typeglob to give the anonymous subroutine the name we want
1866 *$destroy_name = sub {
1868 my $addr = do { no overloading; pack 'J', $self; };
1870 $self->$destroy_callback if $destroy_callback;
1871 foreach my $field (keys %{$package_fields{$package}}) {
1872 #print STDERR __LINE__, ": Destroying ", ref $self, " ", sprintf("%04X", $addr), ": ", $field, "\n";
1873 delete $package_fields{$package}{$field}{$addr};
1879 unless ($package->can('dump')) {
1880 my $dump_name = "${package}::dump";
1884 return dump_inside_out($self, $package_fields{$package}, @_);
1891 # Arrange for the input field to be garbage collected when no longer
1892 # needed. Also, creates standard accessor functions for the field
1893 # based on the optional parameters-- none if none of these parameters:
1894 # 'addable' creates an 'add_NAME()' accessor function.
1895 # 'readable' or 'readable_array' creates a 'NAME()' accessor
1897 # 'settable' creates a 'set_NAME()' accessor function.
1898 # 'constructor' doesn't create an accessor function, but adds the
1899 # field to the hash that was previously passed to
1901 # Any of the accesses can be abbreviated down, so that 'a', 'ad',
1902 # 'add' etc. all mean 'addable'.
1903 # The read accessor function will work on both array and scalar
1904 # values. If another accessor in the parameter list is 'a', the read
1905 # access assumes an array. You can also force it to be array access
1906 # by specifying 'readable_array' instead of 'readable'
1908 # A sort-of 'protected' access can be set-up by preceding the addable,
1909 # readable or settable with some initial portion of 'protected_' (but,
1910 # the underscore is required), like 'p_a', 'pro_set', etc. The
1911 # "protection" is only by convention. All that happens is that the
1912 # accessor functions' names begin with an underscore. So instead of
1913 # calling set_foo, the call is _set_foo. (Real protection could be
1914 # accomplished by having a new subroutine, end_package, called at the
1915 # end of each package, and then storing the __LINE__ ranges and
1916 # checking them on every accessor. But that is way overkill.)
1918 # We create anonymous subroutines as the accessors and then use
1919 # typeglobs to assign them to the proper package and name
1921 my $name = shift; # Name of the field
1922 my $field = shift; # Reference to the inside-out hash containing the
1925 my $package = (caller)[0];
1927 if (! exists $package_fields{$package}) {
1928 croak "$0: Must call 'setup_package' before 'set_access'";
1931 # Stash the field so DESTROY can get it.
1932 $package_fields{$package}{$name} = $field;
1934 # Remaining arguments are the accessors. For each...
1935 foreach my $access (@_) {
1936 my $access = lc $access;
1940 # Match the input as far as it goes.
1941 if ($access =~ /^(p[^_]*)_/) {
1943 if (substr('protected_', 0, length $protected)
1947 # Add 1 for the underscore not included in $protected
1948 $access = substr($access, length($protected) + 1);
1956 if (substr('addable', 0, length $access) eq $access) {
1957 my $subname = "${package}::${protected}add_$name";
1960 # add_ accessor. Don't add if already there, which we
1961 # determine using 'eq' for scalars and '==' otherwise.
1964 return Carp::carp_too_few_args(\@_, 2) if main::DEBUG && @_ < 2;
1967 my $addr = do { no overloading; pack 'J', $self; };
1968 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
1970 return if grep { $value == $_ } @{$field->{$addr}};
1973 return if grep { $value eq $_ } @{$field->{$addr}};
1975 push @{$field->{$addr}}, $value;
1979 elsif (substr('constructor', 0, length $access) eq $access) {
1981 Carp::my_carp_bug("Can't set-up 'protected' constructors")
1984 $constructor_fields{$package}{$name} = $field;
1987 elsif (substr('readable_array', 0, length $access) eq $access) {
1989 # Here has read access. If one of the other parameters for
1990 # access is array, or this one specifies array (by being more
1991 # than just 'readable_'), then create a subroutine that
1992 # assumes the data is an array. Otherwise just a scalar
1993 my $subname = "${package}::${protected}$name";
1994 if (grep { /^a/i } @_
1995 or length($access) > length('readable_'))
2000 Carp::carp_extra_args(\@_) if main::DEBUG && @_ > 1;
2001 my $addr = do { no overloading; pack 'J', $_[0]; };
2002 if (ref $field->{$addr} ne 'ARRAY') {
2003 my $type = ref $field->{$addr};
2004 $type = 'scalar' unless $type;
2005 Carp::my_carp_bug("Trying to read $name as an array when it is a $type. Big problems.");
2008 return scalar @{$field->{$addr}} unless wantarray;
2010 # Make a copy; had problems with caller modifying the
2011 # original otherwise
2012 my @return = @{$field->{$addr}};
2018 # Here not an array value, a simpler function.
2022 Carp::carp_extra_args(\@_) if main::DEBUG && @_ > 1;
2024 return $field->{pack 'J', $_[0]};
2028 elsif (substr('settable', 0, length $access) eq $access) {
2029 my $subname = "${package}::${protected}set_$name";
2034 return Carp::carp_too_few_args(\@_, 2) if @_ < 2;
2035 Carp::carp_extra_args(\@_) if @_ > 2;
2037 # $self is $_[0]; $value is $_[1]
2039 $field->{pack 'J', $_[0]} = $_[1];
2044 Carp::my_carp_bug("Unknown accessor type $access. No accessor set.");
2053 # All input files use this object, which stores various attributes about them,
2054 # and provides for convenient, uniform handling. The run method wraps the
2055 # processing. It handles all the bookkeeping of opening, reading, and closing
2056 # the file, returning only significant input lines.
2058 # Each object gets a handler which processes the body of the file, and is
2059 # called by run(). All character property files must use the generic,
2060 # default handler, which has code scrubbed to handle things you might not
2061 # expect, including automatic EBCDIC handling. For files that don't deal with
2062 # mapping code points to a property value, such as test files,
2063 # PropertyAliases, PropValueAliases, and named sequences, you can override the
2064 # handler to be a custom one. Such a handler should basically be a
2065 # while(next_line()) {...} loop.
2067 # You can also set up handlers to
2068 # 0) call during object construction time, after everything else is done
2069 # 1) call before the first line is read, for pre processing
2070 # 2) call to adjust each line of the input before the main handler gets
2071 # them. This can be automatically generated, if appropriately simple
2072 # enough, by specifiying a Properties parameter in the constructor.
2073 # 3) call upon EOF before the main handler exits its loop
2074 # 4) call at the end, for post processing
2076 # $_ is used to store the input line, and is to be filtered by the
2077 # each_line_handler()s. So, if the format of the line is not in the desired
2078 # format for the main handler, these are used to do that adjusting. They can
2079 # be stacked (by enclosing them in an [ anonymous array ] in the constructor,
2080 # so the $_ output of one is used as the input to the next. The eof handler
2081 # is also stackable, but none of the others are, but could easily be changed
2084 # Some properties are used by the Perl core but aren't defined until later
2085 # Unicode releases. The perl interpreter would have problems working when
2086 # compiled with an earlier Unicode version that doesn't have them, so we need
2087 # to define them somehow for those releases. The 'Early' constructor
2088 # parameter can be used to automatically handle this. It is essentially
2089 # ignored if the Unicode version being compiled has a data file for this
2090 # property. Either code to execute or a file to read can be specified.
2091 # Details are at the %early definition.
2093 # Most of the handlers can call insert_lines() or insert_adjusted_lines()
2094 # which insert the parameters as lines to be processed before the next input
2095 # file line is read. This allows the EOF handler(s) to flush buffers, for
2096 # example. The difference between the two routines is that the lines inserted
2097 # by insert_lines() are subjected to the each_line_handler()s. (So if you
2098 # called it from such a handler, you would get infinite recursion without some
2099 # mechanism to prevent that.) Lines inserted by insert_adjusted_lines() go
2100 # directly to the main handler without any adjustments. If the
2101 # post-processing handler calls any of these, there will be no effect. Some
2102 # error checking for these conditions could be added, but it hasn't been done.
2104 # carp_bad_line() should be called to warn of bad input lines, which clears $_
2105 # to prevent further processing of the line. This routine will output the
2106 # message as a warning once, and then keep a count of the lines that have the
2107 # same message, and output that count at the end of the file's processing.
2108 # This keeps the number of messages down to a manageable amount.
2110 # get_missings() should be called to retrieve any @missing input lines.
2111 # Messages will be raised if this isn't done if the options aren't to ignore
2114 sub trace { return main::trace(@_); }
2117 # Keep track of fields that are to be put into the constructor.
2118 my %constructor_fields;
2120 main::setup_package(Constructor_Fields => \%constructor_fields);
2122 my %file; # Input file name, required
2123 main::set_access('file', \%file, qw{ c r });
2125 my %first_released; # Unicode version file was first released in, required
2126 main::set_access('first_released', \%first_released, qw{ c r });
2128 my %handler; # Subroutine to process the input file, defaults to
2129 # 'process_generic_property_file'
2130 main::set_access('handler', \%handler, qw{ c });
2133 # name of property this file is for. defaults to none, meaning not
2134 # applicable, or is otherwise determinable, for example, from each line.
2135 main::set_access('property', \%property, qw{ c r });
2138 # This is either an unsigned number, or a list of property names. In the
2139 # former case, if it is non-zero, it means the file is optional, so if the
2140 # file is absent, no warning about that is output. In the latter case, it
2141 # is a list of properties that the file (exclusively) defines. If the
2142 # file is present, tables for those properties will be produced; if
2143 # absent, none will, even if they are listed elsewhere (namely
2144 # PropertyAliases.txt and PropValueAliases.txt) as being in this release,
2145 # and no warnings will be raised about them not being available. (And no
2146 # warning about the file itself will be raised.)
2147 main::set_access('optional', \%optional, qw{ c readable_array } );
2150 # This is used for debugging, to skip processing of all but a few input
2151 # files. Add 'non_skip => 1' to the constructor for those files you want
2152 # processed when you set the $debug_skip global.
2153 main::set_access('non_skip', \%non_skip, 'c');
2156 # This is used to skip processing of this input file (semi-) permanently.
2157 # The value should be the reason the file is being skipped. It is used
2158 # for files that we aren't planning to process anytime soon, but want to
2159 # allow to be in the directory and be checked for their names not
2160 # conflicting with any other files on a DOS 8.3 name filesystem, but to
2161 # not otherwise be processed, and to not raise a warning about not being
2162 # handled. In the constructor call, any value that evaluates to a numeric
2163 # 0 or undef means don't skip. Any other value is a string giving the
2164 # reason it is being skippped, and this will appear in generated pod.
2165 # However, an empty string reason will suppress the pod entry.
2166 # Internally, calls that evaluate to numeric 0 are changed into undef to
2167 # distinguish them from an empty string call.
2168 main::set_access('skip', \%skip, 'c', 'r');
2170 my %each_line_handler;
2171 # list of subroutines to look at and filter each non-comment line in the
2172 # file. defaults to none. The subroutines are called in order, each is
2173 # to adjust $_ for the next one, and the final one adjusts it for
2175 main::set_access('each_line_handler', \%each_line_handler, 'c');
2177 my %properties; # Optional ordered list of the properties that occur in each
2178 # meaningful line of the input file. If present, an appropriate
2179 # each_line_handler() is automatically generated and pushed onto the stack
2180 # of such handlers. This is useful when a file contains multiple
2181 # proerties per line, but no other special considerations are necessary.
2182 # The special value "<ignored>" means to discard the corresponding input
2184 # Any @missing lines in the file should also match this syntax; no such
2185 # files exist as of 6.3. But if it happens in a future release, the code
2186 # could be expanded to properly parse them.
2187 main::set_access('properties', \%properties, qw{ c r });
2189 my %has_missings_defaults;
2190 # ? Are there lines in the file giving default values for code points
2191 # missing from it?. Defaults to NO_DEFAULTS. Otherwise NOT_IGNORED is
2192 # the norm, but IGNORED means it has such lines, but the handler doesn't
2193 # use them. Having these three states allows us to catch changes to the
2194 # UCD that this program should track. XXX This could be expanded to
2195 # specify the syntax for such lines, like %properties above.
2196 main::set_access('has_missings_defaults',
2197 \%has_missings_defaults, qw{ c r });
2199 my %construction_time_handler;
2200 # Subroutine to call at the end of the new method. If undef, no such
2201 # handler is called.
2202 main::set_access('construction_time_handler',
2203 \%construction_time_handler, qw{ c });
2206 # Subroutine to call before doing anything else in the file. If undef, no
2207 # such handler is called.
2208 main::set_access('pre_handler', \%pre_handler, qw{ c });
2211 # Subroutines to call upon getting an EOF on the input file, but before
2212 # that is returned to the main handler. This is to allow buffers to be
2213 # flushed. The handler is expected to call insert_lines() or
2214 # insert_adjusted() with the buffered material
2215 main::set_access('eof_handler', \%eof_handler, qw{ c });
2218 # Subroutine to call after all the lines of the file are read in and
2219 # processed. If undef, no such handler is called. Note that this cannot
2220 # add lines to be processed; instead use eof_handler
2221 main::set_access('post_handler', \%post_handler, qw{ c });
2223 my %progress_message;
2224 # Message to print to display progress in lieu of the standard one
2225 main::set_access('progress_message', \%progress_message, qw{ c });
2228 # cache open file handle, internal. Is undef if file hasn't been
2229 # processed at all, empty if has;
2230 main::set_access('handle', \%handle);
2233 # cache of lines added virtually to the file, internal
2234 main::set_access('added_lines', \%added_lines);
2237 # cache of lines added virtually to the file, internal
2238 main::set_access('remapped_lines', \%remapped_lines);
2241 # cache of errors found, internal
2242 main::set_access('errors', \%errors);
2245 # storage of '@missing' defaults lines
2246 main::set_access('missings', \%missings);
2249 # Used for properties that must be defined (for Perl's purposes) on
2250 # versions of Unicode earlier than Unicode itself defines them. The
2251 # parameter is an array (it would be better to be a hash, but not worth
2252 # bothering about due to its rare use).
2254 # The first element is either a code reference to call when in a release
2255 # earlier than the Unicode file is available in, or it is an alternate
2256 # file to use instead of the non-existent one. This file must have been
2257 # plunked down in the same directory as mktables. Should you be compiling
2258 # on a release that needs such a file, mktables will abort the
2259 # compilation, and tell you where to get the necessary file(s), and what
2260 # name(s) to use to store them as.
2261 # In the case of specifying an alternate file, the array must contain two
2264 # [1] is the name of the property that will be generated by this file.
2265 # The class automatically takes the input file and excludes any code
2266 # points in it that were not assigned in the Unicode version being
2267 # compiled. It then uses this result to define the property in the given
2268 # version. Since the property doesn't actually exist in the Unicode
2269 # version being compiled, this should be a name accessible only by core
2270 # perl. If it is the same name as the regular property, the constructor
2271 # will mark the output table as a $PLACEHOLDER so that it doesn't actually
2272 # get output, and so will be unusable by non-core code. Otherwise it gets
2273 # marked as $INTERNAL_ONLY.
2275 # [2] is a property value to assign (only when compiling Unicode 1.1.5) to
2276 # the Hangul syllables in that release (which were ripped out in version
2277 # 2) for the given property . (Hence it is ignored except when compiling
2278 # version 1. You only get one value that applies to all of them, which
2279 # may not be the actual reality, but probably nobody cares anyway for
2280 # these obsolete characters.)
2282 # Not all files can be handled in the above way, and so the code ref
2283 # alternative is available. It can do whatever it needs to. The other
2284 # array elements are optional in this case, and the code is free to use or
2285 # ignore them if they are present.
2287 # Internally, the constructor unshifts a 0 or 1 onto this array to
2288 # indicate if an early alternative is actually being used or not. This
2289 # makes for easier testing later on.
2290 main::set_access('early', \%early, 'c');
2292 my %required_even_in_debug_skip;
2293 # debug_skip is used to speed up compilation during debugging by skipping
2294 # processing files that are not needed for the task at hand. However,
2295 # some files pretty much can never be skipped, and this is used to specify
2296 # that this is one of them. In order to skip this file, the call to the
2297 # constructor must be edited to comment out this parameter.
2298 main::set_access('required_even_in_debug_skip',
2299 \%required_even_in_debug_skip, 'c');
2302 # Some files get removed from the Unicode DB. This is a version object
2303 # giving the first release without this file.
2304 main::set_access('withdrawn', \%withdrawn, 'c');
2306 my %in_this_release;
2307 # Calculated value from %first_released and %withdrawn. Are we compiling
2308 # a Unicode release which includes this file?
2309 main::set_access('in_this_release', \%in_this_release);
2312 sub _next_line_with_remapped_range;
2317 my $self = bless \do{ my $anonymous_scalar }, $class;
2318 my $addr = do { no overloading; pack 'J', $self; };
2321 $handler{$addr} = \&main::process_generic_property_file;
2322 $non_skip{$addr} = 0;
2323 $skip{$addr} = undef;
2324 $has_missings_defaults{$addr} = $NO_DEFAULTS;
2325 $handle{$addr} = undef;
2326 $added_lines{$addr} = [ ];
2327 $remapped_lines{$addr} = [ ];
2328 $each_line_handler{$addr} = [ ];
2329 $eof_handler{$addr} = [ ];
2330 $errors{$addr} = { };
2331 $missings{$addr} = [ ];
2332 $early{$addr} = [ ];
2333 $optional{$addr} = [ ];
2335 # Two positional parameters.
2336 return Carp::carp_too_few_args(\@_, 2) if main::DEBUG && @_ < 2;
2337 $file{$addr} = main::internal_file_to_platform(shift);
2338 $first_released{$addr} = shift;
2340 # The rest of the arguments are key => value pairs
2341 # %constructor_fields has been set up earlier to list all possible
2342 # ones. Either set or push, depending on how the default has been set
2345 foreach my $key (keys %args) {
2346 my $argument = $args{$key};
2348 # Note that the fields are the lower case of the constructor keys
2349 my $hash = $constructor_fields{lc $key};
2350 if (! defined $hash) {
2351 Carp::my_carp_bug("Unrecognized parameters '$key => $argument' to new() for $self. Skipped");
2354 if (ref $hash->{$addr} eq 'ARRAY') {
2355 if (ref $argument eq 'ARRAY') {
2356 foreach my $argument (@{$argument}) {
2357 next if ! defined $argument;
2358 push @{$hash->{$addr}}, $argument;
2362 push @{$hash->{$addr}}, $argument if defined $argument;
2366 $hash->{$addr} = $argument;
2371 $non_skip{$addr} = 1 if $required_even_in_debug_skip{$addr};
2373 # Convert 0 (meaning don't skip) to undef
2374 undef $skip{$addr} unless $skip{$addr};
2376 # Handle the case where this file is optional
2377 my $pod_message_for_non_existent_optional = "";
2378 if ($optional{$addr}->@*) {
2380 # First element is the pod message
2381 $pod_message_for_non_existent_optional
2382 = shift $optional{$addr}->@*;
2383 # Convert a 0 'Optional' argument to an empty list to make later
2384 # code more concise.
2385 if ( $optional{$addr}->@*
2386 && $optional{$addr}->@* == 1
2387 && $optional{$addr}[0] ne ""
2388 && $optional{$addr}[0] !~ /\D/
2389 && $optional{$addr}[0] == 0)
2391 $optional{$addr} = [ ];
2393 else { # But if the only element doesn't evaluate to 0, make sure
2394 # that this file is indeed considered optional below.
2395 unshift $optional{$addr}->@*, 1;
2400 my $function_instead_of_file = 0;
2402 # If we are compiling a Unicode release earlier than the file became
2403 # available, the constructor may have supplied a substitute
2404 if ($first_released{$addr} gt $v_version && $early{$addr}->@*) {
2406 # Yes, we have a substitute, that we will use; mark it so
2407 unshift $early{$addr}->@*, 1;
2409 # See the definition of %early for what the array elements mean.
2410 # If we have a property this defines, create a table and default
2411 # map for it now (at essentially compile time), so that it will be
2412 # available for the whole of run time. (We will want to add this
2413 # name as an alias when we are using the official property name;
2414 # but this must be deferred until run(), because at construction
2415 # time the official names have yet to be defined.)
2416 if ($early{$addr}[2]) {
2417 my $fate = ($property{$addr}
2418 && $property{$addr} eq $early{$addr}[2])
2421 my $prop_object = Property->new($early{$addr}[2],
2423 Perl_Extension => 1,
2426 # Use the default mapping for the regular property for this
2428 if ( defined $property{$addr}
2429 && defined $default_mapping{$property{$addr}})
2432 ->set_default_map($default_mapping{$property{$addr}});
2436 if (ref $early{$addr}[1] eq 'CODE') {
2437 $function_instead_of_file = 1;
2439 # If the first element of the array is a code ref, the others
2441 $handler{$addr} = $early{$addr}[1];
2442 $property{$addr} = $early{$addr}[2]
2443 if defined $early{$addr}[2];
2444 $progress = "substitute $file{$addr}";
2448 else { # Specifying a substitute file
2450 if (! main::file_exists($early{$addr}[1])) {
2452 # If we don't see the substitute file, generate an error
2453 # message giving the needed things, and add it to the list
2454 # of such to output before actual processing happens
2455 # (hence the user finds out all of them in one run).
2456 # Instead of creating a general method for NameAliases,
2457 # hard-code it here, as there is unlikely to ever be a
2458 # second one which needs special handling.
2459 my $string_version = ($file{$addr} eq "NameAliases.txt")
2460 ? 'at least 6.1 (the later, the better)'
2461 : sprintf "%vd", $first_released{$addr};
2462 push @missing_early_files, <<END;
2463 '$file{$addr}' version $string_version should be copied to '$early{$addr}[1]'.
2468 $progress = $early{$addr}[1];
2469 $progress .= ", substituting for $file{$addr}" if $file{$addr};
2470 $file{$addr} = $early{$addr}[1];
2471 $property{$addr} = $early{$addr}[2];
2473 # Ignore code points not in the version being compiled
2474 push $each_line_handler{$addr}->@*, \&_exclude_unassigned;
2476 if ( $v_version lt v2.0 # Hanguls in this release ...
2477 && defined $early{$addr}[3]) # ... need special treatment
2479 push $eof_handler{$addr}->@*, \&_fixup_obsolete_hanguls;
2483 # And this substitute is valid for all releases.
2484 $first_released{$addr} = v0;
2486 else { # Normal behavior
2487 $progress = $file{$addr};
2488 unshift $early{$addr}->@*, 0; # No substitute
2491 my $file = $file{$addr};
2492 $progress_message{$addr} = "Processing $progress"
2493 unless $progress_message{$addr};
2495 # A file should be there if it is within the window of versions for
2496 # which Unicode supplies it
2497 if ($withdrawn{$addr} && $withdrawn{$addr} le $v_version) {
2498 $in_this_release{$addr} = 0;
2502 $in_this_release{$addr} = $first_released{$addr} le $v_version;
2504 # Check that the file for this object (possibly using a substitute
2505 # for early releases) exists or we have a function alternative
2506 if ( ! $function_instead_of_file
2507 && ! main::file_exists($file))
2509 # Here there is nothing available for this release. This is
2510 # fine if we aren't expecting anything in this release.
2511 if (! $in_this_release{$addr}) {
2512 $skip{$addr} = ""; # Don't remark since we expected
2513 # nothing and got nothing
2515 elsif ($optional{$addr}->@*) {
2517 # Here the file is optional in this release; Use the
2518 # passed in text to document this case in the pod.
2519 $skip{$addr} = $pod_message_for_non_existent_optional;
2521 elsif ( $in_this_release{$addr}
2522 && ! defined $skip{$addr}
2524 { # Doesn't exist but should.
2525 $skip{$addr} = "'$file' not found. Possibly Big problems";
2526 Carp::my_carp($skip{$addr});
2529 elsif ($debug_skip && ! defined $skip{$addr} && ! $non_skip{$addr})
2532 # The file exists; if not skipped for another reason, and we are
2533 # skipping most everything during debugging builds, use that as
2535 $skip{$addr} = '$debug_skip is on'
2541 && ! $required_even_in_debug_skip{$addr}
2544 print "Warning: " . __PACKAGE__ . " constructor for $file has useless 'non_skip' in it\n";
2547 # Here, we have figured out if we will be skipping this file or not.
2548 # If so, we add any single property it defines to any passed in
2549 # optional property list. These will be dealt with at run time.
2550 if (defined $skip{$addr}) {
2551 if ($property{$addr}) {
2552 push $optional{$addr}->@*, $property{$addr};
2554 } # Otherwise, are going to process the file.
2555 elsif ($property{$addr}) {
2557 # If the file has a property defined in the constructor for it, it
2558 # means that the property is not listed in the file's entries. So
2559 # add a handler (to the list of line handlers) to insert the
2560 # property name into the lines, to provide a uniform interface to
2561 # the final processing subroutine.
2562 push @{$each_line_handler{$addr}}, \&_insert_property_into_line;
2564 elsif ($properties{$addr}) {
2566 # Similarly, there may be more than one property represented on
2567 # each line, with no clue but the constructor input what those
2568 # might be. Add a handler for each line in the input so that it
2569 # creates a separate input line for each property in those input
2570 # lines, thus making them suitable to handle generically.
2572 push @{$each_line_handler{$addr}},
2575 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2577 my @fields = split /\s*;\s*/, $_, -1;
2579 if (@fields - 1 > @{$properties{$addr}}) {
2580 $file->carp_bad_line('Extra fields');
2584 my $range = shift @fields; # 0th element is always the
2587 # The next fields in the input line correspond
2588 # respectively to the stored properties.
2589 for my $i (0 .. @{$properties{$addr}} - 1) {
2590 my $property_name = $properties{$addr}[$i];
2591 next if $property_name eq '<ignored>';
2592 $file->insert_adjusted_lines(
2593 "$range; $property_name; $fields[$i]");
2601 { # On non-ascii platforms, we use a special pre-handler
2604 *next_line = (main::NON_ASCII_PLATFORM)
2605 ? *_next_line_with_remapped_range
2609 &{$construction_time_handler{$addr}}($self)
2610 if $construction_time_handler{$addr};
2618 qw("") => "_operator_stringify",
2619 "." => \&main::_operator_dot,
2620 ".=" => \&main::_operator_dot_equal,
2623 sub _operator_stringify {
2626 return __PACKAGE__ . " object for " . $self->file;
2630 # Process the input object $self. This opens and closes the file and
2631 # calls all the handlers for it. Currently, this can only be called
2632 # once per file, as it destroy's the EOF handlers
2634 # flag to make sure extracted files are processed early
2635 state $seen_non_extracted = 0;
2638 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2640 my $addr = do { no overloading; pack 'J', $self; };
2642 my $file = $file{$addr};
2645 $handle{$addr} = 'pretend_is_open';
2648 if ($seen_non_extracted) {
2649 if ($file =~ /$EXTRACTED/i) # Some platforms may change the
2650 # case of the file's name
2652 Carp::my_carp_bug(main::join_lines(<<END
2653 $file should be processed just after the 'Prop...Alias' files, and before
2654 anything not in the $EXTRACTED_DIR directory. Proceeding, but the results may
2655 have subtle problems
2660 elsif ($EXTRACTED_DIR
2662 # We only do this check for generic property files
2663 && $handler{$addr} == \&main::process_generic_property_file
2665 && $file !~ /$EXTRACTED/i)
2667 # We don't set this (by the 'if' above) if we have no
2668 # extracted directory, so if running on an early version,
2669 # this test won't work. Not worth worrying about.
2670 $seen_non_extracted = 1;
2673 # Mark the file as having being processed, and warn if it
2674 # isn't a file we are expecting. As we process the files,
2675 # they are deleted from the hash, so any that remain at the
2676 # end of the program are files that we didn't process.
2677 my $fkey = File::Spec->rel2abs($file);
2678 my $exists = delete $potential_files{lc($fkey)};
2680 Carp::my_carp("Was not expecting '$file'.")
2681 if $exists && ! $in_this_release{$addr};
2683 # If there is special handling for compiling Unicode releases
2684 # earlier than the first one in which Unicode defines this
2686 if ($early{$addr}->@* > 1) {
2688 # Mark as processed any substitute file that would be used in
2690 $fkey = File::Spec->rel2abs($early{$addr}[1]);
2691 delete $potential_files{lc($fkey)};
2693 # As commented in the constructor code, when using the
2694 # official property, we still have to allow the publicly
2695 # inaccessible early name so that the core code which uses it
2696 # will work regardless.
2697 if (! $early{$addr}[0] && $early{$addr}->@* > 2) {
2698 my $early_property_name = $early{$addr}[2];
2699 if ($property{$addr} ne $early_property_name) {
2700 main::property_ref($property{$addr})
2701 ->add_alias($early_property_name);
2706 # We may be skipping this file ...
2707 if (defined $skip{$addr}) {
2709 # If the file isn't supposed to be in this release, there is
2711 if ($in_this_release{$addr}) {
2713 # But otherwise, we may print a message
2715 print STDERR "Skipping input file '$file'",
2716 " because '$skip{$addr}'\n";
2719 # And add it to the list of skipped files, which is later
2720 # used to make the pod
2721 $skipped_files{$file} = $skip{$addr};
2723 # The 'optional' list contains properties that are also to
2724 # be skipped along with the file. (There may also be
2725 # digits which are just placeholders to make sure it isn't
2727 foreach my $property ($optional{$addr}->@*) {
2728 next unless $property =~ /\D/;
2729 my $prop_object = main::property_ref($property);
2730 next unless defined $prop_object;
2731 $prop_object->set_fate($SUPPRESSED, $skip{$addr});
2738 # Here, we are going to process the file. Open it, converting the
2739 # slashes used in this program into the proper form for the OS
2741 if (not open $file_handle, "<", $file) {
2742 Carp::my_carp("Can't open $file. Skipping: $!");
2745 $handle{$addr} = $file_handle; # Cache the open file handle
2747 # If possible, make sure that the file is the correct version.
2748 # (This data isn't available on early Unicode releases or in
2749 # UnicodeData.txt.) We don't do this check if we are using a
2750 # substitute file instead of the official one (though the code
2751 # could be extended to do so).
2752 if ($in_this_release{$addr}
2753 && ! $early{$addr}[0]
2754 && lc($file) ne 'unicodedata.txt')
2756 if ($file !~ /^Unihan/i) {
2758 # The non-Unihan files started getting version numbers in
2759 # 3.2, but some files in 4.0 are unchanged from 3.2, and
2760 # marked as 3.2. 4.0.1 is the first version where there
2761 # are no files marked as being from less than 4.0, though
2762 # some are marked as 4.0. In versions after that, the
2763 # numbers are correct.
2764 if ($v_version ge v4.0.1) {
2765 $_ = <$file_handle>; # The version number is in the
2767 if ($_ !~ / - $string_version \. /x) {
2771 # 4.0.1 had some valid files that weren't updated.
2772 if (! ($v_version eq v4.0.1 && $_ =~ /4\.0\.0/)) {
2773 die Carp::my_carp("File '$file' is version "
2774 . "'$_'. It should be "
2775 . "version $string_version");
2780 elsif ($v_version ge v6.0.0) { # Unihan
2782 # Unihan files didn't get accurate version numbers until
2783 # 6.0. The version is somewhere in the first comment
2785 while (<$file_handle>) {
2787 Carp::my_carp_bug("Could not find the expected "
2788 . "version info in file '$file'");
2793 next if $_ !~ / version: /x;
2794 last if $_ =~ /$string_version/;
2795 die Carp::my_carp("File '$file' is version "
2796 . "'$_'. It should be "
2797 . "version $string_version");
2803 print "$progress_message{$addr}\n" if $verbosity >= $PROGRESS;
2805 # Call any special handler for before the file.
2806 &{$pre_handler{$addr}}($self) if $pre_handler{$addr};
2808 # Then the main handler
2809 &{$handler{$addr}}($self);
2811 # Then any special post-file handler.
2812 &{$post_handler{$addr}}($self) if $post_handler{$addr};
2814 # If any errors have been accumulated, output the counts (as the first
2815 # error message in each class was output when it was encountered).
2816 if ($errors{$addr}) {
2819 foreach my $error (keys %{$errors{$addr}}) {
2820 $total += $errors{$addr}->{$error};
2821 delete $errors{$addr}->{$error};
2826 = "A total of $total lines had errors in $file. ";
2828 $message .= ($types == 1)
2829 ? '(Only the first one was displayed.)'
2830 : '(Only the first of each type was displayed.)';
2831 Carp::my_carp($message);
2835 if (@{$missings{$addr}}) {
2836 Carp::my_carp_bug("Handler for $file didn't look at all the \@missing lines. Generated tables likely are wrong");
2839 # If a real file handle, close it.
2840 close $handle{$addr} or Carp::my_carp("Can't close $file: $!") if
2842 $handle{$addr} = ""; # Uses empty to indicate that has already seen
2843 # the file, as opposed to undef
2848 # Sets $_ to be the next logical input line, if any. Returns non-zero
2849 # if such a line exists. 'logical' means that any lines that have
2850 # been added via insert_lines() will be returned in $_ before the file
2854 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2856 my $addr = do { no overloading; pack 'J', $self; };
2858 # Here the file is open (or if the handle is not a ref, is an open
2859 # 'virtual' file). Get the next line; any inserted lines get priority
2860 # over the file itself.
2864 while (1) { # Loop until find non-comment, non-empty line
2865 #local $to_trace = 1 if main::DEBUG;
2866 my $inserted_ref = shift @{$added_lines{$addr}};
2867 if (defined $inserted_ref) {
2868 ($adjusted, $_) = @{$inserted_ref};
2869 trace $adjusted, $_ if main::DEBUG && $to_trace;
2870 return 1 if $adjusted;
2873 last if ! ref $handle{$addr}; # Don't read unless is real file
2874 last if ! defined ($_ = readline $handle{$addr});
2877 trace $_ if main::DEBUG && $to_trace;
2879 # See if this line is the comment line that defines what property
2880 # value that code points that are not listed in the file should
2881 # have. The format or existence of these lines is not guaranteed
2882 # by Unicode since they are comments, but the documentation says
2883 # that this was added for machine-readability, so probably won't
2884 # change. This works starting in Unicode Version 5.0. They look
2887 # @missing: 0000..10FFFF; Not_Reordered
2888 # @missing: 0000..10FFFF; Decomposition_Mapping; <code point>
2889 # @missing: 0000..10FFFF; ; NaN
2891 # Save the line for a later get_missings() call.
2892 if (/$missing_defaults_prefix/) {
2893 if ($has_missings_defaults{$addr} == $NO_DEFAULTS) {
2894 $self->carp_bad_line("Unexpected \@missing line. Assuming no missing entries");
2896 elsif ($has_missings_defaults{$addr} == $NOT_IGNORED) {
2897 my @defaults = split /\s* ; \s*/x, $_;
2899 # The first field is the @missing, which ends in a
2900 # semi-colon, so can safely shift.
2903 # Some of these lines may have empty field placeholders
2904 # which get in the way. An example is:
2905 # @missing: 0000..10FFFF; ; NaN
2906 # Remove them. Process starting from the top so the
2907 # splice doesn't affect things still to be looked at.
2908 for (my $i = @defaults - 1; $i >= 0; $i--) {
2909 next if $defaults[$i] ne "";
2910 splice @defaults, $i, 1;
2913 # What's left should be just the property (maybe) and the
2914 # default. Having only one element means it doesn't have
2918 if (@defaults >= 1) {
2919 if (@defaults == 1) {
2920 $default = $defaults[0];
2923 $property = $defaults[0];
2924 $default = $defaults[1];
2930 || ($default =~ /^</
2931 && $default !~ /^<code *point>$/i
2932 && $default !~ /^<none>$/i
2933 && $default !~ /^<script>$/i))
2935 $self->carp_bad_line("Unrecognized \@missing line: $_. Assuming no missing entries");
2939 # If the property is missing from the line, it should
2940 # be the one for the whole file
2941 $property = $property{$addr} if ! defined $property;
2943 # Change <none> to the null string, which is what it
2944 # really means. If the default is the code point
2945 # itself, set it to <code point>, which is what
2946 # Unicode uses (but sometimes they've forgotten the
2948 if ($default =~ /^<none>$/i) {
2951 elsif ($default =~ /^<code *point>$/i) {
2952 $default = $CODE_POINT;
2954 elsif ($default =~ /^<script>$/i) {
2956 # Special case this one. Currently is from
2957 # ScriptExtensions.txt, and means for all unlisted
2958 # code points, use their Script property values.
2959 # For the code points not listed in that file, the
2960 # default value is 'Unknown'.
2961 $default = "Unknown";
2964 # Store them as a sub-arrays with both components.
2965 push @{$missings{$addr}}, [ $default, $property ];
2969 # There is nothing for the caller to process on this comment
2974 # Remove comments and trailing space, and skip this line if the
2980 # Call any handlers for this line, and skip further processing of
2981 # the line if the handler sets the line to null.
2982 foreach my $sub_ref (@{$each_line_handler{$addr}}) {
2987 # Here the line is ok. return success.
2989 } # End of looping through lines.
2991 # If there are EOF handlers, call each (only once) and if it generates
2992 # more lines to process go back in the loop to handle them.
2993 while ($eof_handler{$addr}->@*) {
2994 &{$eof_handler{$addr}[0]}($self);
2995 shift $eof_handler{$addr}->@*; # Currently only get one shot at it.
2996 goto LINE if $added_lines{$addr};
2999 # Return failure -- no more lines.
3004 sub _next_line_with_remapped_range {
3006 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3008 # like _next_line(), but for use on non-ASCII platforms. It sets $_
3009 # to be the next logical input line, if any. Returns non-zero if such
3010 # a line exists. 'logical' means that any lines that have been added
3011 # via insert_lines() will be returned in $_ before the file is read
3014 # The difference from _next_line() is that this remaps the Unicode
3015 # code points in the input to those of the native platform. Each
3016 # input line contains a single code point, or a single contiguous
3017 # range of them This routine splits each range into its individual
3018 # code points and caches them. It returns the cached values,
3019 # translated into their native equivalents, one at a time, for each
3020 # call, before reading the next line. Since native values can only be
3021 # a single byte wide, no translation is needed for code points above
3022 # 0xFF, and ranges that are entirely above that number are not split.
3023 # If an input line contains the range 254-1000, it would be split into
3024 # three elements: 254, 255, and 256-1000. (The downstream table
3025 # insertion code will sort and coalesce the individual code points
3026 # into appropriate ranges.)
3028 my $addr = do { no overloading; pack 'J', $self; };
3032 # Look in cache before reading the next line. Return any cached
3034 my $inserted = shift @{$remapped_lines{$addr}};
3035 if (defined $inserted) {
3036 trace $inserted if main::DEBUG && $to_trace;
3037 $_ = $inserted =~ s/^ ( \d+ ) /sprintf("%04X", utf8::unicode_to_native($1))/xer;
3038 trace $_ if main::DEBUG && $to_trace;
3042 # Get the next line.
3043 return 0 unless _next_line($self);
3045 # If there is a special handler for it, return the line,
3046 # untranslated. This should happen only for files that are
3047 # special, not being code-point related, such as property names.
3048 return 1 if $handler{$addr}
3049 != \&main::process_generic_property_file;
3051 my ($range, $property_name, $map, @remainder)
3052 = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
3055 || ! defined $property_name
3056 || $range !~ /^ ($code_point_re) (?:\.\. ($code_point_re) )? $/x)
3058 Carp::my_carp_bug("Unrecognized input line '$_'. Ignored");
3062 my $high = (defined $2) ? hex $2 : $low;
3064 # If the input maps the range to another code point, remap the
3065 # target if it is between 0 and 255.
3068 $map =~ s/\b 00 ( [0-9A-F]{2} ) \b/sprintf("%04X", utf8::unicode_to_native(hex $1))/gxe;
3069 $tail = "$property_name; $map";
3070 $_ = "$range; $tail";
3073 $tail = $property_name;
3076 # If entire range is above 255, just return it, unchanged (except
3077 # any mapped-to code point, already changed above)
3078 return 1 if $low > 255;
3080 # Cache an entry for every code point < 255. For those in the
3081 # range above 255, return a dummy entry for just that portion of
3082 # the range. Note that this will be out-of-order, but that is not
3084 foreach my $code_point ($low .. $high) {
3085 if ($code_point > 255) {
3086 $_ = sprintf "%04X..%04X; $tail", $code_point, $high;
3089 push @{$remapped_lines{$addr}}, "$code_point; $tail";
3091 } # End of looping through lines.
3096 # Not currently used, not fully tested.
3098 # # Non-destructive look-ahead one non-adjusted, non-comment, non-blank
3099 # # record. Not callable from an each_line_handler(), nor does it call
3100 # # an each_line_handler() on the line.
3103 # my $addr = do { no overloading; pack 'J', $self; };
3105 # foreach my $inserted_ref (@{$added_lines{$addr}}) {
3106 # my ($adjusted, $line) = @{$inserted_ref};
3107 # next if $adjusted;
3109 # # Remove comments and trailing space, and return a non-empty
3112 # $line =~ s/\s+$//;
3113 # return $line if $line ne "";
3116 # return if ! ref $handle{$addr}; # Don't read unless is real file
3117 # while (1) { # Loop until find non-comment, non-empty line
3118 # local $to_trace = 1 if main::DEBUG;
3119 # trace $_ if main::DEBUG && $to_trace;
3120 # return if ! defined (my $line = readline $handle{$addr});
3122 # push @{$added_lines{$addr}}, [ 0, $line ];
3125 # $line =~ s/\s+$//;
3126 # return $line if $line ne "";
3134 # Lines can be inserted so that it looks like they were in the input
3135 # file at the place it was when this routine is called. See also
3136 # insert_adjusted_lines(). Lines inserted via this routine go through
3137 # any each_line_handler()
3141 # Each inserted line is an array, with the first element being 0 to
3142 # indicate that this line hasn't been adjusted, and needs to be
3145 push @{$added_lines{pack 'J', $self}}, map { [ 0, $_ ] } @_;
3149 sub insert_adjusted_lines {
3150 # Lines can be inserted so that it looks like they were in the input
3151 # file at the place it was when this routine is called. See also
3152 # insert_lines(). Lines inserted via this routine are already fully
3153 # adjusted, ready to be processed; each_line_handler()s handlers will
3154 # not be called. This means this is not a completely general
3155 # facility, as only the last each_line_handler on the stack should
3156 # call this. It could be made more general, by passing to each of the
3157 # line_handlers their position on the stack, which they would pass on
3158 # to this routine, and that would replace the boolean first element in
3159 # the anonymous array pushed here, so that the next_line routine could
3160 # use that to call only those handlers whose index is after it on the
3161 # stack. But this is overkill for what is needed now.
3164 trace $_[0] if main::DEBUG && $to_trace;
3166 # Each inserted line is an array, with the first element being 1 to
3167 # indicate that this line has been adjusted
3169 push @{$added_lines{pack 'J', $self}}, map { [ 1, $_ ] } @_;
3174 # Returns the stored up @missings lines' values, and clears the list.
3175 # The values are in an array, consisting of the default in the first
3176 # element, and the property in the 2nd. However, since these lines
3177 # can be stacked up, the return is an array of all these arrays.
3180 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3182 my $addr = do { no overloading; pack 'J', $self; };
3184 # If not accepting a list return, just return the first one.
3185 return shift @{$missings{$addr}} unless wantarray;
3187 my @return = @{$missings{$addr}};
3188 undef @{$missings{$addr}};
3192 sub _exclude_unassigned {
3194 # Takes the range in $_ and excludes code points that aren't assigned
3197 state $skip_inserted_count = 0;
3199 # Ignore recursive calls.
3200 if ($skip_inserted_count) {
3201 $skip_inserted_count--;
3205 # Find what code points are assigned in this release
3206 main::calculate_Assigned() if ! defined $Assigned;
3209 my $addr = do { no overloading; pack 'J', $self; };
3210 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3212 my ($range, @remainder)
3213 = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
3215 # Examine the range.
3216 if ($range =~ /^ ($code_point_re) (?:\.\. ($code_point_re) )? $/x)
3219 my $high = (defined $2) ? hex $2 : $low;
3221 # Split the range into subranges of just those code points in it
3222 # that are assigned.
3223 my @ranges = (Range_List->new(Initialize
3224 => Range->new($low, $high)) & $Assigned)->ranges;
3226 # Do nothing if nothing in the original range is assigned in this
3227 # release; handle normally if everything is in this release.
3231 elsif (@ranges != 1) {
3233 # Here, some code points in the original range aren't in this
3234 # release; @ranges gives the ones that are. Create fake input
3235 # lines for each of the ranges, and set things up so that when
3236 # this routine is called on that fake input, it will do
3238 $skip_inserted_count = @ranges;
3239 my $remainder = join ";", @remainder;
3240 for my $range (@ranges) {
3241 $self->insert_lines(sprintf("%04X..%04X;%s",
3242 $range->start, $range->end, $remainder));
3244 $_ = ""; # The original range is now defunct.
3251 sub _fixup_obsolete_hanguls {
3253 # This is called only when compiling Unicode version 1. All Unicode
3254 # data for subsequent releases assumes that the code points that were
3255 # Hangul syllables in this release only are something else, so if
3256 # using such data, we have to override it
3259 my $addr = do { no overloading; pack 'J', $self; };
3260 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3262 my $object = main::property_ref($property{$addr});
3263 $object->add_map($FIRST_REMOVED_HANGUL_SYLLABLE,
3264 $FINAL_REMOVED_HANGUL_SYLLABLE,
3265 $early{$addr}[3], # Passed-in value for these
3266 Replace => $UNCONDITIONALLY);
3269 sub _insert_property_into_line {
3270 # Add a property field to $_, if this file requires it.
3273 my $addr = do { no overloading; pack 'J', $self; };
3274 my $property = $property{$addr};
3275 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3277 $_ =~ s/(;|$)/; $property$1/;
3282 # Output consistent error messages, using either a generic one, or the
3283 # one given by the optional parameter. To avoid gazillions of the
3284 # same message in case the syntax of a file is way off, this routine
3285 # only outputs the first instance of each message, incrementing a
3286 # count so the totals can be output at the end of the file.
3289 my $message = shift;
3290 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3292 my $addr = do { no overloading; pack 'J', $self; };
3294 $message = 'Unexpected line' unless $message;
3296 # No trailing punctuation so as to fit with our addenda.
3297 $message =~ s/[.:;,]$//;
3299 # If haven't seen this exact message before, output it now. Otherwise
3300 # increment the count of how many times it has occurred
3301 unless ($errors{$addr}->{$message}) {
3302 Carp::my_carp("$message in '$_' in "
3304 . " at line $.. Skipping this line;");
3305 $errors{$addr}->{$message} = 1;
3308 $errors{$addr}->{$message}++;
3311 # Clear the line to prevent any further (meaningful) processing of it.
3318 package Multi_Default;
3320 # Certain properties in early versions of Unicode had more than one possible
3321 # default for code points missing from the files. In these cases, one
3322 # default applies to everything left over after all the others are applied,
3323 # and for each of the others, there is a description of which class of code
3324 # points applies to it. This object helps implement this by storing the
3325 # defaults, and for all but that final default, an eval string that generates
3326 # the class that it applies to.
3331 main::setup_package();
3334 # The defaults structure for the classes
3335 main::set_access('class_defaults', \%class_defaults);
3338 # The default that applies to everything left over.
3339 main::set_access('other_default', \%other_default, 'r');
3343 # The constructor is called with default => eval pairs, terminated by
3344 # the left-over default. e.g.
3345 # Multi_Default->new(
3346 # 'T' => '$gc->table("Mn") + $gc->table("Cf") - 0x200C
3348 # 'R' => 'some other expression that evaluates to code points',
3353 # It is best to leave the final value be the one that matches the
3354 # above-Unicode code points.
3358 my $self = bless \do{my $anonymous_scalar}, $class;
3359 my $addr = do { no overloading; pack 'J', $self; };
3362 my $default = shift;
3364 $class_defaults{$addr}->{$default} = $eval;
3367 $other_default{$addr} = shift;
3372 sub get_next_defaults {
3373 # Iterates and returns the next class of defaults.
3375 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3377 my $addr = do { no overloading; pack 'J', $self; };
3379 return each %{$class_defaults{$addr}};
3385 # An alias is one of the names that a table goes by. This class defines them
3386 # including some attributes. Everything is currently setup in the
3392 main::setup_package();
3395 main::set_access('name', \%name, 'r');
3398 # Should this name match loosely or not.
3399 main::set_access('loose_match', \%loose_match, 'r');
3401 my %make_re_pod_entry;
3402 # Some aliases should not get their own entries in the re section of the
3403 # pod, because they are covered by a wild-card, and some we want to
3404 # discourage use of. Binary
3405 main::set_access('make_re_pod_entry', \%make_re_pod_entry, 'r', 's');
3408 # Is this documented to be accessible via Unicode::UCD
3409 main::set_access('ucd', \%ucd, 'r', 's');
3412 # Aliases have a status, like deprecated, or even suppressed (which means
3413 # they don't appear in documentation). Enum
3414 main::set_access('status', \%status, 'r');
3417 # Similarly, some aliases should not be considered as usable ones for
3418 # external use, such as file names, or we don't want documentation to
3419 # recommend them. Boolean
3420 main::set_access('ok_as_filename', \%ok_as_filename, 'r');
3425 my $self = bless \do { my $anonymous_scalar }, $class;
3426 my $addr = do { no overloading; pack 'J', $self; };
3428 $name{$addr} = shift;
3429 $loose_match{$addr} = shift;
3430 $make_re_pod_entry{$addr} = shift;
3431 $ok_as_filename{$addr} = shift;
3432 $status{$addr} = shift;
3433 $ucd{$addr} = shift;
3435 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3437 # Null names are never ok externally
3438 $ok_as_filename{$addr} = 0 if $name{$addr} eq "";
3446 # A range is the basic unit for storing code points, and is described in the
3447 # comments at the beginning of the program. Each range has a starting code
3448 # point; an ending code point (not less than the starting one); a value
3449 # that applies to every code point in between the two end-points, inclusive;
3450 # and an enum type that applies to the value. The type is for the user's
3451 # convenience, and has no meaning here, except that a non-zero type is
3452 # considered to not obey the normal Unicode rules for having standard forms.
3454 # The same structure is used for both map and match tables, even though in the
3455 # latter, the value (and hence type) is irrelevant and could be used as a
3456 # comment. In map tables, the value is what all the code points in the range
3457 # map to. Type 0 values have the standardized version of the value stored as
3458 # well, so as to not have to recalculate it a lot.
3460 sub trace { return main::trace(@_); }
3464 main::setup_package();
3467 main::set_access('start', \%start, 'r', 's');
3470 main::set_access('end', \%end, 'r', 's');
3473 main::set_access('value', \%value, 'r');
3476 main::set_access('type', \%type, 'r');
3479 # The value in internal standard form. Defined only if the type is 0.
3480 main::set_access('standard_form', \%standard_form);
3482 # Note that if these fields change, the dump() method should as well
3485 return Carp::carp_too_few_args(\@_, 3) if main::DEBUG && @_ < 3;
3488 my $self = bless \do { my $anonymous_scalar }, $class;
3489 my $addr = do { no overloading; pack 'J', $self; };
3491 $start{$addr} = shift;
3492 $end{$addr} = shift;
3496 my $value = delete $args{'Value'}; # Can be 0
3497 $value = "" unless defined $value;
3498 $value{$addr} = $value;
3500 $type{$addr} = delete $args{'Type'} || 0;
3502 Carp::carp_extra_args(\%args) if main::DEBUG && %args;
3509 qw("") => "_operator_stringify",
3510 "." => \&main::_operator_dot,
3511 ".=" => \&main::_operator_dot_equal,
3514 sub _operator_stringify {
3516 my $addr = do { no overloading; pack 'J', $self; };
3518 # Output it like '0041..0065 (value)'
3519 my $return = sprintf("%04X", $start{$addr})
3521 . sprintf("%04X", $end{$addr});
3522 my $value = $value{$addr};
3523 my $type = $type{$addr};
3525 $return .= "$value";
3526 $return .= ", Type=$type" if $type != 0;
3533 # Calculate the standard form only if needed, and cache the result.
3534 # The standard form is the value itself if the type is special.
3535 # This represents a considerable CPU and memory saving - at the time
3536 # of writing there are 368676 non-special objects, but the standard
3537 # form is only requested for 22047 of them - ie about 6%.
3540 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3542 my $addr = do { no overloading; pack 'J', $self; };
3544 return $standard_form{$addr} if defined $standard_form{$addr};
3546 my $value = $value{$addr};
3547 return $value if $type{$addr};
3548 return $standard_form{$addr} = main::standardize($value);
3552 # Human, not machine readable. For machine readable, comment out this
3553 # entire routine and let the standard one take effect.
3556 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3558 my $addr = do { no overloading; pack 'J', $self; };
3560 my $return = $indent
3561 . sprintf("%04X", $start{$addr})
3563 . sprintf("%04X", $end{$addr})
3564 . " '$value{$addr}';";
3565 if (! defined $standard_form{$addr}) {
3566 $return .= "(type=$type{$addr})";
3568 elsif ($standard_form{$addr} ne $value{$addr}) {
3569 $return .= "(standard '$standard_form{$addr}')";
3575 package _Range_List_Base;
3577 # Base class for range lists. A range list is simply an ordered list of
3578 # ranges, so that the ranges with the lowest starting numbers are first in it.
3580 # When a new range is added that is adjacent to an existing range that has the
3581 # same value and type, it merges with it to form a larger range.
3583 # Ranges generally do not overlap, except that there can be multiple entries
3584 # of single code point ranges. This is because of NameAliases.txt.
3586 # In this program, there is a standard value such that if two different
3587 # values, have the same standard value, they are considered equivalent. This
3588 # value was chosen so that it gives correct results on Unicode data
3590 # There are a number of methods to manipulate range lists, and some operators
3591 # are overloaded to handle them.
3593 sub trace { return main::trace(@_); }
3599 # Max is initialized to a negative value that isn't adjacent to 0, for
3603 main::setup_package();
3606 # The list of ranges
3607 main::set_access('ranges', \%ranges, 'readable_array');
3610 # The highest code point in the list. This was originally a method, but
3611 # actual measurements said it was used a lot.
3612 main::set_access('max', \%max, 'r');
3614 my %each_range_iterator;
3615 # Iterator position for each_range()
3616 main::set_access('each_range_iterator', \%each_range_iterator);
3619 # Name of parent this is attached to, if any. Solely for better error
3621 main::set_access('owner_name_of', \%owner_name_of, 'p_r');
3623 my %_search_ranges_cache;
3624 # A cache of the previous result from _search_ranges(), for better
3626 main::set_access('_search_ranges_cache', \%_search_ranges_cache);
3632 # Optional initialization data for the range list.
3633 my $initialize = delete $args{'Initialize'};
3637 # Use _union() to initialize. _union() returns an object of this
3638 # class, which means that it will call this constructor recursively.
3639 # But it won't have this $initialize parameter so that it won't
3640 # infinitely loop on this.
3641 return _union($class, $initialize, %args) if defined $initialize;
3643 $self = bless \do { my $anonymous_scalar }, $class;
3644 my $addr = do { no overloading; pack 'J', $self; };
3646 # Optional parent object, only for debug info.
3647 $owner_name_of{$addr} = delete $args{'Owner'};
3648 $owner_name_of{$addr} = "" if ! defined $owner_name_of{$addr};
3650 # Stringify, in case it is an object.
3651 $owner_name_of{$addr} = "$owner_name_of{$addr}";
3653 # This is used only for error messages, and so a colon is added
3654 $owner_name_of{$addr} .= ": " if $owner_name_of{$addr} ne "";
3656 Carp::carp_extra_args(\%args) if main::DEBUG && %args;
3658 $max{$addr} = $max_init;
3660 $_search_ranges_cache{$addr} = 0;
3661 $ranges{$addr} = [];
3668 qw("") => "_operator_stringify",
3669 "." => \&main::_operator_dot,
3670 ".=" => \&main::_operator_dot_equal,
3673 sub _operator_stringify {
3675 my $addr = do { no overloading; pack 'J', $self; };
3677 return "Range_List attached to '$owner_name_of{$addr}'"
3678 if $owner_name_of{$addr};
3679 return "anonymous Range_List " . \$self;
3683 # Returns the union of the input code points. It can be called as
3684 # either a constructor or a method. If called as a method, the result
3685 # will be a new() instance of the calling object, containing the union
3686 # of that object with the other parameter's code points; if called as
3687 # a constructor, the first parameter gives the class that the new object
3688 # should be, and the second parameter gives the code points to go into
3690 # In either case, there are two parameters looked at by this routine;
3691 # any additional parameters are passed to the new() constructor.
3693 # The code points can come in the form of some object that contains
3694 # ranges, and has a conventionally named method to access them; or
3695 # they can be an array of individual code points (as integers); or
3696 # just a single code point.
3698 # If they are ranges, this routine doesn't make any effort to preserve
3699 # the range values and types of one input over the other. Therefore
3700 # this base class should not allow _union to be called from other than
3701 # initialization code, so as to prevent two tables from being added
3702 # together where the range values matter. The general form of this
3703 # routine therefore belongs in a derived class, but it was moved here
3704 # to avoid duplication of code. The failure to overload this in this
3705 # class keeps it safe.
3707 # It does make the effort during initialization to accept tables with
3708 # multiple values for the same code point, and to preserve the order
3709 # of these. If there is only one input range or range set, it doesn't
3710 # sort (as it should already be sorted to the desired order), and will
3711 # accept multiple values per code point. Otherwise it will merge
3712 # multiple values into a single one.
3715 my @args; # Arguments to pass to the constructor
3719 # If a method call, will start the union with the object itself, and
3720 # the class of the new object will be the same as self.
3727 # Add the other required parameter.
3729 # Rest of parameters are passed on to the constructor
3731 # Accumulate all records from both lists.
3733 my $input_count = 0;
3734 for my $arg (@args) {
3735 #local $to_trace = 0 if main::DEBUG;
3736 trace "argument = $arg" if main::DEBUG && $to_trace;
3737 if (! defined $arg) {
3739 if (defined $self) {
3741 $message .= $owner_name_of{pack 'J', $self};
3743 Carp::my_carp_bug($message . "Undefined argument to _union. No union done.");
3747 $arg = [ $arg ] if ! ref $arg;
3748 my $type = ref $arg;
3749 if ($type eq 'ARRAY') {
3750 foreach my $element (@$arg) {
3751 push @records, Range->new($element, $element);
3755 elsif ($arg->isa('Range')) {
3756 push @records, $arg;
3759 elsif ($arg->can('ranges')) {
3760 push @records, $arg->ranges;
3765 if (defined $self) {
3767 $message .= $owner_name_of{pack 'J', $self};
3769 Carp::my_carp_bug($message . "Cannot take the union of a $type. No union done.");
3774 # Sort with the range containing the lowest ordinal first, but if
3775 # two ranges start at the same code point, sort with the bigger range
3776 # of the two first, because it takes fewer cycles.
3777 if ($input_count > 1) {
3778 @records = sort { ($a->start <=> $b->start)
3780 # if b is shorter than a, b->end will be
3781 # less than a->end, and we want to select
3782 # a, so want to return -1
3783 ($b->end <=> $a->end)
3787 my $new = $class->new(@_);
3789 # Fold in records so long as they add new information.
3790 for my $set (@records) {
3791 my $start = $set->start;
3792 my $end = $set->end;
3793 my $value = $set->value;
3794 my $type = $set->type;
3795 if ($start > $new->max) {
3796 $new->_add_delete('+', $start, $end, $value, Type => $type);
3798 elsif ($end > $new->max) {
3799 $new->_add_delete('+', $new->max +1, $end, $value,
3802 elsif ($input_count == 1) {
3803 # Here, overlaps existing range, but is from a single input,
3804 # so preserve the multiple values from that input.
3805 $new->_add_delete('+', $start, $end, $value, Type => $type,
3806 Replace => $MULTIPLE_AFTER);
3813 sub range_count { # Return the number of ranges in the range list
3815 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3818 return scalar @{$ranges{pack 'J', $self}};
3822 # Returns the minimum code point currently in the range list, or if
3823 # the range list is empty, 2 beyond the max possible. This is a
3824 # method because used so rarely, that not worth saving between calls,
3825 # and having to worry about changing it as ranges are added and
3829 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3831 my $addr = do { no overloading; pack 'J', $self; };
3833 # If the range list is empty, return a large value that isn't adjacent
3834 # to any that could be in the range list, for simpler tests
3835 return $MAX_WORKING_CODEPOINT + 2 unless scalar @{$ranges{$addr}};
3836 return $ranges{$addr}->[0]->start;
3840 # Boolean: Is argument in the range list? If so returns $i such that:
3841 # range[$i]->end < $codepoint <= range[$i+1]->end
3842 # which is one beyond what you want; this is so that the 0th range
3843 # doesn't return false
3845 my $codepoint = shift;
3846 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3848 my $i = $self->_search_ranges($codepoint);
3849 return 0 unless defined $i;
3851 # The search returns $i, such that
3852 # range[$i-1]->end < $codepoint <= range[$i]->end
3853 # So is in the table if and only iff it is at least the start position
3856 return 0 if $ranges{pack 'J', $self}->[$i]->start > $codepoint;
3860 sub containing_range {
3861 # Returns the range object that contains the code point, undef if none
3864 my $codepoint = shift;
3865 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3867 my $i = $self->contains($codepoint);
3870 # contains() returns 1 beyond where we should look
3872 return $ranges{pack 'J', $self}->[$i-1];
3876 # Returns the value associated with the code point, undef if none
3879 my $codepoint = shift;
3880 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3882 my $range = $self->containing_range($codepoint);
3883 return unless defined $range;
3885 return $range->value;
3889 # Returns the type of the range containing the code point, undef if
3890 # the code point is not in the table
3893 my $codepoint = shift;
3894 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3896 my $range = $self->containing_range($codepoint);
3897 return unless defined $range;
3899 return $range->type;
3902 sub _search_ranges {
3903 # Find the range in the list which contains a code point, or where it
3904 # should go if were to add it. That is, it returns $i, such that:
3905 # range[$i-1]->end < $codepoint <= range[$i]->end
3906 # Returns undef if no such $i is possible (e.g. at end of table), or
3907 # if there is an error.
3910 my $code_point = shift;
3911 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3913 my $addr = do { no overloading; pack 'J', $self; };
3915 return if $code_point > $max{$addr};
3916 my $r = $ranges{$addr}; # The current list of ranges
3917 my $range_list_size = scalar @$r;
3920 use integer; # want integer division
3922 # Use the cached result as the starting guess for this one, because,
3923 # an experiment on 5.1 showed that 90% of the time the cache was the
3924 # same as the result on the next call (and 7% it was one less).
3925 $i = $_search_ranges_cache{$addr};
3926 $i = 0 if $i >= $range_list_size; # Reset if no longer valid (prob.
3927 # from an intervening deletion
3928 #local $to_trace = 1 if main::DEBUG;
3929 trace "previous \$i is still valid: $i" if main::DEBUG && $to_trace && $code_point <= $r->[$i]->end && ($i == 0 || $r->[$i-1]->end < $code_point);
3930 return $i if $code_point <= $r->[$i]->end
3931 && ($i == 0 || $r->[$i-1]->end < $code_point);
3933 # Here the cache doesn't yield the correct $i. Try adding 1.
3934 if ($i < $range_list_size - 1
3935 && $r->[$i]->end < $code_point &&
3936 $code_point <= $r->[$i+1]->end)
3939 trace "next \$i is correct: $i" if main::DEBUG && $to_trace;
3940 $_search_ranges_cache{$addr} = $i;
3944 # Here, adding 1 also didn't work. We do a binary search to
3945 # find the correct position, starting with current $i
3947 my $upper = $range_list_size - 1;
3949 trace "top of loop i=$i:", sprintf("%04X", $r->[$lower]->start), "[$lower] .. ", sprintf("%04X", $r->[$i]->start), "[$i] .. ", sprintf("%04X", $r->[$upper]->start), "[$upper]" if main::DEBUG && $to_trace;
3951 if ($code_point <= $r->[$i]->end) {
3953 # Here we have met the upper constraint. We can quit if we
3954 # also meet the lower one.
3955 last if $i == 0 || $r->[$i-1]->end < $code_point;
3957 $upper = $i; # Still too high.
3962 # Here, $r[$i]->end < $code_point, so look higher up.
3966 # Split search domain in half to try again.
3967 my $temp = ($upper + $lower) / 2;
3969 # No point in continuing unless $i changes for next time
3973 # We can't reach the highest element because of the averaging.
3974 # So if one below the upper edge, force it there and try one
3976 if ($i == $range_list_size - 2) {
3978 trace "Forcing to upper edge" if main::DEBUG && $to_trace;
3979 $i = $range_list_size - 1;
3981 # Change $lower as well so if fails next time through,
3982 # taking the average will yield the same $i, and we will
3983 # quit with the error message just below.
3987 Carp::my_carp_bug("$owner_name_of{$addr}Can't find where the range ought to go. No action taken.");
3991 } # End of while loop
3993 if (main::DEBUG && $to_trace) {
3994 trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i;
3995 trace "i= [ $i ]", $r->[$i];
3996 trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < $range_list_size - 1;
3999 # Here we have found the offset. Cache it as a starting point for the
4001 $_search_ranges_cache{$addr} = $i;
4006 # Add, replace or delete ranges to or from a list. The $type
4007 # parameter gives which:
4008 # '+' => insert or replace a range, returning a list of any changed
4010 # '-' => delete a range, returning a list of any deleted ranges.
4012 # The next three parameters give respectively the start, end, and
4013 # value associated with the range. 'value' should be null unless the
4016 # The range list is kept sorted so that the range with the lowest
4017 # starting position is first in the list, and generally, adjacent
4018 # ranges with the same values are merged into a single larger one (see
4019 # exceptions below).
4021 # There are more parameters; all are key => value pairs:
4022 # Type gives the type of the value. It is only valid for '+'.
4023 # All ranges have types; if this parameter is omitted, 0 is
4024 # assumed. Ranges with type 0 are assumed to obey the
4025 # Unicode rules for casing, etc; ranges with other types are
4026 # not. Otherwise, the type is arbitrary, for the caller's
4027 # convenience, and looked at only by this routine to keep
4028 # adjacent ranges of different types from being merged into
4029 # a single larger range, and when Replace =>
4030 # $IF_NOT_EQUIVALENT is specified (see just below).
4031 # Replace determines what to do if the range list already contains
4032 # ranges which coincide with all or portions of the input
4033 # range. It is only valid for '+':
4034 # => $NO means that the new value is not to replace
4035 # any existing ones, but any empty gaps of the
4036 # range list coinciding with the input range
4037 # will be filled in with the new value.
4038 # => $UNCONDITIONALLY means to replace the existing values with
4039 # this one unconditionally. However, if the
4040 # new and old values are identical, the
4041 # replacement is skipped to save cycles
4042 # => $IF_NOT_EQUIVALENT means to replace the existing values
4043 # (the default) with this one if they are not equivalent.
4044 # Ranges are equivalent if their types are the
4045 # same, and they are the same string; or if
4046 # both are type 0 ranges, if their Unicode
4047 # standard forms are identical. In this last
4048 # case, the routine chooses the more "modern"
4049 # one to use. This is because some of the
4050 # older files are formatted with values that
4051 # are, for example, ALL CAPs, whereas the
4052 # derived files have a more modern style,
4053 # which looks better. By looking for this
4054 # style when the pre-existing and replacement
4055 # standard forms are the same, we can move to
4057 # => $MULTIPLE_BEFORE means that if this range duplicates an
4058 # existing one, but has a different value,
4059 # don't replace the existing one, but insert
4060 # this one so that the same range can occur
4061 # multiple times. They are stored LIFO, so
4062 # that the final one inserted is the first one
4063 # returned in an ordered search of the table.
4064 # If this is an exact duplicate, including the
4065 # value, the original will be moved to be
4066 # first, before any other duplicate ranges
4067 # with different values.
4068 # => $MULTIPLE_AFTER is like $MULTIPLE_BEFORE, but is stored
4069 # FIFO, so that this one is inserted after all
4070 # others that currently exist. If this is an
4071 # exact duplicate, including value, of an
4072 # existing range, this one is discarded
4073 # (leaving the existing one in its original,
4074 # higher priority position
4075 # => $CROAK Die with an error if is already there
4076 # => anything else is the same as => $IF_NOT_EQUIVALENT
4078 # "same value" means identical for non-type-0 ranges, and it means
4079 # having the same standard forms for type-0 ranges.
4081 return Carp::carp_too_few_args(\@_, 5) if main::DEBUG && @_ < 5;
4084 my $operation = shift; # '+' for add/replace; '-' for delete;
4091 $value = "" if not defined $value; # warning: $value can be "0"
4093 my $replace = delete $args{'Replace'};
4094 $replace = $IF_NOT_EQUIVALENT unless defined $replace;
4096 my $type = delete $args{'Type'};
4097 $type = 0 unless defined $type;
4099 Carp::carp_extra_args(\%args) if main::DEBUG && %args;
4101 my $addr = do { no overloading; pack 'J', $self; };
4103 if ($operation ne '+' && $operation ne '-') {
4104 Carp::my_carp_bug("$owner_name_of{$addr}First parameter to _add_delete must be '+' or '-'. No action taken.");
4107 unless (defined $start && defined $end) {
4108 Carp::my_carp_bug("$owner_name_of{$addr}Undefined start and/or end to _add_delete. No action taken.");
4111 unless ($end >= $start) {
4112 Carp::my_carp_bug("$owner_name_of{$addr}End of range (" . sprintf("%04X", $end) . ") must not be before start (" . sprintf("%04X", $start) . "). No action taken.");
4115 #local $to_trace = 1 if main::DEBUG;
4117 if ($operation eq '-') {
4118 if ($replace != $IF_NOT_EQUIVALENT) {
4119 Carp::my_carp_bug("$owner_name_of{$addr}Replace => \$IF_NOT_EQUIVALENT is required when deleting a range from a range list. Assuming Replace => \$IF_NOT_EQUIVALENT.");
4120 $replace = $IF_NOT_EQUIVALENT;
4123 Carp::my_carp_bug("$owner_name_of{$addr}Type => 0 is required when deleting a range from a range list. Assuming Type => 0.");
4127 Carp::my_carp_bug("$owner_name_of{$addr}Value => \"\" is required when deleting a range from a range list. Assuming Value => \"\".");
4132 my $r = $ranges{$addr}; # The current list of ranges
4133 my $range_list_size = scalar @$r; # And its size
4134 my $max = $max{$addr}; # The current high code point in
4135 # the list of ranges
4137 # Do a special case requiring fewer machine cycles when the new range
4138 # starts after the current highest point. The Unicode input data is
4139 # structured so this is common.
4140 if ($start > $max) {
4142 trace "$owner_name_of{$addr} $operation", sprintf("%04X..%04X (%s) type=%d; prev max=%04X", $start, $end, $value, $type, $max) if main::DEBUG && $to_trace;
4143 return if $operation eq '-'; # Deleting a non-existing range is a
4146 # If the new range doesn't logically extend the current final one
4147 # in the range list, create a new range at the end of the range
4148 # list. (max cleverly is initialized to a negative number not
4149 # adjacent to 0 if the range list is empty, so even adding a range
4150 # to an empty range list starting at 0 will have this 'if'
4152 if ($start > $max + 1 # non-adjacent means can't extend.
4153 || @{$r}[-1]->value ne $value # values differ, can't extend.
4154 || @{$r}[-1]->type != $type # types differ, can't extend.
4156 push @$r, Range->new($start, $end,
4162 # Here, the new range starts just after the current highest in
4163 # the range list, and they have the same type and value.
4164 # Extend the existing range to incorporate the new one.
4165 @{$r}[-1]->set_end($end);
4168 # This becomes the new maximum.
4173 #local $to_trace = 0 if main::DEBUG;
4175 trace "$owner_name_of{$addr} $operation", sprintf("%04X", $start) . '..' . sprintf("%04X", $end) . " ($value) replace=$replace" if main::DEBUG && $to_trace;
4177 # Here, the input range isn't after the whole rest of the range list.
4178 # Most likely 'splice' will be needed. The rest of the routine finds
4179 # the needed splice parameters, and if necessary, does the splice.
4180 # First, find the offset parameter needed by the splice function for
4181 # the input range. Note that the input range may span multiple
4182 # existing ones, but we'll worry about that later. For now, just find
4183 # the beginning. If the input range is to be inserted starting in a
4184 # position not currently in the range list, it must (obviously) come
4185 # just after the range below it, and just before the range above it.
4186 # Slightly less obviously, it will occupy the position currently
4187 # occupied by the range that is to come after it. More formally, we
4188 # are looking for the position, $i, in the array of ranges, such that:
4190 # r[$i-1]->start <= r[$i-1]->end < $start < r[$i]->start <= r[$i]->end
4192 # (The ordered relationships within existing ranges are also shown in
4193 # the equation above). However, if the start of the input range is
4194 # within an existing range, the splice offset should point to that
4195 # existing range's position in the list; that is $i satisfies a
4196 # somewhat different equation, namely:
4198 #r[$i-1]->start <= r[$i-1]->end < r[$i]->start <= $start <= r[$i]->end
4200 # More briefly, $start can come before or after r[$i]->start, and at
4201 # this point, we don't know which it will be. However, these
4202 # two equations share these constraints:
4204 # r[$i-1]->end < $start <= r[$i]->end
4206 # And that is good enough to find $i.
4208 my $i = $self->_search_ranges($start);
4210 Carp::my_carp_bug("Searching $self for range beginning with $start unexpectedly returned undefined. Operation '$operation' not performed");
4214 # The search function returns $i such that:
4216 # r[$i-1]->end < $start <= r[$i]->end
4218 # That means that $i points to the first range in the range list
4219 # that could possibly be affected by this operation. We still don't
4220 # know if the start of the input range is within r[$i], or if it
4221 # points to empty space between r[$i-1] and r[$i].
4222 trace "[$i] is the beginning splice point. Existing range there is ", $r->[$i] if main::DEBUG && $to_trace;
4224 # Special case the insertion of data that is not to replace any
4226 if ($replace == $NO) { # If $NO, has to be operation '+'
4227 #local $to_trace = 1 if main::DEBUG;
4228 trace "Doesn't replace" if main::DEBUG && $to_trace;
4230 # Here, the new range is to take effect only on those code points
4231 # that aren't already in an existing range. This can be done by
4232 # looking through the existing range list and finding the gaps in
4233 # the ranges that this new range affects, and then calling this
4234 # function recursively on each of those gaps, leaving untouched
4235 # anything already in the list. Gather up a list of the changed
4236 # gaps first so that changes to the internal state as new ranges
4237 # are added won't be a problem.
4240 # First, if the starting point of the input range is outside an
4241 # existing one, there is a gap from there to the beginning of the
4242 # existing range -- add a span to fill the part that this new
4244 if ($start < $r->[$i]->start) {
4245 push @gap_list, Range->new($start,
4247 $r->[$i]->start - 1),
4249 trace "gap before $r->[$i] [$i], will add", $gap_list[-1] if main::DEBUG && $to_trace;
4252 # Then look through the range list for other gaps until we reach
4253 # the highest range affected by the input one.
4255 for ($j = $i+1; $j < $range_list_size; $j++) {
4256 trace "j=[$j]", $r->[$j] if main::DEBUG && $to_trace;
4257 last if $end < $r->[$j]->start;
4259 # If there is a gap between when this range starts and the
4260 # previous one ends, add a span to fill it. Note that just
4261 # because there are two ranges doesn't mean there is a
4262 # non-zero gap between them. It could be that they have
4263 # different values or types
4264 if ($r->[$j-1]->end + 1 != $r->[$j]->start) {
4266 Range->new($r->[$j-1]->end + 1,
4267 $r->[$j]->start - 1,
4269 trace "gap between $r->[$j-1] and $r->[$j] [$j], will add: $gap_list[-1]" if main::DEBUG && $to_trace;
4273 # Here, we have either found an existing range in the range list,
4274 # beyond the area affected by the input one, or we fell off the
4275 # end of the loop because the input range affects the whole rest
4276 # of the range list. In either case, $j is 1 higher than the
4277 # highest affected range. If $j == $i, it means that there are no
4278 # affected ranges, that the entire insertion is in the gap between
4279 # r[$i-1], and r[$i], which we already have taken care of before
4281 # On the other hand, if there are affected ranges, it might be
4282 # that there is a gap that needs filling after the final such
4283 # range to the end of the input range
4284 if ($r->[$j-1]->end < $end) {
4285 push @gap_list, Range->new(main::max($start,
4286 $r->[$j-1]->end + 1),
4289 trace "gap after $r->[$j-1], will add $gap_list[-1]" if main::DEBUG && $to_trace;
4292 # Call recursively to fill in all the gaps.
4293 foreach my $gap (@gap_list) {
4294 $self->_add_delete($operation,
4304 # Here, we have taken care of the case where $replace is $NO.
4305 # Remember that here, r[$i-1]->end < $start <= r[$i]->end
4306 # If inserting a multiple record, this is where it goes, before the
4307 # first (if any) existing one if inserting LIFO. (If this is to go
4308 # afterwards, FIFO, we below move the pointer to there.) These imply
4309 # an insertion, and no change to any existing ranges. Note that $i
4310 # can be -1 if this new range doesn't actually duplicate any existing,
4311 # and comes at the beginning of the list.
4312 if ($replace == $MULTIPLE_BEFORE || $replace == $MULTIPLE_AFTER) {
4314 if ($start != $end) {
4315 Carp::my_carp_bug("$owner_name_of{$addr}Can't cope with adding a multiple record when the range ($start..$end) contains more than one code point. No action taken.");
4319 # If the new code point is within a current range ...
4320 if ($end >= $r->[$i]->start) {
4322 # Don't add an exact duplicate, as it isn't really a multiple
4323 my $existing_value = $r->[$i]->value;
4324 my $existing_type = $r->[$i]->type;
4325 return if $value eq $existing_value && $type eq $existing_type;
4327 # If the multiple value is part of an existing range, we want
4328 # to split up that range, so that only the single code point
4329 # is affected. To do this, we first call ourselves
4330 # recursively to delete that code point from the table, having
4331 # preserved its current data above. Then we call ourselves
4332 # recursively again to add the new multiple, which we know by
4333 # the test just above is different than the current code
4334 # point's value, so it will become a range containing a single
4335 # code point: just itself. Finally, we add back in the
4336 # pre-existing code point, which will again be a single code
4337 # point range. Because 'i' likely will have changed as a
4338 # result of these operations, we can't just continue on, but
4339 # do this operation recursively as well. If we are inserting
4340 # LIFO, the pre-existing code point needs to go after the new
4341 # one, so use MULTIPLE_AFTER; and vice versa.
4342 if ($r->[$i]->start != $r->[$i]->end) {
4343 $self->_add_delete('-', $start, $end, "");
4344 $self->_add_delete('+', $start, $end, $value, Type => $type);
4345 return $self->_add_delete('+',
4348 Type => $existing_type,
4349 Replace => ($replace == $MULTIPLE_BEFORE)
4351 : $MULTIPLE_BEFORE);
4355 # If to place this new record after, move to beyond all existing
4356 # ones; but don't add this one if identical to any of them, as it
4357 # isn't really a multiple. This leaves the original order, so
4358 # that the current request is ignored. The reasoning is that the
4359 # previous request that wanted this record to have high priority
4360 # should have precedence.
4361 if ($replace == $MULTIPLE_AFTER) {
4362 while ($i < @$r && $r->[$i]->start == $start) {
4363 return if $value eq $r->[$i]->value
4364 && $type eq $r->[$i]->type;
4369 # If instead we are to place this new record before any
4370 # existing ones, remove any identical ones that come after it.
4371 # This changes the existing order so that the new one is
4372 # first, as is being requested.
4373 for (my $j = $i + 1;
4374 $j < @$r && $r->[$j]->start == $start;
4377 if ($value eq $r->[$j]->value && $type eq $r->[$j]->type) {
4379 last; # There should only be one instance, so no
4380 # need to keep looking
4385 trace "Adding multiple record at $i with $start..$end, $value" if main::DEBUG && $to_trace;
4386 my @return = splice @$r,
4393 if (main::DEBUG && $to_trace) {
4394 trace "After splice:";
4395 trace 'i-2=[', $i-2, ']', $r->[$i-2] if $i >= 2;
4396 trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i >= 1;
4397 trace "i =[", $i, "]", $r->[$i] if $i >= 0;
4398 trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < @$r - 1;
4399 trace 'i+2=[', $i+2, ']', $r->[$i+2] if $i < @$r - 2;
4400 trace 'i+3=[', $i+3, ']', $r->[$i+3] if $i < @$r - 3;
4405 # Here, we have taken care of $NO and $MULTIPLE_foo replaces. This
4406 # leaves delete, insert, and replace either unconditionally or if not
4407 # equivalent. $i still points to the first potential affected range.
4408 # Now find the highest range affected, which will determine the length
4409 # parameter to splice. (The input range can span multiple existing
4410 # ones.) If this isn't a deletion, while we are looking through the
4411 # range list, see also if this is a replacement rather than a clean
4412 # insertion; that is if it will change the values of at least one
4413 # existing range. Start off assuming it is an insert, until find it
4415 my $clean_insert = $operation eq '+';
4416 my $j; # This will point to the highest affected range
4418 # For non-zero types, the standard form is the value itself;
4419 my $standard_form = ($type) ? $value : main::standardize($value);
4421 for ($j = $i; $j < $range_list_size; $j++) {
4422 trace "Looking for highest affected range; the one at $j is ", $r->[$j] if main::DEBUG && $to_trace;
4424 # If find a range that it doesn't overlap into, we can stop
4426 last if $end < $r->[$j]->start;
4428 # Here, overlaps the range at $j. If the values don't match,
4429 # and so far we think this is a clean insertion, it becomes a
4430 # non-clean insertion, i.e., a 'change' or 'replace' instead.
4431 if ($clean_insert) {
4432 if ($r->[$j]->standard_form ne $standard_form) {
4434 if ($replace == $CROAK) {
4435 main::croak("The range to add "
4436 . sprintf("%04X", $start)
4438 . sprintf("%04X", $end)
4439 . " with value '$value' overlaps an existing range $r->[$j]");
4444 # Here, the two values are essentially the same. If the
4445 # two are actually identical, replacing wouldn't change
4446 # anything so skip it.
4447 my $pre_existing = $r->[$j]->value;
4448 if ($pre_existing ne $value) {
4450 # Here the new and old standardized values are the
4451 # same, but the non-standardized values aren't. If
4452 # replacing unconditionally, then replace
4453 if( $replace == $UNCONDITIONALLY) {
4458 # Here, are replacing conditionally. Decide to
4459 # replace or not based on which appears to look
4460 # the "nicest". If one is mixed case and the
4461 # other isn't, choose the mixed case one.
4462 my $new_mixed = $value =~ /[A-Z]/
4463 && $value =~ /[a-z]/;
4464 my $old_mixed = $pre_existing =~ /[A-Z]/
4465 && $pre_existing =~ /[a-z]/;
4467 if ($old_mixed != $new_mixed) {
4468 $clean_insert = 0 if $new_mixed;
4469 if (main::DEBUG && $to_trace) {
4470 if ($clean_insert) {
4471 trace "Retaining $pre_existing over $value";
4474 trace "Replacing $pre_existing with $value";
4480 # Here casing wasn't different between the two.
4481 # If one has hyphens or underscores and the
4482 # other doesn't, choose the one with the
4484 my $new_punct = $value =~ /[-_]/;
4485 my $old_punct = $pre_existing =~ /[-_]/;
4487 if ($old_punct != $new_punct) {
4488 $clean_insert = 0 if $new_punct;
4489 if (main::DEBUG && $to_trace) {
4490 if ($clean_insert) {
4491 trace "Retaining $pre_existing over $value";
4494 trace "Replacing $pre_existing with $value";
4497 } # else existing one is just as "good";
4498 # retain it to save cycles.
4504 } # End of loop looking for highest affected range.
4506 # Here, $j points to one beyond the highest range that this insertion
4507 # affects (hence to beyond the range list if that range is the final
4508 # one in the range list).
4510 # The splice length is all the affected ranges. Get it before
4511 # subtracting, for efficiency, so we don't have to later add 1.
4512 my $length = $j - $i;
4514 $j--; # $j now points to the highest affected range.
4515 trace "Final affected range is $j: $r->[$j]" if main::DEBUG && $to_trace;
4517 # Here, have taken care of $NO and $MULTIPLE_foo replaces.
4518 # $j points to the highest affected range. But it can be < $i or even
4519 # -1. These happen only if the insertion is entirely in the gap
4520 # between r[$i-1] and r[$i]. Here's why: j < i means that the j loop
4521 # above exited first time through with $end < $r->[$i]->start. (And
4522 # then we subtracted one from j) This implies also that $start <
4523 # $r->[$i]->start, but we know from above that $r->[$i-1]->end <
4524 # $start, so the entire input range is in the gap.
4527 # Here the entire input range is in the gap before $i.
4529 if (main::DEBUG && $to_trace) {
4531 trace "Entire range is between $r->[$i-1] and $r->[$i]";
4534 trace "Entire range is before $r->[$i]";
4537 return if $operation ne '+'; # Deletion of a non-existent range is
4542 # Here part of the input range is not in the gap before $i. Thus,
4543 # there is at least one affected one, and $j points to the highest
4546 # At this point, here is the situation:
4547 # This is not an insertion of a multiple, nor of tentative ($NO)
4549 # $i points to the first element in the current range list that
4550 # may be affected by this operation. In fact, we know
4551 # that the range at $i is affected because we are in
4552 # the else branch of this 'if'
4553 # $j points to the highest affected range.
4555 # r[$i-1]->end < $start <= r[$i]->end
4557 # r[$i-1]->end < $start <= $end < r[$j+1]->start
4560 # $clean_insert is a boolean which is set true if and only if
4561 # this is a "clean insertion", i.e., not a change nor a
4562 # deletion (multiple was handled above).
4564 # We now have enough information to decide if this call is a no-op
4565 # or not. It is a no-op if this is an insertion of already
4566 # existing data. To be so, it must be contained entirely in one
4569 if (main::DEBUG && $to_trace && $clean_insert
4570 && $start >= $r->[$i]->start
4571 && $end <= $r->[$i]->end)
4575 return if $clean_insert
4576 && $start >= $r->[$i]->start
4577 && $end <= $r->[$i]->end;
4580 # Here, we know that some action will have to be taken. We have
4581 # calculated the offset and length (though adjustments may be needed)
4582 # for the splice. Now start constructing the replacement list.
4584 my $splice_start = $i;
4589 # See if should extend any adjacent ranges.
4590 if ($operation eq '-') { # Don't extend deletions
4591 $extends_below = $extends_above = 0;
4593 else { # Here, should extend any adjacent ranges. See if there are
4595 $extends_below = ($i > 0
4596 # can't extend unless adjacent
4597 && $r->[$i-1]->end == $start -1
4598 # can't extend unless are same standard value
4599 && $r->[$i-1]->standard_form eq $standard_form
4600 # can't extend unless share type
4601 && $r->[$i-1]->type == $type);
4602 $extends_above = ($j+1 < $range_list_size
4603 && $r->[$j+1]->start == $end +1
4604 && $r->[$j+1]->standard_form eq $standard_form
4605 && $r->[$j+1]->type == $type);
4607 if ($extends_below && $extends_above) { # Adds to both
4608 $splice_start--; # start replace at element below
4609 $length += 2; # will replace on both sides
4610 trace "Extends both below and above ranges" if main::DEBUG && $to_trace;
4612 # The result will fill in any gap, replacing both sides, and
4613 # create one large range.
4614 @replacement = Range->new($r->[$i-1]->start,
4621 # Here we know that the result won't just be the conglomeration of
4622 # a new range with both its adjacent neighbors. But it could
4623 # extend one of them.
4625 if ($extends_below) {
4627 # Here the new element adds to the one below, but not to the
4628 # one above. If inserting, and only to that one range, can
4629 # just change its ending to include the new one.
4630 if ($length == 0 && $clean_insert) {
4631 $r->[$i-1]->set_end($end);
4632 trace "inserted range extends range to below so it is now $r->[$i-1]" if main::DEBUG && $to_trace;
4636 trace "Changing inserted range to start at ", sprintf("%04X", $r->[$i-1]->start), " instead of ", sprintf("%04X", $start) if main::DEBUG && $to_trace;
4637 $splice_start--; # start replace at element below
4638 $length++; # will replace the element below
4639 $start = $r->[$i-1]->start;
4642 elsif ($extends_above) {
4644 # Here the new element adds to the one above, but not below.
4645 # Mirror the code above
4646 if ($length == 0 && $clean_insert) {
4647 $r->[$j+1]->set_start($start);
4648 trace "inserted range extends range to above so it is now $r->[$j+1]" if main::DEBUG && $to_trace;
4652 trace "Changing inserted range to end at ", sprintf("%04X", $r->[$j+1]->end), " instead of ", sprintf("%04X", $end) if main::DEBUG && $to_trace;
4653 $length++; # will replace the element above
4654 $end = $r->[$j+1]->end;
4658 trace "Range at $i is $r->[$i]" if main::DEBUG && $to_trace;
4660 # Finally, here we know there will have to be a splice.
4661 # If the change or delete affects only the highest portion of the
4662 # first affected range, the range will have to be split. The
4663 # splice will remove the whole range, but will replace it by a new
4664 # range containing just the unaffected part. So, in this case,
4665 # add to the replacement list just this unaffected portion.
4666 if (! $extends_below
4667 && $start > $r->[$i]->start && $start <= $r->[$i]->end)
4670 Range->new($r->[$i]->start,
4672 Value => $r->[$i]->value,
4673 Type => $r->[$i]->type);
4676 # In the case of an insert or change, but not a delete, we have to
4677 # put in the new stuff; this comes next.
4678 if ($operation eq '+') {
4679 push @replacement, Range->new($start,
4685 trace "Range at $j is $r->[$j]" if main::DEBUG && $to_trace && $j != $i;
4686 #trace "$end >=", $r->[$j]->start, " && $end <", $r->[$j]->end if main::DEBUG && $to_trace;
4688 # And finally, if we're changing or deleting only a portion of the
4689 # highest affected range, it must be split, as the lowest one was.
4690 if (! $extends_above
4691 && $j >= 0 # Remember that j can be -1 if before first
4693 && $end >= $r->[$j]->start
4694 && $end < $r->[$j]->end)
4697 Range->new($end + 1,
4699 Value => $r->[$j]->value,
4700 Type => $r->[$j]->type);
4704 # And do the splice, as calculated above
4705 if (main::DEBUG && $to_trace) {
4706 trace "replacing $length element(s) at $i with ";
4707 foreach my $replacement (@replacement) {
4708 trace " $replacement";
4710 trace "Before splice:";
4711 trace 'i-2=[', $i-2, ']', $r->[$i-2] if $i >= 2;
4712 trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i >= 1;
4713 trace "i =[", $i, "]", $r->[$i];
4714 trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < @$r - 1;
4715 trace 'i+2=[', $i+2, ']', $r->[$i+2] if $i < @$r - 2;
4718 my @return = splice @$r, $splice_start, $length, @replacement;
4720 if (main::DEBUG && $to_trace) {
4721 trace "After splice:";
4722 trace 'i-2=[', $i-2, ']', $r->[$i-2] if $i >= 2;
4723 trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i >= 1;
4724 trace "i =[", $i, "]", $r->[$i];
4725 trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < @$r - 1;
4726 trace 'i+2=[', $i+2, ']', $r->[$i+2] if $i < @$r - 2;
4727 trace "removed ", @return if @return;
4730 # An actual deletion could have changed the maximum in the list.
4731 # There was no deletion if the splice didn't return something, but
4732 # otherwise recalculate it. This is done too rarely to worry about
4734 if ($operation eq '-' && @return) {
4736 $max{$addr} = $r->[-1]->end;
4739 $max{$addr} = $max_init;
4745 sub reset_each_range { # reset the iterator for each_range();
4747 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4750 undef $each_range_iterator{pack 'J', $self};
4755 # Iterate over each range in a range list. Results are undefined if
4756 # the range list is changed during the iteration.
4759 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4761 my $addr = do { no overloading; pack 'J', $self; };
4763 return if $self->is_empty;
4765 $each_range_iterator{$addr} = -1
4766 if ! defined $each_range_iterator{$addr};
4767 $each_range_iterator{$addr}++;
4768 return $ranges{$addr}->[$each_range_iterator{$addr}]
4769 if $each_range_iterator{$addr} < @{$ranges{$addr}};
4770 undef $each_range_iterator{$addr};
4774 sub count { # Returns count of code points in range list
4776 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4778 my $addr = do { no overloading; pack 'J', $self; };
4781 foreach my $range (@{$ranges{$addr}}) {
4782 $count += $range->end - $range->start + 1;
4787 sub delete_range { # Delete a range
4792 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4794 return $self->_add_delete('-', $start, $end, "");
4797 sub is_empty { # Returns boolean as to if a range list is empty
4799 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4802 return scalar @{$ranges{pack 'J', $self}} == 0;
4806 # Quickly returns a scalar suitable for separating tables into
4807 # buckets, i.e. it is a hash function of the contents of a table, so
4808 # there are relatively few conflicts.
4811 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4813 my $addr = do { no overloading; pack 'J', $self; };
4815 # These are quickly computable. Return looks like 'min..max;count'
4816 return $self->min . "..$max{$addr};" . scalar @{$ranges{$addr}};
4818 } # End closure for _Range_List_Base
4821 use parent '-norequire', '_Range_List_Base';
4823 # A Range_List is a range list for match tables; i.e. the range values are
4824 # not significant. Thus a number of operations can be safely added to it,
4825 # such as inversion, intersection. Note that union is also an unsafe
4826 # operation when range values are cared about, and that method is in the base
4827 # class, not here. But things are set up so that that method is callable only
4828 # during initialization. Only in this derived class, is there an operation
4829 # that combines two tables. A Range_Map can thus be used to initialize a
4830 # Range_List, and its mappings will be in the list, but are not significant to
4833 sub trace { return main::trace(@_); }
4839 '+' => sub { my $self = shift;
4842 return $self->_union($other)
4844 '+=' => sub { my $self = shift;
4846 my $reversed = shift;
4849 Carp::my_carp_bug("Bad news. Can't cope with '"
4853 . "'. undef returned.");
4857 return $self->_union($other)
4859 '&' => sub { my $self = shift;
4862 return $self->_intersect($other, 0);
4864 '&=' => sub { my $self = shift;
4866 my $reversed = shift;
4869 Carp::my_carp_bug("Bad news. Can't cope with '"
4873 . "'. undef returned.");
4877 return $self->_intersect($other, 0);
4884 # Returns a new Range_List that gives all code points not in $self.
4888 my $new = Range_List->new;
4890 # Go through each range in the table, finding the gaps between them
4891 my $max = -1; # Set so no gap before range beginning at 0
4892 for my $range ($self->ranges) {
4893 my $start = $range->start;
4894 my $end = $range->end;
4896 # If there is a gap before this range, the inverse will contain
4898 if ($start > $max + 1) {
4899 $new->add_range($max + 1, $start - 1);
4904 # And finally, add the gap from the end of the table to the max
4905 # possible code point
4906 if ($max < $MAX_WORKING_CODEPOINT) {
4907 $new->add_range($max + 1, $MAX_WORKING_CODEPOINT);
4913 # Returns a new Range_List with the argument deleted from it. The
4914 # argument can be a single code point, a range, or something that has
4915 # a range, with the _range_list() method on it returning them
4919 my $reversed = shift;
4920 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4923 Carp::my_carp_bug("Bad news. Can't cope with '"
4927 . "'. undef returned.");
4931 my $new = Range_List->new(Initialize => $self);
4933 if (! ref $other) { # Single code point
4934 $new->delete_range($other, $other);
4936 elsif ($other->isa('Range')) {
4937 $new->delete_range($other->start, $other->end);
4939 elsif ($other->can('_range_list')) {
4940 foreach my $range ($other->_range_list->ranges) {
4941 $new->delete_range($range->start, $range->end);
4945 Carp::my_carp_bug("Can't cope with a "
4947 . " argument to '-'. Subtraction ignored."
4956 # Returns either a boolean giving whether the two inputs' range lists
4957 # intersect (overlap), or a new Range_List containing the intersection
4958 # of the two lists. The optional final parameter being true indicates
4959 # to do the check instead of the intersection.
4961 my $a_object = shift;
4962 my $b_object = shift;
4963 my $check_if_overlapping = shift;
4964 $check_if_overlapping = 0 unless defined $check_if_overlapping;
4965 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4967 if (! defined $b_object) {
4969 $message .= $a_object->_owner_name_of if defined $a_object;
4970 Carp::my_carp_bug($message .= "Called with undefined value. Intersection not done.");
4974 # a & b = !(!a | !b), or in our terminology = ~ ( ~a + -b )
4975 # Thus the intersection could be much more simply be written:
4976 # return ~(~$a_object + ~$b_object);
4977 # But, this is slower, and when taking the inverse of a large
4978 # range_size_1 table, back when such tables were always stored that
4979 # way, it became prohibitively slow, hence the code was changed to the
4982 if ($b_object->isa('Range')) {
4983 $b_object = Range_List->new(Initialize => $b_object,
4984 Owner => $a_object->_owner_name_of);
4986 $b_object = $b_object->_range_list if $b_object->can('_range_list');
4988 my @a_ranges = $a_object->ranges;
4989 my @b_ranges = $b_object->ranges;
4991 #local $to_trace = 1 if main::DEBUG;
4992 trace "intersecting $a_object with ", scalar @a_ranges, "ranges and $b_object with", scalar @b_ranges, " ranges" if main::DEBUG && $to_trace;
4994 # Start with the first range in each list
4996 my $range_a = $a_ranges[$a_i];
4998 my $range_b = $b_ranges[$b_i];
5000 my $new = __PACKAGE__->new(Owner => $a_object->_owner_name_of)
5001 if ! $check_if_overlapping;
5003 # If either list is empty, there is no intersection and no overlap
5004 if (! defined $range_a || ! defined $range_b) {
5005 return $check_if_overlapping ? 0 : $new;
5007 trace "range_a[$a_i]=$range_a; range_b[$b_i]=$range_b" if main::DEBUG && $to_trace;
5009 # Otherwise, must calculate the intersection/overlap. Start with the
5010 # very first code point in each list
5011 my $a = $range_a->start;
5012 my $b = $range_b->start;
5014 # Loop through all the ranges of each list; in each iteration, $a and
5015 # $b are the current code points in their respective lists
5018 # If $a and $b are the same code point, ...
5021 # it means the lists overlap. If just checking for overlap
5022 # know the answer now,
5023 return 1 if $check_if_overlapping;
5025 # The intersection includes this code point plus anything else
5026 # common to both current ranges.
5028 my $end = main::min($range_a->end, $range_b->end);
5029 if (! $check_if_overlapping) {
5030 trace "adding intersection range ", sprintf("%04X", $start) . ".." . sprintf("%04X", $end) if main::DEBUG && $to_trace;
5031 $new->add_range($start, $end);
5034 # Skip ahead to the end of the current intersect
5037 # If the current intersect ends at the end of either range (as
5038 # it must for at least one of them), the next possible one
5039 # will be the beginning code point in it's list's next range.
5040 if ($a == $range_a->end) {
5041 $range_a = $a_ranges[++$a_i];
5042 last unless defined $range_a;
5043 $a = $range_a->start;
5045 if ($b == $range_b->end) {
5046 $range_b = $b_ranges[++$b_i];
5047 last unless defined $range_b;
5048 $b = $range_b->start;
5051 trace "range_a[$a_i]=$range_a; range_b[$b_i]=$range_b" if main::DEBUG && $to_trace;
5055 # Not equal, but if the range containing $a encompasses $b,
5056 # change $a to be the middle of the range where it does equal
5057 # $b, so the next iteration will get the intersection
5058 if ($range_a->end >= $b) {
5063 # Here, the current range containing $a is entirely below
5064 # $b. Go try to find a range that could contain $b.
5065 $a_i = $a_object->_search_ranges($b);
5067 # If no range found, quit.
5068 last unless defined $a_i;
5070 # The search returns $a_i, such that
5071 # range_a[$a_i-1]->end < $b <= range_a[$a_i]->end
5072 # Set $a to the beginning of this new range, and repeat.
5073 $range_a = $a_ranges[$a_i];
5074 $a = $range_a->start;
5077 else { # Here, $b < $a.
5079 # Mirror image code to the leg just above
5080 if ($range_b->end >= $a) {
5084 $b_i = $b_object->_search_ranges($a);
5085 last unless defined $b_i;
5086 $range_b = $b_ranges[$b_i];
5087 $b = $range_b->start;
5090 } # End of looping through ranges.
5092 # Intersection fully computed, or now know that there is no overlap
5093 return $check_if_overlapping ? 0 : $new;
5097 # Returns boolean giving whether the two arguments overlap somewhere
5101 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5103 return $self->_intersect($other, 1);
5107 # Add a range to the list.
5112 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5114 return $self->_add_delete('+', $start, $end, "");
5117 sub matches_identically_to {
5118 # Return a boolean as to whether or not two Range_Lists match identical
5119 # sets of code points.
5123 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5125 # These are ordered in increasing real time to figure out (at least
5126 # until a patch changes that and doesn't change this)
5127 return 0 if $self->max != $other->max;
5128 return 0 if $self->min != $other->min;
5129 return 0 if $self->range_count != $other->range_count;
5130 return 0 if $self->count != $other->count;
5132 # Here they could be identical because all the tests above passed.
5133 # The loop below is somewhat simpler since we know they have the same
5134 # number of elements. Compare range by range, until reach the end or
5135 # find something that differs.
5136 my @a_ranges = $self->ranges;
5137 my @b_ranges = $other->ranges;
5138 for my $i (0 .. @a_ranges - 1) {
5139 my $a = $a_ranges[$i];
5140 my $b = $b_ranges[$i];
5141 trace "self $a; other $b" if main::DEBUG && $to_trace;
5142 return 0 if ! defined $b
5143 || $a->start != $b->start
5144 || $a->end != $b->end;
5149 sub is_code_point_usable {
5150 # This used only for making the test script. See if the input
5151 # proposed trial code point is one that Perl will handle. If second
5152 # parameter is 0, it won't select some code points for various
5153 # reasons, noted below.
5156 my $try_hard = shift;
5157 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5159 return 0 if $code < 0; # Never use a negative
5161 # shun null. I'm (khw) not sure why this was done, but NULL would be
5162 # the character very frequently used.
5163 return $try_hard if $code == 0x0000;
5165 # shun non-character code points.
5166 return $try_hard if $code >= 0xFDD0 && $code <= 0xFDEF;
5167 return $try_hard if ($code & 0xFFFE) == 0xFFFE; # includes FFFF
5169 return $try_hard if $code > $MAX_UNICODE_CODEPOINT; # keep in range
5170 return $try_hard if $code >= 0xD800 && $code <= 0xDFFF; # no surrogate
5175 sub get_valid_code_point {
5176 # Return a code point that's part of the range list. Returns nothing
5177 # if the table is empty or we can't find a suitable code point. This
5178 # used only for making the test script.
5181 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5183 my $addr = do { no overloading; pack 'J', $self; };
5185 # On first pass, don't choose less desirable code points; if no good
5186 # one is found, repeat, allowing a less desirable one to be selected.
5187 for my $try_hard (0, 1) {
5189 # Look through all the ranges for a usable code point.
5190 for my $set (reverse $self->ranges) {
5192 # Try the edge cases first, starting with the end point of the
5194 my $end = $set->end;
5195 return $end if is_code_point_usable($end, $try_hard);
5196 $end = $MAX_UNICODE_CODEPOINT + 1 if $end > $MAX_UNICODE_CODEPOINT;
5198 # End point didn't, work. Start at the beginning and try
5199 # every one until find one that does work.
5200 for my $trial ($set->start .. $end - 1) {
5201 return $trial if is_code_point_usable($trial, $try_hard);
5205 return (); # If none found, give up.
5208 sub get_invalid_code_point {
5209 # Return a code point that's not part of the table. Returns nothing
5210 # if the table covers all code points or a suitable code point can't
5211 # be found. This used only for making the test script.
5214 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5216 # Just find a valid code point of the inverse, if any.
5217 return Range_List->new(Initialize => ~ $self)->get_valid_code_point;
5219 } # end closure for Range_List
5222 use parent '-norequire', '_Range_List_Base';
5224 # A Range_Map is a range list in which the range values (called maps) are
5225 # significant, and hence shouldn't be manipulated by our other code, which
5226 # could be ambiguous or lose things. For example, in taking the union of two
5227 # lists, which share code points, but which have differing values, which one
5228 # has precedence in the union?
5229 # It turns out that these operations aren't really necessary for map tables,
5230 # and so this class was created to make sure they aren't accidentally
5236 # Add a range containing a mapping value to the list
5239 # Rest of parameters passed on
5241 return $self->_add_delete('+', @_);
5245 # Adds entry to a range list which can duplicate an existing entry
5248 my $code_point = shift;
5251 my $replace = delete $args{'Replace'} // $MULTIPLE_BEFORE;
5252 Carp::carp_extra_args(\%args) if main::DEBUG && %args;
5254 return $self->add_map($code_point, $code_point,
5255 $value, Replace => $replace);
5257 } # End of closure for package Range_Map
5259 package _Base_Table;
5261 # A table is the basic data structure that gets written out into a file for
5262 # use by the Perl core. This is the abstract base class implementing the
5263 # common elements from the derived ones. A list of the methods to be
5264 # furnished by an implementing class is just after the constructor.
5266 sub standardize { return main::standardize($_[0]); }
5267 sub trace { return main::trace(@_); }
5271 main::setup_package();
5274 # Object containing the ranges of the table.
5275 main::set_access('range_list', \%range_list, 'p_r', 'p_s');
5278 # The full table name.
5279 main::set_access('full_name', \%full_name, 'r');
5282 # The table name, almost always shorter
5283 main::set_access('name', \%name, 'r');
5286 # The shortest of all the aliases for this table, with underscores removed
5287 main::set_access('short_name', \%short_name);
5289 my %nominal_short_name_length;
5290 # The length of short_name before removing underscores
5291 main::set_access('nominal_short_name_length',
5292 \%nominal_short_name_length);
5295 # The complete name, including property.
5296 main::set_access('complete_name', \%complete_name, 'r');
5299 # Parent property this table is attached to.
5300 main::set_access('property', \%property, 'r');
5303 # Ordered list of alias objects of the table's name. The first ones in
5304 # the list are output first in comments
5305 main::set_access('aliases', \%aliases, 'readable_array');
5308 # A comment associated with the table for human readers of the files
5309 main::set_access('comment', \%comment, 's');
5312 # A comment giving a short description of the table's meaning for human
5313 # readers of the files.
5314 main::set_access('description', \%description, 'readable_array');
5317 # A comment giving a short note about the table for human readers of the
5319 main::set_access('note', \%note, 'readable_array');
5322 # Enum; there are a number of possibilities for what happens to this
5323 # table: it could be normal, or suppressed, or not for external use. See
5324 # values at definition for $SUPPRESSED.
5325 main::set_access('fate', \%fate, 'r');
5327 my %find_table_from_alias;
5328 # The parent property passes this pointer to a hash which this class adds
5329 # all its aliases to, so that the parent can quickly take an alias and
5331 main::set_access('find_table_from_alias', \%find_table_from_alias, 'p_r');
5334 # After this table is made equivalent to another one; we shouldn't go
5335 # changing the contents because that could mean it's no longer equivalent
5336 main::set_access('locked', \%locked, 'r');
5339 # This gives the final path to the file containing the table. Each
5340 # directory in the path is an element in the array
5341 main::set_access('file_path', \%file_path, 'readable_array');
5344 # What is the table's status, normal, $OBSOLETE, etc. Enum
5345 main::set_access('status', \%status, 'r');
5348 # A comment about its being obsolete, or whatever non normal status it has
5349 main::set_access('status_info', \%status_info, 'r');
5351 my %caseless_equivalent;
5352 # The table this is equivalent to under /i matching, if any.
5353 main::set_access('caseless_equivalent', \%caseless_equivalent, 'r', 's');
5356 # Is the table to be output with each range only a single code point?
5357 # This is done to avoid breaking existing code that may have come to rely
5358 # on this behavior in previous versions of this program.)
5359 main::set_access('range_size_1', \%range_size_1, 'r', 's');
5362 # A boolean set iff this table is a Perl extension to the Unicode
5364 main::set_access('perl_extension', \%perl_extension, 'r');
5366 my %output_range_counts;
5367 # A boolean set iff this table is to have comments written in the
5368 # output file that contain the number of code points in the range.
5369 # The constructor can override the global flag of the same name.
5370 main::set_access('output_range_counts', \%output_range_counts, 'r');
5372 my %write_as_invlist;
5373 # A boolean set iff the output file for this table is to be in the form of
5374 # an inversion list/map.
5375 main::set_access('write_as_invlist', \%write_as_invlist, 'r');
5378 # The format of the entries of the table. This is calculated from the
5379 # data in the table (or passed in the constructor). This is an enum e.g.,
5380 # $STRING_FORMAT. It is marked protected as it should not be generally
5381 # used to override calculations.
5382 main::set_access('format', \%format, 'r', 'p_s');
5385 # All arguments are key => value pairs, which you can see below, most
5386 # of which match fields documented above. Otherwise: Re_Pod_Entry,
5387 # OK_as_Filename, and Fuzzy apply to the names of the table, and are
5388 # documented in the Alias package
5390 return Carp::carp_too_few_args(\@_, 2) if main::DEBUG && @_ < 2;
5394 my $self = bless \do { my $anonymous_scalar }, $class;
5395 my $addr = do { no overloading; pack 'J', $self; };
5399 $name{$addr} = delete $args{'Name'};
5400 $find_table_from_alias{$addr} = delete $args{'_Alias_Hash'};
5401 $full_name{$addr} = delete $args{'Full_Name'};
5402 my $complete_name = $complete_name{$addr}
5403 = delete $args{'Complete_Name'};
5404 $format{$addr} = delete $args{'Format'};
5405 $output_range_counts{$addr} = delete $args{'Output_Range_Counts'};
5406 $property{$addr} = delete $args{'_Property'};
5407 $range_list{$addr} = delete $args{'_Range_List'};
5408 $status{$addr} = delete $args{'Status'} || $NORMAL;
5409 $status_info{$addr} = delete $args{'_Status_Info'} || "";
5410 $range_size_1{$addr} = delete $args{'Range_Size_1'} || 0;
5411 $caseless_equivalent{$addr} = delete $args{'Caseless_Equivalent'} || 0;
5412 $fate{$addr} = delete $args{'Fate'} || $ORDINARY;
5413 $write_as_invlist{$addr} = delete $args{'Write_As_Invlist'};# No default
5414 my $ucd = delete $args{'UCD'};
5416 my $description = delete $args{'Description'};
5417 my $ok_as_filename = delete $args{'OK_as_Filename'};
5418 my $loose_match = delete $args{'Fuzzy'};
5419 my $note = delete $args{'Note'};
5420 my $make_re_pod_entry = delete $args{'Re_Pod_Entry'};
5421 my $perl_extension = delete $args{'Perl_Extension'};
5422 my $suppression_reason = delete $args{'Suppression_Reason'};
5424 # Shouldn't have any left over
5425 Carp::carp_extra_args(\%args) if main::DEBUG && %args;
5427 # Can't use || above because conceivably the name could be 0, and
5428 # can't use // operator in case this program gets used in Perl 5.8
5429 $full_name{$addr} = $name{$addr} if ! defined $full_name{$addr};
5430 $output_range_counts{$addr} = $output_range_counts if
5431 ! defined $output_range_counts{$addr};
5433 $aliases{$addr} = [ ];
5434 $comment{$addr} = [ ];
5435 $description{$addr} = [ ];
5437 $file_path{$addr} = [ ];
5438 $locked{$addr} = "";
5440 push @{$description{$addr}}, $description if $description;
5441 push @{$note{$addr}}, $note if $note;
5443 if ($fate{$addr} == $PLACEHOLDER) {
5445 # A placeholder table doesn't get documented, is a perl extension,
5446 # and quite likely will be empty
5447 $make_re_pod_entry = 0 if ! defined $make_re_pod_entry;
5448 $perl_extension = 1 if ! defined $perl_extension;
5449 $ucd = 0 if ! defined $ucd;
5450 push @tables_that_may_be_empty, $complete_name{$addr};
5451 $self->add_comment(<<END);
5452 This is a placeholder because it is not in Version $string_version of Unicode,
5453 but is needed by the Perl core to work gracefully. Because it is not in this
5454 version of Unicode, it will not be listed in $pod_file.pod
5457 elsif (exists $why_suppressed{$complete_name}
5458 # Don't suppress if overridden
5459 && ! grep { $_ eq $complete_name{$addr} }
5460 @output_mapped_properties)
5462 $fate{$addr} = $SUPPRESSED;
5464 elsif ($fate{$addr} == $SUPPRESSED) {
5465 Carp::my_carp_bug("Need reason for suppressing") unless $suppression_reason;
5466 # Though currently unused
5468 elsif ($suppression_reason) {
5469 Carp::my_carp_bug("A reason was given for suppressing, but not suppressed");
5472 # If hasn't set its status already, see if it is on one of the
5473 # lists of properties or tables that have particular statuses; if
5474 # not, is normal. The lists are prioritized so the most serious
5475 # ones are checked first
5476 if (! $status{$addr}) {
5477 if (exists $why_deprecated{$complete_name}) {
5478 $status{$addr} = $DEPRECATED;
5480 elsif (exists $why_stabilized{$complete_name}) {
5481 $status{$addr} = $STABILIZED;
5483 elsif (exists $why_obsolete{$complete_name}) {
5484 $status{$addr} = $OBSOLETE;
5487 # Existence above doesn't necessarily mean there is a message
5488 # associated with it. Use the most serious message.
5489 if ($status{$addr}) {
5490 if ($why_deprecated{$complete_name}) {
5492 = $why_deprecated{$complete_name};
5494 elsif ($why_stabilized{$complete_name}) {
5496 = $why_stabilized{$complete_name};
5498 elsif ($why_obsolete{$complete_name}) {
5500 = $why_obsolete{$complete_name};
5505 $perl_extension{$addr} = $perl_extension || 0;
5507 # Don't list a property by default that is internal only
5508 if ($fate{$addr} > $MAP_PROXIED) {
5509 $make_re_pod_entry = 0 if ! defined $make_re_pod_entry;
5510 $ucd = 0 if ! defined $ucd;
5513 $ucd = 1 if ! defined $ucd;
5516 # By convention what typically gets printed only or first is what's
5517 # first in the list, so put the full name there for good output
5518 # clarity. Other routines rely on the full name being first on the
5520 $self->add_alias($full_name{$addr},
5521 OK_as_Filename => $ok_as_filename,
5522 Fuzzy => $loose_match,
5523 Re_Pod_Entry => $make_re_pod_entry,
5524 Status => $status{$addr},
5528 # Then comes the other name, if meaningfully different.
5529 if (standardize($full_name{$addr}) ne standardize($name{$addr})) {
5530 $self->add_alias($name{$addr},
5531 OK_as_Filename => $ok_as_filename,
5532 Fuzzy => $loose_match,
5533 Re_Pod_Entry => $make_re_pod_entry,
5534 Status => $status{$addr},
5542 # Here are the methods that are required to be defined by any derived
5545 handle_special_range
5549 # write() knows how to write out normal ranges, but it calls
5550 # handle_special_range() when it encounters a non-normal one.
5551 # append_to_body() is called by it after it has handled all
5552 # ranges to add anything after the main portion of the table.
5553 # And finally, pre_body() is called after all this to build up
5554 # anything that should appear before the main portion of the
5555 # table. Doing it this way allows things in the middle to
5556 # affect what should appear before the main portion of the
5561 Carp::my_carp_bug( __LINE__
5562 . ": Must create method '$sub()' for "
5570 "." => \&main::_operator_dot,
5571 ".=" => \&main::_operator_dot_equal,
5572 '!=' => \&main::_operator_not_equal,
5573 '==' => \&main::_operator_equal,
5577 # Returns the array of ranges associated with this table.
5580 return $range_list{pack 'J', shift}->ranges;
5584 # Add a synonym for this table.
5586 return Carp::carp_too_few_args(\@_, 3) if main::DEBUG && @_ < 3;
5589 my $name = shift; # The name to add.
5590 my $pointer = shift; # What the alias hash should point to. For
5591 # map tables, this is the parent property;
5592 # for match tables, it is the table itself.
5595 my $loose_match = delete $args{'Fuzzy'};
5597 my $ok_as_filename = delete $args{'OK_as_Filename'};
5598 $ok_as_filename = 1 unless defined $ok_as_filename;
5600 # An internal name does not get documented, unless overridden by the
5601 # input; same for making tests for it.
5602 my $status = delete $args{'Status'} || (($name =~ /^_/)
5605 my $make_re_pod_entry = delete $args{'Re_Pod_Entry'}
5606 // (($status ne $INTERNAL_ALIAS)
5607 ? (($name =~ /^_/) ? $NO : $YES)
5609 my $ucd = delete $args{'UCD'} // (($name =~ /^_/) ? 0 : 1);
5611 Carp::carp_extra_args(\%args) if main::DEBUG && %args;
5613 # Capitalize the first letter of the alias unless it is one of the CJK
5614 # ones which specifically begins with a lower 'k'. Do this because
5615 # Unicode has varied whether they capitalize first letters or not, and
5616 # have later changed their minds and capitalized them, but not the
5617 # other way around. So do it always and avoid changes from release to
5619 $name = ucfirst($name) unless $name =~ /^k[A-Z]/;
5621 my $addr = do { no overloading; pack 'J', $self; };
5623 # Figure out if should be loosely matched if not already specified.
5624 if (! defined $loose_match) {
5626 # Is a loose_match if isn't null, and doesn't begin with an
5627 # underscore and isn't just a number
5629 && substr($name, 0, 1) ne '_'
5630 && $name !~ qr{^[0-9_.+-/]+$})
5639 # If this alias has already been defined, do nothing.
5640 return if defined $find_table_from_alias{$addr}->{$name};
5642 # That includes if it is standardly equivalent to an existing alias,
5643 # in which case, add this name to the list, so won't have to search
5645 my $standard_name = main::standardize($name);
5646 if (defined $find_table_from_alias{$addr}->{$standard_name}) {
5647 $find_table_from_alias{$addr}->{$name}
5648 = $find_table_from_alias{$addr}->{$standard_name};
5652 # Set the index hash for this alias for future quick reference.
5653 $find_table_from_alias{$addr}->{$name} = $pointer;
5654 $find_table_from_alias{$addr}->{$standard_name} = $pointer;
5655 local $to_trace = 0 if main::DEBUG;
5656 trace "adding alias $name to $pointer" if main::DEBUG && $to_trace;
5657 trace "adding alias $standard_name to $pointer" if main::DEBUG && $to_trace;
5660 # Put the new alias at the end of the list of aliases unless the final
5661 # element begins with an underscore (meaning it is for internal perl
5662 # use) or is all numeric, in which case, put the new one before that
5663 # one. This floats any all-numeric or underscore-beginning aliases to
5664 # the end. This is done so that they are listed last in output lists,
5665 # to encourage the user to use a better name (either more descriptive
5666 # or not an internal-only one) instead. This ordering is relied on
5667 # implicitly elsewhere in this program, like in short_name()
5668 my $list = $aliases{$addr};
5669 my $insert_position = (@$list == 0
5670 || (substr($list->[-1]->name, 0, 1) ne '_'
5671 && $list->[-1]->name =~ /\D/))
5677 Alias->new($name, $loose_match, $make_re_pod_entry,
5678 $ok_as_filename, $status, $ucd);
5680 # This name may be shorter than any existing ones, so clear the cache
5681 # of the shortest, so will have to be recalculated.
5683 undef $short_name{pack 'J', $self};
5688 # Returns a name suitable for use as the base part of a file name.
5689 # That is, shorter wins. It can return undef if there is no suitable
5690 # name. The name has all non-essential underscores removed.
5692 # The optional second parameter is a reference to a scalar in which
5693 # this routine will store the length the returned name had before the
5694 # underscores were removed, or undef if the return is undef.
5696 # The shortest name can change if new aliases are added. So using
5697 # this should be deferred until after all these are added. The code
5698 # that does that should clear this one's cache.
5699 # Any name with alphabetics is preferred over an all numeric one, even
5703 my $nominal_length_ptr = shift;
5704 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5706 my $addr = do { no overloading; pack 'J', $self; };
5708 # For efficiency, don't recalculate, but this means that adding new
5709 # aliases could change what the shortest is, so the code that does
5710 # that needs to undef this.
5711 if (defined $short_name{$addr}) {
5712 if ($nominal_length_ptr) {
5713 $$nominal_length_ptr = $nominal_short_name_length{$addr};
5715 return $short_name{$addr};
5718 # Look at each alias
5719 foreach my $alias ($self->aliases()) {
5721 # Don't use an alias that isn't ok to use for an external name.
5722 next if ! $alias->ok_as_filename;
5724 my $name = main::Standardize($alias->name);
5725 trace $self, $name if main::DEBUG && $to_trace;
5727 # Take the first one, or a shorter one that isn't numeric. This
5728 # relies on numeric aliases always being last in the array
5729 # returned by aliases(). Any alpha one will have precedence.
5730 if (! defined $short_name{$addr}
5732 && length($name) < length($short_name{$addr})))
5734 # Remove interior underscores.
5735 ($short_name{$addr} = $name) =~ s/ (?<= . ) _ (?= . ) //xg;
5737 $nominal_short_name_length{$addr} = length $name;
5741 # If the short name isn't a nice one, perhaps an equivalent table has
5743 if (! defined $short_name{$addr}
5744 || $short_name{$addr} eq ""
5745 || $short_name{$addr} eq "_")
5748 foreach my $follower ($self->children) { # All equivalents
5749 my $follower_name = $follower->short_name;
5750 next unless defined $follower_name;
5752 # Anything (except undefined) is better than underscore or
5754 if (! defined $return || $return eq "_") {
5755 $return = $follower_name;
5759 # If the new follower name isn't "_" and is shorter than the
5760 # current best one, prefer the new one.
5761 next if $follower_name eq "_";
5762 next if length $follower_name > length $return;
5763 $return = $follower_name;
5765 $short_name{$addr} = $return if defined $return;
5768 # If no suitable external name return undef
5769 if (! defined $short_name{$addr}) {
5770 $$nominal_length_ptr = undef if $nominal_length_ptr;
5774 # Don't allow a null short name.
5775 if ($short_name{$addr} eq "") {
5776 $short_name{$addr} = '_';
5777 $nominal_short_name_length{$addr} = 1;
5780 trace $self, $short_name{$addr} if main::DEBUG && $to_trace;
5782 if ($nominal_length_ptr) {
5783 $$nominal_length_ptr = $nominal_short_name_length{$addr};
5785 return $short_name{$addr};
5789 # Returns the external name that this table should be known by. This
5790 # is usually the short_name, but not if the short_name is undefined,
5791 # in which case the external_name is arbitrarily set to the
5795 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5797 my $short = $self->short_name;
5798 return $short if defined $short;
5803 sub add_description { # Adds the parameter as a short description.
5806 my $description = shift;
5808 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5811 push @{$description{pack 'J', $self}}, $description;
5816 sub add_note { # Adds the parameter as a short note.
5821 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5824 push @{$note{pack 'J', $self}}, $note;
5829 sub add_comment { # Adds the parameter as a comment.
5831 return unless $debugging_build;
5834 my $comment = shift;
5835 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5840 push @{$comment{pack 'J', $self}}, $comment;
5846 # Return the current comment for this table. If called in list
5847 # context, returns the array of comments. In scalar, returns a string
5848 # of each element joined together with a period ending each.
5851 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5853 my $addr = do { no overloading; pack 'J', $self; };
5854 my @list = @{$comment{$addr}};
5855 return @list if wantarray;
5857 foreach my $sentence (@list) {
5858 $return .= '. ' if $return;
5859 $return .= $sentence;
5862 $return .= '.' if $return;
5867 # Initialize the table with the argument which is any valid
5868 # initialization for range lists.
5871 my $addr = do { no overloading; pack 'J', $self; };
5872 my $initialization = shift;
5873 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5875 # Replace the current range list with a new one of the same exact
5877 my $class = ref $range_list{$addr};
5878 $range_list{$addr} = $class->new(Owner => $self,
5879 Initialize => $initialization);
5885 # The header that is output for the table in the file it is written
5889 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5892 $return .= $DEVELOPMENT_ONLY if $compare_versions;
5897 sub merge_single_annotation_line ($$$) {
5898 my ($output, $annotation, $annotation_column) = @_;
5900 # This appends an annotation comment, $annotation, to $output,
5901 # starting in or after column $annotation_column, removing any
5902 # pre-existing comment from $output.
5904 $annotation =~ s/^ \s* \# \ //x;
5905 $output =~ s/ \s* ( \# \N* )? \n //x;
5906 $output = Text::Tabs::expand($output);
5908 my $spaces = $annotation_column - length $output;
5909 $spaces = 2 if $spaces < 0; # Have 2 blanks before the comment
5911 $output = sprintf "%s%*s# %s",
5916 return Text::Tabs::unexpand $output;
5920 # Write a representation of the table to its file. It calls several
5921 # functions furnished by sub-classes of this abstract base class to
5922 # handle non-normal ranges, to add stuff before the table, and at its
5923 # end. If the table is to be written so that adjustments are
5924 # required, this does that conversion.
5927 my $use_adjustments = shift; # ? output in adjusted format or not
5928 my $suppress_value = shift; # Optional, if the value associated with
5929 # a range equals this one, don't write
5931 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5933 my $addr = do { no overloading; pack 'J', $self; };
5934 my $write_as_invlist = $write_as_invlist{$addr};
5936 # Start with the header
5937 my @HEADER = $self->header;
5940 push @HEADER, "\n", main::simple_fold($comment{$addr}, '# '), "\n"
5943 # Things discovered processing the main body of the document may
5944 # affect what gets output before it, therefore pre_body() isn't called
5945 # until after all other processing of the table is done.
5947 # The main body looks like a 'here' document. If there are comments,
5948 # get rid of them when processing it.
5950 if ($annotate || $output_range_counts) {
5951 # Use the line below in Perls that don't have /r
5952 #push @OUT, 'return join "\n", map { s/\s*#.*//mg; $_ } split "\n", <<\'END\';' . "\n";
5953 push @OUT, "return <<'END' =~ s/\\s*#.*//mgr;\n";
5955 push @OUT, "return <<'END';\n";
5958 if ($range_list{$addr}->is_empty) {
5960 # This is a kludge for empty tables to silence a warning in
5961 # utf8.c, which can't really deal with empty tables, but it can
5962 # deal with a table that matches nothing, as the inverse of 'All'
5964 push @OUT, "!utf8::All\n";
5966 elsif ($self->name eq 'N'
5968 # To save disk space and table cache space, avoid putting out
5969 # binary N tables, but instead create a file which just inverts
5970 # the Y table. Since the file will still exist and occupy a
5971 # certain number of blocks, might as well output the whole
5972 # thing if it all will fit in one block. The number of
5973 # ranges below is an approximate number for that.
5974 && ($self->property->type == $BINARY
5975 || $self->property->type == $FORCED_BINARY)
5976 # && $self->property->tables == 2 Can't do this because the
5977 # non-binary properties, like NFDQC aren't specifiable
5979 && $range_list{$addr}->ranges > 15
5980 && ! $annotate) # Under --annotate, want to see everything
5982 push @OUT, "!utf8::" . $self->property->name . "\n";
5985 my $range_size_1 = $range_size_1{$addr};
5987 # To make it more readable, use a minimum indentation
5990 # These are used only in $annotate option
5991 my $format; # e.g. $HEX_ADJUST_FORMAT
5992 my $include_name; # ? Include the character's name in the
5994 my $include_cp; # ? Include its code point
5997 $comment_indent = ($self->isa('Map_Table'))
5999 : ($write_as_invlist)
6004 $format = $self->format;
6006 # The name of the character is output only for tables that
6007 # don't already include the name in the output.
6008 my $property = $self->property;
6010 ! ($property == $perl_charname
6011 || $property == main::property_ref('Unicode_1_Name')
6012 || $property == main::property_ref('Name')
6013 || $property == main::property_ref('Name_Alias')
6016 # Don't include the code point in the annotation where all
6017 # lines are a single code point, so it can be easily found in
6019 $include_cp = ! $range_size_1;
6021 if (! $self->isa('Map_Table')) {
6022 $comment_indent = ($write_as_invlist) ? 8 : 16;
6025 $comment_indent = 16;
6027 # There are just a few short ranges in this table, so no
6028 # need to include the code point in the annotation.
6029 $include_cp = 0 if $format eq $DECOMP_STRING_FORMAT;
6031 # We're trying to get this to look good, as the whole
6032 # point is to make human-readable tables. It is easier to
6033 # read if almost all the annotation comments begin in the
6034 # same column. Map tables have varying width maps, so can
6035 # create a jagged comment appearance. This code does a
6036 # preliminary pass through these tables looking for the
6037 # maximum width map in each, and causing the comments to
6038 # begin just to the right of that. However, if the
6039 # comments begin too far to the right of most lines, it's
6040 # hard to line them up horizontally with their real data.
6041 # Therefore we ignore the longest outliers
6042 my $ignore_longest_X_percent = 2; # Discard longest X%
6044 # Each key in this hash is a width of at least one of the
6045 # maps in the table. Its value is how many lines have
6049 # We won't space things further left than one tab stop
6050 # after the rest of the line; initializing it to that
6051 # number saves some work.
6052 my $max_map_width = 8;
6054 # Fill in the %widths hash
6056 for my $set ($range_list{$addr}->ranges) {
6057 my $value = $set->value;
6059 # These range types don't appear in the main table
6060 next if $set->type == 0
6061 && defined $suppress_value
6062 && $value eq $suppress_value;
6063 next if $set->type == $MULTI_CP
6064 || $set->type == $NULL;
6066 # Include 2 spaces before the beginning of the
6068 my $this_width = length($value) + 2;
6070 # Ranges of the remaining non-zero types usually
6071 # occupy just one line (maybe occasionally two, but
6072 # this doesn't have to be dead accurate). This is
6073 # because these ranges are like "unassigned code
6075 my $count = ($set->type != 0)
6077 : $set->end - $set->start + 1;
6078 $widths{$this_width} += $count;
6080 $max_map_width = $this_width
6081 if $max_map_width < $this_width;
6084 # If the widest map gives us less than two tab stops
6085 # worth, just take it as-is.
6086 if ($max_map_width > 16) {
6088 # Otherwise go through %widths until we have included
6089 # the desired percentage of lines in the whole table.
6090 my $running_total = 0;
6091 foreach my $width (sort { $a <=> $b } keys %widths)
6093 $running_total += $widths{$width};
6095 if ($running_total * 100 / $total
6096 >= 100 - $ignore_longest_X_percent)
6098 $max_map_width = $width;
6103 $comment_indent += $max_map_width;
6107 # Values for previous time through the loop. Initialize to
6108 # something that won't be adjacent to the first iteration;
6109 # only $previous_end matters for that.
6111 my $previous_end = -2;
6114 # Values for next time through the portion of the loop that splits
6115 # the range. 0 in $next_start means there is no remaining portion
6121 my $invlist_count = 0;
6123 my $output_value_in_hex = $self->isa('Map_Table')
6124 && ($self->format eq $HEX_ADJUST_FORMAT
6125 || $self->to_output_map == $EXTERNAL_MAP);
6126 # Use leading zeroes just for files whose format should not be
6127 # changed from what it has been. Otherwise, they just take up
6128 # space and time to process.
6129 my $hex_format = ($self->isa('Map_Table')
6130 && $self->to_output_map == $EXTERNAL_MAP)
6134 # The values for some of these tables are stored in mktables as
6135 # hex strings. Normally, these are just output as strings without
6136 # change, but when we are doing adjustments, we have to operate on
6137 # these numerically, so we convert those to decimal to do that,
6138 # and back to hex for output
6139 my $convert_map_to_from_hex = 0;
6140 my $output_map_in_hex = 0;
6141 if ($self->isa('Map_Table')) {
6142 $convert_map_to_from_hex
6143 = ($use_adjustments && $self->format eq $HEX_ADJUST_FORMAT)
6144 || ($annotate && $self->format eq $HEX_FORMAT);
6145 $output_map_in_hex = $convert_map_to_from_hex
6146 || $self->format eq $HEX_FORMAT;
6149 # To store any annotations about the characters.
6152 # Output each range as part of the here document.
6154 for my $set ($range_list{$addr}->ranges) {
6155 if ($set->type != 0) {
6156 $self->handle_special_range($set);
6159 my $start = $set->start;
6160 my $end = $set->end;
6161 my $value = $set->value;
6163 # Don't output ranges whose value is the one to suppress
6164 next RANGE if defined $suppress_value
6165 && $value eq $suppress_value;
6167 $value = CORE::hex $value if $convert_map_to_from_hex;
6170 { # This bare block encloses the scope where we may need to
6171 # 'redo' to. Consider a table that is to be written out
6172 # using single item ranges. This is given in the
6173 # $range_size_1 boolean. To accomplish this, we split the
6174 # range each time through the loop into two portions, the
6175 # first item, and the rest. We handle that first item
6176 # this time in the loop, and 'redo' to repeat the process
6177 # for the rest of the range.
6179 # We may also have to do it, with other special handling,
6180 # if the table has adjustments. Consider the table that
6181 # contains the lowercasing maps. mktables stores the
6182 # ASCII range ones as 26 ranges:
6183 # ord('A') => ord('a'), .. ord('Z') => ord('z')
6184 # For compactness, the table that gets written has this as
6186 # ( ord('A') .. ord('Z') ) => ord('a')
6187 # and the software that reads the tables is smart enough
6188 # to "connect the dots". This change is accomplished in
6189 # this loop by looking to see if the current iteration
6190 # fits the paradigm of the previous iteration, and if so,
6191 # we merge them by replacing the final output item with
6192 # the merged data. Repeated 25 times, this gets A-Z. But
6193 # we also have to make sure we don't screw up cases where
6194 # we have internally stored
6195 # ( 0x1C4 .. 0x1C6 ) => 0x1C5
6196 # This single internal range has to be output as 3 ranges,
6197 # which is done by splitting, like we do for $range_size_1
6198 # tables. (There are very few of such ranges that need to
6199 # be split, so the gain of doing the combining of other
6200 # ranges far outweighs the splitting of these.) The
6201 # values to use for the redo at the end of this block are
6202 # set up just below in the scalars whose names begin with
6205 if (($use_adjustments || $range_size_1) && $end != $start)
6207 $next_start = $start + 1;
6209 $next_value = $value;
6213 if ($use_adjustments && ! $range_size_1) {
6215 # If this range is adjacent to the previous one, and
6216 # the values in each are integers that are also
6217 # adjacent (differ by 1), then this range really
6218 # extends the previous one that is already in element
6219 # $OUT[-1]. So we pop that element, and pretend that
6220 # the range starts with whatever it started with.
6221 # $offset is incremented by 1 each time so that it
6222 # gives the current offset from the first element in
6223 # the accumulating range, and we keep in $value the
6224 # value of that first element.
6225 if ($start == $previous_end + 1
6226 && $value =~ /^ -? \d+ $/xa
6227 && $previous_value =~ /^ -? \d+ $/xa
6228 && ($value == ($previous_value + ++$offset)))
6231 $start = $previous_start;
6232 $value = $previous_value;
6236 if (@annotation == 1) {
6237 $OUT[-1] = merge_single_annotation_line(
6238 $OUT[-1], $annotation[0], $comment_indent);
6241 push @OUT, @annotation;
6246 # Save the current values for the next time through
6248 $previous_start = $start;
6249 $previous_end = $end;
6250 $previous_value = $value;
6253 if ($write_as_invlist) {
6255 # Inversion list format has a single number per line,
6256 # the starting code point of a range that matches the
6258 push @OUT, $start, "\n";
6261 # Add a comment with the size of the range, if
6263 if ($output_range_counts{$addr}) {
6264 $OUT[-1] = merge_single_annotation_line(
6267 . main::clarify_code_point_count($end - $start + 1)
6272 elsif ($start != $end) { # If there is a range
6273 if ($end == $MAX_WORKING_CODEPOINT) {
6274 push @OUT, sprintf "$hex_format\t$hex_format",
6276 $MAX_PLATFORM_CODEPOINT;
6279 push @OUT, sprintf "$hex_format\t$hex_format",
6282 if (length $value) {
6283 if ($convert_map_to_from_hex) {
6284 $OUT[-1] .= sprintf "\t$hex_format\n", $value;
6287 $OUT[-1] .= "\t$value\n";
6291 # Add a comment with the size of the range, if
6293 if ($output_range_counts{$addr}) {
6294 $OUT[-1] = merge_single_annotation_line(
6297 . main::clarify_code_point_count($end - $start + 1)
6302 else { # Here to output a single code point per line.
6304 # Use any passed in subroutine to output.
6305 if (ref $range_size_1 eq 'CODE') {
6306 for my $i ($start .. $end) {
6307 push @OUT, &{$range_size_1}($i, $value);
6312 # Here, caller is ok with default output.
6313 for (my $i = $start; $i <= $end; $i++) {
6314 if ($convert_map_to_from_hex) {
6316 sprintf "$hex_format\t\t$hex_format\n",
6320 push @OUT, sprintf $hex_format, $i;
6321 $OUT[-1] .= "\t\t$value" if $value ne "";
6329 for (my $i = $start; $i <= $end; $i++) {
6330 my $annotation = "";
6332 # Get character information if don't have it already
6333 main::populate_char_info($i)
6334 if ! defined $viacode[$i];
6335 my $type = $annotate_char_type[$i];
6337 # Figure out if should output the next code points
6338 # as part of a range or not. If this is not in an
6339 # annotation range, then won't output as a range,
6340 # so returns $i. Otherwise use the end of the
6341 # annotation range, but no further than the
6342 # maximum possible end point of the loop.
6347 $annotate_ranges->value_of($i) || $i,
6350 # Use a range if it is a range, and either is one
6351 # of the special annotation ranges, or the range
6352 # is at most 3 long. This last case causes the
6353 # algorithmically named code points to be output
6354 # individually in spans of at most 3, as they are
6355 # the ones whose $type is > 0.
6356 if ($range_end != $i
6357 && ( $type < 0 || $range_end - $i > 2))
6359 # Here is to output a range. We don't allow a
6360 # caller-specified output format--just use the
6362 my $range_name = $viacode[$i];
6364 # For the code points which end in their hex
6365 # value, we eliminate that from the output
6366 # annotation, and capitalize only the first
6367 # letter of each word.
6368 if ($type == $CP_IN_NAME) {
6369 my $hex = sprintf $hex_format, $i;
6370 $range_name =~ s/-$hex$//;
6371 my @words = split " ", $range_name;
6372 for my $word (@words) {
6374 ucfirst(lc($word)) if $word ne 'CJK';
6376 $range_name = join " ", @words;
6378 elsif ($type == $HANGUL_SYLLABLE) {
6379 $range_name = "Hangul Syllable";
6382 if ($i != $start || $range_end < $end) {
6383 if ($range_end < $MAX_WORKING_CODEPOINT)
6385 $annotation = sprintf "%04X..%04X",
6389 $annotation = sprintf "%04X..INFINITY",
6393 else { # Indent if not displaying code points
6394 $annotation = " " x 4;
6397 $annotation .= " $age[$i]" if $age[$i];
6398 $annotation .= " $range_name";
6401 # Include the number of code points in the
6404 main::clarify_code_point_count($range_end - $i + 1);
6405 $annotation .= " [$count]\n";
6407 # Skip to the end of the range
6410 else { # Not in a range.
6413 # When outputting the names of each character,
6414 # use the character itself if printable
6415 $comment .= "'" . main::display_chr($i) . "' "
6418 my $output_value = $value;
6420 # Determine the annotation
6421 if ($format eq $DECOMP_STRING_FORMAT) {
6423 # This is very specialized, with the type
6424 # of decomposition beginning the line
6425 # enclosed in <...>, and the code points
6426 # that the code point decomposes to
6427 # separated by blanks. Create two
6428 # strings, one of the printable
6429 # characters, and one of their official
6431 (my $map = $output_value)
6432 =~ s/ \ * < .*? > \ +//x;
6436 foreach my $to (split " ", $map) {
6437 $to = CORE::hex $to;
6438 $to_name .= " + " if $to_name;
6439 $to_chr .= main::display_chr($to);
6440 main::populate_char_info($to)
6441 if ! defined $viacode[$to];
6442 $to_name .= $viacode[$to];
6446 "=> '$to_chr'; $viacode[$i] => $to_name";
6449 $output_value += $i - $start
6451 # Don't try to adjust a
6453 && $output_value !~ /[-\D]/;
6455 if ($output_map_in_hex) {
6456 main::populate_char_info($output_value)
6457 if ! defined $viacode[$output_value];
6459 . main::display_chr($output_value)
6460 . "'; " if $printable[$output_value];
6462 if ($include_name && $viacode[$i]) {
6463 $comment .= " " if $comment;
6464 $comment .= $viacode[$i];
6466 if ($output_map_in_hex) {
6468 " => $viacode[$output_value]"
6469 if $viacode[$output_value];
6470 $output_value = sprintf($hex_format,
6476 $annotation = sprintf "%04X %s", $i, $age[$i];
6477 if ($use_adjustments) {
6478 $annotation .= " => $output_value";
6482 if ($comment ne "") {
6483 $annotation .= " " if $annotation ne "";
6484 $annotation .= $comment;
6486 $annotation .= "\n" if $annotation ne "";
6489 if ($annotation ne "") {
6490 push @annotation, (" " x $comment_indent)
6495 # If not adjusting, we don't have to go through the
6496 # loop again to know that the annotation comes next
6498 if (! $use_adjustments) {
6499 if (@annotation == 1) {
6500 $OUT[-1] = merge_single_annotation_line(
6501 $OUT[-1], $annotation[0], $comment_indent);
6504 push @OUT, map { Text::Tabs::unexpand $_ }
6511 # Add the beginning of the range that doesn't match the
6512 # property, except if the just added match range extends
6513 # to infinity. We do this after any annotations for the
6515 if ($write_as_invlist && $end < $MAX_WORKING_CODEPOINT) {
6516 push @OUT, $end + 1, "\n";
6520 # If we split the range, set up so the next time through
6521 # we get the remainder, and redo.
6523 $start = $next_start;
6525 $value = $next_value;
6530 } # End of loop through all the table's ranges
6532 push @OUT, @annotation; # Add orphaned annotation, if any
6534 splice @OUT, 1, 0, "V$invlist_count\n" if $invlist_count;
6537 # Add anything that goes after the main body, but within the here
6539 my $append_to_body = $self->append_to_body;
6540 push @OUT, $append_to_body if $append_to_body;
6542 # And finish the here document.
6545 # Done with the main portion of the body. Can now figure out what
6546 # should appear before it in the file.
6547 my $pre_body = $self->pre_body;
6548 push @HEADER, $pre_body, "\n" if $pre_body;
6550 # All these files should have a .pl suffix added to them.
6551 my @file_with_pl = @{$file_path{$addr}};
6552 $file_with_pl[-1] .= '.pl';
6554 main::write(\@file_with_pl,
6555 $annotate, # utf8 iff annotating
6561 sub set_status { # Set the table's status
6563 my $status = shift; # The status enum value
6564 my $info = shift; # Any message associated with it.
6565 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6567 my $addr = do { no overloading; pack 'J', $self; };
6569 $status{$addr} = $status;
6570 $status_info{$addr} = $info;
6574 sub set_fate { # Set the fate of a table
6578 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6580 my $addr = do { no overloading; pack 'J', $self; };
6582 return if $fate{$addr} == $fate; # If no-op
6584 # Can only change the ordinary fate, except if going to $MAP_PROXIED
6585 return if $fate{$addr} != $ORDINARY && $fate != $MAP_PROXIED;
6587 $fate{$addr} = $fate;
6589 # Don't document anything to do with a non-normal fated table
6590 if ($fate != $ORDINARY) {
6591 my $put_in_pod = ($fate == $MAP_PROXIED) ? 1 : 0;
6592 foreach my $alias ($self->aliases) {
6593 $alias->set_ucd($put_in_pod);
6595 # MAP_PROXIED doesn't affect the match tables
6596 next if $fate == $MAP_PROXIED;
6597 $alias->set_make_re_pod_entry($put_in_pod);
6601 # Save the reason for suppression for output
6602 if ($fate >= $SUPPRESSED) {
6603 $reason = "" unless defined $reason;
6604 $why_suppressed{$complete_name{$addr}} = $reason;
6611 # Don't allow changes to the table from now on. This stores a stack
6612 # trace of where it was called, so that later attempts to modify it
6613 # can immediately show where it got locked.
6616 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6618 my $addr = do { no overloading; pack 'J', $self; };
6620 $locked{$addr} = "";
6622 my $line = (caller(0))[2];
6625 # Accumulate the stack trace
6627 my ($pkg, $file, $caller_line, $caller) = caller $i++;
6629 last unless defined $caller;
6631 $locked{$addr} .= " called from $caller() at line $line\n";
6632 $line = $caller_line;
6634 $locked{$addr} .= " called from main at line $line\n";
6639 sub carp_if_locked {
6640 # Return whether a table is locked or not, and, by the way, complain
6644 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6646 my $addr = do { no overloading; pack 'J', $self; };
6648 return 0 if ! $locked{$addr};
6649 Carp::my_carp_bug("Can't modify a locked table. Stack trace of locking:\n$locked{$addr}\n\n");
6653 sub set_file_path { # Set the final directory path for this table
6655 # Rest of parameters passed on
6658 @{$file_path{pack 'J', $self}} = @_;
6662 # Accessors for the range list stored in this table. First for
6671 matches_identically_to
6684 return $self->_range_list->$sub(@_);
6688 # Then for ones that should fail if locked
6698 return if $self->carp_if_locked;
6700 return $self->_range_list->$sub(@_);
6707 use parent '-norequire', '_Base_Table';
6709 # A Map Table is a table that contains the mappings from code points to
6710 # values. There are two weird cases:
6711 # 1) Anomalous entries are ones that aren't maps of ranges of code points, but
6712 # are written in the table's file at the end of the table nonetheless. It
6713 # requires specially constructed code to handle these; utf8.c can not read
6714 # these in, so they should not go in $map_directory. As of this writing,
6715 # the only case that these happen is for named sequences used in
6716 # charnames.pm. But this code doesn't enforce any syntax on these, so
6717 # something else could come along that uses it.
6718 # 2) Specials are anything that doesn't fit syntactically into the body of the
6719 # table. The ranges for these have a map type of non-zero. The code below
6720 # knows about and handles each possible type. In most cases, these are
6721 # written as part of the header.
6723 # A map table deliberately can't be manipulated at will unlike match tables.
6724 # This is because of the ambiguities having to do with what to do with
6725 # overlapping code points. And there just isn't a need for those things;
6726 # what one wants to do is just query, add, replace, or delete mappings, plus
6727 # write the final result.
6728 # However, there is a method to get the list of possible ranges that aren't in
6729 # this table to use for defaulting missing code point mappings. And,
6730 # map_add_or_replace_non_nulls() does allow one to add another table to this
6731 # one, but it is clearly very specialized, and defined that the other's
6732 # non-null values replace this one's if there is any overlap.
6734 sub trace { return main::trace(@_); }
6738 main::setup_package();
6741 # Many input files omit some entries; this gives what the mapping for the
6742 # missing entries should be
6743 main::set_access('default_map', \%default_map, 'r');
6745 my %anomalous_entries;
6746 # Things that go in the body of the table which don't fit the normal
6747 # scheme of things, like having a range. Not much can be done with these
6748 # once there except to output them. This was created to handle named
6750 main::set_access('anomalous_entry', \%anomalous_entries, 'a');
6751 main::set_access('anomalous_entries', # Append singular, read plural
6752 \%anomalous_entries,
6755 my %replacement_property;
6756 # Certain files are unused by Perl itself, and are kept only for backwards
6757 # compatibility for programs that used them before Unicode::UCD existed.
6758 # These are termed legacy properties. At some point they may be removed,
6759 # but for now mark them as legacy. If non empty, this is the name of the
6760 # property to use instead (i.e., the modern equivalent).
6761 main::set_access('replacement_property', \%replacement_property, 'r');
6764 # Enum as to whether or not to write out this map table, and how:
6766 # $EXTERNAL_MAP means its existence is noted in the documentation, and
6767 # it should not be removed nor its format changed. This
6768 # is done for those files that have traditionally been
6769 # output. Maps of legacy-only properties default to
6771 # $INTERNAL_MAP means Perl reserves the right to do anything it wants
6773 # $OUTPUT_ADJUSTED means that it is an $INTERNAL_MAP, and instead of
6774 # outputting the actual mappings as-is, we adjust things
6775 # to create a much more compact table. Only those few
6776 # tables where the mapping is convertible at least to an
6777 # integer and compacting makes a big difference should
6778 # have this. Hence, the default is to not do this
6779 # unless the table's default mapping is to $CODE_POINT,
6780 # and the range size is not 1.
6781 main::set_access('to_output_map', \%to_output_map, 's');
6789 # Optional initialization data for the table.
6790 my $initialize = delete $args{'Initialize'};
6792 my $default_map = delete $args{'Default_Map'};
6793 my $property = delete $args{'_Property'};
6794 my $full_name = delete $args{'Full_Name'};
6795 my $replacement_property = delete $args{'Replacement_Property'} // "";
6796 my $to_output_map = delete $args{'To_Output_Map'};
6798 # Rest of parameters passed on; legacy properties have several common
6800 if ($replacement_property) {
6801 $args{"Fate"} = $LEGACY_ONLY;
6802 $args{"Range_Size_1"} = 1;
6803 $args{"Perl_Extension"} = 1;
6807 my $range_list = Range_Map->new(Owner => $property);
6809 my $self = $class->SUPER::new(
6811 Complete_Name => $full_name,
6812 Full_Name => $full_name,
6813 _Property => $property,
6814 _Range_List => $range_list,
6815 Write_As_Invlist => 0,
6818 my $addr = do { no overloading; pack 'J', $self; };
6820 $anomalous_entries{$addr} = [];
6821 $default_map{$addr} = $default_map;
6822 $replacement_property{$addr} = $replacement_property;
6823 $to_output_map = $EXTERNAL_MAP if ! defined $to_output_map
6824 && $replacement_property;
6825 $to_output_map{$addr} = $to_output_map;
6827 $self->initialize($initialize) if defined $initialize;
6834 qw("") => "_operator_stringify",
6837 sub _operator_stringify {
6840 my $name = $self->property->full_name;
6841 $name = '""' if $name eq "";
6842 return "Map table for Property '$name'";
6846 # Add a synonym for this table (which means the property itself)
6849 # Rest of parameters passed on.
6851 $self->SUPER::add_alias($name, $self->property, @_);
6856 # Add a range of code points to the list of specially-handled code
6857 # points. $MULTI_CP is assumed if the type of special is not passed
6866 my $type = delete $args{'Type'} || 0;
6867 # Rest of parameters passed on
6869 # Can't change the table if locked.
6870 return if $self->carp_if_locked;
6872 my $addr = do { no overloading; pack 'J', $self; };
6874 $self->_range_list->add_map($lower, $upper,
6881 sub append_to_body {
6882 # Adds to the written HERE document of the table's body any anomalous
6883 # entries in the table..
6886 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6888 my $addr = do { no overloading; pack 'J', $self; };
6890 return "" unless @{$anomalous_entries{$addr}};
6891 return join("\n", @{$anomalous_entries{$addr}}) . "\n";
6894 sub map_add_or_replace_non_nulls {
6895 # This adds the mappings in the table $other to $self. Non-null
6896 # mappings from $other override those in $self. It essentially merges
6897 # the two tables, with the second having priority except for null
6902 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6904 return if $self->carp_if_locked;
6906 if (! $other->isa(__PACKAGE__)) {
6907 Carp::my_carp_bug("$other should be a "
6915 my $addr = do { no overloading; pack 'J', $self; };
6916 my $other_addr = do { no overloading; pack 'J', $other; };
6918 local $to_trace = 0 if main::DEBUG;
6920 my $self_range_list = $self->_range_list;
6921 my $other_range_list = $other->_range_list;
6922 foreach my $range ($other_range_list->ranges) {
6923 my $value = $range->value;
6924 next if $value eq "";
6925 $self_range_list->_add_delete('+',
6929 Type => $range->type,
6930 Replace => $UNCONDITIONALLY);
6936 sub set_default_map {
6937 # Define what code points that are missing from the input files should
6942 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6944 my $addr = do { no overloading; pack 'J', $self; };
6946 # Convert the input to the standard equivalent, if any (won't have any
6947 # for $STRING properties)
6948 my $standard = $self->_find_table_from_alias->{$map};
6949 $map = $standard->name if defined $standard;
6951 # Warn if there already is a non-equivalent default map for this
6952 # property. Note that a default map can be a ref, which means that
6953 # what it actually means is delayed until later in the program, and it
6954 # IS permissible to override it here without a message.
6955 my $default_map = $default_map{$addr};
6956 if (defined $default_map
6957 && ! ref($default_map)
6958 && $default_map ne $map
6959 && main::Standardize($map) ne $default_map)
6961 my $property = $self->property;
6962 my $map_table = $property->table($map);
6963 my $default_table = $property->table($default_map);
6964 if (defined $map_table
6965 && defined $default_table
6966 && $map_table != $default_table)
6968 Carp::my_carp("Changing the default mapping for "
6970 . " from $default_map to $map'");
6974 $default_map{$addr} = $map;
6976 # Don't also create any missing table for this map at this point,
6977 # because if we did, it could get done before the main table add is
6978 # done for PropValueAliases.txt; instead the caller will have to make
6979 # sure it exists, if desired.
6984 # Returns boolean: should we write this map table?
6987 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6989 my $addr = do { no overloading; pack 'J', $self; };
6991 # If overridden, use that
6992 return $to_output_map{$addr} if defined $to_output_map{$addr};
6994 my $full_name = $self->full_name;
6995 return $global_to_output_map{$full_name}
6996 if defined $global_to_output_map{$full_name};
6998 # If table says to output, do so; if says to suppress it, do so.
6999 my $fate = $self->fate;
7000 return $INTERNAL_MAP if $fate == $INTERNAL_ONLY;
7001 return $EXTERNAL_MAP if grep { $_ eq $full_name } @output_mapped_properties;
7002 return 0 if $fate == $SUPPRESSED || $fate == $MAP_PROXIED;
7004 my $type = $self->property->type;
7006 # Don't want to output binary map tables even for debugging.
7007 return 0 if $type == $BINARY;
7009 # But do want to output string ones. All the ones that remain to
7010 # be dealt with (i.e. which haven't explicitly been set to external)
7011 # are for internal Perl use only. The default for those that map to
7012 # $CODE_POINT and haven't been restricted to a single element range
7013 # is to use the adjusted form.
7014 if ($type == $STRING) {
7015 return $INTERNAL_MAP if $self->range_size_1
7016 || $default_map{$addr} ne $CODE_POINT;
7017 return $OUTPUT_ADJUSTED;
7020 # Otherwise is an $ENUM, do output it, for Perl's purposes
7021 return $INTERNAL_MAP;
7025 # Returns a Range_List that is gaps of the current table. That is,
7029 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7031 my $current = Range_List->new(Initialize => $self->_range_list,
7032 Owner => $self->property);
7038 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7040 my $return = $self->SUPER::header();
7042 if ($self->to_output_map >= $INTERNAL_MAP) {
7043 $return .= $INTERNAL_ONLY_HEADER;
7046 my $property_name = $self->property->replacement_property;
7048 # The legacy-only properties were gotten above; but there are some
7049 # other properties whose files are in current use that have fixed
7051 $property_name = $self->property->full_name unless $property_name;
7055 # !!!!!!! IT IS DEPRECATED TO USE THIS FILE !!!!!!!
7057 # This file is for internal use by core Perl only. It is retained for
7058 # backwards compatibility with applications that may have come to rely on it,
7059 # but its format and even its name or existence are subject to change without
7060 # notice in a future Perl version. Don't use it directly. Instead, its
7061 # contents are now retrievable through a stable API in the Unicode::UCD
7062 # module: Unicode::UCD::prop_invmap('$property_name') (Values for individual
7063 # code points can be retrieved via Unicode::UCD::charprop());
7069 sub set_final_comment {
7070 # Just before output, create the comment that heads the file
7071 # containing this table.
7073 return unless $debugging_build;
7076 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7078 # No sense generating a comment if aren't going to write it out.
7079 return if ! $self->to_output_map;
7081 my $addr = do { no overloading; pack 'J', $self; };
7083 my $property = $self->property;
7085 # Get all the possible names for this property. Don't use any that
7086 # aren't ok for use in a file name, etc. This is perhaps causing that
7087 # flag to do double duty, and may have to be changed in the future to
7088 # have our own flag for just this purpose; but it works now to exclude
7089 # Perl generated synonyms from the lists for properties, where the
7090 # name is always the proper Unicode one.
7091 my @property_aliases = grep { $_->ok_as_filename } $self->aliases;
7093 my $count = $self->count;
7094 my $default_map = $default_map{$addr};
7096 # The ranges that map to the default aren't output, so subtract that
7097 # to get those actually output. A property with matching tables
7098 # already has the information calculated.
7099 if ($property->type != $STRING && $property->type != $FORCED_BINARY) {
7100 $count -= $property->table($default_map)->count;
7102 elsif (defined $default_map) {
7104 # But for $STRING properties, must calculate now. Subtract the
7105 # count from each range that maps to the default.
7106 foreach my $range ($self->_range_list->ranges) {
7107 if ($range->value eq $default_map) {
7108 $count -= $range->end +1 - $range->start;
7114 # Get a string version of $count with underscores in large numbers,
7116 my $string_count = main::clarify_code_point_count($count);
7118 my $code_points = ($count == 1)
7119 ? 'single code point'
7120 : "$string_count code points";
7125 if (@property_aliases <= 1) {
7126 $mapping = 'mapping';
7127 $these_mappings = 'this mapping';
7131 $mapping = 'synonymous mappings';
7132 $these_mappings = 'these mappings';
7136 if ($count >= $MAX_UNICODE_CODEPOINTS) {
7137 $cp = "any code point in Unicode Version $string_version";
7141 if ($default_map eq "") {
7142 $map_to = 'the null string';
7144 elsif ($default_map eq $CODE_POINT) {
7148 $map_to = "'$default_map'";
7151 $cp = "the single code point";
7154 $cp = "one of the $code_points";
7156 $cp .= " in Unicode Version $unicode_version for which the mapping is not to $map_to";
7161 my $status = $self->status;
7162 if ($status ne $NORMAL) {
7163 my $warn = uc $status_past_participles{$status};
7166 !!!!!!! $warn !!!!!!!!!!!!!!!!!!!
7167 All property or property=value combinations contained in this file are $warn.
7168 See $unicode_reference_url for what this means.
7172 $comment .= "This file returns the $mapping:\n";
7174 my $ucd_accessible_name = "";
7175 my $has_underscore_name = 0;
7176 my $full_name = $self->property->full_name;
7177 for my $i (0 .. @property_aliases - 1) {
7178 my $name = $property_aliases[$i]->name;
7179 $has_underscore_name = 1 if $name =~ /^_/;
7180 $comment .= sprintf("%-8s%s\n", " ", $name . '(cp)');
7181 if ($property_aliases[$i]->ucd) {
7182 if ($name eq $full_name) {
7183 $ucd_accessible_name = $full_name;
7185 elsif (! $ucd_accessible_name) {
7186 $ucd_accessible_name = $name;
7190 $comment .= "\nwhere 'cp' is $cp.";
7191 if ($ucd_accessible_name) {
7192 $comment .= " Note that $these_mappings";
7193 if ($has_underscore_name) {
7194 $comment .= " (except for the one(s) that begin with an underscore)";
7196 $comment .= " $are accessible via the functions prop_invmap('$full_name') or charprop() in Unicode::UCD";
7200 # And append any commentary already set from the actual property.
7201 $comment .= "\n\n" . $self->comment if $self->comment;
7202 if ($self->description) {
7203 $comment .= "\n\n" . join " ", $self->description;
7206 $comment .= "\n\n" . join " ", $self->note;
7210 if (! $self->perl_extension) {
7213 For information about what this property really means, see:
7214 $unicode_reference_url
7218 if ($count) { # Format differs for empty table
7219 $comment.= "\nThe format of the ";
7220 if ($self->range_size_1) {
7222 main body of lines of this file is: CODE_POINT\\t\\tMAPPING where CODE_POINT
7223 is in hex; MAPPING is what CODE_POINT maps to.
7228 # There are tables which end up only having one element per
7229 # range, but it is not worth keeping track of for making just
7230 # this comment a little better.
7232 non-comment portions of the main body of lines of this file is:
7233 START\\tSTOP\\tMAPPING where START is the starting code point of the
7234 range, in hex; STOP is the ending point, or if omitted, the range has just one
7235 code point; MAPPING is what each code point between START and STOP maps to.
7237 if ($self->output_range_counts) {
7239 Numbers in comments in [brackets] indicate how many code points are in the
7240 range (omitted when the range is a single code point or if the mapping is to
7246 $self->set_comment(main::join_lines($comment));
7250 my %swash_keys; # Makes sure don't duplicate swash names.
7252 # The remaining variables are temporaries used while writing each table,
7253 # to output special ranges.
7254 my @multi_code_point_maps; # Map is to more than one code point.
7256 sub handle_special_range {
7257 # Called in the middle of write when it finds a range it doesn't know
7262 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7264 my $addr = do { no overloading; pack 'J', $self; };
7266 my $type = $range->type;
7268 my $low = $range->start;
7269 my $high = $range->end;
7270 my $map = $range->value;
7272 # No need to output the range if it maps to the default.
7273 return if $map eq $default_map{$addr};
7275 my $property = $self->property;
7277 # Switch based on the map type...
7278 if ($type == $HANGUL_SYLLABLE) {
7280 # These are entirely algorithmically determinable based on
7281 # some constants furnished by Unicode; for now, just set a
7282 # flag to indicate that have them. After everything is figured
7283 # out, we will output the code that does the algorithm. (Don't
7284 # output them if not needed because we are suppressing this
7286 $has_hangul_syllables = 1 if $property->to_output_map;
7288 elsif ($type == $CP_IN_NAME) {
7290 # Code points whose name ends in their code point are also
7291 # algorithmically determinable, but need information about the map
7292 # to do so. Both the map and its inverse are stored in data
7293 # structures output in the file. They are stored in the mean time
7294 # in global lists The lists will be written out later into Name.pm,
7295 # which is created only if needed. In order to prevent duplicates
7296 # in the list, only add to them for one property, should multiple
7298 if ($needing_code_points_ending_in_code_point == 0) {
7299 $needing_code_points_ending_in_code_point = $property;
7301 if ($property == $needing_code_points_ending_in_code_point) {
7302 push @{$names_ending_in_code_point{$map}->{'low'}}, $low;
7303 push @{$names_ending_in_code_point{$map}->{'high'}}, $high;
7305 my $squeezed = $map =~ s/[-\s]+//gr;
7306 push @{$loose_names_ending_in_code_point{$squeezed}->{'low'}},
7308 push @{$loose_names_ending_in_code_point{$squeezed}->{'high'}},
7311 push @code_points_ending_in_code_point, { low => $low,
7317 elsif ($range->type == $MULTI_CP || $range->type == $NULL) {
7319 # Multi-code point maps and null string maps have an entry
7320 # for each code point in the range. They use the same
7322 for my $code_point ($low .. $high) {
7324 # The pack() below can't cope with surrogates. XXX This may
7326 if ($code_point >= 0xD800 && $code_point <= 0xDFFF) {
7327 Carp::my_carp("Surrogate code point '$code_point' in mapping to '$map' in $self. No map created");
7331 # Generate the hash entries for these in the form that
7332 # utf8.c understands.
7336 foreach my $to (split " ", $map) {
7337 if ($to !~ /^$code_point_re$/) {
7338 Carp::my_carp("Illegal code point '$to' in mapping '$map' from $code_point in $self. No map created");
7341 $tostr .= sprintf "\\x{%s}", $to;
7342 $to = CORE::hex $to;
7344 $to_name .= " + " if $to_name;
7345 $to_chr .= main::display_chr($to);
7346 main::populate_char_info($to)
7347 if ! defined $viacode[$to];
7348 $to_name .= $viacode[$to];
7352 # The unpack yields a list of the bytes that comprise the
7353 # UTF-8 of $code_point, which are each placed in \xZZ format
7354 # and output in the %s to map to $tostr, so the result looks
7356 # "\xC4\xB0" => "\x{0069}\x{0307}",
7357 my $utf8 = sprintf(qq["%s" => "$tostr",],
7358 join("", map { sprintf "\\x%02X", $_ }
7359 unpack("U0C*", chr $code_point)));
7361 # Add a comment so that a human reader can more easily
7362 # see what's going on.
7363 push @multi_code_point_maps,
7364 sprintf("%-45s # U+%04X", $utf8, $code_point);
7366 $multi_code_point_maps[-1] .= " => $map";
7369 main::populate_char_info($code_point)
7370 if ! defined $viacode[$code_point];
7371 $multi_code_point_maps[-1] .= " '"
7372 . main::display_chr($code_point)
7373 . "' => '$to_chr'; $viacode[$code_point] => $to_name";
7378 Carp::my_carp("Unrecognized map type '$range->type' in '$range' in $self. Not written");
7385 # Returns the string that should be output in the file before the main
7386 # body of this table. It isn't called until the main body is
7387 # calculated, saving a pass. The string includes some hash entries
7388 # identifying the format of the body, and what the single value should
7389 # be for all ranges missing from it. It also includes any code points
7390 # which have map_types that don't go in the main table.
7393 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7395 my $addr = do { no overloading; pack 'J', $self; };
7397 my $name = $self->property->swash_name;
7399 # Currently there is nothing in the pre_body unless a swash is being
7401 return unless defined $name;
7403 if (defined $swash_keys{$name}) {
7404 Carp::my_carp(main::join_lines(<<END
7405 Already created a swash name '$name' for $swash_keys{$name}. This means that
7406 the same name desired for $self shouldn't be used. Bad News. This must be
7407 fixed before production use, but proceeding anyway
7411 $swash_keys{$name} = "$self";
7415 # Here we assume we were called after have gone through the whole
7416 # file. If we actually generated anything for each map type, add its
7417 # respective header and trailer
7418 my $specials_name = "";
7419 if (@multi_code_point_maps) {
7420 $specials_name = "utf8::ToSpec$name";
7423 # Some code points require special handling because their mappings are each to
7424 # multiple code points. These do not appear in the main body, but are defined
7425 # in the hash below.
7427 # Each key is the string of N bytes that together make up the UTF-8 encoding
7428 # for the code point. (i.e. the same as looking at the code point's UTF-8
7429 # under "use bytes"). Each value is the UTF-8 of the translation, for speed.
7430 \%$specials_name = (
7432 $pre_body .= join("\n", @multi_code_point_maps) . "\n);\n";
7435 my $format = $self->format;
7439 my $output_adjusted = ($self->to_output_map == $OUTPUT_ADJUSTED);
7440 if ($output_adjusted) {
7441 if ($specials_name) {
7443 # The mappings in the non-hash portion of this file must be modified to get the
7444 # correct values by adding the code point ordinal number to each one that is
7450 # The mappings must be modified to get the correct values by adding the code
7451 # point ordinal number to each one that is numeric.
7458 # The name this swash is to be known by, with the format of the mappings in
7459 # the main body of the table, and what all code points missing from this file
7461 \$utf8::SwashInfo{'To$name'}{'format'} = '$format'; # $map_table_formats{$format}
7463 if ($specials_name) {
7465 \$utf8::SwashInfo{'To$name'}{'specials_name'} = '$specials_name'; # Name of hash of special mappings
7468 my $default_map = $default_map{$addr};
7470 # For $CODE_POINT default maps and using adjustments, instead the default
7472 $return .= "\$utf8::SwashInfo{'To$name'}{'missing'} = '"
7473 . (($output_adjusted && $default_map eq $CODE_POINT)
7478 if ($default_map eq $CODE_POINT) {
7479 $return .= ' # code point maps to itself';
7481 elsif ($default_map eq "") {
7482 $return .= ' # code point maps to the null string';
7486 $return .= $pre_body;
7492 # Write the table to the file.
7495 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7497 my $addr = do { no overloading; pack 'J', $self; };
7499 # Clear the temporaries
7500 undef @multi_code_point_maps;
7502 # Calculate the format of the table if not already done.
7503 my $format = $self->format;
7504 my $type = $self->property->type;
7505 my $default_map = $self->default_map;
7506 if (! defined $format) {
7507 if ($type == $BINARY) {
7509 # Don't bother checking the values, because we elsewhere
7510 # verify that a binary table has only 2 values.
7511 $format = $BINARY_FORMAT;
7514 my @ranges = $self->_range_list->ranges;
7516 # default an empty table based on its type and default map
7519 # But it turns out that the only one we can say is a
7520 # non-string (besides binary, handled above) is when the
7521 # table is a string and the default map is to a code point
7522 if ($type == $STRING && $default_map eq $CODE_POINT) {
7523 $format = $HEX_FORMAT;
7526 $format = $STRING_FORMAT;
7531 # Start with the most restrictive format, and as we find
7532 # something that doesn't fit with that, change to the next
7533 # most restrictive, and so on.
7534 $format = $DECIMAL_FORMAT;
7535 foreach my $range (@ranges) {
7536 next if $range->type != 0; # Non-normal ranges don't
7537 # affect the main body
7538 my $map = $range->value;
7539 if ($map ne $default_map) {
7540 last if $format eq $STRING_FORMAT; # already at
7543 $format = $INTEGER_FORMAT
7544 if $format eq $DECIMAL_FORMAT
7545 && $map !~ / ^ [0-9] $ /x;
7546 $format = $FLOAT_FORMAT
7547 if $format eq $INTEGER_FORMAT
7548 && $map !~ / ^ -? [0-9]+ $ /x;
7549 $format = $RATIONAL_FORMAT
7550 if $format eq $FLOAT_FORMAT
7551 && $map !~ / ^ -? [0-9]+ \. [0-9]* $ /x;
7552 $format = $HEX_FORMAT
7553 if ($format eq $RATIONAL_FORMAT
7555 m/ ^ -? [0-9]+ ( \/ [0-9]+ )? $ /x)
7556 # Assume a leading zero means hex,
7557 # even if all digits are 0-9
7558 || ($format eq $INTEGER_FORMAT
7559 && $map =~ /^0[0-9A-F]/);
7560 $format = $STRING_FORMAT if $format eq $HEX_FORMAT
7561 && $map =~ /[^0-9A-F]/;
7566 } # end of calculating format
7568 if ($default_map eq $CODE_POINT
7569 && $format ne $HEX_FORMAT
7570 && ! defined $self->format) # manual settings are always
7573 Carp::my_carp_bug("Expecting hex format for mapping table for $self, instead got '$format'")
7576 # If the output is to be adjusted, the format of the table that gets
7577 # output is actually 'a' or 'ax' instead of whatever it is stored
7579 my $output_adjusted = ($self->to_output_map == $OUTPUT_ADJUSTED);
7580 if ($output_adjusted) {
7581 if ($default_map eq $CODE_POINT) {
7582 $format = $HEX_ADJUST_FORMAT;
7585 $format = $ADJUST_FORMAT;
7589 $self->_set_format($format);
7591 return $self->SUPER::write(
7593 $default_map); # don't write defaulteds
7596 # Accessors for the underlying list that should fail if locked.
7606 return if $self->carp_if_locked;
7607 return $self->_range_list->$sub(@_);
7610 } # End closure for Map_Table
7612 package Match_Table;
7613 use parent '-norequire', '_Base_Table';
7615 # A Match table is one which is a list of all the code points that have
7616 # the same property and property value, for use in \p{property=value}
7617 # constructs in regular expressions. It adds very little data to the base
7618 # structure, but many methods, as these lists can be combined in many ways to
7620 # There are only a few concepts added:
7621 # 1) Equivalents and Relatedness.
7622 # Two tables can match the identical code points, but have different names.
7623 # This always happens when there is a perl single form extension
7624 # \p{IsProperty} for the Unicode compound form \P{Property=True}. The two
7625 # tables are set to be related, with the Perl extension being a child, and
7626 # the Unicode property being the parent.
7628 # It may be that two tables match the identical code points and we don't
7629 # know if they are related or not. This happens most frequently when the
7630 # Block and Script properties have the exact range. But note that a
7631 # revision to Unicode could add new code points to the script, which would
7632 # now have to be in a different block (as the block was filled, or there
7633 # would have been 'Unknown' script code points in it and they wouldn't have
7634 # been identical). So we can't rely on any two properties from Unicode
7635 # always matching the same code points from release to release, and thus
7636 # these tables are considered coincidentally equivalent--not related. When
7637 # two tables are unrelated but equivalent, one is arbitrarily chosen as the
7638 # 'leader', and the others are 'equivalents'. This concept is useful
7639 # to minimize the number of tables written out. Only one file is used for
7640 # any identical set of code points, with entries in Heavy.pl mapping all
7641 # the involved tables to it.
7643 # Related tables will always be identical; we set them up to be so. Thus
7644 # if the Unicode one is deprecated, the Perl one will be too. Not so for
7645 # unrelated tables. Relatedness makes generating the documentation easier.
7648 # Like equivalents, two tables may be the inverses of each other, the
7649 # intersection between them is null, and the union is every Unicode code
7650 # point. The two tables that occupy a binary property are necessarily like
7651 # this. By specifying one table as the complement of another, we can avoid
7652 # storing it on disk (using the other table and performing a fast
7653 # transform), and some memory and calculations.
7655 # 3) Conflicting. It may be that there will eventually be name clashes, with
7656 # the same name meaning different things. For a while, there actually were
7657 # conflicts, but they have so far been resolved by changing Perl's or
7658 # Unicode's definitions to match the other, but when this code was written,
7659 # it wasn't clear that that was what was going to happen. (Unicode changed
7660 # because of protests during their beta period.) Name clashes are warned
7661 # about during compilation, and the documentation. The generated tables
7662 # are sane, free of name clashes, because the code suppresses the Perl
7663 # version. But manual intervention to decide what the actual behavior
7664 # should be may be required should this happen. The introductory comments
7665 # have more to say about this.
7667 sub standardize { return main::standardize($_[0]); }
7668 sub trace { return main::trace(@_); }
7673 main::setup_package();
7676 # The leader table of this one; initially $self.
7677 main::set_access('leader', \%leader, 'r');
7680 # An array of any tables that have this one as their leader
7681 main::set_access('equivalents', \%equivalents, 'readable_array');
7684 # The parent table to this one, initially $self. This allows us to
7685 # distinguish between equivalent tables that are related (for which this
7686 # is set to), and those which may not be, but share the same output file
7687 # because they match the exact same set of code points in the current
7689 main::set_access('parent', \%parent, 'r');
7692 # An array of any tables that have this one as their parent
7693 main::set_access('children', \%children, 'readable_array');
7696 # Array of any tables that would have the same name as this one with
7697 # a different meaning. This is used for the generated documentation.
7698 main::set_access('conflicting', \%conflicting, 'readable_array');
7701 # Set in the constructor for tables that are expected to match all code
7703 main::set_access('matches_all', \%matches_all, 'r');
7706 # Points to the complement that this table is expressed in terms of; 0 if
7708 main::set_access('complement', \%complement, 'r');
7715 # The property for which this table is a listing of property values.
7716 my $property = delete $args{'_Property'};
7718 my $name = delete $args{'Name'};
7719 my $full_name = delete $args{'Full_Name'};
7720 $full_name = $name if ! defined $full_name;
7723 my $initialize = delete $args{'Initialize'};
7724 my $matches_all = delete $args{'Matches_All'} || 0;
7725 my $format = delete $args{'Format'};
7726 # Rest of parameters passed on.
7728 my $range_list = Range_List->new(Initialize => $initialize,
7729 Owner => $property);
7731 my $complete = $full_name;
7732 $complete = '""' if $complete eq ""; # A null name shouldn't happen,
7733 # but this helps debug if it
7735 # The complete name for a match table includes it's property in a
7736 # compound form 'property=table', except if the property is the
7737 # pseudo-property, perl, in which case it is just the single form,
7738 # 'table' (If you change the '=' must also change the ':' in lots of
7739 # places in this program that assume an equal sign)
7740 $complete = $property->full_name . "=$complete" if $property != $perl;
7742 my $self = $class->SUPER::new(%args,
7744 Complete_Name => $complete,
7745 Full_Name => $full_name,
7746 _Property => $property,
7747 _Range_List => $range_list,
7748 Format => $EMPTY_FORMAT,
7749 Write_As_Invlist => 1,
7751 my $addr = do { no overloading; pack 'J', $self; };
7753 $conflicting{$addr} = [ ];
7754 $equivalents{$addr} = [ ];
7755 $children{$addr} = [ ];
7756 $matches_all{$addr} = $matches_all;
7757 $leader{$addr} = $self;
7758 $parent{$addr} = $self;
7759 $complement{$addr} = 0;
7761 if (defined $format && $format ne $EMPTY_FORMAT) {
7762 Carp::my_carp_bug("'Format' must be '$EMPTY_FORMAT' in a match table instead of '$format'. Using '$EMPTY_FORMAT'");
7768 # See this program's beginning comment block about overloading these.
7771 qw("") => "_operator_stringify",
7775 return if $self->carp_if_locked;
7783 return $self->_range_list + $other;
7789 return $self->_range_list & $other;
7794 my $reversed = shift;
7797 Carp::my_carp_bug("Bad news. Can't cope with '"
7801 . "'. undef returned.");
7805 return if $self->carp_if_locked;
7807 my $addr = do { no overloading; pack 'J', $self; };
7811 # Change the range list of this table to be the
7813 $self->_set_range_list($self->_range_list
7816 else { # $other is just a simple value
7817 $self->add_range($other, $other);
7824 my $reversed = shift;
7827 Carp::my_carp_bug("Bad news. Can't cope with '"
7831 . "'. undef returned.");
7835 return if $self->carp_if_locked;
7836 $self->_set_range_list($self->_range_list & $other);
7839 '-' => sub { my $self = shift;
7841 my $reversed = shift;
7843 Carp::my_carp_bug("Bad news. Can't cope with '"
7847 . "'. undef returned.");
7851 return $self->_range_list - $other;
7853 '~' => sub { my $self = shift;
7854 return ~ $self->_range_list;
7858 sub _operator_stringify {
7861 my $name = $self->complete_name;
7862 return "Table '$name'";
7866 # Returns the range list associated with this table, which will be the
7867 # complement's if it has one.
7871 if (($complement = $self->complement) != 0) {
7872 return ~ $complement->_range_list;
7875 return $self->SUPER::_range_list;
7880 # Add a synonym for this table. See the comments in the base class
7884 # Rest of parameters passed on.
7886 $self->SUPER::add_alias($name, $self, @_);
7890 sub add_conflicting {
7891 # Add the name of some other object to the list of ones that name
7892 # clash with this match table.
7895 my $conflicting_name = shift; # The name of the conflicting object
7896 my $p = shift || 'p'; # Optional, is this a \p{} or \P{} ?
7897 my $conflicting_object = shift; # Optional, the conflicting object
7898 # itself. This is used to
7899 # disambiguate the text if the input
7900 # name is identical to any of the
7901 # aliases $self is known by.
7902 # Sometimes the conflicting object is
7903 # merely hypothetical, so this has to
7904 # be an optional parameter.
7905 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7907 my $addr = do { no overloading; pack 'J', $self; };
7909 # Check if the conflicting name is exactly the same as any existing
7910 # alias in this table (as long as there is a real object there to
7911 # disambiguate with).
7912 if (defined $conflicting_object) {
7913 foreach my $alias ($self->aliases) {
7914 if ($alias->name eq $conflicting_name) {
7916 # Here, there is an exact match. This results in
7917 # ambiguous comments, so disambiguate by changing the
7918 # conflicting name to its object's complete equivalent.
7919 $conflicting_name = $conflicting_object->complete_name;
7925 # Convert to the \p{...} final name
7926 $conflicting_name = "\\$p" . "{$conflicting_name}";
7929 return if grep { $conflicting_name eq $_ } @{$conflicting{$addr}};
7931 push @{$conflicting{$addr}}, $conflicting_name;
7936 sub is_set_equivalent_to {
7937 # Return boolean of whether or not the other object is a table of this
7938 # type and has been marked equivalent to this one.
7942 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7944 return 0 if ! defined $other; # Can happen for incomplete early
7946 unless ($other->isa(__PACKAGE__)) {
7947 my $ref_other = ref $other;
7948 my $ref_self = ref $self;
7949 Carp::my_carp_bug("Argument to 'is_set_equivalent_to' must be another $ref_self, not a '$ref_other'. $other not set equivalent to $self.");
7953 # Two tables are equivalent if they have the same leader.
7955 return $leader{pack 'J', $self} == $leader{pack 'J', $other};
7959 sub set_equivalent_to {
7960 # Set $self equivalent to the parameter table.
7961 # The required Related => 'x' parameter is a boolean indicating
7962 # whether these tables are related or not. If related, $other becomes
7963 # the 'parent' of $self; if unrelated it becomes the 'leader'
7965 # Related tables share all characteristics except names; equivalents
7966 # not quite so many.
7967 # If they are related, one must be a perl extension. This is because
7968 # we can't guarantee that Unicode won't change one or the other in a
7969 # later release even if they are identical now.
7975 my $related = delete $args{'Related'};
7977 Carp::carp_extra_args(\%args) if main::DEBUG && %args;
7979 return if ! defined $other; # Keep on going; happens in some early
7982 if (! defined $related) {
7983 Carp::my_carp_bug("set_equivalent_to must have 'Related => [01] parameter. Assuming $self is not related to $other");
7987 # If already are equivalent, no need to re-do it; if subroutine
7988 # returns null, it found an error, also do nothing
7989 my $are_equivalent = $self->is_set_equivalent_to($other);
7990 return if ! defined $are_equivalent || $are_equivalent;
7992 my $addr = do { no overloading; pack 'J', $self; };
7993 my $current_leader = ($related) ? $parent{$addr} : $leader{$addr};
7996 if ($current_leader->perl_extension) {
7997 if ($other->perl_extension) {
7998 Carp::my_carp_bug("Use add_alias() to set two Perl tables '$self' and '$other', equivalent.");
8001 } elsif ($self->property != $other->property # Depending on
8007 && ! $other->perl_extension)
8009 Carp::my_carp_bug("set_equivalent_to should have 'Related => 0 for equivalencing two Unicode properties. Assuming $self is not related to $other");
8014 if (! $self->is_empty && ! $self->matches_identically_to($other)) {
8015 Carp::my_carp_bug("$self should be empty or match identically to $other. Not setting equivalent");
8019 my $leader = do { no overloading; pack 'J', $current_leader; };
8020 my $other_addr = do { no overloading; pack 'J', $other; };
8022 # Any tables that are equivalent to or children of this table must now
8023 # instead be equivalent to or (children) to the new leader (parent),
8024 # still equivalent. The equivalency includes their matches_all info,
8025 # and for related tables, their fate and status.
8026 # All related tables are of necessity equivalent, but the converse
8027 # isn't necessarily true
8028 my $status = $other->status;
8029 my $status_info = $other->status_info;
8030 my $fate = $other->fate;
8031 my $matches_all = $matches_all{other_addr};
8032 my $caseless_equivalent = $other->caseless_equivalent;
8033 foreach my $table ($current_leader, @{$equivalents{$leader}}) {
8034 next if $table == $other;
8035 trace "setting $other to be the leader of $table, status=$status" if main::DEBUG && $to_trace;
8037 my $table_addr = do { no overloading; pack 'J', $table; };
8038 $leader{$table_addr} = $other;
8039 $matches_all{$table_addr} = $matches_all;
8040 $self->_set_range_list($other->_range_list);
8041 push @{$equivalents{$other_addr}}, $table;
8043 $parent{$table_addr} = $other;
8044 push @{$children{$other_addr}}, $table;
8045 $table->set_status($status, $status_info);
8047 # This reason currently doesn't get exposed outside; otherwise
8048 # would have to look up the parent's reason and use it instead.
8049 $table->set_fate($fate, "Parent's fate");
8051 $self->set_caseless_equivalent($caseless_equivalent);
8055 # Now that we've declared these to be equivalent, any changes to one
8056 # of the tables would invalidate that equivalency.
8062 sub set_complement {
8063 # Set $self to be the complement of the parameter table. $self is
8064 # locked, as what it contains should all come from the other table.
8070 Carp::carp_extra_args(\%args) if main::DEBUG && %args;
8072 if ($other->complement != 0) {
8073 Carp::my_carp_bug("Can't set $self to be the complement of $other, which itself is the complement of " . $other->complement);
8076 my $addr = do { no overloading; pack 'J', $self; };
8077 $complement{$addr} = $other;
8082 sub add_range { # Add a range to the list for this table.
8084 # Rest of parameters passed on
8086 return if $self->carp_if_locked;
8087 return $self->_range_list->add_range(@_);
8092 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8094 # All match tables are to be used only by the Perl core.
8095 return $self->SUPER::header() . $INTERNAL_ONLY_HEADER;
8098 sub pre_body { # Does nothing for match tables.
8102 sub append_to_body { # Does nothing for match tables.
8110 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8112 $self->SUPER::set_fate($fate, $reason);
8114 # All children share this fate
8115 foreach my $child ($self->children) {
8116 $child->set_fate($fate, $reason);
8123 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8125 return $self->SUPER::write(0); # No adjustments
8128 sub set_final_comment {
8129 # This creates a comment for the file that is to hold the match table
8130 # $self. It is somewhat convoluted to make the English read nicely,
8131 # but, heh, it's just a comment.
8132 # This should be called only with the leader match table of all the
8133 # ones that share the same file. It lists all such tables, ordered so
8134 # that related ones are together.
8136 return unless $debugging_build;
8138 my $leader = shift; # Should only be called on the leader table of
8139 # an equivalent group
8140 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8142 my $addr = do { no overloading; pack 'J', $leader; };
8144 if ($leader{$addr} != $leader) {
8145 Carp::my_carp_bug(<<END
8146 set_final_comment() must be called on a leader table, which $leader is not.
8147 It is equivalent to $leader{$addr}. No comment created
8153 # Get the number of code points matched by each of the tables in this
8154 # file, and add underscores for clarity.
8155 my $count = $leader->count;
8157 my $non_unicode_string;
8158 if ($count > $MAX_UNICODE_CODEPOINTS) {
8159 $unicode_count = $count - ($MAX_WORKING_CODEPOINT
8160 - $MAX_UNICODE_CODEPOINT);
8161 $non_unicode_string = "All above-Unicode code points match as well, and are also returned";
8164 $unicode_count = $count;
8165 $non_unicode_string = "";
8167 my $string_count = main::clarify_code_point_count($unicode_count);
8169 my $loose_count = 0; # how many aliases loosely matched
8170 my $compound_name = ""; # ? Are any names compound?, and if so, an
8172 my $properties_with_compound_names = 0; # count of these
8175 my %flags; # The status flags used in the file
8176 my $total_entries = 0; # number of entries written in the comment
8177 my $matches_comment = ""; # The portion of the comment about the
8179 my @global_comments; # List of all the tables' comments that are
8180 # there before this routine was called.
8181 my $has_ucd_alias = 0; # If there is an alias that is accessible via
8182 # Unicode::UCD. If not, then don't say it is
8185 # Get list of all the parent tables that are equivalent to this one
8186 # (including itself).
8187 my @parents = grep { $parent{main::objaddr $_} == $_ }
8188 main::uniques($leader, @{$equivalents{$addr}});
8189 my $has_unrelated = (@parents >= 2); # boolean, ? are there unrelated
8191 for my $parent (@parents) {
8193 my $property = $parent->property;
8195 # Special case 'N' tables in properties with two match tables when
8196 # the other is a 'Y' one. These are likely to be binary tables,
8197 # but not necessarily. In either case, \P{} will match the
8198 # complement of \p{}, and so if something is a synonym of \p, the
8199 # complement of that something will be the synonym of \P. This
8200 # would be true of any property with just two match tables, not
8201 # just those whose values are Y and N; but that would require a
8202 # little extra work, and there are none such so far in Unicode.
8203 my $perl_p = 'p'; # which is it? \p{} or \P{}
8204 my @yes_perl_synonyms; # list of any synonyms for the 'Y' table
8206 if (scalar $property->tables == 2
8207 && $parent == $property->table('N')
8208 && defined (my $yes = $property->table('Y')))
8210 my $yes_addr = do { no overloading; pack 'J', $yes; };
8212 = grep { $_->property == $perl }
8215 $parent{$yes_addr}->children);
8217 # But these synonyms are \P{} ,not \p{}
8221 my @description; # Will hold the table description
8222 my @note; # Will hold the table notes.
8223 my @conflicting; # Will hold the table conflicts.
8225 # Look at the parent, any yes synonyms, and all the children
8226 my $parent_addr = do { no overloading; pack 'J', $parent; };
8227 for my $table ($parent,
8229 @{$children{$parent_addr}})
8231 my $table_addr = do { no overloading; pack 'J', $table; };
8232 my $table_property = $table->property;
8234 # Tables are separated by a blank line to create a grouping.
8235 $matches_comment .= "\n" if $matches_comment;
8237 # The table is named based on the property and value
8238 # combination it is for, like script=greek. But there may be
8239 # a number of synonyms for each side, like 'sc' for 'script',
8240 # and 'grek' for 'greek'. Any combination of these is a valid
8241 # name for this table. In this case, there are three more,
8242 # 'sc=grek', 'sc=greek', and 'script='grek'. Rather than
8243 # listing all possible combinations in the comment, we make
8244 # sure that each synonym occurs at least once, and add
8245 # commentary that the other combinations are possible.
8246 # Because regular expressions don't recognize things like
8247 # \p{jsn=}, only look at non-null right-hand-sides
8248 my @property_aliases = grep { $_->status ne $INTERNAL_ALIAS } $table_property->aliases;
8249 my @table_aliases = grep { $_->name ne "" } $table->aliases;
8251 # The alias lists above are already ordered in the order we
8252 # want to output them. To ensure that each synonym is listed,
8253 # we must use the max of the two numbers. But if there are no
8254 # legal synonyms (nothing in @table_aliases), then we don't
8256 my $listed_combos = (@table_aliases)
8257 ? main::max(scalar @table_aliases,
8258 scalar @property_aliases)
8260 trace "$listed_combos, tables=", scalar @table_aliases, "; property names=", scalar @property_aliases if main::DEBUG;
8262 my $property_had_compound_name = 0;
8264 for my $i (0 .. $listed_combos - 1) {
8267 # The current alias for the property is the next one on
8268 # the list, or if beyond the end, start over. Similarly
8269 # for the table (\p{prop=table})
8270 my $property_alias = $property_aliases
8271 [$i % @property_aliases]->name;
8272 my $table_alias_object = $table_aliases
8273 [$i % @table_aliases];
8274 my $table_alias = $table_alias_object->name;
8275 my $loose_match = $table_alias_object->loose_match;
8276 $has_ucd_alias |= $table_alias_object->ucd;
8278 if ($table_alias !~ /\D/) { # Clarify large numbers.
8279 $table_alias = main::clarify_number($table_alias)
8282 # Add a comment for this alias combination
8283 my $current_match_comment;
8284 if ($table_property == $perl) {
8285 $current_match_comment = "\\$perl_p"
8289 $current_match_comment
8290 = "\\p{$property_alias=$table_alias}";
8291 $property_had_compound_name = 1;
8294 # Flag any abnormal status for this table.
8295 my $flag = $property->status
8297 || $table_alias_object->status;
8298 if ($flag && $flag ne $PLACEHOLDER) {
8299 $flags{$flag} = $status_past_participles{$flag};
8304 # Pretty up the comment. Note the \b; it says don't make
8305 # this line a continuation.
8306 $matches_comment .= sprintf("\b%-1s%-s%s\n",
8309 $current_match_comment);
8310 } # End of generating the entries for this table.
8312 # Save these for output after this group of related tables.
8313 push @description, $table->description;
8314 push @note, $table->note;
8315 push @conflicting, $table->conflicting;
8317 # And this for output after all the tables.
8318 push @global_comments, $table->comment;
8320 # Compute an alternate compound name using the final property
8321 # synonym and the first table synonym with a colon instead of
8322 # the equal sign used elsewhere.
8323 if ($property_had_compound_name) {
8324 $properties_with_compound_names ++;
8325 if (! $compound_name || @property_aliases > 1) {
8326 $compound_name = $property_aliases[-1]->name
8328 . $table_aliases[0]->name;
8331 } # End of looping through all children of this table
8333 # Here have assembled in $matches_comment all the related tables
8334 # to the current parent (preceded by the same info for all the
8335 # previous parents). Put out information that applies to all of
8336 # the current family.
8339 # But output the conflicting information now, as it applies to
8341 my $conflicting = join ", ", @conflicting;
8343 $matches_comment .= <<END;
8345 Note that contrary to what you might expect, the above is NOT the same as
8347 $matches_comment .= "any of: " if @conflicting > 1;
8348 $matches_comment .= "$conflicting\n";
8352 $matches_comment .= "\n Meaning: "
8353 . join('; ', @description)
8357 $matches_comment .= "\n Note: "
8358 . join("\n ", @note)
8361 } # End of looping through all tables
8363 $matches_comment .= "\n$non_unicode_string\n" if $non_unicode_string;
8369 if ($unicode_count == 1) {
8371 $code_points = 'single code point';
8375 $code_points = "$string_count code points";
8380 if ($total_entries == 1) {
8383 $any_of_these = 'this'
8386 $synonyms = " any of the following regular expression constructs";
8387 $entries = 'entries';
8388 $any_of_these = 'any of these'
8392 if ($has_ucd_alias) {
8393 $comment .= "Use Unicode::UCD::prop_invlist() to access the contents of this file.\n\n";
8395 if ($has_unrelated) {
8397 This file is for tables that are not necessarily related: To conserve
8398 resources, every table that matches the identical set of code points in this
8399 version of Unicode uses this file. Each one is listed in a separate group
8400 below. It could be that the tables will match the same set of code points in
8401 other Unicode releases, or it could be purely coincidence that they happen to
8402 be the same in Unicode $unicode_version, and hence may not in other versions.
8408 foreach my $flag (sort keys %flags) {
8410 '$flag' below means that this form is $flags{$flag}.
8412 if ($flag eq $INTERNAL_ALIAS) {
8413 $comment .= "DO NOT USE!!!";
8416 $comment .= "Consult $pod_file.pod";
8423 if ($total_entries == 0) {
8424 Carp::my_carp("No regular expression construct can match $leader, as all names for it are the null string. Creating file anyway.");
8426 This file returns the $code_points in Unicode Version
8427 $unicode_version for
8428 $leader, but it is inaccessible through Perl regular expressions, as
8429 "\\p{prop=}" is not recognized.
8434 This file returns the $code_points in Unicode Version
8435 $unicode_version that
8439 $pod_file.pod should be consulted for the syntax rules for $any_of_these,
8440 including if adding or subtracting white space, underscore, and hyphen
8441 characters matters or doesn't matter, and other permissible syntactic
8442 variants. Upper/lower case distinctions never matter.
8446 if ($compound_name) {
8449 A colon can be substituted for the equals sign, and
8451 if ($properties_with_compound_names > 1) {
8453 within each group above,
8456 $compound_name = sprintf("%-8s\\p{%s}", " ", $compound_name);
8458 # Note the \b below, it says don't make that line a continuation.
8460 anything to the left of the equals (or colon) can be combined with anything to
8461 the right. Thus, for example,
8467 # And append any comment(s) from the actual tables. They are all
8468 # gathered here, so may not read all that well.
8469 if (@global_comments) {
8470 $comment .= "\n" . join("\n\n", @global_comments) . "\n";
8473 if ($count) { # The format differs if no code points, and needs no
8474 # explanation in that case
8475 if ($leader->write_as_invlist) {
8478 The first data line of this file begins with the letter V to indicate it is in
8479 inversion list format. The number following the V gives the number of lines
8480 remaining. Each of those remaining lines is a single number representing the
8481 starting code point of a range which goes up to but not including the number
8482 on the next line; The 0th, 2nd, 4th... ranges are for code points that match
8483 the property; the 1st, 3rd, 5th... are ranges of code points that don't match
8484 the property. The final line's range extends to the platform's infinity.
8489 The format of the lines of this file is:
8490 START\\tSTOP\\twhere START is the starting code point of the range, in hex;
8491 STOP is the ending point, or if omitted, the range has just one code point.
8494 if ($leader->output_range_counts) {
8496 Numbers in comments in [brackets] indicate how many code points are in the
8502 $leader->set_comment(main::join_lines($comment));
8506 # Accessors for the underlying list
8508 get_valid_code_point
8509 get_invalid_code_point
8517 return $self->_range_list->$sub(@_);
8520 } # End closure for Match_Table
8524 # The Property class represents a Unicode property, or the $perl
8525 # pseudo-property. It contains a map table initialized empty at construction
8526 # time, and for properties accessible through regular expressions, various
8527 # match tables, created through the add_match_table() method, and referenced
8528 # by the table('NAME') or tables() methods, the latter returning a list of all
8529 # of the match tables. Otherwise table operations implicitly are for the map
8532 # Most of the data in the property is actually about its map table, so it
8533 # mostly just uses that table's accessors for most methods. The two could
8534 # have been combined into one object, but for clarity because of their
8535 # differing semantics, they have been kept separate. It could be argued that
8536 # the 'file' and 'directory' fields should be kept with the map table.
8538 # Each property has a type. This can be set in the constructor, or in the
8539 # set_type accessor, but mostly it is figured out by the data. Every property
8540 # starts with unknown type, overridden by a parameter to the constructor, or
8541 # as match tables are added, or ranges added to the map table, the data is
8542 # inspected, and the type changed. After the table is mostly or entirely
8543 # filled, compute_type() should be called to finalize they analysis.
8545 # There are very few operations defined. One can safely remove a range from
8546 # the map table, and property_add_or_replace_non_nulls() adds the maps from another
8547 # table to this one, replacing any in the intersection of the two.
8549 sub standardize { return main::standardize($_[0]); }
8550 sub trace { return main::trace(@_) if main::DEBUG && $to_trace }
8554 # This hash will contain as keys, all the aliases of all properties, and
8555 # as values, pointers to their respective property objects. This allows
8556 # quick look-up of a property from any of its names.
8557 my %alias_to_property_of;
8559 sub dump_alias_to_property_of {
8562 print "\n", main::simple_dumper (\%alias_to_property_of), "\n";
8567 # This is a package subroutine, not called as a method.
8568 # If the single parameter is a literal '*' it returns a list of all
8569 # defined properties.
8570 # Otherwise, the single parameter is a name, and it returns a pointer
8571 # to the corresponding property object, or undef if none.
8573 # Properties can have several different names. The 'standard' form of
8574 # each of them is stored in %alias_to_property_of as they are defined.
8575 # But it's possible that this subroutine will be called with some
8576 # variant, so if the initial lookup fails, it is repeated with the
8577 # standardized form of the input name. If found, besides returning the
8578 # result, the input name is added to the list so future calls won't
8579 # have to do the conversion again.
8583 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8585 if (! defined $name) {
8586 Carp::my_carp_bug("Undefined input property. No action taken.");
8590 return main::uniques(values %alias_to_property_of) if $name eq '*';
8592 # Return cached result if have it.
8593 my $result = $alias_to_property_of{$name};
8594 return $result if defined $result;
8596 # Convert the input to standard form.
8597 my $standard_name = standardize($name);
8599 $result = $alias_to_property_of{$standard_name};
8600 return unless defined $result; # Don't cache undefs
8602 # Cache the result before returning it.
8603 $alias_to_property_of{$name} = $result;
8608 main::setup_package();
8611 # A pointer to the map table object for this property
8612 main::set_access('map', \%map);
8615 # The property's full name. This is a duplicate of the copy kept in the
8616 # map table, but is needed because stringify needs it during
8617 # construction of the map table, and then would have a chicken before egg
8619 main::set_access('full_name', \%full_name, 'r');
8622 # This hash will contain as keys, all the aliases of any match tables
8623 # attached to this property, and as values, the pointers to their
8624 # respective tables. This allows quick look-up of a table from any of its
8626 main::set_access('table_ref', \%table_ref);
8629 # The type of the property, $ENUM, $BINARY, etc
8630 main::set_access('type', \%type, 'r');
8633 # The filename where the map table will go (if actually written).
8634 # Normally defaulted, but can be overridden.
8635 main::set_access('file', \%file, 'r', 's');
8638 # The directory where the map table will go (if actually written).
8639 # Normally defaulted, but can be overridden.
8640 main::set_access('directory', \%directory, 's');
8642 my %pseudo_map_type;
8643 # This is used to affect the calculation of the map types for all the
8644 # ranges in the table. It should be set to one of the values that signify
8645 # to alter the calculation.
8646 main::set_access('pseudo_map_type', \%pseudo_map_type, 'r');
8648 my %has_only_code_point_maps;
8649 # A boolean used to help in computing the type of data in the map table.
8650 main::set_access('has_only_code_point_maps', \%has_only_code_point_maps);
8653 # A list of the first few distinct mappings this property has. This is
8654 # used to disambiguate between binary and enum property types, so don't
8655 # have to keep more than three.
8656 main::set_access('unique_maps', \%unique_maps);
8658 my %pre_declared_maps;
8659 # A boolean that gives whether the input data should declare all the
8660 # tables used, or not. If the former, unknown ones raise a warning.
8661 main::set_access('pre_declared_maps',
8662 \%pre_declared_maps, 'r', 's');
8665 # The only required parameter is the positionally first, name. All
8666 # other parameters are key => value pairs. See the documentation just
8667 # above for the meanings of the ones not passed directly on to the map
8668 # table constructor.
8671 my $name = shift || "";
8673 my $self = property_ref($name);
8674 if (defined $self) {
8675 my $options_string = join ", ", @_;
8676 $options_string = ". Ignoring options $options_string" if $options_string;
8677 Carp::my_carp("$self is already in use. Using existing one$options_string;");
8683 $self = bless \do { my $anonymous_scalar }, $class;
8684 my $addr = do { no overloading; pack 'J', $self; };
8686 $directory{$addr} = delete $args{'Directory'};
8687 $file{$addr} = delete $args{'File'};
8688 $full_name{$addr} = delete $args{'Full_Name'} || $name;
8689 $type{$addr} = delete $args{'Type'} || $UNKNOWN;
8690 $pseudo_map_type{$addr} = delete $args{'Map_Type'};
8691 $pre_declared_maps{$addr} = delete $args{'Pre_Declared_Maps'}
8692 # Starting in this release, property
8693 # values should be defined for all
8694 # properties, except those overriding this
8695 // $v_version ge v5.1.0;
8697 # Rest of parameters passed on.
8699 $has_only_code_point_maps{$addr} = 1;
8700 $table_ref{$addr} = { };
8701 $unique_maps{$addr} = { };
8703 $map{$addr} = Map_Table->new($name,
8704 Full_Name => $full_name{$addr},
8705 _Alias_Hash => \%alias_to_property_of,
8711 # See this program's beginning comment block about overloading the copy
8712 # constructor. Few operations are defined on properties, but a couple are
8713 # useful. It is safe to take the inverse of a property, and to remove a
8714 # single code point from it.
8717 qw("") => "_operator_stringify",
8718 "." => \&main::_operator_dot,
8719 ".=" => \&main::_operator_dot_equal,
8720 '==' => \&main::_operator_equal,
8721 '!=' => \&main::_operator_not_equal,
8722 '=' => sub { return shift },
8723 '-=' => "_minus_and_equal",
8726 sub _operator_stringify {
8727 return "Property '" . shift->full_name . "'";
8730 sub _minus_and_equal {
8731 # Remove a single code point from the map table of a property.
8735 my $reversed = shift;
8736 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8739 Carp::my_carp_bug("Bad news. Can't cope with a "
8741 . " argument to '-='. Subtraction ignored.");
8744 elsif ($reversed) { # Shouldn't happen in a -=, but just in case
8745 Carp::my_carp_bug("Bad news. Can't cope with subtracting a "
8747 . " from a non-object. undef returned.");
8752 $map{pack 'J', $self}->delete_range($other, $other);
8757 sub add_match_table {
8758 # Add a new match table for this property, with name given by the
8759 # parameter. It returns a pointer to the table.
8765 my $addr = do { no overloading; pack 'J', $self; };
8767 my $table = $table_ref{$addr}{$name};
8768 my $standard_name = main::standardize($name);
8770 || (defined ($table = $table_ref{$addr}{$standard_name})))
8772 Carp::my_carp("Table '$name' in $self is already in use. Using existing one");
8773 $table_ref{$addr}{$name} = $table;
8778 # See if this is a perl extension, if not passed in.
8779 my $perl_extension = delete $args{'Perl_Extension'};
8781 = $self->perl_extension if ! defined $perl_extension;
8784 my $suppression_reason = "";
8785 if ($self->name =~ /^_/) {
8786 $fate = $SUPPRESSED;
8787 $suppression_reason = "Parent property is internal only";
8789 elsif ($self->fate >= $SUPPRESSED) {
8790 $fate = $self->fate;
8791 $suppression_reason = $why_suppressed{$self->complete_name};
8794 elsif ($name =~ /^_/) {
8795 $fate = $INTERNAL_ONLY;
8797 $table = Match_Table->new(
8799 Perl_Extension => $perl_extension,
8800 _Alias_Hash => $table_ref{$addr},
8803 Suppression_Reason => $suppression_reason,
8804 Status => $self->status,
8805 _Status_Info => $self->status_info,
8807 return unless defined $table;
8810 # Save the names for quick look up
8811 $table_ref{$addr}{$standard_name} = $table;
8812 $table_ref{$addr}{$name} = $table;
8814 # Perhaps we can figure out the type of this property based on the
8815 # fact of adding this match table. First, string properties don't
8816 # have match tables; second, a binary property can't have 3 match
8818 if ($type{$addr} == $UNKNOWN) {
8819 $type{$addr} = $NON_STRING;
8821 elsif ($type{$addr} == $STRING) {
8822 Carp::my_carp("$self Added a match table '$name' to a string property '$self'. Changed it to a non-string property. Bad News.");
8823 $type{$addr} = $NON_STRING;
8825 elsif ($type{$addr} != $ENUM && $type{$addr} != $FORCED_BINARY) {
8826 if (scalar main::uniques(values %{$table_ref{$addr}}) > 2) {
8827 if ($type{$addr} == $BINARY) {
8828 Carp::my_carp("$self now has more than 2 tables (with the addition of '$name'), and so is no longer binary. Changing its type to 'enum'. Bad News.");
8830 $type{$addr} = $ENUM;
8837 sub delete_match_table {
8838 # Delete the table referred to by $2 from the property $1.
8841 my $table_to_remove = shift;
8842 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8844 my $addr = do { no overloading; pack 'J', $self; };
8846 # Remove all names that refer to it.
8847 foreach my $key (keys %{$table_ref{$addr}}) {
8848 delete $table_ref{$addr}{$key}
8849 if $table_ref{$addr}{$key} == $table_to_remove;
8852 $table_to_remove->DESTROY;
8857 # Return a pointer to the match table (with name given by the
8858 # parameter) associated with this property; undef if none.
8862 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8864 my $addr = do { no overloading; pack 'J', $self; };
8866 return $table_ref{$addr}{$name} if defined $table_ref{$addr}{$name};
8868 # If quick look-up failed, try again using the standard form of the
8869 # input name. If that succeeds, cache the result before returning so
8870 # won't have to standardize this input name again.
8871 my $standard_name = main::standardize($name);
8872 return unless defined $table_ref{$addr}{$standard_name};
8874 $table_ref{$addr}{$name} = $table_ref{$addr}{$standard_name};
8875 return $table_ref{$addr}{$name};
8879 # Return a list of pointers to all the match tables attached to this
8883 return main::uniques(values %{$table_ref{pack 'J', shift}});
8887 # Returns the directory the map table for this property should be
8888 # output in. If a specific directory has been specified, that has
8889 # priority; 'undef' is returned if the type isn't defined;
8890 # or $map_directory for everything else.
8892 my $addr = do { no overloading; pack 'J', shift; };
8894 return $directory{$addr} if defined $directory{$addr};
8895 return undef if $type{$addr} == $UNKNOWN;
8896 return $map_directory;
8900 # Return the name that is used to both:
8901 # 1) Name the file that the map table is written to.
8902 # 2) The name of swash related stuff inside that file.
8903 # The reason for this is that the Perl core historically has used
8904 # certain names that aren't the same as the Unicode property names.
8905 # To continue using these, $file is hard-coded in this file for those,
8906 # but otherwise the standard name is used. This is different from the
8907 # external_name, so that the rest of the files, like in lib can use
8908 # the standard name always, without regard to historical precedent.
8911 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8913 my $addr = do { no overloading; pack 'J', $self; };
8915 # Swash names are used only on either
8916 # 1) legacy-only properties, because the formats for these are
8917 # unchangeable, and they have had these lines in them; or
8918 # 2) regular or internal-only map tables
8919 # 3) otherwise there should be no access to the
8920 # property map table from other parts of Perl.
8921 return if $map{$addr}->fate != $ORDINARY
8922 && $map{$addr}->fate != $LEGACY_ONLY
8923 && ! ($map{$addr}->name =~ /^_/
8924 && $map{$addr}->fate == $INTERNAL_ONLY);
8926 return $file{$addr} if defined $file{$addr};
8927 return $map{$addr}->external_name;
8930 sub to_create_match_tables {
8931 # Returns a boolean as to whether or not match tables should be
8932 # created for this property.
8935 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8937 # The whole point of this pseudo property is match tables.
8938 return 1 if $self == $perl;
8940 my $addr = do { no overloading; pack 'J', $self; };
8942 # Don't generate tables of code points that match the property values
8943 # of a string property. Such a list would most likely have many
8944 # property values, each with just one or very few code points mapping
8946 return 0 if $type{$addr} == $STRING;
8952 sub property_add_or_replace_non_nulls {
8953 # This adds the mappings in the property $other to $self. Non-null
8954 # mappings from $other override those in $self. It essentially merges
8955 # the two properties, with the second having priority except for null
8960 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8962 if (! $other->isa(__PACKAGE__)) {
8963 Carp::my_carp_bug("$other should be a "
8972 return $map{pack 'J', $self}->map_add_or_replace_non_nulls($map{pack 'J', $other});
8976 # Certain tables are not generally written out to files, but
8977 # Unicode::UCD has the intelligence to know that the file for $self
8978 # can be used to reconstruct those tables. This routine just changes
8979 # things so that UCD pod entries for those suppressed tables are
8980 # generated, so the fact that a proxy is used is invisible to the
8985 foreach my $property_name (@_) {
8986 my $ref = property_ref($property_name);
8987 next if $ref->to_output_map;
8988 $ref->set_fate($MAP_PROXIED);
8993 # Set the type of the property. Mostly this is figured out by the
8994 # data in the table. But this is used to set it explicitly. The
8995 # reason it is not a standard accessor is that when setting a binary
8996 # property, we need to make sure that all the true/false aliases are
8997 # present, as they were omitted in early Unicode releases.
9001 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9005 && $type != $FORCED_BINARY
9006 && $type != $STRING)
9008 Carp::my_carp("Unrecognized type '$type'. Type not set");
9012 { no overloading; $type{pack 'J', $self} = $type; }
9013 return if $type != $BINARY && $type != $FORCED_BINARY;
9015 my $yes = $self->table('Y');
9016 $yes = $self->table('Yes') if ! defined $yes;
9017 $yes = $self->add_match_table('Y', Full_Name => 'Yes')
9020 # Add aliases in order wanted, duplicates will be ignored. We use a
9021 # binary property present in all releases for its ordered lists of
9022 # true/false aliases. Note, that could run into problems in
9023 # outputting things in that we don't distinguish between the name and
9024 # full name of these. Hopefully, if the table was already created
9025 # before this code is executed, it was done with these set properly.
9026 my $bm = property_ref("Bidi_Mirrored");
9027 foreach my $alias ($bm->table("Y")->aliases) {
9028 $yes->add_alias($alias->name);
9030 my $no = $self->table('N');
9031 $no = $self->table('No') if ! defined $no;
9032 $no = $self->add_match_table('N', Full_Name => 'No') if ! defined $no;
9033 foreach my $alias ($bm->table("N")->aliases) {
9034 $no->add_alias($alias->name);
9041 # Add a map to the property's map table. This also keeps
9042 # track of the maps so that the property type can be determined from
9046 my $start = shift; # First code point in range
9047 my $end = shift; # Final code point in range
9048 my $map = shift; # What the range maps to.
9049 # Rest of parameters passed on.
9051 my $addr = do { no overloading; pack 'J', $self; };
9053 # If haven't the type of the property, gather information to figure it
9055 if ($type{$addr} == $UNKNOWN) {
9057 # If the map contains an interior blank or dash, or most other
9058 # nonword characters, it will be a string property. This
9059 # heuristic may actually miss some string properties. If so, they
9060 # may need to have explicit set_types called for them. This
9061 # happens in the Unihan properties.
9062 if ($map =~ / (?<= . ) [ -] (?= . ) /x
9063 || $map =~ / [^\w.\/\ -] /x)
9065 $self->set_type($STRING);
9067 # $unique_maps is used for disambiguating between ENUM and
9068 # BINARY later; since we know the property is not going to be
9069 # one of those, no point in keeping the data around
9070 undef $unique_maps{$addr};
9074 # Not necessarily a string. The final decision has to be
9075 # deferred until all the data are in. We keep track of if all
9076 # the values are code points for that eventual decision.
9077 $has_only_code_point_maps{$addr} &=
9078 $map =~ / ^ $code_point_re $/x;
9080 # For the purposes of disambiguating between binary and other
9081 # enumerations at the end, we keep track of the first three
9082 # distinct property values. Once we get to three, we know
9083 # it's not going to be binary, so no need to track more.
9084 if (scalar keys %{$unique_maps{$addr}} < 3) {
9085 $unique_maps{$addr}{main::standardize($map)} = 1;
9090 # Add the mapping by calling our map table's method
9091 return $map{$addr}->add_map($start, $end, $map, @_);
9095 # Compute the type of the property: $ENUM, $STRING, or $BINARY. This
9096 # should be called after the property is mostly filled with its maps.
9097 # We have been keeping track of what the property values have been,
9098 # and now have the necessary information to figure out the type.
9101 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9103 my $addr = do { no overloading; pack 'J', $self; };
9105 my $type = $type{$addr};
9107 # If already have figured these out, no need to do so again, but we do
9108 # a double check on ENUMS to make sure that a string property hasn't
9109 # improperly been classified as an ENUM, so continue on with those.
9110 return if $type == $STRING
9112 || $type == $FORCED_BINARY;
9114 # If every map is to a code point, is a string property.
9115 if ($type == $UNKNOWN
9116 && ($has_only_code_point_maps{$addr}
9117 || (defined $map{$addr}->default_map
9118 && $map{$addr}->default_map eq "")))
9120 $self->set_type($STRING);
9124 # Otherwise, it is to some sort of enumeration. (The case where
9125 # it is a Unicode miscellaneous property, and treated like a
9126 # string in this program is handled in add_map()). Distinguish
9127 # between binary and some other enumeration type. Of course, if
9128 # there are more than two values, it's not binary. But more
9129 # subtle is the test that the default mapping is defined means it
9130 # isn't binary. This in fact may change in the future if Unicode
9131 # changes the way its data is structured. But so far, no binary
9132 # properties ever have @missing lines for them, so the default map
9133 # isn't defined for them. The few properties that are two-valued
9134 # and aren't considered binary have the default map defined
9135 # starting in Unicode 5.0, when the @missing lines appeared; and
9136 # this program has special code to put in a default map for them
9137 # for earlier than 5.0 releases.
9139 || scalar keys %{$unique_maps{$addr}} > 2
9140 || defined $self->default_map)
9142 my $tables = $self->tables;
9143 my $count = $self->count;
9144 if ($verbosity && $tables > 500 && $tables/$count > .1) {
9145 Carp::my_carp_bug("It appears that $self should be a \$STRING property, not an \$ENUM because it has too many match tables: $tables\n");
9147 $self->set_type($ENUM);
9150 $self->set_type($BINARY);
9153 undef $unique_maps{$addr}; # Garbage collect
9160 my $reason = shift; # Ignored unless suppressing
9161 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9163 my $addr = do { no overloading; pack 'J', $self; };
9164 if ($fate >= $SUPPRESSED) {
9165 $why_suppressed{$self->complete_name} = $reason;
9168 # Each table shares the property's fate, except that MAP_PROXIED
9169 # doesn't affect match tables
9170 $map{$addr}->set_fate($fate, $reason);
9171 if ($fate != $MAP_PROXIED) {
9172 foreach my $table ($map{$addr}, $self->tables) {
9173 $table->set_fate($fate, $reason);
9180 # Most of the accessors for a property actually apply to its map table.
9181 # Setup up accessor functions for those, referring to %map
9206 replacement_property
9231 # 'property' above is for symmetry, so that one can take
9232 # the property of a property and get itself, and so don't
9233 # have to distinguish between properties and tables in
9241 return $map{pack 'J', $self}->$sub(@_);
9251 # Converts an ordinal printable character value to a displayable string,
9252 # using a dotted circle to hold combining characters.
9256 return $chr if $ccc->table(0)->contains($ord);
9257 return "\x{25CC}$chr";
9261 # Returns lines of the input joined together, so that they can be folded
9263 # This causes continuation lines to be joined together into one long line
9264 # for folding. A continuation line is any line that doesn't begin with a
9265 # space or "\b" (the latter is stripped from the output). This is so
9266 # lines can be be in a HERE document so as to fit nicely in the terminal
9267 # width, but be joined together in one long line, and then folded with
9268 # indents, '#' prefixes, etc, properly handled.
9269 # A blank separates the joined lines except if there is a break; an extra
9270 # blank is inserted after a period ending a line.
9272 # Initialize the return with the first line.
9273 my ($return, @lines) = split "\n", shift;
9275 # If the first line is null, it was an empty line, add the \n back in
9276 $return = "\n" if $return eq "";
9278 # Now join the remainder of the physical lines.
9279 for my $line (@lines) {
9281 # An empty line means wanted a blank line, so add two \n's to get that
9282 # effect, and go to the next line.
9283 if (length $line == 0) {
9288 # Look at the last character of what we have so far.
9289 my $previous_char = substr($return, -1, 1);
9291 # And at the next char to be output.
9292 my $next_char = substr($line, 0, 1);
9294 if ($previous_char ne "\n") {
9296 # Here didn't end wth a nl. If the next char a blank or \b, it
9297 # means that here there is a break anyway. So add a nl to the
9299 if ($next_char eq " " || $next_char eq "\b") {
9300 $previous_char = "\n";
9301 $return .= $previous_char;
9304 # Add an extra space after periods.
9305 $return .= " " if $previous_char eq '.';
9308 # Here $previous_char is still the latest character to be output. If
9309 # it isn't a nl, it means that the next line is to be a continuation
9310 # line, with a blank inserted between them.
9311 $return .= " " if $previous_char ne "\n";
9314 substr($line, 0, 1) = "" if $next_char eq "\b";
9316 # And append this next line.
9323 sub simple_fold($;$$$) {
9324 # Returns a string of the input (string or an array of strings) folded
9325 # into multiple-lines each of no more than $MAX_LINE_WIDTH characters plus
9327 # This is tailored for the kind of text written by this program,
9328 # especially the pod file, which can have very long names with
9329 # underscores in the middle, or words like AbcDefgHij.... We allow
9330 # breaking in the middle of such constructs if the line won't fit
9331 # otherwise. The break in such cases will come either just after an
9332 # underscore, or just before one of the Capital letters.
9334 local $to_trace = 0 if main::DEBUG;
9337 my $prefix = shift; # Optional string to prepend to each output
9339 $prefix = "" unless defined $prefix;
9341 my $hanging_indent = shift; # Optional number of spaces to indent
9342 # continuation lines
9343 $hanging_indent = 0 unless $hanging_indent;
9345 my $right_margin = shift; # Optional number of spaces to narrow the
9347 $right_margin = 0 unless defined $right_margin;
9349 # Call carp with the 'nofold' option to avoid it from trying to call us
9351 Carp::carp_extra_args(\@_, 'nofold') if main::DEBUG && @_;
9353 # The space available doesn't include what's automatically prepended
9354 # to each line, or what's reserved on the right.
9355 my $max = $MAX_LINE_WIDTH - length($prefix) - $right_margin;
9356 # XXX Instead of using the 'nofold' perhaps better to look up the stack
9358 if (DEBUG && $hanging_indent >= $max) {
9359 Carp::my_carp("Too large a hanging indent ($hanging_indent); must be < $max. Using 0", 'nofold');
9360 $hanging_indent = 0;
9363 # First, split into the current physical lines.
9365 if (ref $line) { # Better be an array, because not bothering to
9367 foreach my $line (@{$line}) {
9368 push @line, split /\n/, $line;
9372 @line = split /\n/, $line;
9375 #local $to_trace = 1 if main::DEBUG;
9376 trace "", join(" ", @line), "\n" if main::DEBUG && $to_trace;
9378 # Look at each current physical line.
9379 for (my $i = 0; $i < @line; $i++) {
9380 Carp::my_carp("Tabs don't work well.", 'nofold') if $line[$i] =~ /\t/;
9381 #local $to_trace = 1 if main::DEBUG;
9382 trace "i=$i: $line[$i]\n" if main::DEBUG && $to_trace;
9384 # Remove prefix, because will be added back anyway, don't want
9386 $line[$i] =~ s/^$prefix//;
9388 # Remove trailing space
9389 $line[$i] =~ s/\s+\Z//;
9391 # If the line is too long, fold it.
9392 if (length $line[$i] > $max) {
9395 # Here needs to fold. Save the leading space in the line for
9397 $line[$i] =~ /^ ( \s* )/x;
9398 my $leading_space = $1;
9399 trace "line length", length $line[$i], "; lead length", length($leading_space) if main::DEBUG && $to_trace;
9401 # If character at final permissible position is white space,
9402 # fold there, which will delete that white space
9403 if (substr($line[$i], $max - 1, 1) =~ /\s/) {
9404 $remainder = substr($line[$i], $max);
9405 $line[$i] = substr($line[$i], 0, $max - 1);
9409 # Otherwise fold at an acceptable break char closest to
9410 # the max length. Look at just the maximal initial
9411 # segment of the line
9412 my $segment = substr($line[$i], 0, $max - 1);
9414 /^ ( .{$hanging_indent} # Don't look before the
9416 \ * # Don't look in leading
9417 # blanks past the indent
9418 [^ ] .* # Find the right-most
9419 (?: # acceptable break:
9420 [ \s = ] # space or equal
9421 | - (?! [.0-9] ) # or non-unary minus.
9422 ) # $1 includes the character
9425 # Split into the initial part that fits, and remaining
9427 $remainder = substr($line[$i], length $1);
9429 trace $line[$i] if DEBUG && $to_trace;
9430 trace $remainder if DEBUG && $to_trace;
9433 # If didn't find a good breaking spot, see if there is a
9434 # not-so-good breaking spot. These are just after
9435 # underscores or where the case changes from lower to
9436 # upper. Use \a as a soft hyphen, but give up
9437 # and don't break the line if there is actually a \a
9438 # already in the input. We use an ascii character for the
9439 # soft-hyphen to avoid any attempt by miniperl to try to
9440 # access the files that this program is creating.
9441 elsif ($segment !~ /\a/
9442 && ($segment =~ s/_/_\a/g
9443 || $segment =~ s/ ( [a-z] ) (?= [A-Z] )/$1\a/xg))
9445 # Here were able to find at least one place to insert
9446 # our substitute soft hyphen. Find the right-most one
9447 # and replace it by a real hyphen.
9448 trace $segment if DEBUG && $to_trace;
9450 rindex($segment, "\a"),
9453 # Then remove the soft hyphen substitutes.
9454 $segment =~ s/\a//g;
9455 trace $segment if DEBUG && $to_trace;
9457 # And split into the initial part that fits, and
9458 # remainder of the line
9459 my $pos = rindex($segment, '-');
9460 $remainder = substr($line[$i], $pos);
9461 trace $remainder if DEBUG && $to_trace;
9462 $line[$i] = substr($segment, 0, $pos + 1);
9466 # Here we know if we can fold or not. If we can, $remainder
9467 # is what remains to be processed in the next iteration.
9468 if (defined $remainder) {
9469 trace "folded='$line[$i]'" if main::DEBUG && $to_trace;
9471 # Insert the folded remainder of the line as a new element
9472 # of the array. (It may still be too long, but we will
9473 # deal with that next time through the loop.) Omit any
9474 # leading space in the remainder.
9475 $remainder =~ s/^\s+//;
9476 trace "remainder='$remainder'" if main::DEBUG && $to_trace;
9478 # But then indent by whichever is larger of:
9479 # 1) the leading space on the input line;
9480 # 2) the hanging indent.
9481 # This preserves indentation in the original line.
9482 my $lead = ($leading_space)
9483 ? length $leading_space
9485 $lead = max($lead, $hanging_indent);
9486 splice @line, $i+1, 0, (" " x $lead) . $remainder;
9490 # Ready to output the line. Get rid of any trailing space
9491 # And prefix by the required $prefix passed in.
9492 $line[$i] =~ s/\s+$//;
9493 $line[$i] = "$prefix$line[$i]\n";
9494 } # End of looping through all the lines.
9496 return join "", @line;
9499 sub property_ref { # Returns a reference to a property object.
9500 return Property::property_ref(@_);
9503 sub force_unlink ($) {
9504 my $filename = shift;
9505 return unless file_exists($filename);
9506 return if CORE::unlink($filename);
9508 # We might need write permission
9509 chmod 0777, $filename;
9510 CORE::unlink($filename) or Carp::my_carp("Couldn't unlink $filename. Proceeding anyway: $!");
9515 # Given a filename and references to arrays of lines, write the lines of
9516 # each array to the file
9517 # Filename can be given as an arrayref of directory names
9519 return Carp::carp_too_few_args(\@_, 3) if main::DEBUG && @_ < 3;
9522 my $use_utf8 = shift;
9524 # Get into a single string if an array, and get rid of, in Unix terms, any
9526 $file= File::Spec->join(@$file) if ref $file eq 'ARRAY';
9527 $file = File::Spec->canonpath($file);
9529 # If has directories, make sure that they all exist
9530 (undef, my $directories, undef) = File::Spec->splitpath($file);
9531 File::Path::mkpath($directories) if $directories && ! -d $directories;
9533 push @files_actually_output, $file;
9535 force_unlink ($file);
9538 if (not open $OUT, ">", $file) {
9539 Carp::my_carp("can't open $file for output. Skipping this file: $!");
9543 binmode $OUT, ":utf8" if $use_utf8;
9545 while (defined (my $lines_ref = shift)) {
9546 unless (@$lines_ref) {
9547 Carp::my_carp("An array of lines for writing to file '$file' is empty; writing it anyway;");
9550 print $OUT @$lines_ref or die Carp::my_carp("write to '$file' failed: $!");
9552 close $OUT or die Carp::my_carp("close '$file' failed: $!");
9554 print "$file written.\n" if $verbosity >= $VERBOSE;
9560 sub Standardize($) {
9561 # This converts the input name string into a standardized equivalent to
9565 unless (defined $name) {
9566 Carp::my_carp_bug("Standardize() called with undef. Returning undef.");
9570 # Remove any leading or trailing white space
9574 # Convert interior white space and hyphens into underscores.
9575 $name =~ s/ (?<= .) [ -]+ (.) /_$1/xg;
9577 # Capitalize the letter following an underscore, and convert a sequence of
9578 # multiple underscores to a single one
9579 $name =~ s/ (?<= .) _+ (.) /_\u$1/xg;
9581 # And capitalize the first letter, but not for the special cjk ones.
9582 $name = ucfirst($name) unless $name =~ /^k[A-Z]/;
9586 sub standardize ($) {
9587 # Returns a lower-cased standardized name, without underscores. This form
9588 # is chosen so that it can distinguish between any real versus superficial
9589 # Unicode name differences. It relies on the fact that Unicode doesn't
9590 # have interior underscores, white space, nor dashes in any
9591 # stricter-matched name. It should not be used on Unicode code point
9592 # names (the Name property), as they mostly, but not always follow these
9595 my $name = Standardize(shift);
9596 return if !defined $name;
9598 $name =~ s/ (?<= .) _ (?= . ) //xg;
9602 sub utf8_heavy_name ($$) {
9603 # Returns the name that utf8_heavy.pl will use to find a table. XXX
9604 # perhaps this function should be placed somewhere, like Heavy.pl so that
9605 # utf8_heavy can use it directly without duplicating code that can get
9610 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9612 my $property = $table->property;
9613 $property = ($property == $perl)
9614 ? "" # 'perl' is never explicitly stated
9615 : standardize($property->name) . '=';
9616 if ($alias->loose_match) {
9617 return $property . standardize($alias->name);
9620 return lc ($property . $alias->name);
9628 my $indent_increment = " " x (($debugging_build) ? 2 : 0);
9629 %main::already_output = ();
9631 $main::simple_dumper_nesting = 0;
9634 # Like Simple Data::Dumper. Good enough for our needs. We can't use
9635 # the real thing as we have to run under miniperl.
9637 # It is designed so that on input it is at the beginning of a line,
9638 # and the final thing output in any call is a trailing ",\n".
9642 $indent = "" if ! $debugging_build || ! defined $indent;
9644 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9646 # nesting level is localized, so that as the call stack pops, it goes
9647 # back to the prior value.
9648 local $main::simple_dumper_nesting = $main::simple_dumper_nesting;
9649 local %main::already_output = %main::already_output;
9650 $main::simple_dumper_nesting++;
9651 #print STDERR __LINE__, ": $main::simple_dumper_nesting: $indent$item\n";
9653 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9655 # Determine the indent for recursive calls.
9656 my $next_indent = $indent . $indent_increment;
9661 # Dump of scalar: just output it in quotes if not a number. To do
9662 # so we must escape certain characters, and therefore need to
9663 # operate on a copy to avoid changing the original
9665 $copy = $UNDEF unless defined $copy;
9667 # Quote non-integers (integers also have optional leading '-')
9668 if ($copy eq "" || $copy !~ /^ -? \d+ $/x) {
9670 # Escape apostrophe and backslash
9671 $copy =~ s/ ( ['\\] ) /\\$1/xg;
9674 $output = "$indent$copy,\n";
9678 # Keep track of cycles in the input, and refuse to infinitely loop
9679 my $addr = do { no overloading; pack 'J', $item; };
9680 if (defined $main::already_output{$addr}) {
9681 return "${indent}ALREADY OUTPUT: $item\n";
9683 $main::already_output{$addr} = $item;
9685 if (ref $item eq 'ARRAY') {
9688 if ($main::simple_dumper_nesting > 1) {
9690 $using_brackets = 1;
9693 $using_brackets = 0;
9696 # If the array is empty, put the closing bracket on the same
9697 # line. Otherwise, recursively add each array element
9703 for (my $i = 0; $i < @$item; $i++) {
9705 # Indent array elements one level
9706 $output .= &simple_dumper($item->[$i], $next_indent);
9707 next if ! $debugging_build;
9708 $output =~ s/\n$//; # Remove any trailing nl so
9709 $output .= " # [$i]\n"; # as to add a comment giving
9712 $output .= $indent; # Indent closing ']' to orig level
9714 $output .= ']' if $using_brackets;
9717 elsif (ref $item eq 'HASH') {
9722 # No surrounding braces at top level
9724 if ($main::simple_dumper_nesting > 1) {
9727 $body_indent = $next_indent;
9728 $next_indent .= $indent_increment;
9733 $body_indent = $indent;
9737 # Output hashes sorted alphabetically instead of apparently
9738 # random. Use caseless alphabetic sort
9739 foreach my $key (sort { lc $a cmp lc $b } keys %$item)
9741 if ($is_first_line) {
9745 $output .= "$body_indent";
9748 # The key must be a scalar, but this recursive call quotes
9750 $output .= &simple_dumper($key);
9752 # And change the trailing comma and nl to the hash fat
9753 # comma for clarity, and so the value can be on the same
9755 $output =~ s/,\n$/ => /;
9757 # Recursively call to get the value's dump.
9758 my $next = &simple_dumper($item->{$key}, $next_indent);
9760 # If the value is all on one line, remove its indent, so
9761 # will follow the => immediately. If it takes more than
9762 # one line, start it on a new line.
9763 if ($next !~ /\n.*\n/) {
9772 $output .= "$indent},\n" if $using_braces;
9774 elsif (ref $item eq 'CODE' || ref $item eq 'GLOB') {
9775 $output = $indent . ref($item) . "\n";
9776 # XXX see if blessed
9778 elsif ($item->can('dump')) {
9780 # By convention in this program, objects furnish a 'dump'
9781 # method. Since not doing any output at this level, just pass
9782 # on the input indent
9783 $output = $item->dump($indent);
9786 Carp::my_carp("Can't cope with dumping a " . ref($item) . ". Skipping.");
9793 sub dump_inside_out {
9794 # Dump inside-out hashes in an object's state by converting them to a
9795 # regular hash and then calling simple_dumper on that.
9798 my $fields_ref = shift;
9800 my $addr = do { no overloading; pack 'J', $object; };
9803 foreach my $key (keys %$fields_ref) {
9804 $hash{$key} = $fields_ref->{$key}{$addr};
9807 return simple_dumper(\%hash, @_);
9811 # Overloaded '.' method that is common to all packages. It uses the
9812 # package's stringify method.
9816 my $reversed = shift;
9817 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9819 $other = "" unless defined $other;
9821 foreach my $which (\$self, \$other) {
9822 next unless ref $$which;
9823 if ($$which->can('_operator_stringify')) {
9824 $$which = $$which->_operator_stringify;
9827 my $ref = ref $$which;
9828 my $addr = do { no overloading; pack 'J', $$which; };
9829 $$which = "$ref ($addr)";
9837 sub _operator_dot_equal {
9838 # Overloaded '.=' method that is common to all packages.
9842 my $reversed = shift;
9843 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9845 $other = "" unless defined $other;
9848 return $other .= "$self";
9851 return "$self" . "$other";
9855 sub _operator_equal {
9856 # Generic overloaded '==' routine. To be equal, they must be the exact
9862 return 0 unless defined $other;
9863 return 0 unless ref $other;
9865 return $self == $other;
9868 sub _operator_not_equal {
9872 return ! _operator_equal($self, $other);
9875 sub substitute_PropertyAliases($) {
9876 # Deal with early releases that don't have the crucial PropertyAliases.txt
9879 my $file_object = shift;
9880 $file_object->insert_lines(get_old_property_aliases());
9882 process_PropertyAliases($file_object);
9886 sub process_PropertyAliases($) {
9887 # This reads in the PropertyAliases.txt file, which contains almost all
9888 # the character properties in Unicode and their equivalent aliases:
9889 # scf ; Simple_Case_Folding ; sfc
9891 # Field 0 is the preferred short name for the property.
9892 # Field 1 is the full name.
9893 # Any succeeding ones are other accepted names.
9896 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9898 # Add any cjk properties that may have been defined.
9899 $file->insert_lines(@cjk_properties);
9901 while ($file->next_line) {
9903 my @data = split /\s*;\s*/;
9905 my $full = $data[1];
9907 # This line is defective in early Perls. The property in Unihan.txt
9909 if ($full eq 'Unicode_Radical_Stroke' && @data < 3) {
9910 push @data, qw(cjkRSUnicode kRSUnicode);
9913 my $this = Property->new($data[0], Full_Name => $full);
9915 $this->set_fate($SUPPRESSED, $why_suppressed{$full})
9916 if $why_suppressed{$full};
9918 # Start looking for more aliases after these two.
9919 for my $i (2 .. @data - 1) {
9920 $this->add_alias($data[$i]);
9925 my $scf = property_ref("Simple_Case_Folding");
9926 $scf->add_alias("scf");
9927 $scf->add_alias("sfc");
9932 sub finish_property_setup {
9933 # Finishes setting up after PropertyAliases.
9936 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9938 # This entry was missing from this file in earlier Unicode versions
9939 if (-e 'Jamo.txt' && ! defined property_ref('JSN')) {
9940 Property->new('JSN', Full_Name => 'Jamo_Short_Name');
9943 # These are used so much, that we set globals for them.
9944 $gc = property_ref('General_Category');
9945 $block = property_ref('Block');
9946 $script = property_ref('Script');
9947 $age = property_ref('Age');
9949 # Perl adds this alias.
9950 $gc->add_alias('Category');
9952 # Unicode::Normalize expects this file with this name and directory.
9953 $ccc = property_ref('Canonical_Combining_Class');
9955 $ccc->set_file('CombiningClass');
9956 $ccc->set_directory(File::Spec->curdir());
9959 # These two properties aren't actually used in the core, but unfortunately
9960 # the names just above that are in the core interfere with these, so
9961 # choose different names. These aren't a problem unless the map tables
9962 # for these files get written out.
9963 my $lowercase = property_ref('Lowercase');
9964 $lowercase->set_file('IsLower') if defined $lowercase;
9965 my $uppercase = property_ref('Uppercase');
9966 $uppercase->set_file('IsUpper') if defined $uppercase;
9968 # Set up the hard-coded default mappings, but only on properties defined
9970 foreach my $property (keys %default_mapping) {
9971 my $property_object = property_ref($property);
9972 next if ! defined $property_object;
9973 my $default_map = $default_mapping{$property};
9974 $property_object->set_default_map($default_map);
9976 # A map of <code point> implies the property is string.
9977 if ($property_object->type == $UNKNOWN
9978 && $default_map eq $CODE_POINT)
9980 $property_object->set_type($STRING);
9984 # The following use the Multi_Default class to create objects for
9987 # Bidi class has a complicated default, but the derived file takes care of
9988 # the complications, leaving just 'L'.
9989 if (file_exists("${EXTRACTED}DBidiClass.txt")) {
9990 property_ref('Bidi_Class')->set_default_map('L');
9995 # The derived file was introduced in 3.1.1. The values below are
9996 # taken from table 3-8, TUS 3.0
9998 'my $default = Range_List->new;
9999 $default->add_range(0x0590, 0x05FF);
10000 $default->add_range(0xFB1D, 0xFB4F);'
10003 # The defaults apply only to unassigned characters
10004 $default_R .= '$gc->table("Unassigned") & $default;';
10006 if ($v_version lt v3.0.0) {
10007 $default = Multi_Default->new(R => $default_R, 'L');
10011 # AL apparently not introduced until 3.0: TUS 2.x references are
10012 # not on-line to check it out
10014 'my $default = Range_List->new;
10015 $default->add_range(0x0600, 0x07BF);
10016 $default->add_range(0xFB50, 0xFDFF);
10017 $default->add_range(0xFE70, 0xFEFF);'
10020 # Non-character code points introduced in this release; aren't AL
10021 if ($v_version ge 3.1.0) {
10022 $default_AL .= '$default->delete_range(0xFDD0, 0xFDEF);';
10024 $default_AL .= '$gc->table("Unassigned") & $default';
10025 $default = Multi_Default->new(AL => $default_AL,
10029 property_ref('Bidi_Class')->set_default_map($default);
10032 # Joining type has a complicated default, but the derived file takes care
10033 # of the complications, leaving just 'U' (or Non_Joining), except the file
10035 if (file_exists("${EXTRACTED}DJoinType.txt") || -e 'ArabicShaping.txt') {
10036 if (file_exists("${EXTRACTED}DJoinType.txt") && $v_version ne 3.1.0) {
10037 property_ref('Joining_Type')->set_default_map('Non_Joining');
10041 # Otherwise, there are not one, but two possibilities for the
10042 # missing defaults: T and U.
10043 # The missing defaults that evaluate to T are given by:
10044 # T = Mn + Cf - ZWNJ - ZWJ
10045 # where Mn and Cf are the general category values. In other words,
10046 # any non-spacing mark or any format control character, except
10047 # U+200C ZERO WIDTH NON-JOINER (joining type U) and U+200D ZERO
10048 # WIDTH JOINER (joining type C).
10049 my $default = Multi_Default->new(
10050 'T' => '$gc->table("Mn") + $gc->table("Cf") - 0x200C - 0x200D',
10052 property_ref('Joining_Type')->set_default_map($default);
10056 # Line break has a complicated default in early releases. It is 'Unknown'
10057 # for non-assigned code points; 'AL' for assigned.
10058 if (file_exists("${EXTRACTED}DLineBreak.txt") || -e 'LineBreak.txt') {
10059 my $lb = property_ref('Line_Break');
10060 if (file_exists("${EXTRACTED}DLineBreak.txt")) {
10061 $lb->set_default_map('Unknown');
10064 my $default = Multi_Default->new('AL' => '~ $gc->table("Cn")',
10067 $lb->set_default_map($default);
10071 # For backwards compatibility with applications that may read the mapping
10072 # file directly (it was documented in 5.12 and 5.14 as being thusly
10073 # usable), keep it from being adjusted. (range_size_1 is
10074 # used to force the traditional format.)
10075 if (defined (my $nfkc_cf = property_ref('NFKC_Casefold'))) {
10076 $nfkc_cf->set_to_output_map($EXTERNAL_MAP);
10077 $nfkc_cf->set_range_size_1(1);
10079 if (defined (my $bmg = property_ref('Bidi_Mirroring_Glyph'))) {
10080 $bmg->set_to_output_map($EXTERNAL_MAP);
10081 $bmg->set_range_size_1(1);
10084 property_ref('Numeric_Value')->set_to_output_map($OUTPUT_ADJUSTED);
10089 sub get_old_property_aliases() {
10090 # Returns what would be in PropertyAliases.txt if it existed in very old
10091 # versions of Unicode. It was derived from the one in 3.2, and pared
10092 # down based on the data that was actually in the older releases.
10093 # An attempt was made to use the existence of files to mean inclusion or
10094 # not of various aliases, but if this was not sufficient, using version
10095 # numbers was resorted to.
10099 # These are to be used in all versions (though some are constructed by
10100 # this program if missing)
10101 push @return, split /\n/, <<'END';
10103 Bidi_M ; Bidi_Mirrored
10105 ccc ; Canonical_Combining_Class
10106 dm ; Decomposition_Mapping
10107 dt ; Decomposition_Type
10108 gc ; General_Category
10110 lc ; Lowercase_Mapping
10112 na1 ; Unicode_1_Name
10115 scf ; Simple_Case_Folding
10116 slc ; Simple_Lowercase_Mapping
10117 stc ; Simple_Titlecase_Mapping
10118 suc ; Simple_Uppercase_Mapping
10119 tc ; Titlecase_Mapping
10120 uc ; Uppercase_Mapping
10123 if (-e 'Blocks.txt') {
10124 push @return, "blk ; Block\n";
10126 if (-e 'ArabicShaping.txt') {
10127 push @return, split /\n/, <<'END';
10132 if (-e 'PropList.txt') {
10134 # This first set is in the original old-style proplist.
10135 push @return, split /\n/, <<'END';
10136 Bidi_C ; Bidi_Control
10144 Join_C ; Join_Control
10146 QMark ; Quotation_Mark
10147 Term ; Terminal_Punctuation
10148 WSpace ; White_Space
10150 # The next sets were added later
10151 if ($v_version ge v3.0.0) {
10152 push @return, split /\n/, <<'END';
10157 if ($v_version ge v3.0.1) {
10158 push @return, split /\n/, <<'END';
10159 NChar ; Noncharacter_Code_Point
10162 # The next sets were added in the new-style
10163 if ($v_version ge v3.1.0) {
10164 push @return, split /\n/, <<'END';
10165 OAlpha ; Other_Alphabetic
10166 OLower ; Other_Lowercase
10168 OUpper ; Other_Uppercase
10171 if ($v_version ge v3.1.1) {
10172 push @return, "AHex ; ASCII_Hex_Digit\n";
10175 if (-e 'EastAsianWidth.txt') {
10176 push @return, "ea ; East_Asian_Width\n";
10178 if (-e 'CompositionExclusions.txt') {
10179 push @return, "CE ; Composition_Exclusion\n";
10181 if (-e 'LineBreak.txt') {
10182 push @return, "lb ; Line_Break\n";
10184 if (-e 'BidiMirroring.txt') {
10185 push @return, "bmg ; Bidi_Mirroring_Glyph\n";
10187 if (-e 'Scripts.txt') {
10188 push @return, "sc ; Script\n";
10190 if (-e 'DNormalizationProps.txt') {
10191 push @return, split /\n/, <<'END';
10192 Comp_Ex ; Full_Composition_Exclusion
10193 FC_NFKC ; FC_NFKC_Closure
10194 NFC_QC ; NFC_Quick_Check
10195 NFD_QC ; NFD_Quick_Check
10196 NFKC_QC ; NFKC_Quick_Check
10197 NFKD_QC ; NFKD_Quick_Check
10198 XO_NFC ; Expands_On_NFC
10199 XO_NFD ; Expands_On_NFD
10200 XO_NFKC ; Expands_On_NFKC
10201 XO_NFKD ; Expands_On_NFKD
10204 if (-e 'DCoreProperties.txt') {
10205 push @return, split /\n/, <<'END';
10208 XIDC ; XID_Continue
10211 # These can also appear in some versions of PropList.txt
10212 push @return, "Lower ; Lowercase\n"
10213 unless grep { $_ =~ /^Lower\b/} @return;
10214 push @return, "Upper ; Uppercase\n"
10215 unless grep { $_ =~ /^Upper\b/} @return;
10218 # This flag requires the DAge.txt file to be copied into the directory.
10219 if (DEBUG && $compare_versions) {
10220 push @return, 'age ; Age';
10226 sub substitute_PropValueAliases($) {
10227 # Deal with early releases that don't have the crucial
10228 # PropValueAliases.txt file.
10230 my $file_object = shift;
10231 $file_object->insert_lines(get_old_property_value_aliases());
10233 process_PropValueAliases($file_object);
10236 sub process_PropValueAliases {
10237 # This file contains values that properties look like:
10238 # bc ; AL ; Arabic_Letter
10239 # blk; n/a ; Greek_And_Coptic ; Greek
10241 # Field 0 is the property.
10242 # Field 1 is the short name of a property value or 'n/a' if no
10243 # short name exists;
10244 # Field 2 is the full property value name;
10245 # Any other fields are more synonyms for the property value.
10246 # Purely numeric property values are omitted from the file; as are some
10247 # others, fewer and fewer in later releases
10249 # Entries for the ccc property have an extra field before the
10251 # ccc; 0; NR ; Not_Reordered
10252 # It is the numeric value that the names are synonyms for.
10254 # There are comment entries for values missing from this file:
10255 # # @missing: 0000..10FFFF; ISO_Comment; <none>
10256 # # @missing: 0000..10FFFF; Lowercase_Mapping; <code point>
10259 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10261 if ($v_version lt 4.0.0) {
10262 $file->insert_lines(split /\n/, <<'END'
10263 Hangul_Syllable_Type; L ; Leading_Jamo
10264 Hangul_Syllable_Type; LV ; LV_Syllable
10265 Hangul_Syllable_Type; LVT ; LVT_Syllable
10266 Hangul_Syllable_Type; NA ; Not_Applicable
10267 Hangul_Syllable_Type; T ; Trailing_Jamo
10268 Hangul_Syllable_Type; V ; Vowel_Jamo
10272 if ($v_version lt 4.1.0) {
10273 $file->insert_lines(split /\n/, <<'END'
10274 _Perl_GCB; CN ; Control
10276 _Perl_GCB; EX ; Extend
10280 _Perl_GCB; LVT ; LVT
10283 _Perl_GCB; XX ; Other
10289 # Add any explicit cjk values
10290 $file->insert_lines(@cjk_property_values);
10292 # This line is used only for testing the code that checks for name
10293 # conflicts. There is a script Inherited, and when this line is executed
10294 # it causes there to be a name conflict with the 'Inherited' that this
10295 # program generates for this block property value
10296 #$file->insert_lines('blk; n/a; Herited');
10298 # Process each line of the file ...
10299 while ($file->next_line) {
10301 # Fix typo in input file
10302 s/CCC133/CCC132/g if $v_version eq v6.1.0;
10304 my ($property, @data) = split /\s*;\s*/;
10306 # The ccc property has an extra field at the beginning, which is the
10307 # numeric value. Move it to be after the other two, mnemonic, fields,
10308 # so that those will be used as the property value's names, and the
10309 # number will be an extra alias. (Rightmost splice removes field 1-2,
10310 # returning them in a slice; left splice inserts that before anything,
10311 # thus shifting the former field 0 to after them.)
10312 splice (@data, 0, 0, splice(@data, 1, 2)) if $property eq 'ccc';
10314 if ($v_version le v5.0.0 && $property eq 'blk' && $data[1] =~ /-/) {
10315 my $new_style = $data[1] =~ s/-/_/gr;
10316 splice @data, 1, 0, $new_style;
10319 # Field 0 is a short name unless "n/a"; field 1 is the full name. If
10320 # there is no short name, use the full one in element 1
10321 if ($data[0] eq "n/a") {
10322 $data[0] = $data[1];
10324 elsif ($data[0] ne $data[1]
10325 && standardize($data[0]) eq standardize($data[1])
10326 && $data[1] !~ /[[:upper:]]/)
10328 # Also, there is a bug in the file in which "n/a" is omitted, and
10329 # the two fields are identical except for case, and the full name
10330 # is all lower case. Copy the "short" name unto the full one to
10331 # give it some upper case.
10333 $data[1] = $data[0];
10336 # Earlier releases had the pseudo property 'qc' that should expand to
10337 # the ones that replace it below.
10338 if ($property eq 'qc') {
10339 if (lc $data[0] eq 'y') {
10340 $file->insert_lines('NFC_QC; Y ; Yes',
10342 'NFKC_QC; Y ; Yes',
10343 'NFKD_QC; Y ; Yes',
10346 elsif (lc $data[0] eq 'n') {
10347 $file->insert_lines('NFC_QC; N ; No',
10353 elsif (lc $data[0] eq 'm') {
10354 $file->insert_lines('NFC_QC; M ; Maybe',
10355 'NFKC_QC; M ; Maybe',
10359 $file->carp_bad_line("qc followed by unexpected '$data[0]");
10364 # The first field is the short name, 2nd is the full one.
10365 my $property_object = property_ref($property);
10366 my $table = $property_object->add_match_table($data[0],
10367 Full_Name => $data[1]);
10369 # Start looking for more aliases after these two.
10370 for my $i (2 .. @data - 1) {
10371 $table->add_alias($data[$i]);
10373 } # End of looping through the file
10375 # As noted in the comments early in the program, it generates tables for
10376 # the default values for all releases, even those for which the concept
10377 # didn't exist at the time. Here we add those if missing.
10378 if (defined $age && ! defined $age->table('Unassigned')) {
10379 $age->add_match_table('Unassigned');
10381 $block->add_match_table('No_Block') if -e 'Blocks.txt'
10382 && ! defined $block->table('No_Block');
10385 # Now set the default mappings of the properties from the file. This is
10386 # done after the loop because a number of properties have only @missings
10387 # entries in the file, and may not show up until the end.
10388 my @defaults = $file->get_missings;
10389 foreach my $default_ref (@defaults) {
10390 my $default = $default_ref->[0];
10391 my $property = property_ref($default_ref->[1]);
10392 $property->set_default_map($default);
10397 sub get_old_property_value_aliases () {
10398 # Returns what would be in PropValueAliases.txt if it existed in very old
10399 # versions of Unicode. It was derived from the one in 3.2, and pared
10400 # down. An attempt was made to use the existence of files to mean
10401 # inclusion or not of various aliases, but if this was not sufficient,
10402 # using version numbers was resorted to.
10404 my @return = split /\n/, <<'END';
10405 bc ; AN ; Arabic_Number
10406 bc ; B ; Paragraph_Separator
10407 bc ; CS ; Common_Separator
10408 bc ; EN ; European_Number
10409 bc ; ES ; European_Separator
10410 bc ; ET ; European_Terminator
10411 bc ; L ; Left_To_Right
10412 bc ; ON ; Other_Neutral
10413 bc ; R ; Right_To_Left
10414 bc ; WS ; White_Space
10416 Bidi_M; N; No; F; False
10417 Bidi_M; Y; Yes; T; True
10419 # The standard combining classes are very much different in v1, so only use
10420 # ones that look right (not checked thoroughly)
10421 ccc; 0; NR ; Not_Reordered
10422 ccc; 1; OV ; Overlay
10424 ccc; 8; KV ; Kana_Voicing
10425 ccc; 9; VR ; Virama
10426 ccc; 202; ATBL ; Attached_Below_Left
10427 ccc; 216; ATAR ; Attached_Above_Right
10428 ccc; 218; BL ; Below_Left
10429 ccc; 220; B ; Below
10430 ccc; 222; BR ; Below_Right
10432 ccc; 228; AL ; Above_Left
10433 ccc; 230; A ; Above
10434 ccc; 232; AR ; Above_Right
10435 ccc; 234; DA ; Double_Above
10437 dt ; can ; canonical
10441 dt ; fra ; fraction
10442 dt ; init ; initial
10443 dt ; iso ; isolated
10451 gc ; C ; Other # Cc | Cf | Cn | Co | Cs
10453 gc ; Cn ; Unassigned
10454 gc ; Co ; Private_Use
10455 gc ; L ; Letter # Ll | Lm | Lo | Lt | Lu
10456 gc ; LC ; Cased_Letter # Ll | Lt | Lu
10457 gc ; Ll ; Lowercase_Letter
10458 gc ; Lm ; Modifier_Letter
10459 gc ; Lo ; Other_Letter
10460 gc ; Lu ; Uppercase_Letter
10461 gc ; M ; Mark # Mc | Me | Mn
10462 gc ; Mc ; Spacing_Mark
10463 gc ; Mn ; Nonspacing_Mark
10464 gc ; N ; Number # Nd | Nl | No
10465 gc ; Nd ; Decimal_Number
10466 gc ; No ; Other_Number
10467 gc ; P ; Punctuation # Pc | Pd | Pe | Pf | Pi | Po | Ps
10468 gc ; Pd ; Dash_Punctuation
10469 gc ; Pe ; Close_Punctuation
10470 gc ; Po ; Other_Punctuation
10471 gc ; Ps ; Open_Punctuation
10472 gc ; S ; Symbol # Sc | Sk | Sm | So
10473 gc ; Sc ; Currency_Symbol
10474 gc ; Sm ; Math_Symbol
10475 gc ; So ; Other_Symbol
10476 gc ; Z ; Separator # Zl | Zp | Zs
10477 gc ; Zl ; Line_Separator
10478 gc ; Zp ; Paragraph_Separator
10479 gc ; Zs ; Space_Separator
10487 if (-e 'ArabicShaping.txt') {
10488 push @return, split /\n/, <<'END';
10495 jg ; n/a ; NO_JOINING_GROUP
10503 jt ; C ; Join_Causing
10504 jt ; D ; Dual_Joining
10505 jt ; L ; Left_Joining
10506 jt ; R ; Right_Joining
10507 jt ; U ; Non_Joining
10508 jt ; T ; Transparent
10510 if ($v_version ge v3.0.0) {
10511 push @return, split /\n/, <<'END';
10515 jg ; n/a ; DALATH_RISH
10518 jg ; n/a ; FINAL_SEMKATH
10521 jg ; n/a ; HAMZA_ON_HEH_GOAL
10524 jg ; n/a ; HEH_GOAL
10528 jg ; n/a ; KNOTTED_HEH
10535 jg ; n/a ; REVERSED_PE
10539 jg ; n/a ; SWASH_KAF
10541 jg ; n/a ; TEH_MARBUTA
10544 jg ; n/a ; YEH_BARREE
10545 jg ; n/a ; YEH_WITH_TAIL
10554 if (-e 'EastAsianWidth.txt') {
10555 push @return, split /\n/, <<'END';
10565 if (-e 'LineBreak.txt') {
10566 push @return, split /\n/, <<'END';
10567 lb ; AI ; Ambiguous
10568 lb ; AL ; Alphabetic
10569 lb ; B2 ; Break_Both
10570 lb ; BA ; Break_After
10571 lb ; BB ; Break_Before
10572 lb ; BK ; Mandatory_Break
10573 lb ; CB ; Contingent_Break
10574 lb ; CL ; Close_Punctuation
10575 lb ; CM ; Combining_Mark
10576 lb ; CR ; Carriage_Return
10577 lb ; EX ; Exclamation
10580 lb ; ID ; Ideographic
10581 lb ; IN ; Inseperable
10582 lb ; IS ; Infix_Numeric
10583 lb ; LF ; Line_Feed
10584 lb ; NS ; Nonstarter
10586 lb ; OP ; Open_Punctuation
10587 lb ; PO ; Postfix_Numeric
10588 lb ; PR ; Prefix_Numeric
10589 lb ; QU ; Quotation
10590 lb ; SA ; Complex_Context
10591 lb ; SG ; Surrogate
10593 lb ; SY ; Break_Symbols
10599 if (-e 'DNormalizationProps.txt') {
10600 push @return, split /\n/, <<'END';
10607 if (-e 'Scripts.txt') {
10608 push @return, split /\n/, <<'END';
10610 sc ; Armn ; Armenian
10611 sc ; Beng ; Bengali
10612 sc ; Bopo ; Bopomofo
10613 sc ; Cans ; Canadian_Aboriginal
10614 sc ; Cher ; Cherokee
10615 sc ; Cyrl ; Cyrillic
10616 sc ; Deva ; Devanagari
10617 sc ; Dsrt ; Deseret
10618 sc ; Ethi ; Ethiopic
10619 sc ; Geor ; Georgian
10622 sc ; Gujr ; Gujarati
10623 sc ; Guru ; Gurmukhi
10627 sc ; Hira ; Hiragana
10628 sc ; Ital ; Old_Italic
10629 sc ; Kana ; Katakana
10631 sc ; Knda ; Kannada
10634 sc ; Mlym ; Malayalam
10635 sc ; Mong ; Mongolian
10636 sc ; Mymr ; Myanmar
10639 sc ; Qaai ; Inherited
10641 sc ; Sinh ; Sinhala
10647 sc ; Tibt ; Tibetan
10653 if ($v_version ge v2.0.0) {
10654 push @return, split /\n/, <<'END';
10658 dt ; vert ; vertical
10662 gc ; Cs ; Surrogate
10663 gc ; Lt ; Titlecase_Letter
10664 gc ; Me ; Enclosing_Mark
10665 gc ; Nl ; Letter_Number
10666 gc ; Pc ; Connector_Punctuation
10667 gc ; Sk ; Modifier_Symbol
10670 if ($v_version ge v2.1.2) {
10671 push @return, "bc ; S ; Segment_Separator\n";
10673 if ($v_version ge v2.1.5) {
10674 push @return, split /\n/, <<'END';
10675 gc ; Pf ; Final_Punctuation
10676 gc ; Pi ; Initial_Punctuation
10679 if ($v_version ge v2.1.8) {
10680 push @return, "ccc; 240; IS ; Iota_Subscript\n";
10683 if ($v_version ge v3.0.0) {
10684 push @return, split /\n/, <<'END';
10685 bc ; AL ; Arabic_Letter
10686 bc ; BN ; Boundary_Neutral
10687 bc ; LRE ; Left_To_Right_Embedding
10688 bc ; LRO ; Left_To_Right_Override
10689 bc ; NSM ; Nonspacing_Mark
10690 bc ; PDF ; Pop_Directional_Format
10691 bc ; RLE ; Right_To_Left_Embedding
10692 bc ; RLO ; Right_To_Left_Override
10694 ccc; 233; DB ; Double_Below
10698 if ($v_version ge v3.1.0) {
10699 push @return, "ccc; 226; R ; Right\n";
10705 sub process_NormalizationsTest {
10707 # Each line looks like:
10708 # source code point; NFC; NFD; NFKC; NFKD
10710 # 1E0A;1E0A;0044 0307;1E0A;0044 0307;
10713 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10715 # Process each line of the file ...
10716 while ($file->next_line) {
10720 my ($c1, $c2, $c3, $c4, $c5) = split /\s*;\s*/;
10722 foreach my $var (\$c1, \$c2, \$c3, \$c4, \$c5) {
10723 $$var = pack "U0U*", map { hex } split " ", $$var;
10724 $$var =~ s/(\\)/$1$1/g;
10727 push @normalization_tests,
10728 "Test_N(q
\a$c1
\a, q
\a$c2
\a, q
\a$c3
\a, q
\a$c4
\a, q
\a$c5
\a);\n";
10729 } # End of looping through the file
10732 sub output_perl_charnames_line ($$) {
10734 # Output the entries in Perl_charnames specially, using 5 digits instead
10735 # of four. This makes the entries a constant length, and simplifies
10736 # charnames.pm which this table is for. Unicode can have 6 digit
10737 # ordinals, but they are all private use or noncharacters which do not
10738 # have names, so won't be in this table.
10740 return sprintf "%05X\t%s\n", $_[0], $_[1];
10745 # These are constants to the $property_info hash in this subroutine, to
10746 # avoid using a quoted-string which might have a typo.
10748 my $DEFAULT_MAP = 'default_map';
10749 my $DEFAULT_TABLE = 'default_table';
10750 my $PSEUDO_MAP_TYPE = 'pseudo_map_type';
10751 my $MISSINGS = 'missings';
10753 sub process_generic_property_file {
10754 # This processes a file containing property mappings and puts them
10755 # into internal map tables. It should be used to handle any property
10756 # files that have mappings from a code point or range thereof to
10757 # something else. This means almost all the UCD .txt files.
10758 # each_line_handlers() should be set to adjust the lines of these
10759 # files, if necessary, to what this routine understands:
10762 # 003C..003E ; Math
10764 # the fields are: "codepoint-range ; property; map"
10766 # meaning the codepoints in the range all have the value 'map' under
10768 # Beginning and trailing white space in each field are not significant.
10769 # Note there is not a trailing semi-colon in the above. A trailing
10770 # semi-colon means the map is a null-string. An omitted map, as
10771 # opposed to a null-string, is assumed to be 'Y', based on Unicode
10772 # table syntax. (This could have been hidden from this routine by
10773 # doing it in the $file object, but that would require parsing of the
10774 # line there, so would have to parse it twice, or change the interface
10775 # to pass this an array. So not done.)
10777 # The map field may begin with a sequence of commands that apply to
10778 # this range. Each such command begins and ends with $CMD_DELIM.
10779 # These are used to indicate, for example, that the mapping for a
10780 # range has a non-default type.
10782 # This loops through the file, calling its next_line() method, and
10783 # then taking the map and adding it to the property's table.
10784 # Complications arise because any number of properties can be in the
10785 # file, in any order, interspersed in any way. The first time a
10786 # property is seen, it gets information about that property and
10787 # caches it for quick retrieval later. It also normalizes the maps
10788 # so that only one of many synonyms is stored. The Unicode input
10789 # files do use some multiple synonyms.
10792 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10794 my %property_info; # To keep track of what properties
10795 # have already had entries in the
10796 # current file, and info about each,
10797 # so don't have to recompute.
10798 my $property_name; # property currently being worked on
10799 my $property_type; # and its type
10800 my $previous_property_name = ""; # name from last time through loop
10801 my $property_object; # pointer to the current property's
10803 my $property_addr; # the address of that object
10804 my $default_map; # the string that code points missing
10805 # from the file map to
10806 my $default_table; # For non-string properties, a
10807 # reference to the match table that
10808 # will contain the list of code
10809 # points that map to $default_map.
10811 # Get the next real non-comment line
10813 while ($file->next_line) {
10815 # Default replacement type; means that if parts of the range have
10816 # already been stored in our tables, the new map overrides them if
10817 # they differ more than cosmetically
10818 my $replace = $IF_NOT_EQUIVALENT;
10819 my $map_type; # Default type for the map of this range
10821 #local $to_trace = 1 if main::DEBUG;
10822 trace $_ if main::DEBUG && $to_trace;
10824 # Split the line into components
10825 my ($range, $property_name, $map, @remainder)
10826 = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
10828 # If more or less on the line than we are expecting, warn and skip
10831 $file->carp_bad_line('Extra fields');
10834 elsif ( ! defined $property_name) {
10835 $file->carp_bad_line('Missing property');
10839 # Examine the range.
10840 if ($range !~ /^ ($code_point_re) (?:\.\. ($code_point_re) )? $/x)
10842 $file->carp_bad_line("Range '$range' not of the form 'CP1' or 'CP1..CP2' (where CP1,2 are code points in hex)");
10846 my $high = (defined $2) ? hex $2 : $low;
10848 # If changing to a new property, get the things constant per
10850 if ($previous_property_name ne $property_name) {
10852 $property_object = property_ref($property_name);
10853 if (! defined $property_object) {
10854 $file->carp_bad_line("Unexpected property '$property_name'. Skipped");
10857 { no overloading; $property_addr = pack 'J', $property_object; }
10859 # Defer changing names until have a line that is acceptable
10860 # (the 'next' statement above means is unacceptable)
10861 $previous_property_name = $property_name;
10863 # If not the first time for this property, retrieve info about
10864 # it from the cache
10865 if (defined ($property_info{$property_addr}{$TYPE})) {
10866 $property_type = $property_info{$property_addr}{$TYPE};
10867 $default_map = $property_info{$property_addr}{$DEFAULT_MAP};
10869 = $property_info{$property_addr}{$PSEUDO_MAP_TYPE};
10871 = $property_info{$property_addr}{$DEFAULT_TABLE};
10875 # Here, is the first time for this property. Set up the
10877 $property_type = $property_info{$property_addr}{$TYPE}
10878 = $property_object->type;
10880 = $property_info{$property_addr}{$PSEUDO_MAP_TYPE}
10881 = $property_object->pseudo_map_type;
10883 # The Unicode files are set up so that if the map is not
10884 # defined, it is a binary property
10885 if (! defined $map && $property_type != $BINARY) {
10886 if ($property_type != $UNKNOWN
10887 && $property_type != $NON_STRING)
10889 $file->carp_bad_line("No mapping defined on a non-binary property. Using 'Y' for the map");
10892 $property_object->set_type($BINARY);
10894 = $property_info{$property_addr}{$TYPE}
10899 # Get any @missings default for this property. This
10900 # should precede the first entry for the property in the
10901 # input file, and is located in a comment that has been
10902 # stored by the Input_file class until we access it here.
10903 # It's possible that there is more than one such line
10904 # waiting for us; collect them all, and parse
10905 my @missings_list = $file->get_missings
10906 if $file->has_missings_defaults;
10907 foreach my $default_ref (@missings_list) {
10908 my $default = $default_ref->[0];
10909 my $addr = do { no overloading; pack 'J', property_ref($default_ref->[1]); };
10911 # For string properties, the default is just what the
10912 # file says, but non-string properties should already
10913 # have set up a table for the default property value;
10914 # use the table for these, so can resolve synonyms
10915 # later to a single standard one.
10916 if ($property_type == $STRING
10917 || $property_type == $UNKNOWN)
10919 $property_info{$addr}{$MISSINGS} = $default;
10922 $property_info{$addr}{$MISSINGS}
10923 = $property_object->table($default);
10927 # Finished storing all the @missings defaults in the input
10928 # file so far. Get the one for the current property.
10929 my $missings = $property_info{$property_addr}{$MISSINGS};
10931 # But we likely have separately stored what the default
10932 # should be. (This is to accommodate versions of the
10933 # standard where the @missings lines are absent or
10934 # incomplete.) Hopefully the two will match. But check
10936 $default_map = $property_object->default_map;
10938 # If the map is a ref, it means that the default won't be
10939 # processed until later, so undef it, so next few lines
10940 # will redefine it to something that nothing will match
10941 undef $default_map if ref $default_map;
10943 # Create a $default_map if don't have one; maybe a dummy
10944 # that won't match anything.
10945 if (! defined $default_map) {
10947 # Use any @missings line in the file.
10948 if (defined $missings) {
10949 if (ref $missings) {
10950 $default_map = $missings->full_name;
10951 $default_table = $missings;
10954 $default_map = $missings;
10957 # And store it with the property for outside use.
10958 $property_object->set_default_map($default_map);
10962 # Neither an @missings nor a default map. Create
10963 # a dummy one, so won't have to test definedness
10964 # in the main loop.
10965 $default_map = '_Perl This will never be in a file
10970 # Here, we have $default_map defined, possibly in terms of
10971 # $missings, but maybe not, and possibly is a dummy one.
10972 if (defined $missings) {
10974 # Make sure there is no conflict between the two.
10975 # $missings has priority.
10976 if (ref $missings) {
10978 = $property_object->table($default_map);
10979 if (! defined $default_table
10980 || $default_table != $missings)
10982 if (! defined $default_table) {
10983 $default_table = $UNDEF;
10985 $file->carp_bad_line(<<END
10986 The \@missings line for $property_name in $file says that missings default to
10987 $missings, but we expect it to be $default_table. $missings used.
10990 $default_table = $missings;
10991 $default_map = $missings->full_name;
10993 $property_info{$property_addr}{$DEFAULT_TABLE}
10996 elsif ($default_map ne $missings) {
10997 $file->carp_bad_line(<<END
10998 The \@missings line for $property_name in $file says that missings default to
10999 $missings, but we expect it to be $default_map. $missings used.
11002 $default_map = $missings;
11006 $property_info{$property_addr}{$DEFAULT_MAP}
11009 # If haven't done so already, find the table corresponding
11010 # to this map for non-string properties.
11011 if (! defined $default_table
11012 && $property_type != $STRING
11013 && $property_type != $UNKNOWN)
11015 $default_table = $property_info{$property_addr}
11017 = $property_object->table($default_map);
11019 } # End of is first time for this property
11020 } # End of switching properties.
11022 # Ready to process the line.
11023 # The Unicode files are set up so that if the map is not defined,
11024 # it is a binary property with value 'Y'
11025 if (! defined $map) {
11030 # If the map begins with a special command to us (enclosed in
11031 # delimiters), extract the command(s).
11032 while ($map =~ s/ ^ $CMD_DELIM (.*?) $CMD_DELIM //x) {
11034 if ($command =~ / ^ $REPLACE_CMD= (.*) /x) {
11037 elsif ($command =~ / ^ $MAP_TYPE_CMD= (.*) /x) {
11041 $file->carp_bad_line("Unknown command line: '$1'");
11047 if ($default_map eq $CODE_POINT && $map =~ / ^ $code_point_re $/x)
11050 # Here, we have a map to a particular code point, and the
11051 # default map is to a code point itself. If the range
11052 # includes the particular code point, change that portion of
11053 # the range to the default. This makes sure that in the final
11054 # table only the non-defaults are listed.
11055 my $decimal_map = hex $map;
11056 if ($low <= $decimal_map && $decimal_map <= $high) {
11058 # If the range includes stuff before or after the map
11059 # we're changing, split it and process the split-off parts
11061 if ($low < $decimal_map) {
11062 $file->insert_adjusted_lines(
11063 sprintf("%04X..%04X; %s; %s",
11069 if ($high > $decimal_map) {
11070 $file->insert_adjusted_lines(
11071 sprintf("%04X..%04X; %s; %s",
11077 $low = $high = $decimal_map;
11078 $map = $CODE_POINT;
11082 # If we can tell that this is a synonym for the default map, use
11083 # the default one instead.
11084 if ($property_type != $STRING
11085 && $property_type != $UNKNOWN)
11087 my $table = $property_object->table($map);
11088 if (defined $table && $table == $default_table) {
11089 $map = $default_map;
11093 # And figure out the map type if not known.
11094 if (! defined $map_type || $map_type == $COMPUTE_NO_MULTI_CP) {
11095 if ($map eq "") { # Nulls are always $NULL map type
11097 } # Otherwise, non-strings, and those that don't allow
11098 # $MULTI_CP, and those that aren't multiple code points are
11101 (($property_type != $STRING && $property_type != $UNKNOWN)
11102 || (defined $map_type && $map_type == $COMPUTE_NO_MULTI_CP)
11103 || $map !~ /^ $code_point_re ( \ $code_point_re )+ $ /x)
11108 $map_type = $MULTI_CP;
11112 $property_object->add_map($low, $high,
11115 Replace => $replace);
11116 } # End of loop through file's lines
11122 { # Closure for UnicodeData.txt handling
11124 # This file was the first one in the UCD; its design leads to some
11125 # awkwardness in processing. Here is a sample line:
11126 # 0041;LATIN CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0061;
11127 # The fields in order are:
11128 my $i = 0; # The code point is in field 0, and is shifted off.
11129 my $CHARNAME = $i++; # character name (e.g. "LATIN CAPITAL LETTER A")
11130 my $CATEGORY = $i++; # category (e.g. "Lu")
11131 my $CCC = $i++; # Canonical combining class (e.g. "230")
11132 my $BIDI = $i++; # directional class (e.g. "L")
11133 my $PERL_DECOMPOSITION = $i++; # decomposition mapping
11134 my $PERL_DECIMAL_DIGIT = $i++; # decimal digit value
11135 my $NUMERIC_TYPE_OTHER_DIGIT = $i++; # digit value, like a superscript
11136 # Dual-use in this program; see below
11137 my $NUMERIC = $i++; # numeric value
11138 my $MIRRORED = $i++; # ? mirrored
11139 my $UNICODE_1_NAME = $i++; # name in Unicode 1.0
11140 my $COMMENT = $i++; # iso comment
11141 my $UPPER = $i++; # simple uppercase mapping
11142 my $LOWER = $i++; # simple lowercase mapping
11143 my $TITLE = $i++; # simple titlecase mapping
11144 my $input_field_count = $i;
11146 # This routine in addition outputs these extra fields:
11148 my $DECOMP_TYPE = $i++; # Decomposition type
11150 # These fields are modifications of ones above, and are usually
11151 # suppressed; they must come last, as for speed, the loop upper bound is
11152 # normally set to ignore them
11153 my $NAME = $i++; # This is the strict name field, not the one that
11155 my $DECOMP_MAP = $i++; # Strict decomposition mapping; not the one used
11156 # by Unicode::Normalize
11157 my $last_field = $i - 1;
11159 # All these are read into an array for each line, with the indices defined
11160 # above. The empty fields in the example line above indicate that the
11161 # value is defaulted. The handler called for each line of the input
11162 # changes these to their defaults.
11164 # Here are the official names of the properties, in a parallel array:
11166 $field_names[$BIDI] = 'Bidi_Class';
11167 $field_names[$CATEGORY] = 'General_Category';
11168 $field_names[$CCC] = 'Canonical_Combining_Class';
11169 $field_names[$CHARNAME] = 'Perl_Charnames';
11170 $field_names[$COMMENT] = 'ISO_Comment';
11171 $field_names[$DECOMP_MAP] = 'Decomposition_Mapping';
11172 $field_names[$DECOMP_TYPE] = 'Decomposition_Type';
11173 $field_names[$LOWER] = 'Lowercase_Mapping';
11174 $field_names[$MIRRORED] = 'Bidi_Mirrored';
11175 $field_names[$NAME] = 'Name';
11176 $field_names[$NUMERIC] = 'Numeric_Value';
11177 $field_names[$NUMERIC_TYPE_OTHER_DIGIT] = 'Numeric_Type';
11178 $field_names[$PERL_DECIMAL_DIGIT] = 'Perl_Decimal_Digit';
11179 $field_names[$PERL_DECOMPOSITION] = 'Perl_Decomposition_Mapping';
11180 $field_names[$TITLE] = 'Titlecase_Mapping';
11181 $field_names[$UNICODE_1_NAME] = 'Unicode_1_Name';
11182 $field_names[$UPPER] = 'Uppercase_Mapping';
11184 # Some of these need a little more explanation:
11185 # The $PERL_DECIMAL_DIGIT field does not lead to an official Unicode
11186 # property, but is used in calculating the Numeric_Type. Perl however,
11187 # creates a file from this field, so a Perl property is created from it.
11188 # Similarly, the Other_Digit field is used only for calculating the
11189 # Numeric_Type, and so it can be safely re-used as the place to store
11190 # the value for Numeric_Type; hence it is referred to as
11191 # $NUMERIC_TYPE_OTHER_DIGIT.
11192 # The input field named $PERL_DECOMPOSITION is a combination of both the
11193 # decomposition mapping and its type. Perl creates a file containing
11194 # exactly this field, so it is used for that. The two properties are
11195 # separated into two extra output fields, $DECOMP_MAP and $DECOMP_TYPE.
11196 # $DECOMP_MAP is usually suppressed (unless the lists are changed to
11197 # output it), as Perl doesn't use it directly.
11198 # The input field named here $CHARNAME is used to construct the
11199 # Perl_Charnames property, which is a combination of the Name property
11200 # (which the input field contains), and the Unicode_1_Name property, and
11201 # others from other files. Since, the strict Name property is not used
11202 # by Perl, this field is used for the table that Perl does use. The
11203 # strict Name property table is usually suppressed (unless the lists are
11204 # changed to output it), so it is accumulated in a separate field,
11205 # $NAME, which to save time is discarded unless the table is actually to
11208 # This file is processed like most in this program. Control is passed to
11209 # process_generic_property_file() which calls filter_UnicodeData_line()
11210 # for each input line. This filter converts the input into line(s) that
11211 # process_generic_property_file() understands. There is also a setup
11212 # routine called before any of the file is processed, and a handler for
11213 # EOF processing, all in this closure.
11215 # A huge speed-up occurred at the cost of some added complexity when these
11216 # routines were altered to buffer the outputs into ranges. Almost all the
11217 # lines of the input file apply to just one code point, and for most
11218 # properties, the map for the next code point up is the same as the
11219 # current one. So instead of creating a line for each property for each
11220 # input line, filter_UnicodeData_line() remembers what the previous map
11221 # of a property was, and doesn't generate a line to pass on until it has
11222 # to, as when the map changes; and that passed-on line encompasses the
11223 # whole contiguous range of code points that have the same map for that
11224 # property. This means a slight amount of extra setup, and having to
11225 # flush these buffers on EOF, testing if the maps have changed, plus
11226 # remembering state information in the closure. But it means a lot less
11227 # real time in not having to change the data base for each property on
11230 # Another complication is that there are already a few ranges designated
11231 # in the input. There are two lines for each, with the same maps except
11232 # the code point and name on each line. This was actually the hardest
11233 # thing to design around. The code points in those ranges may actually
11234 # have real maps not given by these two lines. These maps will either
11235 # be algorithmically determinable, or be in the extracted files furnished
11236 # with the UCD. In the event of conflicts between these extracted files,
11237 # and this one, Unicode says that this one prevails. But it shouldn't
11238 # prevail for conflicts that occur in these ranges. The data from the
11239 # extracted files prevails in those cases. So, this program is structured
11240 # so that those files are processed first, storing maps. Then the other
11241 # files are processed, generally overwriting what the extracted files
11242 # stored. But just the range lines in this input file are processed
11243 # without overwriting. This is accomplished by adding a special string to
11244 # the lines output to tell process_generic_property_file() to turn off the
11245 # overwriting for just this one line.
11246 # A similar mechanism is used to tell it that the map is of a non-default
11249 sub setup_UnicodeData { # Called before any lines of the input are read
11251 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
11253 # Create a new property specially located that is a combination of
11254 # various Name properties: Name, Unicode_1_Name, Named Sequences, and
11255 # _Perl_Name_Alias properties. (The final one duplicates elements of the
11256 # first, and starting in v6.1, is the same as the 'Name_Alias
11257 # property.) A comment for the new property will later be constructed
11258 # based on the actual properties present and used
11259 $perl_charname = Property->new('Perl_Charnames',
11261 Directory => File::Spec->curdir(),
11263 Fate => $INTERNAL_ONLY,
11264 Perl_Extension => 1,
11265 Range_Size_1 => \&output_perl_charnames_line,
11268 $perl_charname->set_proxy_for('Name');
11270 my $Perl_decomp = Property->new('Perl_Decomposition_Mapping',
11271 Directory => File::Spec->curdir(),
11272 File => 'Decomposition',
11273 Format => $DECOMP_STRING_FORMAT,
11274 Fate => $INTERNAL_ONLY,
11275 Perl_Extension => 1,
11276 Default_Map => $CODE_POINT,
11278 # normalize.pm can't cope with these
11279 Output_Range_Counts => 0,
11281 # This is a specially formatted table
11282 # explicitly for normalize.pm, which
11283 # is expecting a particular format,
11284 # which means that mappings containing
11285 # multiple code points are in the main
11286 # body of the table
11287 Map_Type => $COMPUTE_NO_MULTI_CP,
11289 To_Output_Map => $INTERNAL_MAP,
11291 $Perl_decomp->set_proxy_for('Decomposition_Mapping', 'Decomposition_Type');
11292 $Perl_decomp->add_comment(join_lines(<<END
11293 This mapping is a combination of the Unicode 'Decomposition_Type' and
11294 'Decomposition_Mapping' properties, formatted for use by normalize.pm. It is
11295 identical to the official Unicode 'Decomposition_Mapping' property except for
11297 1) It omits the algorithmically determinable Hangul syllable decompositions,
11298 which normalize.pm handles algorithmically.
11299 2) It contains the decomposition type as well. Non-canonical decompositions
11300 begin with a word in angle brackets, like <super>, which denotes the
11301 compatible decomposition type. If the map does not begin with the <angle
11302 brackets>, the decomposition is canonical.
11306 my $Decimal_Digit = Property->new("Perl_Decimal_Digit",
11308 Perl_Extension => 1,
11309 Directory => $map_directory,
11311 To_Output_Map => $OUTPUT_ADJUSTED,
11313 $Decimal_Digit->add_comment(join_lines(<<END
11314 This file gives the mapping of all code points which represent a single
11315 decimal digit [0-9] to their respective digits, but it has ranges of 10 code
11316 points, and the mapping of each non-initial element of each range is actually
11317 not to "0", but to the offset that element has from its corresponding DIGIT 0.
11318 These code points are those that have Numeric_Type=Decimal; not special
11319 things, like subscripts nor Roman numerals.
11323 # These properties are not used for generating anything else, and are
11324 # usually not output. By making them last in the list, we can just
11325 # change the high end of the loop downwards to avoid the work of
11326 # generating a table(s) that is/are just going to get thrown away.
11327 if (! property_ref('Decomposition_Mapping')->to_output_map
11328 && ! property_ref('Name')->to_output_map)
11330 $last_field = min($NAME, $DECOMP_MAP) - 1;
11331 } elsif (property_ref('Decomposition_Mapping')->to_output_map) {
11332 $last_field = $DECOMP_MAP;
11333 } elsif (property_ref('Name')->to_output_map) {
11334 $last_field = $NAME;
11339 my $first_time = 1; # ? Is this the first line of the file
11340 my $in_range = 0; # ? Are we in one of the file's ranges
11341 my $previous_cp; # hex code point of previous line
11342 my $decimal_previous_cp = -1; # And its decimal equivalent
11343 my @start; # For each field, the current starting
11344 # code point in hex for the range
11345 # being accumulated.
11346 my @fields; # The input fields;
11347 my @previous_fields; # And those from the previous call
11349 sub filter_UnicodeData_line {
11350 # Handle a single input line from UnicodeData.txt; see comments above
11351 # Conceptually this takes a single line from the file containing N
11352 # properties, and converts it into N lines with one property per line,
11353 # which is what the final handler expects. But there are
11354 # complications due to the quirkiness of the input file, and to save
11355 # time, it accumulates ranges where the property values don't change
11356 # and only emits lines when necessary. This is about an order of
11357 # magnitude fewer lines emitted.
11360 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
11362 # $_ contains the input line.
11363 # -1 in split means retain trailing null fields
11364 (my $cp, @fields) = split /\s*;\s*/, $_, -1;
11366 #local $to_trace = 1 if main::DEBUG;
11367 trace $cp, @fields , $input_field_count if main::DEBUG && $to_trace;
11368 if (@fields > $input_field_count) {
11369 $file->carp_bad_line('Extra fields');
11374 my $decimal_cp = hex $cp;
11376 # We have to output all the buffered ranges when the next code point
11377 # is not exactly one after the previous one, which means there is a
11378 # gap in the ranges.
11379 my $force_output = ($decimal_cp != $decimal_previous_cp + 1);
11381 # The decomposition mapping field requires special handling. It looks
11384 # <compat> 0032 0020
11387 # The decomposition type is enclosed in <brackets>; if missing, it
11388 # means the type is canonical. There are two decomposition mapping
11389 # tables: the one for use by Perl's normalize.pm has a special format
11390 # which is this field intact; the other, for general use is of
11391 # standard format. In either case we have to find the decomposition
11392 # type. Empty fields have None as their type, and map to the code
11394 if ($fields[$PERL_DECOMPOSITION] eq "") {
11395 $fields[$DECOMP_TYPE] = 'None';
11396 $fields[$DECOMP_MAP] = $fields[$PERL_DECOMPOSITION] = $CODE_POINT;
11399 ($fields[$DECOMP_TYPE], my $map) = $fields[$PERL_DECOMPOSITION]
11400 =~ / < ( .+? ) > \s* ( .+ ) /x;
11401 if (! defined $fields[$DECOMP_TYPE]) {
11402 $fields[$DECOMP_TYPE] = 'Canonical';
11403 $fields[$DECOMP_MAP] = $fields[$PERL_DECOMPOSITION];
11406 $fields[$DECOMP_MAP] = $map;
11410 # The 3 numeric fields also require special handling. The 2 digit
11411 # fields must be either empty or match the number field. This means
11412 # that if it is empty, they must be as well, and the numeric type is
11413 # None, and the numeric value is 'Nan'.
11414 # The decimal digit field must be empty or match the other digit
11415 # field. If the decimal digit field is non-empty, the code point is
11416 # a decimal digit, and the other two fields will have the same value.
11417 # If it is empty, but the other digit field is non-empty, the code
11418 # point is an 'other digit', and the number field will have the same
11419 # value as the other digit field. If the other digit field is empty,
11420 # but the number field is non-empty, the code point is a generic
11422 if ($fields[$NUMERIC] eq "") {
11423 if ($fields[$PERL_DECIMAL_DIGIT] ne ""
11424 || $fields[$NUMERIC_TYPE_OTHER_DIGIT] ne ""
11426 $file->carp_bad_line("Numeric values inconsistent. Trying to process anyway");
11428 $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'None';
11429 $fields[$NUMERIC] = 'NaN';
11432 $file->carp_bad_line("'$fields[$NUMERIC]' should be a whole or rational number. Processing as if it were") if $fields[$NUMERIC] !~ qr{ ^ -? \d+ ( / \d+ )? $ }x;
11433 if ($fields[$PERL_DECIMAL_DIGIT] ne "") {
11434 $file->carp_bad_line("$fields[$PERL_DECIMAL_DIGIT] should equal $fields[$NUMERIC]. Processing anyway") if $fields[$PERL_DECIMAL_DIGIT] != $fields[$NUMERIC];
11435 $file->carp_bad_line("$fields[$PERL_DECIMAL_DIGIT] should be empty since the general category ($fields[$CATEGORY]) isn't 'Nd'. Processing as Decimal") if $fields[$CATEGORY] ne "Nd";
11436 $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'Decimal';
11438 elsif ($fields[$NUMERIC_TYPE_OTHER_DIGIT] ne "") {
11439 $file->carp_bad_line("$fields[$NUMERIC_TYPE_OTHER_DIGIT] should equal $fields[$NUMERIC]. Processing anyway") if $fields[$NUMERIC_TYPE_OTHER_DIGIT] != $fields[$NUMERIC];
11440 $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'Digit';
11443 $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'Numeric';
11445 # Rationals require extra effort.
11446 if ($fields[$NUMERIC] =~ qr{/}) {
11447 reduce_fraction(\$fields[$NUMERIC]);
11448 register_fraction($fields[$NUMERIC])
11453 # For the properties that have empty fields in the file, and which
11454 # mean something different from empty, change them to that default.
11455 # Certain fields just haven't been empty so far in any Unicode
11456 # version, so don't look at those, namely $MIRRORED, $BIDI, $CCC,
11457 # $CATEGORY. This leaves just the two fields, and so we hard-code in
11458 # the defaults; which are very unlikely to ever change.
11459 $fields[$UPPER] = $CODE_POINT if $fields[$UPPER] eq "";
11460 $fields[$LOWER] = $CODE_POINT if $fields[$LOWER] eq "";
11462 # UAX44 says that if title is empty, it is the same as whatever upper
11464 $fields[$TITLE] = $fields[$UPPER] if $fields[$TITLE] eq "";
11466 # There are a few pairs of lines like:
11467 # AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
11468 # D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
11469 # that define ranges. These should be processed after the fields are
11470 # adjusted above, as they may override some of them; but mostly what
11471 # is left is to possibly adjust the $CHARNAME field. The names of all the
11472 # paired lines start with a '<', but this is also true of '<control>,
11473 # which isn't one of these special ones.
11474 if ($fields[$CHARNAME] eq '<control>') {
11476 # Some code points in this file have the pseudo-name
11477 # '<control>', but the official name for such ones is the null
11479 $fields[$NAME] = $fields[$CHARNAME] = "";
11481 # We had better not be in between range lines.
11483 $file->carp_bad_line("Expecting a closing range line, not a $fields[$CHARNAME]'. Trying anyway");
11487 elsif (substr($fields[$CHARNAME], 0, 1) ne '<') {
11489 # Here is a non-range line. We had better not be in between range
11492 $file->carp_bad_line("Expecting a closing range line, not a $fields[$CHARNAME]'. Trying anyway");
11495 if ($fields[$CHARNAME] =~ s/- $cp $//x) {
11497 # These are code points whose names end in their code points,
11498 # which means the names are algorithmically derivable from the
11499 # code points. To shorten the output Name file, the algorithm
11500 # for deriving these is placed in the file instead of each
11501 # code point, so they have map type $CP_IN_NAME
11502 $fields[$CHARNAME] = $CMD_DELIM
11507 . $fields[$CHARNAME];
11509 $fields[$NAME] = $fields[$CHARNAME];
11511 elsif ($fields[$CHARNAME] =~ /^<(.+), First>$/) {
11512 $fields[$CHARNAME] = $fields[$NAME] = $1;
11514 # Here we are at the beginning of a range pair.
11516 $file->carp_bad_line("Expecting a closing range line, not a beginning one, $fields[$CHARNAME]'. Trying anyway");
11520 # Because the properties in the range do not overwrite any already
11521 # in the db, we must flush the buffers of what's already there, so
11522 # they get handled in the normal scheme.
11526 elsif ($fields[$CHARNAME] !~ s/^<(.+), Last>$/$1/) {
11527 $file->carp_bad_line("Unexpected name starting with '<' $fields[$CHARNAME]. Ignoring this line.");
11531 else { # Here, we are at the last line of a range pair.
11534 $file->carp_bad_line("Unexpected end of range $fields[$CHARNAME] when not in one. Ignoring this line.");
11540 $fields[$NAME] = $fields[$CHARNAME];
11542 # Check that the input is valid: that the closing of the range is
11543 # the same as the beginning.
11544 foreach my $i (0 .. $last_field) {
11545 next if $fields[$i] eq $previous_fields[$i];
11546 $file->carp_bad_line("Expecting '$fields[$i]' to be the same as '$previous_fields[$i]'. Bad News. Trying anyway");
11549 # The processing differs depending on the type of range,
11550 # determined by its $CHARNAME
11551 if ($fields[$CHARNAME] =~ /^Hangul Syllable/) {
11553 # Check that the data looks right.
11554 if ($decimal_previous_cp != $SBase) {
11555 $file->carp_bad_line("Unexpected Hangul syllable start = $previous_cp. Bad News. Results will be wrong");
11557 if ($decimal_cp != $SBase + $SCount - 1) {
11558 $file->carp_bad_line("Unexpected Hangul syllable end = $cp. Bad News. Results will be wrong");
11561 # The Hangul syllable range has a somewhat complicated name
11562 # generation algorithm. Each code point in it has a canonical
11563 # decomposition also computable by an algorithm. The
11564 # perl decomposition map table built from these is used only
11565 # by normalize.pm, which has the algorithm built in it, so the
11566 # decomposition maps are not needed, and are large, so are
11567 # omitted from it. If the full decomposition map table is to
11568 # be output, the decompositions are generated for it, in the
11569 # EOF handling code for this input file.
11571 $previous_fields[$DECOMP_TYPE] = 'Canonical';
11573 # This range is stored in our internal structure with its
11574 # own map type, different from all others.
11575 $previous_fields[$CHARNAME] = $previous_fields[$NAME]
11581 . $fields[$CHARNAME];
11583 elsif ($fields[$CHARNAME] =~ /^CJK/) {
11585 # The name for these contains the code point itself, and all
11586 # are defined to have the same base name, regardless of what
11587 # is in the file. They are stored in our internal structure
11588 # with a map type of $CP_IN_NAME
11589 $previous_fields[$CHARNAME] = $previous_fields[$NAME]
11595 . 'CJK UNIFIED IDEOGRAPH';
11598 elsif ($fields[$CATEGORY] eq 'Co'
11599 || $fields[$CATEGORY] eq 'Cs')
11601 # The names of all the code points in these ranges are set to
11602 # null, as there are no names for the private use and
11603 # surrogate code points.
11605 $previous_fields[$CHARNAME] = $previous_fields[$NAME] = "";
11608 $file->carp_bad_line("Unexpected code point range $fields[$CHARNAME] because category is $fields[$CATEGORY]. Attempting to process it.");
11611 # The first line of the range caused everything else to be output,
11612 # and then its values were stored as the beginning values for the
11613 # next set of ranges, which this one ends. Now, for each value,
11614 # add a command to tell the handler that these values should not
11615 # replace any existing ones in our database.
11616 foreach my $i (0 .. $last_field) {
11617 $previous_fields[$i] = $CMD_DELIM
11622 . $previous_fields[$i];
11625 # And change things so it looks like the entire range has been
11626 # gone through with this being the final part of it. Adding the
11627 # command above to each field will cause this range to be flushed
11628 # during the next iteration, as it guaranteed that the stored
11629 # field won't match whatever value the next one has.
11630 $previous_cp = $cp;
11631 $decimal_previous_cp = $decimal_cp;
11633 # We are now set up for the next iteration; so skip the remaining
11634 # code in this subroutine that does the same thing, but doesn't
11635 # know about these ranges.
11641 # On the very first line, we fake it so the code below thinks there is
11642 # nothing to output, and initialize so that when it does get output it
11643 # uses the first line's values for the lowest part of the range.
11644 # (One could avoid this by using peek(), but then one would need to
11645 # know the adjustments done above and do the same ones in the setup
11646 # routine; not worth it)
11649 @previous_fields = @fields;
11650 @start = ($cp) x scalar @fields;
11651 $decimal_previous_cp = $decimal_cp - 1;
11654 # For each field, output the stored up ranges that this code point
11655 # doesn't fit in. Earlier we figured out if all ranges should be
11656 # terminated because of changing the replace or map type styles, or if
11657 # there is a gap between this new code point and the previous one, and
11658 # that is stored in $force_output. But even if those aren't true, we
11659 # need to output the range if this new code point's value for the
11660 # given property doesn't match the stored range's.
11661 #local $to_trace = 1 if main::DEBUG;
11662 foreach my $i (0 .. $last_field) {
11663 my $field = $fields[$i];
11664 if ($force_output || $field ne $previous_fields[$i]) {
11666 # Flush the buffer of stored values.
11667 $file->insert_adjusted_lines("$start[$i]..$previous_cp; $field_names[$i]; $previous_fields[$i]");
11669 # Start a new range with this code point and its value
11671 $previous_fields[$i] = $field;
11675 # Set the values for the next time.
11676 $previous_cp = $cp;
11677 $decimal_previous_cp = $decimal_cp;
11679 # The input line has generated whatever adjusted lines are needed, and
11680 # should not be looked at further.
11685 sub EOF_UnicodeData {
11686 # Called upon EOF to flush the buffers, and create the Hangul
11687 # decomposition mappings if needed.
11690 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
11692 # Flush the buffers.
11693 foreach my $i (0 .. $last_field) {
11694 $file->insert_adjusted_lines("$start[$i]..$previous_cp; $field_names[$i]; $previous_fields[$i]");
11697 if (-e 'Jamo.txt') {
11699 # The algorithm is published by Unicode, based on values in
11700 # Jamo.txt, (which should have been processed before this
11701 # subroutine), and the results left in %Jamo
11703 Carp::my_carp_bug("Jamo.txt should be processed before Unicode.txt. Hangul syllables not generated.");
11707 # If the full decomposition map table is being output, insert
11708 # into it the Hangul syllable mappings. This is to avoid having
11709 # to publish a subroutine in it to compute them. (which would
11710 # essentially be this code.) This uses the algorithm published by
11711 # Unicode. (No hangul syllables in version 1)
11712 if ($v_version ge v2.0.0
11713 && property_ref('Decomposition_Mapping')->to_output_map) {
11714 for (my $S = $SBase; $S < $SBase + $SCount; $S++) {
11716 my $SIndex = $S - $SBase;
11717 my $L = $LBase + $SIndex / $NCount;
11718 my $V = $VBase + ($SIndex % $NCount) / $TCount;
11719 my $T = $TBase + $SIndex % $TCount;
11721 trace "L=$L, V=$V, T=$T" if main::DEBUG && $to_trace;
11722 my $decomposition = sprintf("%04X %04X", $L, $V);
11723 $decomposition .= sprintf(" %04X", $T) if $T != $TBase;
11724 $file->insert_adjusted_lines(
11725 sprintf("%04X; Decomposition_Mapping; %s",
11735 sub filter_v1_ucd {
11736 # Fix UCD lines in version 1. This is probably overkill, but this
11737 # fixes some glaring errors in Version 1 UnicodeData.txt. That file:
11738 # 1) had many Hangul (U+3400 - U+4DFF) code points that were later
11739 # removed. This program retains them
11740 # 2) didn't include ranges, which it should have, and which are now
11741 # added in @corrected_lines below. It was hand populated by
11742 # taking the data from Version 2, verified by analyzing
11744 # 3) There is a syntax error in the entry for U+09F8 which could
11745 # cause problems for utf8_heavy, and so is changed. It's
11746 # numeric value was simply a minus sign, without any number.
11747 # (Eventually Unicode changed the code point to non-numeric.)
11748 # 4) The decomposition types often don't match later versions
11749 # exactly, and the whole syntax of that field is different; so
11750 # the syntax is changed as well as the types to their later
11751 # terminology. Otherwise normalize.pm would be very unhappy
11752 # 5) Many ccc classes are different. These are left intact.
11753 # 6) U+FF10..U+FF19 are missing their numeric values in all three
11754 # fields. These are unchanged because it doesn't really cause
11755 # problems for Perl.
11756 # 7) A number of code points, such as controls, don't have their
11757 # Unicode Version 1 Names in this file. These are added.
11758 # 8) A number of Symbols were marked as Lm. This changes those in
11759 # the Latin1 range, so that regexes work.
11760 # 9) The odd characters U+03DB .. U+03E1 weren't encoded but are
11761 # referred to by their lc equivalents. Not fixed.
11763 my @corrected_lines = split /\n/, <<'END';
11764 4E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;;
11765 9FA5;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;;
11766 E000;<Private Use, First>;Co;0;L;;;;;N;;;;;
11767 F8FF;<Private Use, Last>;Co;0;L;;;;;N;;;;;
11768 F900;<CJK Compatibility Ideograph, First>;Lo;0;L;;;;;N;;;;;
11769 FA2D;<CJK Compatibility Ideograph, Last>;Lo;0;L;;;;;N;;;;;
11773 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
11775 #local $to_trace = 1 if main::DEBUG;
11776 trace $_ if main::DEBUG && $to_trace;
11778 # -1 => retain trailing null fields
11779 my ($code_point, @fields) = split /\s*;\s*/, $_, -1;
11781 # At the first place that is wrong in the input, insert all the
11782 # corrections, replacing the wrong line.
11783 if ($code_point eq '4E00') {
11784 my @copy = @corrected_lines;
11786 ($code_point, @fields) = split /\s*;\s*/, $_, -1;
11788 $file->insert_lines(@copy);
11790 elsif ($code_point =~ /^00/ && $fields[$CATEGORY] eq 'Lm') {
11792 # There are no Lm characters in Latin1; these should be 'Sk', but
11793 # there isn't that in V1.
11794 $fields[$CATEGORY] = 'So';
11797 if ($fields[$NUMERIC] eq '-') {
11798 $fields[$NUMERIC] = '-1'; # This is what 2.0 made it.
11801 if ($fields[$PERL_DECOMPOSITION] ne "") {
11803 # Several entries have this change to superscript 2 or 3 in the
11804 # middle. Convert these to the modern version, which is to use
11805 # the actual U+00B2 and U+00B3 (the superscript forms) instead.
11806 # So 'HHHH HHHH <+sup> 0033 <-sup> HHHH' becomes
11807 # 'HHHH HHHH 00B3 HHHH'.
11808 # It turns out that all of these that don't have another
11809 # decomposition defined at the beginning of the line have the
11810 # <square> decomposition in later releases.
11811 if ($code_point ne '00B2' && $code_point ne '00B3') {
11812 if ($fields[$PERL_DECOMPOSITION]
11813 =~ s/<\+sup> 003([23]) <-sup>/00B$1/)
11815 if (substr($fields[$PERL_DECOMPOSITION], 0, 1) ne '<') {
11816 $fields[$PERL_DECOMPOSITION] = '<square> '
11817 . $fields[$PERL_DECOMPOSITION];
11822 # If is like '<+circled> 0052 <-circled>', convert to
11824 $fields[$PERL_DECOMPOSITION] =~
11825 s/ < \+ ( .*? ) > \s* (.*?) \s* <-\1> /<$1> $2/xg;
11827 # Convert '<join> HHHH HHHH <join>' to '<medial> HHHH HHHH', etc.
11828 $fields[$PERL_DECOMPOSITION] =~
11829 s/ <join> \s* (.*?) \s* <no-join> /<final> $1/x
11830 or $fields[$PERL_DECOMPOSITION] =~
11831 s/ <join> \s* (.*?) \s* <join> /<medial> $1/x
11832 or $fields[$PERL_DECOMPOSITION] =~
11833 s/ <no-join> \s* (.*?) \s* <join> /<initial> $1/x
11834 or $fields[$PERL_DECOMPOSITION] =~
11835 s/ <no-join> \s* (.*?) \s* <no-join> /<isolated> $1/x;
11837 # Convert '<break> HHHH HHHH <break>' to '<break> HHHH', etc.
11838 $fields[$PERL_DECOMPOSITION] =~
11839 s/ <(break|no-break)> \s* (.*?) \s* <\1> /<$1> $2/x;
11841 # Change names to modern form.
11842 $fields[$PERL_DECOMPOSITION] =~ s/<font variant>/<font>/g;
11843 $fields[$PERL_DECOMPOSITION] =~ s/<no-break>/<noBreak>/g;
11844 $fields[$PERL_DECOMPOSITION] =~ s/<circled>/<circle>/g;
11845 $fields[$PERL_DECOMPOSITION] =~ s/<break>/<fraction>/g;
11847 # One entry has weird braces
11848 $fields[$PERL_DECOMPOSITION] =~ s/[{}]//g;
11850 # One entry at U+2116 has an extra <sup>
11851 $fields[$PERL_DECOMPOSITION] =~ s/( < .*? > .* ) < .*? > \ * /$1/x;
11854 $_ = join ';', $code_point, @fields;
11855 trace $_ if main::DEBUG && $to_trace;
11859 sub filter_bad_Nd_ucd {
11860 # Early versions specified a value in the decimal digit field even
11861 # though the code point wasn't a decimal digit. Clear the field in
11862 # that situation, so that the main code doesn't think it is a decimal
11865 my ($code_point, @fields) = split /\s*;\s*/, $_, -1;
11866 if ($fields[$PERL_DECIMAL_DIGIT] ne "" && $fields[$CATEGORY] ne 'Nd') {
11867 $fields[$PERL_DECIMAL_DIGIT] = "";
11868 $_ = join ';', $code_point, @fields;
11873 my @U1_control_names = split /\n/, <<'END';
11878 END OF TRANSMISSION
11883 HORIZONTAL TABULATION
11885 VERTICAL TABULATION
11893 DEVICE CONTROL THREE
11894 DEVICE CONTROL FOUR
11895 NEGATIVE ACKNOWLEDGE
11897 END OF TRANSMISSION BLOCK
11907 BREAK PERMITTED HERE
11911 START OF SELECTED AREA
11912 END OF SELECTED AREA
11913 CHARACTER TABULATION SET
11914 CHARACTER TABULATION WITH JUSTIFICATION
11915 LINE TABULATION SET
11921 DEVICE CONTROL STRING
11927 START OF GUARDED AREA
11928 END OF GUARDED AREA
11930 SINGLE CHARACTER INTRODUCER
11931 CONTROL SEQUENCE INTRODUCER
11933 OPERATING SYSTEM COMMAND
11935 APPLICATION PROGRAM COMMAND
11938 sub filter_early_U1_names {
11939 # Very early versions did not have the Unicode_1_name field specified.
11940 # They differed in which ones were present; make sure a U1 name
11941 # exists, so that Unicode::UCD::charinfo will work
11943 my ($code_point, @fields) = split /\s*;\s*/, $_, -1;
11946 # @U1_control names above are entirely positional, so we pull them out
11947 # in the exact order required, with gaps for the ones that don't have
11949 if ($code_point =~ /^00[01]/
11950 || $code_point eq '007F'
11951 || $code_point =~ /^008[2-9A-F]/
11952 || $code_point =~ /^009[0-8A-F]/)
11954 my $u1_name = shift @U1_control_names;
11955 $fields[$UNICODE_1_NAME] = $u1_name unless $fields[$UNICODE_1_NAME];
11956 $_ = join ';', $code_point, @fields;
11961 sub filter_v2_1_5_ucd {
11962 # A dozen entries in this 2.1.5 file had the mirrored and numeric
11963 # columns swapped; These all had mirrored be 'N'. So if the numeric
11964 # column appears to be N, swap it back.
11966 my ($code_point, @fields) = split /\s*;\s*/, $_, -1;
11967 if ($fields[$NUMERIC] eq 'N') {
11968 $fields[$NUMERIC] = $fields[$MIRRORED];
11969 $fields[$MIRRORED] = 'N';
11970 $_ = join ';', $code_point, @fields;
11975 sub filter_v6_ucd {
11977 # Unicode 6.0 co-opted the name BELL for U+1F514, but until 5.17,
11978 # it wasn't accepted, to allow for some deprecation cycles. This
11979 # function is not called after 5.16
11981 return if $_ !~ /^(?:0007|1F514|070F);/;
11983 my ($code_point, @fields) = split /\s*;\s*/, $_, -1;
11984 if ($code_point eq '0007') {
11985 $fields[$CHARNAME] = "";
11987 elsif ($code_point eq '070F') { # Unicode Corrigendum #8; see
11988 # http://www.unicode.org/versions/corrigendum8.html
11989 $fields[$BIDI] = "AL";
11991 elsif ($^V lt v5.18.0) { # For 5.18 will convert to use Unicode's name
11992 $fields[$CHARNAME] = "";
11995 $_ = join ';', $code_point, @fields;
11999 } # End closure for UnicodeData
12001 sub process_GCB_test {
12004 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
12006 while ($file->next_line) {
12007 push @backslash_X_tests, $_;
12013 sub process_SB_test {
12016 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
12018 while ($file->next_line) {
12019 push @SB_tests, $_;
12025 sub process_WB_test {
12028 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
12030 while ($file->next_line) {
12031 push @WB_tests, $_;
12037 sub process_NamedSequences {
12038 # NamedSequences.txt entries are just added to an array. Because these
12039 # don't look like the other tables, they have their own handler.
12041 # LATIN CAPITAL LETTER A WITH MACRON AND GRAVE;0100 0300
12043 # This just adds the sequence to an array for later handling
12046 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
12048 while ($file->next_line) {
12049 my ($name, $sequence, @remainder) = split /\s*;\s*/, $_, -1;
12051 $file->carp_bad_line(
12052 "Doesn't look like 'KHMER VOWEL SIGN OM;17BB 17C6'");
12056 # Note single \t in keeping with special output format of
12057 # Perl_charnames. But it turns out that the code points don't have to
12058 # be 5 digits long, like the rest, based on the internal workings of
12059 # charnames.pm. This could be easily changed for consistency.
12060 push @named_sequences, "$sequence\t$name";
12069 sub filter_early_ea_lb {
12070 # Fixes early EastAsianWidth.txt and LineBreak.txt files. These had a
12071 # third field be the name of the code point, which can be ignored in
12072 # most cases. But it can be meaningful if it marks a range:
12073 # 33FE;W;IDEOGRAPHIC TELEGRAPH SYMBOL FOR DAY THIRTY-ONE
12074 # 3400;W;<CJK Ideograph Extension A, First>
12076 # We need to see the First in the example above to know it's a range.
12077 # They did not use the later range syntaxes. This routine changes it
12078 # to use the modern syntax.
12079 # $1 is the Input_file object.
12081 my @fields = split /\s*;\s*/;
12082 if ($fields[2] =~ /^<.*, First>/) {
12083 $first_range = $fields[0];
12086 elsif ($fields[2] =~ /^<.*, Last>/) {
12087 $_ = $_ = "$first_range..$fields[0]; $fields[1]";
12090 undef $first_range;
12091 $_ = "$fields[0]; $fields[1]";
12098 sub filter_old_style_arabic_shaping {
12099 # Early versions used a different term for the later one.
12101 my @fields = split /\s*;\s*/;
12102 $fields[3] =~ s/<no shaping>/No_Joining_Group/;
12103 $fields[3] =~ s/\s+/_/g; # Change spaces to underscores
12104 $_ = join ';', @fields;
12109 my $lc; # Table for lowercase mapping
12112 my %special_casing_code_points;
12114 sub setup_special_casing {
12115 # SpecialCasing.txt contains the non-simple case change mappings. The
12116 # simple ones are in UnicodeData.txt, which should already have been
12117 # read in to the full property data structures, so as to initialize
12118 # these with the simple ones. Then the SpecialCasing.txt entries
12119 # add or overwrite the ones which have different full mappings.
12121 # This routine sees if the simple mappings are to be output, and if
12122 # so, copies what has already been put into the full mapping tables,
12123 # while they still contain only the simple mappings.
12125 # The reason it is done this way is that the simple mappings are
12126 # probably not going to be output, so it saves work to initialize the
12127 # full tables with the simple mappings, and then overwrite those
12128 # relatively few entries in them that have different full mappings,
12129 # and thus skip the simple mapping tables altogether.
12132 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
12134 $lc = property_ref('lc');
12135 $tc = property_ref('tc');
12136 $uc = property_ref('uc');
12138 # For each of the case change mappings...
12139 foreach my $full_casing_table ($lc, $tc, $uc) {
12140 my $full_casing_name = $full_casing_table->name;
12141 my $full_casing_full_name = $full_casing_table->full_name;
12142 unless (defined $full_casing_table
12143 && ! $full_casing_table->is_empty)
12145 Carp::my_carp_bug("Need to process UnicodeData before SpecialCasing. Only special casing will be generated.");
12148 # Create a table in the old-style format and with the original
12149 # file name for backwards compatibility with applications that
12150 # read it directly. The new tables contain both the simple and
12151 # full maps, and the old are missing simple maps when there is a
12152 # conflicting full one. Probably it would have been ok to add
12153 # those to the legacy version, as was already done in 5.14 to the
12154 # case folding one, but this was not done, out of an abundance of
12155 # caution. The tables are set up here before we deal with the
12156 # full maps so that as we handle those, we can override the simple
12157 # maps for them in the legacy table, and merely add them in the
12159 my $legacy = Property->new("Legacy_" . $full_casing_full_name,
12160 File => $full_casing_full_name
12161 =~ s/case_Mapping//r,
12162 Format => $HEX_FORMAT,
12163 Default_Map => $CODE_POINT,
12164 Initialize => $full_casing_table,
12165 Replacement_Property => $full_casing_full_name,
12168 $full_casing_table->add_comment(join_lines( <<END
12169 This file includes both the simple and full case changing maps. The simple
12170 ones are in the main body of the table below, and the full ones adding to or
12171 overriding them are in the hash.
12175 # The simple version's name in each mapping merely has an 's' in
12176 # front of the full one's
12177 my $simple_name = 's' . $full_casing_name;
12178 my $simple = property_ref($simple_name);
12179 $simple->initialize($full_casing_table) if $simple->to_output_map();
12185 sub filter_2_1_8_special_casing_line {
12187 # This version had duplicate entries in this file. Delete all but the
12189 my @fields = split /\s*;\s*/, $_, -1; # -1 => retain trailing null
12191 if (exists $special_casing_code_points{$fields[0]}) {
12196 $special_casing_code_points{$fields[0]} = 1;
12197 filter_special_casing_line(@_);
12200 sub filter_special_casing_line {
12201 # Change the format of $_ from SpecialCasing.txt into something that
12202 # the generic handler understands. Each input line contains three
12203 # case mappings. This will generate three lines to pass to the
12204 # generic handler for each of those.
12206 # The input syntax (after stripping comments and trailing white space
12207 # is like one of the following (with the final two being entries that
12209 # 00DF; 00DF; 0053 0073; 0053 0053; # LATIN SMALL LETTER SHARP S
12210 # 03A3; 03C2; 03A3; 03A3; Final_Sigma;
12211 # 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
12212 # Note the trailing semi-colon, unlike many of the input files. That
12213 # means that there will be an extra null field generated by the split
12216 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
12218 my @fields = split /\s*;\s*/, $_, -1; # -1 => retain trailing null
12221 # field #4 is when this mapping is conditional. If any of these get
12222 # implemented, it would be by hard-coding in the casing functions in
12223 # the Perl core, not through tables. But if there is a new condition
12224 # we don't know about, output a warning. We know about all the
12225 # conditions through 6.0
12226 if ($fields[4] ne "") {
12227 my @conditions = split ' ', $fields[4];
12228 if ($conditions[0] ne 'tr' # We know that these languages have
12229 # conditions, and some are multiple
12230 && $conditions[0] ne 'az'
12231 && $conditions[0] ne 'lt'
12233 # And, we know about a single condition Final_Sigma, but
12235 && ($v_version gt v5.2.0
12236 && (@conditions > 1 || $conditions[0] ne 'Final_Sigma')))
12238 $file->carp_bad_line("Unknown condition '$fields[4]'. You should inspect it and either add code to handle it, or add to list of those that are to ignore");
12240 elsif ($conditions[0] ne 'Final_Sigma') {
12242 # Don't print out a message for Final_Sigma, because we
12243 # have hard-coded handling for it. (But the standard
12244 # could change what the rule should be, but it wouldn't
12245 # show up here anyway.
12247 print "# SKIPPING Special Casing: $_\n"
12248 if $verbosity >= $VERBOSE;
12253 elsif (@fields > 6 || (@fields == 6 && $fields[5] ne "" )) {
12254 $file->carp_bad_line('Extra fields');
12259 my $decimal_code_point = hex $fields[0];
12261 # Loop to handle each of the three mappings in the input line, in
12262 # order, with $i indicating the current field number.
12264 for my $object ($lc, $tc, $uc) {
12265 $i++; # First time through, $i = 0 ... 3rd time = 3
12267 my $value = $object->value_of($decimal_code_point);
12268 $value = ($value eq $CODE_POINT)
12269 ? $decimal_code_point
12272 # If this isn't a multi-character mapping, it should already have
12274 if ($fields[$i] !~ / /) {
12275 if ($value != hex $fields[$i]) {
12276 Carp::my_carp("Bad news. UnicodeData.txt thinks "
12278 . "(0x$fields[0]) is $value"
12279 . " and SpecialCasing.txt thinks it is "
12281 . ". Good luck. Retaining UnicodeData value, and proceeding anyway.");
12286 # The mapping goes into both the legacy table, in which it
12287 # replaces the simple one...
12288 $file->insert_adjusted_lines("$fields[0]; Legacy_"
12289 . $object->full_name
12290 . "; $fields[$i]");
12292 # ... and the regular table, in which it is additional,
12293 # beyond the simple mapping.
12294 $file->insert_adjusted_lines("$fields[0]; "
12298 . "$REPLACE_CMD=$MULTIPLE_BEFORE"
12304 # Everything has been handled by the insert_adjusted_lines()
12311 sub filter_old_style_case_folding {
12312 # This transforms $_ containing the case folding style of 3.0.1, to 3.1
12313 # and later style. Different letters were used in the earlier.
12316 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
12318 my @fields = split /\s*;\s*/;
12320 if ($fields[1] eq 'L') {
12321 $fields[1] = 'C'; # L => C always
12323 elsif ($fields[1] eq 'E') {
12324 if ($fields[2] =~ / /) { # E => C if one code point; F otherwise
12332 $file->carp_bad_line("Expecting L or E in second field");
12336 $_ = join("; ", @fields) . ';';
12340 { # Closure for case folding
12342 # Create the map for simple only if are going to output it, for otherwise
12343 # it takes no part in anything we do.
12344 my $to_output_simple;
12346 sub setup_case_folding($) {
12347 # Read in the case foldings in CaseFolding.txt. This handles both
12348 # simple and full case folding.
12351 = property_ref('Simple_Case_Folding')->to_output_map;
12353 if (! $to_output_simple) {
12354 property_ref('Case_Folding')->set_proxy_for('Simple_Case_Folding');
12357 # If we ever wanted to show that these tables were combined, a new
12358 # property method could be created, like set_combined_props()
12359 property_ref('Case_Folding')->add_comment(join_lines( <<END
12360 This file includes both the simple and full case folding maps. The simple
12361 ones are in the main body of the table below, and the full ones adding to or
12362 overriding them are in the hash.
12368 sub filter_case_folding_line {
12369 # Called for each line in CaseFolding.txt
12370 # Input lines look like:
12371 # 0041; C; 0061; # LATIN CAPITAL LETTER A
12372 # 00DF; F; 0073 0073; # LATIN SMALL LETTER SHARP S
12373 # 1E9E; S; 00DF; # LATIN CAPITAL LETTER SHARP S
12375 # 'C' means that folding is the same for both simple and full
12376 # 'F' that it is only for full folding
12377 # 'S' that it is only for simple folding
12378 # 'T' is locale-dependent, and ignored
12379 # 'I' is a type of 'F' used in some early releases.
12380 # Note the trailing semi-colon, unlike many of the input files. That
12381 # means that there will be an extra null field generated by the split
12382 # below, which we ignore and hence is not an error.
12385 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
12387 my ($range, $type, $map, @remainder) = split /\s*;\s*/, $_, -1;
12388 if (@remainder > 1 || (@remainder == 1 && $remainder[0] ne "" )) {
12389 $file->carp_bad_line('Extra fields');
12394 if ($type =~ / ^ [IT] $/x) { # Skip Turkic case folding, is locale dependent
12399 # C: complete, F: full, or I: dotted uppercase I -> dotless lowercase
12400 # I are all full foldings; S is single-char. For S, there is always
12401 # an F entry, so we must allow multiple values for the same code
12402 # point. Fortunately this table doesn't need further manipulation
12403 # which would preclude using multiple-values. The S is now included
12404 # so that _swash_inversion_hash() is able to construct closures
12405 # without having to worry about F mappings.
12406 if ($type eq 'C' || $type eq 'F' || $type eq 'I' || $type eq 'S') {
12407 $_ = "$range; Case_Folding; "
12408 . "$CMD_DELIM$REPLACE_CMD=$MULTIPLE_BEFORE$CMD_DELIM$map";
12412 $file->carp_bad_line('Expecting C F I S or T in second field');
12415 # C and S are simple foldings, but simple case folding is not needed
12416 # unless we explicitly want its map table output.
12417 if ($to_output_simple && $type eq 'C' || $type eq 'S') {
12418 $file->insert_adjusted_lines("$range; Simple_Case_Folding; $map");
12424 } # End case fold closure
12426 sub filter_jamo_line {
12427 # Filter Jamo.txt lines. This routine mainly is used to populate hashes
12428 # from this file that is used in generating the Name property for Jamo
12429 # code points. But, it also is used to convert early versions' syntax
12430 # into the modern form. Here are two examples:
12431 # 1100; G # HANGUL CHOSEONG KIYEOK # Modern syntax
12432 # U+1100; G; HANGUL CHOSEONG KIYEOK # 2.0 syntax
12434 # The input is $_, the output is $_ filtered.
12436 my @fields = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
12438 # Let the caller handle unexpected input. In earlier versions, there was
12439 # a third field which is supposed to be a comment, but did not have a '#'
12441 return if @fields > (($v_version gt v3.0.0) ? 2 : 3);
12443 $fields[0] =~ s/^U\+//; # Also, early versions had this extraneous
12446 # Some 2.1 versions had this wrong. Causes havoc with the algorithm.
12447 $fields[1] = 'R' if $fields[0] eq '1105';
12449 # Add to structure so can generate Names from it.
12450 my $cp = hex $fields[0];
12451 my $short_name = $fields[1];
12452 $Jamo{$cp} = $short_name;
12453 if ($cp <= $LBase + $LCount) {
12454 $Jamo_L{$short_name} = $cp - $LBase;
12456 elsif ($cp <= $VBase + $VCount) {
12457 $Jamo_V{$short_name} = $cp - $VBase;
12459 elsif ($cp <= $TBase + $TCount) {
12460 $Jamo_T{$short_name} = $cp - $TBase;
12463 Carp::my_carp_bug("Unexpected Jamo code point in $_");
12467 # Reassemble using just the first two fields to look like a typical
12468 # property file line
12469 $_ = "$fields[0]; $fields[1]";
12474 sub register_fraction($) {
12475 # This registers the input rational number so that it can be passed on to
12476 # utf8_heavy.pl, both in rational and floating forms.
12478 my $rational = shift;
12480 my $float = eval $rational;
12481 $nv_floating_to_rational{$float} = $rational;
12485 sub gcd($$) { # Greatest-common-divisor; from
12486 # http://en.wikipedia.org/wiki/Euclidean_algorithm
12499 sub reduce_fraction($) {
12500 my $fraction_ref = shift;
12502 # Reduce a fraction to lowest terms. The Unicode data may be reducible,
12503 # hence this is needed. The argument is a reference to the
12504 # string denoting the fraction, which must be of the form:
12505 if ($$fraction_ref !~ / ^ (-?) (\d+) \/ (\d+) $ /ax) {
12506 Carp::my_carp_bug("Non-fraction input '$$fraction_ref'. Unchanged");
12511 my $numerator = $2;
12512 my $denominator = $3;
12516 # Find greatest common divisor
12517 my $gcd = gcd($numerator, $denominator);
12519 # And reduce using the gcd.
12521 $numerator /= $gcd;
12522 $denominator /= $gcd;
12523 $$fraction_ref = "$sign$numerator/$denominator";
12529 sub filter_numeric_value_line {
12530 # DNumValues contains lines of a different syntax than the typical
12532 # 0F33 ; -0.5 ; ; -1/2 # No TIBETAN DIGIT HALF ZERO
12534 # This routine transforms $_ containing the anomalous syntax to the
12535 # typical, by filtering out the extra columns, and convert early version
12536 # decimal numbers to strings that look like rational numbers.
12539 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
12541 # Starting in 5.1, there is a rational field. Just use that, omitting the
12542 # extra columns. Otherwise convert the decimal number in the second field
12543 # to a rational, and omit extraneous columns.
12544 my @fields = split /\s*;\s*/, $_, -1;
12547 if ($v_version ge v5.1.0) {
12548 if (@fields != 4) {
12549 $file->carp_bad_line('Not 4 semi-colon separated fields');
12553 reduce_fraction(\$fields[3]) if $fields[3] =~ qr{/};
12554 $rational = $fields[3];
12556 $_ = join '; ', @fields[ 0, 3 ];
12560 # Here, is an older Unicode file, which has decimal numbers instead of
12561 # rationals in it. Use the fraction to calculate the denominator and
12562 # convert to rational.
12564 if (@fields != 2 && @fields != 3) {
12565 $file->carp_bad_line('Not 2 or 3 semi-colon separated fields');
12570 my $codepoints = $fields[0];
12571 my $decimal = $fields[1];
12572 if ($decimal =~ s/\.0+$//) {
12574 # Anything ending with a decimal followed by nothing but 0's is an
12576 $_ = "$codepoints; $decimal";
12577 $rational = $decimal;
12582 if ($decimal =~ /\.50*$/) {
12586 # Here have the hardcoded repeating decimals in the fraction, and
12587 # the denominator they imply. There were only a few denominators
12588 # in the older Unicode versions of this file which this code
12589 # handles, so it is easy to convert them.
12591 # The 4 is because of a round-off error in the Unicode 3.2 files
12592 elsif ($decimal =~ /\.33*[34]$/ || $decimal =~ /\.6+7$/) {
12595 elsif ($decimal =~ /\.[27]50*$/) {
12598 elsif ($decimal =~ /\.[2468]0*$/) {
12601 elsif ($decimal =~ /\.16+7$/ || $decimal =~ /\.83+$/) {
12604 elsif ($decimal =~ /\.(12|37|62|87)50*$/) {
12607 if ($denominator) {
12608 my $sign = ($decimal < 0) ? "-" : "";
12609 my $numerator = int((abs($decimal) * $denominator) + .5);
12610 $rational = "$sign$numerator/$denominator";
12611 $_ = "$codepoints; $rational";
12614 $file->carp_bad_line("Can't cope with number '$decimal'.");
12621 register_fraction($rational) if $rational =~ qr{/};
12626 my %unihan_properties;
12628 sub construct_unihan {
12630 my $file_object = shift;
12632 return unless file_exists($file_object->file);
12634 if ($v_version lt v4.0.0) {
12635 push @cjk_properties, 'URS ; Unicode_Radical_Stroke';
12636 push @cjk_property_values, split "\n", <<'END';
12637 # @missing: 0000..10FFFF; Unicode_Radical_Stroke; <none>
12641 if ($v_version ge v3.0.0) {
12642 push @cjk_properties, split "\n", <<'END';
12643 cjkIRG_GSource; kIRG_GSource
12644 cjkIRG_JSource; kIRG_JSource
12645 cjkIRG_KSource; kIRG_KSource
12646 cjkIRG_TSource; kIRG_TSource
12647 cjkIRG_VSource; kIRG_VSource
12649 push @cjk_property_values, split "\n", <<'END';
12650 # @missing: 0000..10FFFF; cjkIRG_GSource; <none>
12651 # @missing: 0000..10FFFF; cjkIRG_JSource; <none>
12652 # @missing: 0000..10FFFF; cjkIRG_KSource; <none>
12653 # @missing: 0000..10FFFF; cjkIRG_TSource; <none>
12654 # @missing: 0000..10FFFF; cjkIRG_VSource; <none>
12657 if ($v_version ge v3.1.0) {
12658 push @cjk_properties, 'cjkIRG_HSource; kIRG_HSource';
12659 push @cjk_property_values, '# @missing: 0000..10FFFF; cjkIRG_HSource; <none>';
12661 if ($v_version ge v3.1.1) {
12662 push @cjk_properties, 'cjkIRG_KPSource; kIRG_KPSource';
12663 push @cjk_property_values, '# @missing: 0000..10FFFF; cjkIRG_KPSource; <none>';
12665 if ($v_version ge v3.2.0) {
12666 push @cjk_properties, split "\n", <<'END';
12667 cjkAccountingNumeric; kAccountingNumeric
12668 cjkCompatibilityVariant; kCompatibilityVariant
12669 cjkOtherNumeric; kOtherNumeric
12670 cjkPrimaryNumeric; kPrimaryNumeric
12672 push @cjk_property_values, split "\n", <<'END';
12673 # @missing: 0000..10FFFF; cjkAccountingNumeric; NaN
12674 # @missing: 0000..10FFFF; cjkCompatibilityVariant; <code point>
12675 # @missing: 0000..10FFFF; cjkOtherNumeric; NaN
12676 # @missing: 0000..10FFFF; cjkPrimaryNumeric; NaN
12679 if ($v_version gt v4.0.0) {
12680 push @cjk_properties, 'cjkIRG_USource; kIRG_USource';
12681 push @cjk_property_values, '# @missing: 0000..10FFFF; cjkIRG_USource; <none>';
12684 if ($v_version ge v4.1.0) {
12685 push @cjk_properties, 'cjkIICore ; kIICore';
12686 push @cjk_property_values, '# @missing: 0000..10FFFF; cjkIICore; <none>';
12691 # Do any special setup for Unihan properties.
12693 # This property gives the wrong computed type, so override.
12694 my $usource = property_ref('kIRG_USource');
12695 $usource->set_type($STRING) if defined $usource;
12697 # This property is to be considered binary (it says so in
12698 # http://www.unicode.org/reports/tr38/)
12699 my $iicore = property_ref('kIICore');
12700 if (defined $iicore) {
12701 $iicore->set_type($FORCED_BINARY);
12702 $iicore->table("Y")->add_note("Matches any code point which has a non-null value for this property; see unicode.org UAX #38.");
12704 # Unicode doesn't include the maps for this property, so don't
12705 # warn that they are missing.
12706 $iicore->set_pre_declared_maps(0);
12707 $iicore->add_comment(join_lines( <<END
12708 This property contains string values, but any non-empty ones are considered to
12709 be 'core', so Perl creates tables for both: 1) its string values, plus 2)
12710 tables so that \\p{kIICore} matches any code point which has a non-empty
12711 value for this property.
12719 sub filter_unihan_line {
12720 # Change unihan db lines to look like the others in the db. Here is
12722 # U+341C kCangjie IEKN
12724 # Tabs are used instead of semi-colons to separate fields; therefore
12725 # they may have semi-colons embedded in them. Change these to periods
12726 # so won't screw up the rest of the code.
12729 # Remove lines that don't look like ones we accept.
12730 if ($_ !~ /^ [^\t]* \t ( [^\t]* ) /x) {
12735 # Extract the property, and save a reference to its object.
12737 if (! exists $unihan_properties{$property}) {
12738 $unihan_properties{$property} = property_ref($property);
12741 # Don't do anything unless the property is one we're handling, which
12742 # we determine by seeing if there is an object defined for it or not
12743 if (! defined $unihan_properties{$property}) {
12748 # Convert the tab separators to our standard semi-colons, and convert
12749 # the U+HHHH notation to the rest of the standard's HHHH
12751 s/\b U \+ (?= $code_point_re )//xg;
12753 #local $to_trace = 1 if main::DEBUG;
12754 trace $_ if main::DEBUG && $to_trace;
12760 sub filter_blocks_lines {
12761 # In the Blocks.txt file, the names of the blocks don't quite match the
12762 # names given in PropertyValueAliases.txt, so this changes them so they
12763 # do match: Blanks and hyphens are changed into underscores. Also makes
12764 # early release versions look like later ones
12766 # $_ is transformed to the correct value.
12769 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
12771 if ($v_version lt v3.2.0) {
12772 if (/FEFF.*Specials/) { # Bug in old versions: line wrongly inserted
12777 # Old versions used a different syntax to mark the range.
12778 $_ =~ s/;\s+/../ if $v_version lt v3.1.0;
12781 my @fields = split /\s*;\s*/, $_, -1;
12782 if (@fields != 2) {
12783 $file->carp_bad_line("Expecting exactly two fields");
12788 # Change hyphens and blanks in the block name field only
12789 $fields[1] =~ s/[ -]/_/g;
12790 $fields[1] =~ s/_ ( [a-z] ) /_\u$1/xg; # Capitalize first letter of word
12792 $_ = join("; ", @fields);
12797 my $current_property;
12799 sub filter_old_style_proplist {
12800 # PropList.txt has been in Unicode since version 2.0. Until 3.1, it
12801 # was in a completely different syntax. Ken Whistler of Unicode says
12802 # that it was something he used as an aid for his own purposes, but
12803 # was never an official part of the standard. Many of the properties
12804 # in it were incorporated into the later PropList.txt, but some were
12805 # not. This program uses this early file to generate property tables
12806 # that are otherwise not accessible in the early UCD's. It does this
12807 # for the ones that eventually became official, and don't appear to be
12808 # too different in their contents from the later official version, and
12809 # throws away the rest. It could be argued that the ones it generates
12810 # were probably not really official at that time, so should be
12811 # ignored. You can easily modify things to skip all of them by
12812 # changing this function to just set $_ to "", and return; and to skip
12813 # certain of them by by simply removing their declarations from
12814 # get_old_property_aliases().
12816 # Here is a list of all the ones that are thrown away:
12817 # Alphabetic The definitions for this are very
12818 # defective, so better to not mislead
12819 # people into thinking it works.
12820 # Instead the Perl extension of the
12821 # same name is constructed from first
12823 # Bidi=* duplicates UnicodeData.txt
12824 # Combining never made into official property;
12826 # Composite never made into official property.
12827 # Currency Symbol duplicates UnicodeData.txt: gc=sc
12828 # Decimal Digit duplicates UnicodeData.txt: gc=nd
12829 # Delimiter never made into official property;
12831 # Format Control never made into official property;
12833 # High Surrogate duplicates Blocks.txt
12834 # Ignorable Control never made into official property;
12836 # ISO Control duplicates UnicodeData.txt: gc=cc
12837 # Left of Pair never made into official property;
12838 # Line Separator duplicates UnicodeData.txt: gc=zl
12839 # Low Surrogate duplicates Blocks.txt
12840 # Non-break was actually listed as a property
12841 # in 3.2, but without any code
12842 # points. Unicode denies that this
12843 # was ever an official property
12844 # Non-spacing duplicate UnicodeData.txt: gc=mn
12845 # Numeric duplicates UnicodeData.txt: gc=cc
12846 # Paired Punctuation never made into official property;
12847 # appears to be gc=ps + gc=pe
12848 # Paragraph Separator duplicates UnicodeData.txt: gc=cc
12849 # Private Use duplicates UnicodeData.txt: gc=co
12850 # Private Use High Surrogate duplicates Blocks.txt
12851 # Punctuation duplicates UnicodeData.txt: gc=p
12852 # Space different definition than eventual
12854 # Titlecase duplicates UnicodeData.txt: gc=lt
12855 # Unassigned Code Value duplicates UnicodeData.txt: gc=cn
12856 # Zero-width never made into official property;
12858 # Most of the properties have the same names in this file as in later
12859 # versions, but a couple do not.
12861 # This subroutine filters $_, converting it from the old style into
12862 # the new style. Here's a sample of the old-style
12864 # *******************************************
12866 # Property dump for: 0x100000A0 (Join Control)
12868 # 200C..200D (2 chars)
12870 # In the example, the property is "Join Control". It is kept in this
12871 # closure between calls to the subroutine. The numbers beginning with
12872 # 0x were internal to Ken's program that generated this file.
12874 # If this line contains the property name, extract it.
12875 if (/^Property dump for: [^(]*\((.*)\)/) {
12878 # Convert white space to underscores.
12881 # Convert the few properties that don't have the same name as
12882 # their modern counterparts
12883 s/Identifier_Part/ID_Continue/
12884 or s/Not_a_Character/NChar/;
12886 # If the name matches an existing property, use it.
12887 if (defined property_ref($_)) {
12888 trace "new property=", $_ if main::DEBUG && $to_trace;
12889 $current_property = $_;
12891 else { # Otherwise discard it
12892 trace "rejected property=", $_ if main::DEBUG && $to_trace;
12893 undef $current_property;
12895 $_ = ""; # The property is saved for the next lines of the
12896 # file, but this defining line is of no further use,
12897 # so clear it so that the caller won't process it
12900 elsif (! defined $current_property || $_ !~ /^$code_point_re/) {
12902 # Here, the input line isn't a header defining a property for the
12903 # following section, and either we aren't in such a section, or
12904 # the line doesn't look like one that defines the code points in
12905 # such a section. Ignore this line.
12910 # Here, we have a line defining the code points for the current
12911 # stashed property. Anything starting with the first blank is
12912 # extraneous. Otherwise, it should look like a normal range to
12913 # the caller. Append the property name so that it looks just like
12914 # a modern PropList entry.
12917 $_ .= "; $current_property";
12919 trace $_ if main::DEBUG && $to_trace;
12922 } # End closure for old style proplist
12924 sub filter_old_style_normalization_lines {
12925 # For early releases of Unicode, the lines were like:
12926 # 74..2A76 ; NFKD_NO
12927 # For later releases this became:
12928 # 74..2A76 ; NFKD_QC; N
12929 # Filter $_ to look like those in later releases.
12930 # Similarly for MAYBEs
12932 s/ _NO \b /_QC; N/x || s/ _MAYBE \b /_QC; M/x;
12934 # Also, the property FC_NFKC was abbreviated to FNC
12939 sub setup_script_extensions {
12940 # The Script_Extensions property starts out with a clone of the Script
12943 my $scx = property_ref("Script_Extensions");
12944 $scx = Property->new("scx", Full_Name => "Script_Extensions")
12946 $scx->_set_format($STRING_WHITE_SPACE_LIST);
12947 $scx->initialize($script);
12948 $scx->set_default_map($script->default_map);
12949 $scx->set_pre_declared_maps(0); # PropValueAliases doesn't list these
12950 $scx->add_comment(join_lines( <<END
12951 The values for code points that appear in one script are just the same as for
12952 the 'Script' property. Likewise the values for those that appear in many
12953 scripts are either 'Common' or 'Inherited', same as with 'Script'. But the
12954 values of code points that appear in a few scripts are a space separated list
12959 # Initialize scx's tables and the aliases for them to be the same as sc's
12960 foreach my $table ($script->tables) {
12961 my $scx_table = $scx->add_match_table($table->name,
12962 Full_Name => $table->full_name);
12963 foreach my $alias ($table->aliases) {
12964 $scx_table->add_alias($alias->name);
12969 sub filter_script_extensions_line {
12970 # The Scripts file comes with the full name for the scripts; the
12971 # ScriptExtensions, with the short name. The final mapping file is a
12972 # combination of these, and without adjustment, would have inconsistent
12973 # entries. This filters the latter file to convert to full names.
12974 # Entries look like this:
12975 # 064B..0655 ; Arab Syrc # Mn [11] ARABIC FATHATAN..ARABIC HAMZA BELOW
12977 my @fields = split /\s*;\s*/;
12979 # This script was erroneously omitted in this Unicode version.
12980 $fields[1] .= ' Takr' if $v_version eq v6.1.0 && $fields[0] =~ /^0964/;
12983 foreach my $short_name (split " ", $fields[1]) {
12984 push @full_names, $script->table($short_name)->full_name;
12986 $fields[1] = join " ", @full_names;
12987 $_ = join "; ", @fields;
12994 # Populates the Hangul Syllable Type property from first principles
12997 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
12999 # These few ranges are hard-coded in.
13000 $file->insert_lines(split /\n/, <<'END'
13008 # The Hangul syllables in version 1 are at different code points than
13009 # those that came along starting in version 2, and have different names;
13010 # they comprise about 60% of the code points of the later version.
13011 # From my (khw) research on them (see <558493EB.4000807@att.net>), the
13012 # initial set is a subset of the later version, with different English
13013 # transliterations. I did not see an easy mapping between them. The
13014 # later set includes essentially all possibilities, even ones that aren't
13015 # in modern use (if they ever were), and over 96% of the new ones are type
13016 # LVT. Mathematically, the early set must also contain a preponderance of
13017 # LVT values. In lieu of doing nothing, we just set them all to LVT, and
13018 # expect that this will be right most of the time, which is better than
13019 # not being right at all.
13020 if ($v_version lt v2.0.0) {
13021 my $property = property_ref($file->property);
13022 $file->insert_lines(sprintf("%04X..%04X; LVT\n",
13023 $FIRST_REMOVED_HANGUL_SYLLABLE,
13024 $FINAL_REMOVED_HANGUL_SYLLABLE));
13025 push @tables_that_may_be_empty, $property->table('LV')->complete_name;
13029 # The algorithmically derived syllables are almost all LVT ones, so
13030 # initialize the whole range with that.
13031 $file->insert_lines(sprintf "%04X..%04X; LVT\n",
13032 $SBase, $SBase + $SCount -1);
13034 # Those ones that aren't LVT are LV, and they occur at intervals of
13035 # $TCount code points, starting with the first code point, at $SBase.
13036 for (my $i = $SBase; $i < $SBase + $SCount; $i += $TCount) {
13037 $file->insert_lines(sprintf "%04X..%04X; LV\n", $i, $i);
13045 # Populates the Grapheme Cluster Break property from first principles
13048 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
13050 # All these definitions are from
13051 # http://www.unicode.org/reports/tr29/tr29-3.html with confirmation
13052 # from http://www.unicode.org/reports/tr29/tr29-4.html
13054 foreach my $range ($gc->ranges) {
13056 # Extend includes gc=Me and gc=Mn, while Control includes gc=Cc
13058 if ($range->value =~ / ^ M [en] $ /x) {
13059 $file->insert_lines(sprintf "%04X..%04X; Extend",
13060 $range->start, $range->end);
13062 elsif ($range->value =~ / ^ C [cf] $ /x) {
13063 $file->insert_lines(sprintf "%04X..%04X; Control",
13064 $range->start, $range->end);
13067 $file->insert_lines("2028; Control"); # Line Separator
13068 $file->insert_lines("2029; Control"); # Paragraph Separator
13070 $file->insert_lines("000D; CR");
13071 $file->insert_lines("000A; LF");
13073 # Also from http://www.unicode.org/reports/tr29/tr29-3.html.
13074 foreach my $code_point ( qw{
13075 09BE 09D7 0B3E 0B57 0BBE 0BD7 0CC2 0CD5 0CD6
13076 0D3E 0D57 0DCF 0DDF FF9E FF9F 1D165 1D16E 1D16F
13079 my $category = $gc->value_of(hex $code_point);
13080 next if ! defined $category || $category eq 'Cn'; # But not if
13081 # unassigned in this
13083 $file->insert_lines("$code_point; Extend");
13086 my $hst = property_ref('Hangul_Syllable_Type');
13087 if ($hst->count > 0) {
13088 foreach my $range ($hst->ranges) {
13089 $file->insert_lines(sprintf "%04X..%04X; %s",
13090 $range->start, $range->end, $range->value);
13094 generate_hst($file);
13097 main::process_generic_property_file($file);
13101 sub fixup_early_perl_name_alias {
13103 # Different versions of Unicode have varying support for the name synonyms
13104 # below. Just include everything. As of 6.1, all these are correct in
13105 # the Unicode-supplied file.
13108 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
13111 # ALERT did not come along until 6.0, at which point it became preferred
13112 # over BELL. By inserting it last in early releases, BELL is preferred
13113 # over it; and vice-vers in 6.0
13114 my $type_for_bell = ($v_version lt v6.0.0)
13117 $file->insert_lines(split /\n/, <<END
13118 0007;BELL; $type_for_bell
13119 000A;LINE FEED (LF);alternate
13120 000C;FORM FEED (FF);alternate
13121 000D;CARRIAGE RETURN (CR);alternate
13122 0085;NEXT LINE (NEL);alternate
13127 # One might think that the the 'Unicode_1_Name' field, could work for most
13128 # of the above names, but sadly that field varies depending on the
13129 # release. Version 1.1.5 had no names for any of the controls; Version
13130 # 2.0 introduced names for the C0 controls, and 3.0 introduced C1 names.
13131 # 3.0.1 removed the name INDEX; and 3.2 changed some names:
13132 # changed to parenthesized versions like "NEXT LINE" to
13133 # "NEXT LINE (NEL)";
13134 # changed PARTIAL LINE DOWN to PARTIAL LINE FORWARD
13135 # changed PARTIAL LINE UP to PARTIAL LINE BACKWARD;;
13136 # changed e.g. FILE SEPARATOR to INFORMATION SEPARATOR FOUR
13138 # All these are present in the 6.1 NameAliases.txt
13143 sub filter_later_version_name_alias_line {
13145 # This file has an extra entry per line for the alias type. This is
13146 # handled by creating a compound entry: "$alias: $type"; First, split
13147 # the line into components.
13148 my ($range, $alias, $type, @remainder)
13149 = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
13151 # This file contains multiple entries for some components, so tell the
13152 # downstream code to allow this in our internal tables; the
13153 # $MULTIPLE_AFTER preserves the input ordering.
13154 $_ = join ";", $range, $CMD_DELIM
13164 sub filter_early_version_name_alias_line {
13166 # Early versions did not have the trailing alias type field; implicitly it
13167 # was 'correction'.
13168 $_ .= "; correction";
13170 filter_later_version_name_alias_line;
13174 sub filter_all_caps_script_names {
13176 # Some early Unicode releases had the script names in all CAPS. This
13177 # converts them to just the first letter of each word being capital.
13179 my ($range, $script, @remainder)
13180 = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
13181 my @words = split "_", $script;
13182 for my $word (@words) {
13184 ucfirst(lc($word)) if $word ne 'CJK';
13186 $script = join "_", @words;
13187 $_ = join ";", $range, $script, @remainder;
13190 sub finish_Unicode() {
13191 # This routine should be called after all the Unicode files have been read
13193 # 1) Creates properties that are missing from the version of Unicode being
13194 # compiled, and which, for whatever reason, are needed for the Perl
13195 # core to function properly. These are minimally populated as
13197 # 2) Adds the mappings for code points missing from the files which have
13198 # defaults specified for them.
13199 # 3) At this this point all mappings are known, so it computes the type of
13200 # each property whose type hasn't been determined yet.
13201 # 4) Calculates all the regular expression match tables based on the
13203 # 5) Calculates and adds the tables which are defined by Unicode, but
13204 # which aren't derived by them, and certain derived tables that Perl
13207 # Folding information was introduced later into Unicode data. To get
13208 # Perl's case ignore (/i) to work at all in releases that don't have
13209 # folding, use the best available alternative, which is lower casing.
13210 my $fold = property_ref('Case_Folding');
13211 if ($fold->is_empty) {
13212 $fold->initialize(property_ref('Lowercase_Mapping'));
13213 $fold->add_note(join_lines(<<END
13214 WARNING: This table uses lower case as a substitute for missing fold
13220 # Multiple-character mapping was introduced later into Unicode data, so it
13221 # is by default the simple version. If to output the simple versions and
13222 # not present, just use the regular (which in these Unicode versions is
13223 # the simple as well).
13224 foreach my $map (qw { Uppercase_Mapping
13230 my $comment = <<END;
13232 Note that although the Perl core uses this file, it has the standard values
13233 for code points from U+0000 to U+00FF compiled in, so changing this table will
13234 not change the core's behavior with respect to these code points. Use
13235 Unicode::Casing to override this table.
13237 if ($map eq 'Case_Folding') {
13239 (/i regex matching is not overridable except by using a custom regex engine)
13242 property_ref($map)->add_comment(join_lines($comment));
13243 my $simple = property_ref("Simple_$map");
13244 next if ! $simple->is_empty;
13245 if ($simple->to_output_map) {
13246 $simple->initialize(property_ref($map));
13249 property_ref($map)->set_proxy_for($simple->name);
13253 # For each property, fill in any missing mappings, and calculate the re
13254 # match tables. If a property has more than one missing mapping, the
13255 # default is a reference to a data structure, and requires data from other
13256 # properties to resolve. The sort is used to cause these to be processed
13257 # last, after all the other properties have been calculated.
13258 # (Fortunately, the missing properties so far don't depend on each other.)
13259 foreach my $property
13260 (sort { (defined $a->default_map && ref $a->default_map) ? 1 : -1 }
13263 # $perl has been defined, but isn't one of the Unicode properties that
13264 # need to be finished up.
13265 next if $property == $perl;
13267 # Nor do we need to do anything with properties that aren't going to
13269 next if $property->fate == $SUPPRESSED;
13271 # Handle the properties that have more than one possible default
13272 if (ref $property->default_map) {
13273 my $default_map = $property->default_map;
13275 # These properties have stored in the default_map:
13277 # 1) A default map which applies to all code points in a
13279 # 2) an expression which will evaluate to the list of code
13280 # points in that class
13282 # 3) the default map which applies to every other missing code
13285 # Go through each list.
13286 while (my ($default, $eval) = $default_map->get_next_defaults) {
13288 # Get the class list, and intersect it with all the so-far
13289 # unspecified code points yielding all the code points
13290 # in the class that haven't been specified.
13291 my $list = eval $eval;
13293 Carp::my_carp("Can't set some defaults for missing code points for $property because eval '$eval' failed with '$@'");
13297 # Narrow down the list to just those code points we don't have
13299 $list = $list & $property->inverse_list;
13301 # Add mappings to the property for each code point in the list
13302 foreach my $range ($list->ranges) {
13303 $property->add_map($range->start, $range->end, $default,
13304 Replace => $CROAK);
13308 # All remaining code points have the other mapping. Set that up
13309 # so the normal single-default mapping code will work on them
13310 $property->set_default_map($default_map->other_default);
13312 # And fall through to do that
13315 # We should have enough data now to compute the type of the property.
13316 my $property_name = $property->name;
13317 $property->compute_type;
13318 my $property_type = $property->type;
13320 next if ! $property->to_create_match_tables;
13322 # Here want to create match tables for this property
13324 # The Unicode db always (so far, and they claim into the future) have
13325 # the default for missing entries in binary properties be 'N' (unless
13326 # there is a '@missing' line that specifies otherwise)
13327 if (! defined $property->default_map) {
13328 if ($property_type == $BINARY) {
13329 $property->set_default_map('N');
13331 elsif ($property_type == $ENUM) {
13332 Carp::my_carp("Property '$property_name doesn't have a default mapping. Using a fake one");
13333 $property->set_default_map('XXX This makes sure there is a default map');
13337 # Add any remaining code points to the mapping, using the default for
13338 # missing code points.
13340 my $default_map = $property->default_map;
13341 if ($property_type == $FORCED_BINARY) {
13343 # A forced binary property creates a 'Y' table that matches all
13344 # non-default values. The actual string values are also written out
13345 # as a map table. (The default value will almost certainly be the
13346 # empty string, so the pod glosses over the distinction, and just
13347 # talks about empty vs non-empty.)
13348 my $yes = $property->table("Y");
13349 foreach my $range ($property->ranges) {
13350 next if $range->value eq $default_map;
13351 $yes->add_range($range->start, $range->end);
13353 $property->table("N")->set_complement($yes);
13356 if (defined $default_map) {
13358 # Make sure there is a match table for the default
13359 if (! defined ($default_table = $property->table($default_map)))
13361 $default_table = $property->add_match_table($default_map);
13364 # And, if the property is binary, the default table will just
13365 # be the complement of the other table.
13366 if ($property_type == $BINARY) {
13367 my $non_default_table;
13369 # Find the non-default table.
13370 for my $table ($property->tables) {
13371 if ($table == $default_table) {
13372 if ($v_version le v5.0.0) {
13373 $table->add_alias($_) for qw(N No F False);
13376 } elsif ($v_version le v5.0.0) {
13377 $table->add_alias($_) for qw(Y Yes T True);
13379 $non_default_table = $table;
13381 $default_table->set_complement($non_default_table);
13385 # This fills in any missing values with the default. It's
13386 # not necessary to do this with binary properties, as the
13387 # default is defined completely in terms of the Y table.
13388 $property->add_map(0, $MAX_WORKING_CODEPOINT,
13389 $default_map, Replace => $NO);
13393 # Have all we need to populate the match tables.
13394 my $maps_should_be_defined = $property->pre_declared_maps;
13395 foreach my $range ($property->ranges) {
13396 my $map = $range->value;
13397 my $table = $property->table($map);
13398 if (! defined $table) {
13400 # Integral and rational property values are not
13401 # necessarily defined in PropValueAliases, but whether all
13402 # the other ones should be depends on the property.
13403 if ($maps_should_be_defined
13404 && $map !~ /^ -? \d+ ( \/ \d+ )? $/x)
13406 Carp::my_carp("Table '$property_name=$map' should "
13407 . "have been defined. Defining it now.")
13409 $table = $property->add_match_table($map);
13412 next if $table->complement != 0; # Don't need to populate these
13413 $table->add_range($range->start, $range->end);
13417 # For Perl 5.6 compatibility, all properties matchable in regexes can
13418 # have an optional 'Is_' prefix. This is now done in utf8_heavy.pl.
13419 # But warn if this creates a conflict with a (new) Unicode property
13420 # name, although it appears that Unicode has made a decision never to
13421 # begin a property name with 'Is_', so this shouldn't happen.
13422 foreach my $alias ($property->aliases) {
13423 my $Is_name = 'Is_' . $alias->name;
13424 if (defined (my $pre_existing = property_ref($Is_name))) {
13425 Carp::my_carp(<<END
13426 There is already an alias named $Is_name (from " . $pre_existing . "), so
13427 creating one for $property won't work. This is bad news. If it is not too
13428 late, get Unicode to back off. Otherwise go back to the old scheme (findable
13429 from the git blame log for this area of the code that suppressed individual
13430 aliases that conflict with the new Unicode names. Proceeding anyway.
13434 } # End of loop through aliases for this property
13435 } # End of loop through all Unicode properties.
13437 # Fill in the mappings that Unicode doesn't completely furnish. First the
13438 # single letter major general categories. If Unicode were to start
13439 # delivering the values, this would be redundant, but better that than to
13440 # try to figure out if should skip and not get it right. Ths could happen
13441 # if a new major category were to be introduced, and the hard-coded test
13442 # wouldn't know about it.
13443 # This routine depends on the standard names for the general categories
13444 # being what it thinks they are, like 'Cn'. The major categories are the
13445 # union of all the general category tables which have the same first
13446 # letters. eg. L = Lu + Lt + Ll + Lo + Lm
13447 foreach my $minor_table ($gc->tables) {
13448 my $minor_name = $minor_table->name;
13449 next if length $minor_name == 1;
13450 if (length $minor_name != 2) {
13451 Carp::my_carp_bug("Unexpected general category '$minor_name'. Skipped.");
13455 my $major_name = uc(substr($minor_name, 0, 1));
13456 my $major_table = $gc->table($major_name);
13457 $major_table += $minor_table;
13460 # LC is Ll, Lu, and Lt. (used to be L& or L_, but PropValueAliases.txt
13461 # defines it as LC)
13462 my $LC = $gc->table('LC');
13463 $LC->add_alias('L_', Status => $DISCOURAGED); # For backwards...
13464 $LC->add_alias('L&', Status => $DISCOURAGED); # compatibility.
13467 if ($LC->is_empty) { # Assume if not empty that Unicode has started to
13468 # deliver the correct values in it
13469 $LC->initialize($gc->table('Ll') + $gc->table('Lu'));
13471 # Lt not in release 1.
13472 if (defined $gc->table('Lt')) {
13473 $LC += $gc->table('Lt');
13474 $gc->table('Lt')->set_caseless_equivalent($LC);
13477 $LC->add_description('[\p{Ll}\p{Lu}\p{Lt}]');
13479 $gc->table('Ll')->set_caseless_equivalent($LC);
13480 $gc->table('Lu')->set_caseless_equivalent($LC);
13482 # Create digit and case fold tables with the original file names for
13483 # backwards compatibility with applications that read them directly.
13484 my $Digit = Property->new("Legacy_Perl_Decimal_Digit",
13486 File => 'Digit', # Trad. location
13487 Directory => $map_directory,
13489 Replacement_Property => "Perl_Decimal_Digit",
13490 Initialize => property_ref('Perl_Decimal_Digit'),
13492 $Digit->add_comment(join_lines(<<END
13493 This file gives the mapping of all code points which represent a single
13494 decimal digit [0-9] to their respective digits. For example, the code point
13495 U+0031 (an ASCII '1') is mapped to a numeric 1. These code points are those
13496 that have Numeric_Type=Decimal; not special things, like subscripts nor Roman
13501 Property->new('Legacy_Case_Folding',
13503 Directory => $map_directory,
13504 Default_Map => $CODE_POINT,
13506 Replacement_Property => "Case_Folding",
13507 Format => $HEX_FORMAT,
13508 Initialize => property_ref('cf'),
13511 # The Script_Extensions property started out as a clone of the Script
13512 # property. But processing its data file caused some elements to be
13513 # replaced with different data. (These elements were for the Common and
13514 # Inherited properties.) This data is a qw() list of all the scripts that
13515 # the code points in the given range are in. An example line is:
13516 # 060C ; Arab Syrc Thaa # Po ARABIC COMMA
13518 # The code above has created a new match table named "Arab Syrc Thaa"
13519 # which contains 060C. (The cloned table started out with this code point
13520 # mapping to "Common".) Now we add 060C to each of the Arab, Syrc, and
13521 # Thaa match tables. Then we delete the now spurious "Arab Syrc Thaa"
13522 # match table. This is repeated for all these tables and ranges. The map
13523 # data is retained in the map table for reference, but the spurious match
13524 # tables are deleted.
13526 my $scx = property_ref("Script_Extensions");
13527 if (defined $scx) {
13528 foreach my $table ($scx->tables) {
13529 next unless $table->name =~ /\s/; # All the new and only the new
13530 # tables have a space in their
13532 my @scripts = split /\s+/, $table->name;
13533 foreach my $script (@scripts) {
13534 my $script_table = $scx->table($script);
13535 $script_table += $table;
13537 $scx->delete_match_table($table);
13544 sub pre_3_dot_1_Nl () {
13546 # Return a range list for gc=nl for Unicode versions prior to 3.1, which
13547 # is when Unicode's became fully usable. These code points were
13548 # determined by inspection and experimentation. gc=nl is important for
13549 # certain Perl-extension properties that should be available in all
13552 my $Nl = Range_List->new();
13553 if (defined (my $official = $gc->table('Nl'))) {
13557 $Nl->add_range(0x2160, 0x2182);
13558 $Nl->add_range(0x3007, 0x3007);
13559 $Nl->add_range(0x3021, 0x3029);
13561 $Nl->add_range(0xFE20, 0xFE23);
13562 $Nl->add_range(0x16EE, 0x16F0) if $v_version ge v3.0.0; # 3.0 was when
13567 sub calculate_Assigned() { # Calculate the gc != Cn code points; may be
13568 # called before the Cn's are completely filled.
13569 # Works on Unicodes earlier than ones that
13570 # explicitly specify Cn.
13571 return if defined $Assigned;
13573 if (! defined $gc || $gc->is_empty()) {
13574 Carp::my_carp_bug("calculate_Assigned() called before $gc is populated");
13577 $Assigned = $perl->add_match_table('Assigned',
13578 Description => "All assigned code points",
13580 while (defined (my $range = $gc->each_range())) {
13581 my $standard_value = standardize($range->value);
13582 next if $standard_value eq 'cn' || $standard_value eq 'unassigned';
13583 $Assigned->add_range($range->start, $range->end);
13587 sub calculate_DI() { # Set $DI to a Range_List equivalent to the
13588 # Default_Ignorable_Code_Point property. Works on
13589 # Unicodes earlier than ones that explicitly specify
13591 return if defined $DI;
13593 if (defined (my $di = property_ref('Default_Ignorable_Code_Point'))) {
13594 $DI = $di->table('Y');
13597 $DI = Range_List->new(Initialize => [ 0x180B .. 0x180D,
13602 if ($v_version ge v2.0) {
13603 $DI += $gc->table('Cf')
13604 + $gc->table('Cs');
13606 # These are above the Unicode version 1 max
13607 $DI->add_range(0xE0000, 0xE0FFF);
13609 $DI += $gc->table('Cc')
13611 - utf8::unicode_to_native(0x0A) # LINE FEED
13612 - utf8::unicode_to_native(0x0B) # VERTICAL TAB
13614 - utf8::unicode_to_native(0x0D) # CARRIAGE RETURN
13615 - utf8::unicode_to_native(0x85); # NEL
13619 sub calculate_NChar() { # Create a Perl extension match table which is the
13620 # same as the Noncharacter_Code_Point property, and
13621 # set $NChar to point to it. Works on Unicodes
13622 # earlier than ones that explicitly specify NChar
13623 return if defined $NChar;
13625 $NChar = $perl->add_match_table('_Perl_Nchar',
13626 Perl_Extension => 1,
13627 Fate => $INTERNAL_ONLY);
13628 if (defined (my $off_nchar = property_ref('NChar'))) {
13629 $NChar->initialize($off_nchar->table('Y'));
13632 $NChar->initialize([ 0xFFFE .. 0xFFFF ]);
13633 if ($v_version ge v2.0) { # First release with these nchars
13634 for (my $i = 0x1FFFE; $i <= 0x10FFFE; $i += 0x10000) {
13635 $NChar += [ $i .. $i+1 ];
13641 sub handle_compare_versions () {
13642 # This fixes things up for the $compare_versions capability, where we
13643 # compare Unicode version X with version Y (with Y > X), and we are
13644 # running it on the Unicode Data for version Y.
13646 # It works by calculating the code points whose meaning has been specified
13647 # after release X, by using the Age property. The complement of this set
13648 # is the set of code points whose meaning is unchanged between the
13649 # releases. This is the set the program restricts itself to. It includes
13650 # everything whose meaning has been specified by the time version X came
13651 # along, plus those still unassigned by the time of version Y. (We will
13652 # continue to use the word 'assigned' to mean 'meaning has been
13653 # specified', as it's shorter and is accurate in all cases except the
13654 # Noncharacter code points.)
13656 # This function is run after all the properties specified by Unicode have
13657 # been calculated for release Y. This makes sure we get all the nuances
13658 # of Y's rules. (It is done before the Perl extensions are calculated, as
13659 # those are based entirely on the Unicode ones.) But doing it after the
13660 # Unicode table calculations means we have to fix up the Unicode tables.
13661 # We do this by subtracting the code points that have been assigned since
13662 # X (which is actually done by ANDing each table of assigned code points
13663 # with the set of unchanged code points). Most Unicode properties are of
13664 # the form such that all unassigned code points have a default, grab-bag,
13665 # property value which is changed when the code point gets assigned. For
13666 # these, we just remove the changed code points from the table for the
13667 # latter property value, and add them back in to the grab-bag one. A few
13668 # other properties are not entirely of this form and have values for some
13669 # or all unassigned code points that are not the grab-bag one. These have
13670 # to be handled specially, and are hard-coded in to this routine based on
13671 # manual inspection of the Unicode character database. A list of the
13672 # outlier code points is made for each of these properties, and those
13673 # outliers are excluded from adding and removing from tables.
13675 # Note that there are glitches when comparing against Unicode 1.1, as some
13676 # Hangul syllables in it were later ripped out and eventually replaced
13677 # with other things.
13679 print "Fixing up for version comparison\n" if $verbosity >= $PROGRESS;
13681 my $after_first_version = "All matching code points were added after "
13682 . "Unicode $string_compare_versions";
13684 # Calculate the delta as those code points that have been newly assigned
13685 # since the first compare version.
13686 my $delta = Range_List->new();
13687 foreach my $table ($age->tables) {
13688 next if $table == $age->table('Unassigned');
13689 next if $table->name le $string_compare_versions;
13692 if ($delta->is_empty) {
13693 die ("No changes; perhaps you need a 'DAge.txt' file?");
13696 my $unchanged = ~ $delta;
13698 calculate_Assigned() if ! defined $Assigned;
13699 $Assigned &= $unchanged;
13701 # $Assigned now contains the code points that were assigned as of Unicode
13704 # A block is all or nothing. If nothing is assigned in it, it all goes
13705 # back to the No_Block pool; but if even one code point is assigned, the
13706 # block is retained.
13707 my $no_block = $block->table('No_Block');
13708 foreach my $this_block ($block->tables) {
13709 next if $this_block == $no_block
13710 || ! ($this_block & $Assigned)->is_empty;
13711 $this_block->set_fate($SUPPRESSED, $after_first_version);
13712 $no_block += $this_block;
13715 my @special_delta_properties; # List of properties that have to be
13716 # handled specially.
13717 my %restricted_delta; # Keys are the entries in
13718 # @special_delta_properties; values
13719 # are the range list of the code points
13720 # that behave normally when they get
13723 # In the next three properties, the Default Ignorable code points are
13728 push @special_delta_properties, property_ref('_Perl_GCB');
13729 $restricted_delta{$special_delta_properties[-1]} = ~ $DI;
13731 if (defined (my $cwnfkcc = property_ref('Changes_When_NFKC_Casefolded')))
13733 push @special_delta_properties, $cwnfkcc;
13734 $restricted_delta{$special_delta_properties[-1]} = ~ $DI;
13737 calculate_NChar(); # Non-character code points
13738 $NChar &= $unchanged;
13740 # This may have to be updated from time-to-time to get the most accurate
13742 my $default_BC_non_LtoR = Range_List->new(Initialize =>
13743 # These came from the comments in v8.0 DBidiClass.txt
13750 0x1EE00 .. 0x1EEFF,
13755 0x10800 .. 0x10FFF,
13756 0x1E800 .. 0x1EDFF,
13757 0x1EF00 .. 0x1EFFF,
13762 $default_BC_non_LtoR += $DI + $NChar;
13763 push @special_delta_properties, property_ref('BidiClass');
13764 $restricted_delta{$special_delta_properties[-1]} = ~ $default_BC_non_LtoR;
13766 if (defined (my $eaw = property_ref('East_Asian_Width'))) {
13768 my $default_EA_width_W = Range_List->new(Initialize =>
13769 # From comments in v8.0 EastAsianWidth.txt
13774 0x20000 .. 0x2A6DF,
13775 0x2A700 .. 0x2B73F,
13776 0x2B740 .. 0x2B81F,
13777 0x2B820 .. 0x2CEAF,
13778 0x2F800 .. 0x2FA1F,
13779 0x20000 .. 0x2FFFD,
13780 0x30000 .. 0x3FFFD,
13783 push @special_delta_properties, $eaw;
13784 $restricted_delta{$special_delta_properties[-1]}
13785 = ~ $default_EA_width_W;
13787 # Line break came along in the same release as East_Asian_Width, and
13788 # the non-grab-bag default set is a superset of the EAW one.
13789 if (defined (my $lb = property_ref('Line_Break'))) {
13790 my $default_LB_non_XX = Range_List->new(Initialize =>
13791 # From comments in v8.0 LineBreak.txt
13792 [ 0x20A0 .. 0x20CF ]);
13793 $default_LB_non_XX += $default_EA_width_W;
13794 push @special_delta_properties, $lb;
13795 $restricted_delta{$special_delta_properties[-1]}
13796 = ~ $default_LB_non_XX;
13800 # Go through every property, skipping those we've already worked on, those
13801 # that are immutable, and the perl ones that will be calculated after this
13802 # routine has done its fixup.
13803 foreach my $property (property_ref('*')) {
13804 next if $property == $perl # Done later in the program
13805 || $property == $block # Done just above
13806 || $property == $DI # Done just above
13807 || $property == $NChar # Done just above
13809 # The next two are invariant across Unicode versions
13810 || $property == property_ref('Pattern_Syntax')
13811 || $property == property_ref('Pattern_White_Space');
13813 # Find the grab-bag value.
13814 my $default_map = $property->default_map;
13816 if (! $property->to_create_match_tables) {
13818 # Here there aren't any match tables. So far, all such properties
13819 # have a default map, and don't require special handling. Just
13820 # change each newly assigned code point back to the default map,
13821 # as if they were unassigned.
13822 foreach my $range ($delta->ranges) {
13823 $property->add_map($range->start,
13826 Replace => $UNCONDITIONALLY);
13829 else { # Here there are match tables. Find the one (if any) for the
13830 # grab-bag value that unassigned code points go to.
13832 if (defined $default_map) {
13833 $default_table = $property->table($default_map);
13836 # If some code points don't go back to the the grab-bag when they
13837 # are considered unassigned, exclude them from the list that does
13839 my $this_delta = $delta;
13840 my $this_unchanged = $unchanged;
13841 if (grep { $_ == $property } @special_delta_properties) {
13842 $this_delta = $delta & $restricted_delta{$property};
13843 $this_unchanged = ~ $this_delta;
13846 # Fix up each match table for this property.
13847 foreach my $table ($property->tables) {
13848 if (defined $default_table && $table == $default_table) {
13850 # The code points assigned after release X (the ones we
13851 # are excluding in this routine) go back on to the default
13852 # (grab-bag) table. However, some of these tables don't
13853 # actually exist, but are specified solely by the other
13854 # tables. (In a binary property, we don't need to
13855 # actually have an 'N' table, as it's just the complement
13856 # of the 'Y' table.) Such tables will be locked, so just
13858 $table += $this_delta unless $table->locked;
13862 # Here the table is not for the default value. We need to
13863 # subtract the code points we are ignoring for this
13864 # comparison (the deltas) from it. But if the table
13865 # started out with nothing, no need to exclude anything,
13866 # and want to skip it here anyway, so it gets listed
13867 # properly in the pod.
13868 next if $table->is_empty;
13870 # Save the deltas for later, before we do the subtraction
13871 my $deltas = $table & $this_delta;
13873 $table &= $this_unchanged;
13875 # Suppress the table if the subtraction left it with
13877 if ($table->is_empty) {
13878 if ($property->type == $BINARY) {
13879 push @tables_that_may_be_empty, $table->complete_name;
13882 $table->set_fate($SUPPRESSED, $after_first_version);
13886 # Now we add the removed code points to the property's
13887 # map, as they should now map to the grab-bag default
13888 # property (which they did in the first comparison
13889 # version). But we don't have to do this if the map is
13890 # only for internal use.
13891 if (defined $default_map && $property->to_output_map) {
13893 # The gc property has pseudo property values whose names
13894 # have length 1. These are the union of all the
13895 # property values whose name is longer than 1 and
13896 # whose first letter is all the same. The replacement
13897 # is done once for the longer-named tables.
13898 next if $property == $gc && length $table->name == 1;
13900 foreach my $range ($deltas->ranges) {
13901 $property->add_map($range->start,
13904 Replace => $UNCONDITIONALLY);
13912 # The above code doesn't work on 'gc=C', as it is a superset of the default
13913 # ('Cn') table. It's easiest to just special case it here.
13914 my $C = $gc->table('C');
13915 $C += $gc->table('Cn');
13920 sub compile_perl() {
13921 # Create perl-defined tables. Almost all are part of the pseudo-property
13922 # named 'perl' internally to this program. Many of these are recommended
13923 # in UTS#18 "Unicode Regular Expressions", and their derivations are based
13924 # on those found there.
13925 # Almost all of these are equivalent to some Unicode property.
13926 # A number of these properties have equivalents restricted to the ASCII
13927 # range, with their names prefaced by 'Posix', to signify that these match
13928 # what the Posix standard says they should match. A couple are
13929 # effectively this, but the name doesn't have 'Posix' in it because there
13930 # just isn't any Posix equivalent. 'XPosix' are the Posix tables extended
13931 # to the full Unicode range, by our guesses as to what is appropriate.
13933 # 'All' is all code points. As an error check, instead of just setting it
13934 # to be that, construct it to be the union of all the major categories
13935 $All = $perl->add_match_table('All',
13937 => "All code points, including those above Unicode. Same as qr/./s",
13940 foreach my $major_table ($gc->tables) {
13942 # Major categories are the ones with single letter names.
13943 next if length($major_table->name) != 1;
13945 $All += $major_table;
13948 if ($All->max != $MAX_WORKING_CODEPOINT) {
13949 Carp::my_carp_bug("Generated highest code point ("
13950 . sprintf("%X", $All->max)
13951 . ") doesn't match expected value $MAX_WORKING_CODEPOINT_STRING.")
13953 if ($All->range_count != 1 || $All->min != 0) {
13954 Carp::my_carp_bug("Generated table 'All' doesn't match all code points.")
13957 my $Any = $perl->add_match_table('Any',
13958 Description => "All Unicode code points: [\\x{0000}-\\x{$MAX_UNICODE_CODEPOINT_STRING}]",
13960 $Any->add_range(0, $MAX_UNICODE_CODEPOINT);
13961 $Any->add_alias('Unicode');
13963 calculate_Assigned();
13965 # Our internal-only property should be treated as more than just a
13966 # synonym; grandfather it in to the pod.
13967 $perl->add_match_table('_CombAbove', Re_Pod_Entry => 1,
13968 Fate => $INTERNAL_ONLY, Status => $DISCOURAGED)
13969 ->set_equivalent_to(property_ref('ccc')->table('Above'),
13972 my $ASCII = $perl->add_match_table('ASCII', Description => '[[:ASCII:]]');
13973 if (defined $block) { # This is equivalent to the block if have it.
13974 my $Unicode_ASCII = $block->table('Basic_Latin');
13975 if (defined $Unicode_ASCII && ! $Unicode_ASCII->is_empty) {
13976 $ASCII->set_equivalent_to($Unicode_ASCII, Related => 1);
13980 # Very early releases didn't have blocks, so initialize ASCII ourselves if
13982 if ($ASCII->is_empty) {
13983 if (! NON_ASCII_PLATFORM) {
13984 $ASCII->add_range(0, 127);
13987 for my $i (0 .. 127) {
13988 $ASCII->add_range(utf8::unicode_to_native($i),
13989 utf8::unicode_to_native($i));
13994 # Get the best available case definitions. Early Unicode versions didn't
13995 # have Uppercase and Lowercase defined, so use the general category
13996 # instead for them, modified by hard-coding in the code points each is
13998 my $Lower = $perl->add_match_table('XPosixLower');
13999 my $Unicode_Lower = property_ref('Lowercase');
14000 if (defined $Unicode_Lower && ! $Unicode_Lower->is_empty) {
14001 $Lower->set_equivalent_to($Unicode_Lower->table('Y'), Related => 1);
14005 $Lower += $gc->table('Lowercase_Letter');
14007 # There are quite a few code points in Lower, that aren't in gc=lc,
14008 # and not all are in all releases.
14009 my $temp = Range_List->new(Initialize => [
14010 utf8::unicode_to_native(0xAA),
14011 utf8::unicode_to_native(0xBA),
14029 $Lower += $temp & $Assigned;
14031 my $Posix_Lower = $perl->add_match_table("PosixLower",
14032 Description => "[a-z]",
14033 Initialize => $Lower & $ASCII,
14036 my $Upper = $perl->add_match_table("XPosixUpper");
14037 my $Unicode_Upper = property_ref('Uppercase');
14038 if (defined $Unicode_Upper && ! $Unicode_Upper->is_empty) {
14039 $Upper->set_equivalent_to($Unicode_Upper->table('Y'), Related => 1);
14043 # Unlike Lower, there are only two ranges in Upper that aren't in
14044 # gc=Lu, and all code points were assigned in all releases.
14045 $Upper += $gc->table('Uppercase_Letter');
14046 $Upper->add_range(0x2160, 0x216F); # Uppercase Roman numerals
14047 $Upper->add_range(0x24B6, 0x24CF); # Circled Latin upper case letters
14049 my $Posix_Upper = $perl->add_match_table("PosixUpper",
14050 Description => "[A-Z]",
14051 Initialize => $Upper & $ASCII,
14054 # Earliest releases didn't have title case. Initialize it to empty if not
14055 # otherwise present
14056 my $Title = $perl->add_match_table('Title', Full_Name => 'Titlecase',
14057 Description => '(= \p{Gc=Lt})');
14058 my $lt = $gc->table('Lt');
14060 # Earlier versions of mktables had this related to $lt since they have
14061 # identical code points, but their caseless equivalents are not the same,
14062 # one being 'Cased' and the other being 'LC', and so now must be kept as
14063 # separate entities.
14068 push @tables_that_may_be_empty, $Title->complete_name;
14071 my $Unicode_Cased = property_ref('Cased');
14072 if (defined $Unicode_Cased) {
14073 my $yes = $Unicode_Cased->table('Y');
14074 my $no = $Unicode_Cased->table('N');
14075 $Title->set_caseless_equivalent($yes);
14076 if (defined $Unicode_Upper) {
14077 $Unicode_Upper->table('Y')->set_caseless_equivalent($yes);
14078 $Unicode_Upper->table('N')->set_caseless_equivalent($no);
14080 $Upper->set_caseless_equivalent($yes);
14081 if (defined $Unicode_Lower) {
14082 $Unicode_Lower->table('Y')->set_caseless_equivalent($yes);
14083 $Unicode_Lower->table('N')->set_caseless_equivalent($no);
14085 $Lower->set_caseless_equivalent($yes);
14088 # If this Unicode version doesn't have Cased, set up the Perl
14089 # extension from first principles. From Unicode 5.1: Definition D120:
14090 # A character C is defined to be cased if and only if C has the
14091 # Lowercase or Uppercase property or has a General_Category value of
14092 # Titlecase_Letter.
14093 my $cased = $perl->add_match_table('Cased',
14094 Initialize => $Lower + $Upper + $Title,
14095 Description => 'Uppercase or Lowercase or Titlecase',
14097 # $notcased is purely for the caseless equivalents below
14098 my $notcased = $perl->add_match_table('_Not_Cased',
14099 Initialize => ~ $cased,
14100 Fate => $INTERNAL_ONLY,
14101 Description => 'All not-cased code points');
14102 $Title->set_caseless_equivalent($cased);
14103 if (defined $Unicode_Upper) {
14104 $Unicode_Upper->table('Y')->set_caseless_equivalent($cased);
14105 $Unicode_Upper->table('N')->set_caseless_equivalent($notcased);
14107 $Upper->set_caseless_equivalent($cased);
14108 if (defined $Unicode_Lower) {
14109 $Unicode_Lower->table('Y')->set_caseless_equivalent($cased);
14110 $Unicode_Lower->table('N')->set_caseless_equivalent($notcased);
14112 $Lower->set_caseless_equivalent($cased);
14115 # Similarly, set up our own Case_Ignorable property if this Unicode
14116 # version doesn't have it. From Unicode 5.1: Definition D121: A character
14117 # C is defined to be case-ignorable if C has the value MidLetter or the
14118 # value MidNumLet for the Word_Break property or its General_Category is
14119 # one of Nonspacing_Mark (Mn), Enclosing_Mark (Me), Format (Cf),
14120 # Modifier_Letter (Lm), or Modifier_Symbol (Sk).
14122 # Perl has long had an internal-only alias for this property; grandfather
14123 # it in to the pod, but discourage its use.
14124 my $perl_case_ignorable = $perl->add_match_table('_Case_Ignorable',
14126 Fate => $INTERNAL_ONLY,
14127 Status => $DISCOURAGED);
14128 my $case_ignorable = property_ref('Case_Ignorable');
14129 if (defined $case_ignorable && ! $case_ignorable->is_empty) {
14130 $perl_case_ignorable->set_equivalent_to($case_ignorable->table('Y'),
14135 $perl_case_ignorable->initialize($gc->table('Mn') + $gc->table('Lm'));
14137 # The following three properties are not in early releases
14138 $perl_case_ignorable += $gc->table('Me') if defined $gc->table('Me');
14139 $perl_case_ignorable += $gc->table('Cf') if defined $gc->table('Cf');
14140 $perl_case_ignorable += $gc->table('Sk') if defined $gc->table('Sk');
14142 # For versions 4.1 - 5.0, there is no MidNumLet property, and
14143 # correspondingly the case-ignorable definition lacks that one. For
14144 # 4.0, it appears that it was meant to be the same definition, but was
14145 # inadvertently omitted from the standard's text, so add it if the
14146 # property actually is there
14147 my $wb = property_ref('Word_Break');
14149 my $midlet = $wb->table('MidLetter');
14150 $perl_case_ignorable += $midlet if defined $midlet;
14151 my $midnumlet = $wb->table('MidNumLet');
14152 $perl_case_ignorable += $midnumlet if defined $midnumlet;
14156 # In earlier versions of the standard, instead of the above two
14157 # properties , just the following characters were used:
14158 $perl_case_ignorable +=
14160 + utf8::unicode_to_native(0xAD) # SOFT HYPHEN (SHY)
14161 + 0x2019; # RIGHT SINGLE QUOTATION MARK
14165 # The remaining perl defined tables are mostly based on Unicode TR 18,
14166 # "Annex C: Compatibility Properties". All of these have two versions,
14167 # one whose name generally begins with Posix that is posix-compliant, and
14168 # one that matches Unicode characters beyond the Posix, ASCII range
14170 my $Alpha = $perl->add_match_table('XPosixAlpha');
14172 # Alphabetic was not present in early releases
14173 my $Alphabetic = property_ref('Alphabetic');
14174 if (defined $Alphabetic && ! $Alphabetic->is_empty) {
14175 $Alpha->set_equivalent_to($Alphabetic->table('Y'), Related => 1);
14179 # The Alphabetic property doesn't exist for early releases, so
14180 # generate it. The actual definition, in 5.2 terms is:
14182 # gc=L + gc=Nl + Other_Alphabetic
14184 # Other_Alphabetic is also not defined in these early releases, but it
14185 # contains one gc=So range plus most of gc=Mn and gc=Mc, so we add
14186 # those last two as well, then subtract the relatively few of them that
14187 # shouldn't have been added. (The gc=So range is the circled capital
14188 # Latin characters. Early releases mistakenly didn't also include the
14189 # lower-case versions of these characters, and so we don't either, to
14190 # maintain consistency with those releases that first had this
14192 $Alpha->initialize($gc->table('Letter')
14197 $Alpha->add_range(0x24D0, 0x24E9); # gc=So
14198 foreach my $range ( [ 0x0300, 0x0344 ],
14199 [ 0x0346, 0x034E ],
14200 [ 0x0360, 0x0362 ],
14201 [ 0x0483, 0x0486 ],
14202 [ 0x0591, 0x05AF ],
14203 [ 0x06DF, 0x06E0 ],
14204 [ 0x06EA, 0x06EC ],
14205 [ 0x0740, 0x074A ],
14208 [ 0x0951, 0x0954 ],
14222 [ 0x0E47, 0x0E4C ],
14224 [ 0x0EC8, 0x0ECC ],
14225 [ 0x0F18, 0x0F19 ],
14229 [ 0x0F3E, 0x0F3F ],
14230 [ 0x0F82, 0x0F84 ],
14231 [ 0x0F86, 0x0F87 ],
14235 [ 0x17C9, 0x17D3 ],
14236 [ 0x20D0, 0x20DC ],
14238 [ 0x302A, 0x302F ],
14239 [ 0x3099, 0x309A ],
14240 [ 0xFE20, 0xFE23 ],
14241 [ 0x1D165, 0x1D169 ],
14242 [ 0x1D16D, 0x1D172 ],
14243 [ 0x1D17B, 0x1D182 ],
14244 [ 0x1D185, 0x1D18B ],
14245 [ 0x1D1AA, 0x1D1AD ],
14248 $Alpha->delete_range($range->[0], $range->[1]);
14251 $Alpha->delete_range($range, $range);
14254 $Alpha->add_description('Alphabetic');
14255 $Alpha->add_alias('Alphabetic');
14257 my $Posix_Alpha = $perl->add_match_table("PosixAlpha",
14258 Description => "[A-Za-z]",
14259 Initialize => $Alpha & $ASCII,
14261 $Posix_Upper->set_caseless_equivalent($Posix_Alpha);
14262 $Posix_Lower->set_caseless_equivalent($Posix_Alpha);
14264 my $Alnum = $perl->add_match_table('Alnum', Full_Name => 'XPosixAlnum',
14265 Description => 'Alphabetic and (decimal) Numeric',
14266 Initialize => $Alpha + $gc->table('Decimal_Number'),
14268 $perl->add_match_table("PosixAlnum",
14269 Description => "[A-Za-z0-9]",
14270 Initialize => $Alnum & $ASCII,
14273 my $Word = $perl->add_match_table('Word', Full_Name => 'XPosixWord',
14274 Description => '\w, including beyond ASCII;'
14275 . ' = \p{Alnum} + \pM + \p{Pc}',
14276 Initialize => $Alnum + $gc->table('Mark'),
14278 my $Pc = $gc->table('Connector_Punctuation'); # 'Pc' Not in release 1
14283 $Word += ord('_'); # Make sure this is a $Word
14285 my $JC = property_ref('Join_Control'); # Wasn't in release 1
14287 $Word += $JC->table('Y');
14290 $Word += 0x200C + 0x200D;
14293 # This is a Perl extension, so the name doesn't begin with Posix.
14294 my $PerlWord = $perl->add_match_table('PosixWord',
14295 Description => '\w, restricted to ASCII = [A-Za-z0-9_]',
14296 Initialize => $Word & $ASCII,
14298 $PerlWord->add_alias('PerlWord');
14300 my $Blank = $perl->add_match_table('Blank', Full_Name => 'XPosixBlank',
14301 Description => '\h, Horizontal white space',
14303 # 200B is Zero Width Space which is for line
14304 # break control, and was listed as
14305 # Space_Separator in early releases
14306 Initialize => $gc->table('Space_Separator')
14310 $Blank->add_alias('HorizSpace'); # Another name for it.
14311 $perl->add_match_table("PosixBlank",
14312 Description => "\\t and ' '",
14313 Initialize => $Blank & $ASCII,
14316 my $VertSpace = $perl->add_match_table('VertSpace',
14317 Description => '\v',
14319 $gc->table('Line_Separator')
14320 + $gc->table('Paragraph_Separator')
14321 + utf8::unicode_to_native(0x0A) # LINE FEED
14322 + utf8::unicode_to_native(0x0B) # VERTICAL TAB
14324 + utf8::unicode_to_native(0x0D) # CARRIAGE RETURN
14325 + utf8::unicode_to_native(0x85) # NEL
14327 # No Posix equivalent for vertical space
14329 my $Space = $perl->add_match_table('XPosixSpace',
14330 Description => '\s including beyond ASCII and vertical tab',
14331 Initialize => $Blank + $VertSpace,
14333 $Space->add_alias('XPerlSpace'); # Pre-existing synonyms
14334 $Space->add_alias('SpacePerl');
14335 $Space->add_alias('Space') if $v_version lt v4.1.0;
14337 my $Posix_space = $perl->add_match_table("PosixSpace",
14338 Description => "\\t, \\n, \\cK, \\f, \\r, and ' '. (\\cK is vertical tab)",
14339 Initialize => $Space & $ASCII,
14341 $Posix_space->add_alias('PerlSpace'); # A pre-existing synonym
14343 my $Cntrl = $perl->add_match_table('Cntrl', Full_Name => 'XPosixCntrl',
14344 Description => 'Control characters');
14345 $Cntrl->set_equivalent_to($gc->table('Cc'), Related => 1);
14346 $perl->add_match_table("PosixCntrl",
14347 Description => "ASCII control characters: NUL, SOH, STX, ETX, EOT, ENQ, ACK, BEL, BS, HT, LF, VT, FF, CR, SO, SI, DLE, DC1, DC2, DC3, DC4, NAK, SYN, ETB, CAN, EOM, SUB, ESC, FS, GS, RS, US, and DEL",
14348 Initialize => $Cntrl & $ASCII,
14351 my $perl_surrogate = $perl->add_match_table('_Perl_Surrogate');
14352 my $Cs = $gc->table('Cs');
14353 if (defined $Cs && ! $Cs->is_empty) {
14354 $perl_surrogate += $Cs;
14357 push @tables_that_may_be_empty, '_Perl_Surrogate';
14360 # $controls is a temporary used to construct Graph.
14361 my $controls = Range_List->new(Initialize => $gc->table('Unassigned')
14362 + $gc->table('Control')
14363 + $perl_surrogate);
14365 # Graph is ~space & ~(Cc|Cs|Cn) = ~(space + $controls)
14366 my $Graph = $perl->add_match_table('Graph', Full_Name => 'XPosixGraph',
14367 Description => 'Characters that are graphical',
14368 Initialize => ~ ($Space + $controls),
14370 $perl->add_match_table("PosixGraph",
14372 '[-!"#$%&\'()*+,./:;<=>?@[\\\]^_`{|}~0-9A-Za-z]',
14373 Initialize => $Graph & $ASCII,
14376 $print = $perl->add_match_table('Print', Full_Name => 'XPosixPrint',
14377 Description => 'Characters that are graphical plus space characters (but no controls)',
14378 Initialize => $Blank + $Graph - $gc->table('Control'),
14380 $perl->add_match_table("PosixPrint",
14382 '[- 0-9A-Za-z!"#$%&\'()*+,./:;<=>?@[\\\]^_`{|}~]',
14383 Initialize => $print & $ASCII,
14386 my $Punct = $perl->add_match_table('Punct');
14387 $Punct->set_equivalent_to($gc->table('Punctuation'), Related => 1);
14389 # \p{punct} doesn't include the symbols, which posix does
14390 my $XPosixPunct = $perl->add_match_table('XPosixPunct',
14391 Description => '\p{Punct} + ASCII-range \p{Symbol}',
14392 Initialize => $gc->table('Punctuation')
14393 + ($ASCII & $gc->table('Symbol')),
14394 Perl_Extension => 1
14396 $perl->add_match_table('PosixPunct', Perl_Extension => 1,
14397 Description => '[-!"#$%&\'()*+,./:;<=>?@[\\\]^_`{|}~]',
14398 Initialize => $ASCII & $XPosixPunct,
14401 my $Digit = $perl->add_match_table('Digit', Full_Name => 'XPosixDigit',
14402 Description => '[0-9] + all other decimal digits');
14403 $Digit->set_equivalent_to($gc->table('Decimal_Number'), Related => 1);
14404 my $PosixDigit = $perl->add_match_table("PosixDigit",
14405 Description => '[0-9]',
14406 Initialize => $Digit & $ASCII,
14409 # Hex_Digit was not present in first release
14410 my $Xdigit = $perl->add_match_table('XDigit', Full_Name => 'XPosixXDigit');
14411 my $Hex = property_ref('Hex_Digit');
14412 if (defined $Hex && ! $Hex->is_empty) {
14413 $Xdigit->set_equivalent_to($Hex->table('Y'), Related => 1);
14416 $Xdigit->initialize([ ord('0') .. ord('9'),
14417 ord('A') .. ord('F'),
14418 ord('a') .. ord('f'),
14419 0xFF10..0xFF19, 0xFF21..0xFF26, 0xFF41..0xFF46]);
14420 $Xdigit->add_description('[0-9A-Fa-f] and corresponding fullwidth versions, like U+FF10: FULLWIDTH DIGIT ZERO');
14423 # AHex was not present in early releases
14424 my $PosixXDigit = $perl->add_match_table('PosixXDigit');
14425 my $AHex = property_ref('ASCII_Hex_Digit');
14426 if (defined $AHex && ! $AHex->is_empty) {
14427 $PosixXDigit->set_equivalent_to($AHex->table('Y'), Related => 1);
14430 $PosixXDigit->initialize($Xdigit & $ASCII);
14431 $PosixXDigit->add_alias('AHex');
14432 $PosixXDigit->add_alias('Ascii_Hex_Digit');
14434 $PosixXDigit->add_description('[0-9A-Fa-f]');
14436 my $any_folds = $perl->add_match_table("_Perl_Any_Folds",
14437 Description => "Code points that particpate in some fold",
14439 my $loc_problem_folds = $perl->add_match_table(
14440 "_Perl_Problematic_Locale_Folds",
14442 "Code points that are in some way problematic under locale",
14445 # This allows regexec.c to skip some work when appropriate. Some of the
14446 # entries in _Perl_Problematic_Locale_Folds are multi-character folds,
14447 my $loc_problem_folds_start = $perl->add_match_table(
14448 "_Perl_Problematic_Locale_Foldeds_Start",
14450 "The first character of every sequence in _Perl_Problematic_Locale_Folds",
14453 my $cf = property_ref('Case_Folding');
14455 # Every character 0-255 is problematic because what each folds to depends
14456 # on the current locale
14457 $loc_problem_folds->add_range(0, 255);
14458 $loc_problem_folds_start += $loc_problem_folds;
14460 # Also problematic are anything these fold to outside the range. Likely
14461 # forever the only thing folded to by these outside the 0-255 range is the
14462 # GREEK SMALL MU (from the MICRO SIGN), but it's easy to make the code
14463 # completely general, which should catch any unexpected changes or errors.
14464 # We look at each code point 0-255, and add its fold (including each part
14465 # of a multi-char fold) to the list. See commit message
14466 # 31f05a37c4e9c37a7263491f2fc0237d836e1a80 for a more complete description
14468 foreach my $range ($loc_problem_folds->ranges) {
14469 foreach my $code_point ($range->start .. $range->end) {
14470 my $fold_range = $cf->containing_range($code_point);
14471 next unless defined $fold_range;
14473 # Skip if folds to itself
14474 next if $fold_range->value eq $CODE_POINT;
14476 my @hex_folds = split " ", $fold_range->value;
14477 my $start_cp = $hex_folds[0];
14478 next if $start_cp eq $CODE_POINT;
14479 $start_cp = hex $start_cp;
14480 foreach my $i (0 .. @hex_folds - 1) {
14481 my $cp = $hex_folds[$i];
14482 next if $cp eq $CODE_POINT;
14484 next unless $cp > 255; # Already have the < 256 ones
14486 $loc_problem_folds->add_range($cp, $cp);
14487 $loc_problem_folds_start->add_range($start_cp, $start_cp);
14492 my $folds_to_multi_char = $perl->add_match_table(
14493 "_Perl_Folds_To_Multi_Char",
14495 "Code points whose fold is a string of more than one character",
14497 if ($v_version lt v3.0.1) {
14498 push @tables_that_may_be_empty, '_Perl_Folds_To_Multi_Char';
14501 # Look through all the known folds to populate these tables.
14502 foreach my $range ($cf->ranges) {
14503 next if $range->value eq $CODE_POINT;
14504 my $start = $range->start;
14505 my $end = $range->end;
14506 $any_folds->add_range($start, $end);
14508 my @hex_folds = split " ", $range->value;
14509 if (@hex_folds > 1) { # Is multi-char fold
14510 $folds_to_multi_char->add_range($start, $end);
14513 my $found_locale_problematic = 0;
14515 # Look at each of the folded-to characters...
14516 foreach my $i (0 .. @hex_folds - 1) {
14517 my $cp = hex $hex_folds[$i];
14518 $any_folds->add_range($cp, $cp);
14520 # The fold is problematic if any of the folded-to characters is
14521 # already considered problematic.
14522 if ($loc_problem_folds->contains($cp)) {
14523 $loc_problem_folds->add_range($start, $end);
14524 $found_locale_problematic = 1;
14528 # If this is a problematic fold, add to the start chars the
14529 # folding-from characters and first folded-to character.
14530 if ($found_locale_problematic) {
14531 $loc_problem_folds_start->add_range($start, $end);
14532 my $cp = hex $hex_folds[0];
14533 $loc_problem_folds_start->add_range($cp, $cp);
14537 my $dt = property_ref('Decomposition_Type');
14538 $dt->add_match_table('Non_Canon', Full_Name => 'Non_Canonical',
14539 Initialize => ~ ($dt->table('None') + $dt->table('Canonical')),
14540 Perl_Extension => 1,
14541 Note => 'Union of all non-canonical decompositions',
14544 # _CanonDCIJ is equivalent to Soft_Dotted, but if on a release earlier
14545 # than SD appeared, construct it ourselves, based on the first release SD
14546 # was in. A pod entry is grandfathered in for it
14547 my $CanonDCIJ = $perl->add_match_table('_CanonDCIJ', Re_Pod_Entry => 1,
14548 Perl_Extension => 1,
14549 Fate => $INTERNAL_ONLY,
14550 Status => $DISCOURAGED);
14551 my $soft_dotted = property_ref('Soft_Dotted');
14552 if (defined $soft_dotted && ! $soft_dotted->is_empty) {
14553 $CanonDCIJ->set_equivalent_to($soft_dotted->table('Y'), Related => 1);
14557 # This list came from 3.2 Soft_Dotted; all of these code points are in
14559 $CanonDCIJ->initialize([ ord('i'),
14568 $CanonDCIJ = $CanonDCIJ & $Assigned;
14571 # For backward compatibility, Perl has its own definition for IDStart.
14572 # It is regular XID_Start plus the underscore, but all characters must be
14573 # Word characters as well
14574 my $XID_Start = property_ref('XID_Start');
14575 my $perl_xids = $perl->add_match_table('_Perl_IDStart',
14576 Perl_Extension => 1,
14577 Fate => $INTERNAL_ONLY,
14578 Initialize => ord('_')
14580 if (defined $XID_Start
14581 || defined ($XID_Start = property_ref('ID_Start')))
14583 $perl_xids += $XID_Start->table('Y');
14586 # For Unicode versions that don't have the property, construct our own
14587 # from first principles. The actual definition is:
14589 # + letter numbers (Nl)
14591 # - Pattern_White_Space
14592 # + stability extensions
14593 # - NKFC modifications
14595 # What we do in the code below is to include the identical code points
14596 # that are in the first release that had Unicode's version of this
14597 # property, essentially extrapolating backwards. There were no
14598 # stability extensions until v4.1, so none are included; likewise in
14599 # no Unicode version so far do subtracting PatSyn and PatWS make any
14600 # difference, so those also are ignored.
14601 $perl_xids += $gc->table('Letter') + pre_3_dot_1_Nl();
14603 # We do subtract the NFKC modifications that are in the first version
14604 # that had this property. We don't bother to test if they are in the
14605 # version in question, because if they aren't, the operation is a
14606 # no-op. The NKFC modifications are discussed in
14607 # http://www.unicode.org/reports/tr31/#NFKC_Modifications
14608 foreach my $range ( 0x037A,
14611 [ 0xFC5E, 0xFC63 ],
14612 [ 0xFDFA, 0xFE70 ],
14613 [ 0xFE72, 0xFE76 ],
14618 [ 0xFF9E, 0xFF9F ],
14621 $perl_xids->delete_range($range->[0], $range->[1]);
14624 $perl_xids->delete_range($range, $range);
14629 $perl_xids &= $Word;
14631 my $perl_xidc = $perl->add_match_table('_Perl_IDCont',
14632 Perl_Extension => 1,
14633 Fate => $INTERNAL_ONLY);
14634 my $XIDC = property_ref('XID_Continue');
14636 || defined ($XIDC = property_ref('ID_Continue')))
14638 $perl_xidc += $XIDC->table('Y');
14641 # Similarly, we construct our own XIDC if necessary for early Unicode
14642 # versions. The definition is:
14643 # everything in XIDS
14649 # - Pattern_White_Space
14650 # + stability extensions
14651 # - NFKC modifications
14653 # The same thing applies to this as with XIDS for the PatSyn, PatWS,
14654 # and stability extensions. There is a somewhat different set of NFKC
14655 # mods to remove (and add in this case). The ones below make this
14656 # have identical code points as in the first release that defined it.
14657 $perl_xidc += $perl_xids
14662 + utf8::unicode_to_native(0xB7)
14664 if (defined (my $pc = $gc->table('Pc'))) {
14667 else { # 1.1.5 didn't have Pc, but these should have been in it
14668 $perl_xidc += 0xFF3F;
14669 $perl_xidc->add_range(0x203F, 0x2040);
14670 $perl_xidc->add_range(0xFE33, 0xFE34);
14671 $perl_xidc->add_range(0xFE4D, 0xFE4F);
14674 # Subtract the NFKC mods
14675 foreach my $range ( 0x037A,
14676 [ 0xFC5E, 0xFC63 ],
14677 [ 0xFDFA, 0xFE1F ],
14679 [ 0xFE72, 0xFE76 ],
14686 $perl_xidc->delete_range($range->[0], $range->[1]);
14689 $perl_xidc->delete_range($range, $range);
14694 $perl_xidc &= $Word;
14696 my $charname_begin = $perl->add_match_table('_Perl_Charname_Begin',
14697 Perl_Extension => 1,
14698 Fate => $INTERNAL_ONLY,
14699 Initialize => $gc->table('Letter') & $Alpha & $perl_xids,
14702 my $charname_continue = $perl->add_match_table('_Perl_Charname_Continue',
14703 Perl_Extension => 1,
14704 Fate => $INTERNAL_ONLY,
14705 Initialize => $perl_xidc
14710 + utf8::unicode_to_native(0xA0) # NBSP
14713 my @composition = ('Name', 'Unicode_1_Name', '_Perl_Name_Alias');
14715 if (@named_sequences) {
14716 push @composition, 'Named_Sequence';
14717 foreach my $sequence (@named_sequences) {
14718 $perl_charname->add_anomalous_entry($sequence);
14722 my $alias_sentence = "";
14724 my $alias = property_ref('_Perl_Name_Alias');
14725 $perl_charname->set_proxy_for('_Perl_Name_Alias');
14727 # Add each entry in _Perl_Name_Alias to Perl_Charnames. Where these go
14728 # with respect to any existing entry depends on the entry type.
14729 # Corrections go before said entry, as they should be returned in
14730 # preference over the existing entry. (A correction to a correction
14731 # should be later in the _Perl_Name_Alias table, so it will correctly
14732 # precede the erroneous correction in Perl_Charnames.)
14734 # Abbreviations go after everything else, so they are saved temporarily in
14735 # a hash for later.
14737 # Everything else is added added afterwards, which preserves the input
14740 foreach my $range ($alias->ranges) {
14741 next if $range->value eq "";
14742 my $code_point = $range->start;
14743 if ($code_point != $range->end) {
14744 Carp::my_carp_bug("Bad News. Expecting only one code point in the range $range. Just to keep going, using only the first code point;");
14746 my ($value, $type) = split ': ', $range->value;
14748 if ($type eq 'correction') {
14749 $replace_type = $MULTIPLE_BEFORE;
14751 elsif ($type eq 'abbreviation') {
14754 $abbreviations{$value} = $code_point;
14758 $replace_type = $MULTIPLE_AFTER;
14761 # Actually add; before or after current entry(ies) as determined
14764 $perl_charname->add_duplicate($code_point, $value, Replace => $replace_type);
14766 $alias_sentence = <<END;
14767 The _Perl_Name_Alias property adds duplicate code point entries that are
14768 alternatives to the original name. If an addition is a corrected
14769 name, it will be physically first in the table. The original (less correct,
14770 but still valid) name will be next; then any alternatives, in no particular
14771 order; and finally any abbreviations, again in no particular order.
14774 # Now add the Unicode_1 names for the controls. The Unicode_1 names had
14775 # precedence before 6.1, including the awful ones like "LINE FEED (LF)",
14776 # so should be first in the file; the other names have precedence starting
14778 my $before_or_after = ($v_version lt v6.1.0)
14782 foreach my $range (property_ref('Unicode_1_Name')->ranges) {
14783 my $code_point = $range->start;
14784 my $unicode_1_value = $range->value;
14785 next if $unicode_1_value eq ""; # Skip if name doesn't exist.
14787 if ($code_point != $range->end) {
14788 Carp::my_carp_bug("Bad News. Expecting only one code point in the range $range. Just to keep going, using only the first code point;");
14791 # To handle EBCDIC, we don't hard code in the code points of the
14792 # controls; instead realizing that all of them are below 256.
14793 last if $code_point > 255;
14795 # We only add in the controls.
14796 next if $gc->value_of($code_point) ne 'Cc';
14798 # We reject this Unicode1 name for later Perls, as it is used for
14799 # another code point
14800 next if $unicode_1_value eq 'BELL' && $^V ge v5.17.0;
14802 # This won't add an exact duplicate.
14803 $perl_charname->add_duplicate($code_point, $unicode_1_value,
14804 Replace => $before_or_after);
14807 # Now that have everything added, add in abbreviations after
14808 # everything else. Sort so results don't change between runs of this
14810 foreach my $value (sort keys %abbreviations) {
14811 $perl_charname->add_duplicate($abbreviations{$value}, $value,
14812 Replace => $MULTIPLE_AFTER);
14816 if (@composition <= 2) { # Always at least 2
14817 $comment = join " and ", @composition;
14820 $comment = join ", ", @composition[0 .. scalar @composition - 2];
14821 $comment .= ", and $composition[-1]";
14824 $perl_charname->add_comment(join_lines( <<END
14825 This file is for charnames.pm. It is the union of the $comment properties.
14826 Unicode_1_Name entries are used only for nameless code points in the Name
14829 This file doesn't include the algorithmically determinable names. For those,
14830 use 'unicore/Name.pm'
14833 property_ref('Name')->add_comment(join_lines( <<END
14834 This file doesn't include the algorithmically determinable names. For those,
14835 use 'unicore/Name.pm'
14839 # Construct the Present_In property from the Age property.
14840 if (-e 'DAge.txt' && defined $age) {
14841 my $default_map = $age->default_map;
14842 my $in = Property->new('In',
14843 Default_Map => $default_map,
14844 Full_Name => "Present_In",
14845 Perl_Extension => 1,
14847 Initialize => $age,
14849 $in->add_comment(join_lines(<<END
14850 THIS FILE SHOULD NOT BE USED FOR ANY PURPOSE. The values in this file are the
14851 same as for $age, and not for what $in really means. This is because anything
14852 defined in a given release should have multiple values: that release and all
14853 higher ones. But only one value per code point can be represented in a table
14858 # The Age tables are named like 1.5, 2.0, 2.1, .... Sort so that the
14859 # lowest numbered (earliest) come first, with the non-numeric one
14861 my ($first_age, @rest_ages) = sort { ($a->name !~ /^[\d.]*$/)
14863 : ($b->name !~ /^[\d.]*$/)
14865 : $a->name <=> $b->name
14868 # The Present_In property is the cumulative age properties. The first
14869 # one hence is identical to the first age one.
14870 my $previous_in = $in->add_match_table($first_age->name);
14871 $previous_in->set_equivalent_to($first_age, Related => 1);
14873 my $description_start = "Code point's usage introduced in version ";
14874 $first_age->add_description($description_start . $first_age->name);
14876 # To construct the accumulated values, for each of the age tables
14877 # starting with the 2nd earliest, merge the earliest with it, to get
14878 # all those code points existing in the 2nd earliest. Repeat merging
14879 # the new 2nd earliest with the 3rd earliest to get all those existing
14880 # in the 3rd earliest, and so on.
14881 foreach my $current_age (@rest_ages) {
14882 next if $current_age->name !~ /^[\d.]*$/; # Skip the non-numeric
14884 my $current_in = $in->add_match_table(
14885 $current_age->name,
14886 Initialize => $current_age + $previous_in,
14887 Description => $description_start
14888 . $current_age->name
14891 $previous_in = $current_in;
14893 # Add clarifying material for the corresponding age file. This is
14894 # in part because of the confusing and contradictory information
14895 # given in the Standard's documentation itself, as of 5.2.
14896 $current_age->add_description(
14897 "Code point's usage was introduced in version "
14898 . $current_age->name);
14899 $current_age->add_note("See also $in");
14903 # And finally the code points whose usages have yet to be decided are
14904 # the same in both properties. Note that permanently unassigned code
14905 # points actually have their usage assigned (as being permanently
14906 # unassigned), so that these tables are not the same as gc=cn.
14907 my $unassigned = $in->add_match_table($default_map);
14908 my $age_default = $age->table($default_map);
14909 $age_default->add_description(<<END
14910 Code point's usage has not been assigned in any Unicode release thus far.
14913 $unassigned->set_equivalent_to($age_default, Related => 1);
14916 my $patws = $perl->add_match_table('_Perl_PatWS',
14917 Perl_Extension => 1,
14918 Fate => $INTERNAL_ONLY);
14919 if (defined (my $off_patws = property_ref('Pattern_White_Space'))) {
14920 $patws->initialize($off_patws->table('Y'));
14923 $patws->initialize([ ord("\t"),
14925 utf8::unicode_to_native(0x0B), # VT
14929 utf8::unicode_to_native(0x85), # NEL
14930 0x200E..0x200F, # Left, Right marks
14931 0x2028..0x2029 # Line, Paragraph seps
14935 # See L<perlfunc/quotemeta>
14936 my $quotemeta = $perl->add_match_table('_Perl_Quotemeta',
14937 Perl_Extension => 1,
14938 Fate => $INTERNAL_ONLY,
14940 # Initialize to what's common in
14941 # all Unicode releases.
14943 $gc->table('Control')
14946 + ((~ $Word) & $ASCII)
14949 if (defined (my $patsyn = property_ref('Pattern_Syntax'))) {
14950 $quotemeta += $patsyn->table('Y');
14953 $quotemeta += ((~ $Word) & Range->new(0, 255))
14954 - utf8::unicode_to_native(0xA8)
14955 - utf8::unicode_to_native(0xAF)
14956 - utf8::unicode_to_native(0xB2)
14957 - utf8::unicode_to_native(0xB3)
14958 - utf8::unicode_to_native(0xB4)
14959 - utf8::unicode_to_native(0xB7)
14960 - utf8::unicode_to_native(0xB8)
14961 - utf8::unicode_to_native(0xB9)
14962 - utf8::unicode_to_native(0xBC)
14963 - utf8::unicode_to_native(0xBD)
14964 - utf8::unicode_to_native(0xBE);
14965 $quotemeta += [ # These are above-Latin1 patsyn; hence should be the
14966 # same in all releases
14983 if (defined (my $di = property_ref('Default_Ignorable_Code_Point'))) {
14984 $quotemeta += $di->table('Y')
14987 if ($v_version ge v2.0) {
14988 $quotemeta += $gc->table('Cf')
14989 + $gc->table('Cs');
14991 # These are above the Unicode version 1 max
14992 $quotemeta->add_range(0xE0000, 0xE0FFF);
14994 $quotemeta += $gc->table('Cc')
14996 my $temp = Range_List->new(Initialize => [ 0x180B .. 0x180D,
15000 0xE0000 .. 0xE0FFF,
15002 $quotemeta += $temp;
15009 # Finished creating all the perl properties. All non-internal non-string
15010 # ones have a synonym of 'Is_' prefixed. (Internal properties begin with
15011 # an underscore.) These do not get a separate entry in the pod file
15012 foreach my $table ($perl->tables) {
15013 foreach my $alias ($table->aliases) {
15014 next if $alias->name =~ /^_/;
15015 $table->add_alias('Is_' . $alias->name,
15018 Status => $alias->status,
15019 OK_as_Filename => 0);
15023 # Here done with all the basic stuff. Ready to populate the information
15024 # about each character if annotating them.
15027 # See comments at its declaration
15028 $annotate_ranges = Range_Map->new;
15030 # This separates out the non-characters from the other unassigneds, so
15031 # can give different annotations for each.
15032 $unassigned_sans_noncharacters = Range_List->new(
15033 Initialize => $gc->table('Unassigned'));
15034 $unassigned_sans_noncharacters &= (~ $NChar);
15036 for (my $i = 0; $i <= $MAX_UNICODE_CODEPOINT + 1; $i++ ) {
15037 $i = populate_char_info($i); # Note sets $i so may cause skips
15045 sub add_perl_synonyms() {
15046 # A number of Unicode tables have Perl synonyms that are expressed in
15047 # the single-form, \p{name}. These are:
15048 # All the binary property Y tables, so that \p{Name=Y} gets \p{Name} and
15049 # \p{Is_Name} as synonyms
15050 # \p{Script=Value} gets \p{Value}, \p{Is_Value} as synonyms
15051 # \p{General_Category=Value} gets \p{Value}, \p{Is_Value} as synonyms
15052 # \p{Block=Value} gets \p{In_Value} as a synonym, and, if there is no
15053 # conflict, \p{Value} and \p{Is_Value} as well
15055 # This routine generates these synonyms, warning of any unexpected
15058 # Construct the list of tables to get synonyms for. Start with all the
15059 # binary and the General_Category ones.
15060 my @tables = grep { $_->type == $BINARY || $_->type == $FORCED_BINARY }
15062 push @tables, $gc->tables;
15064 # If the version of Unicode includes the Script property, add its tables
15065 push @tables, $script->tables if defined $script;
15067 # The Block tables are kept separate because they are treated differently.
15068 # And the earliest versions of Unicode didn't include them, so add only if
15071 push @blocks, $block->tables if defined $block;
15073 # Here, have the lists of tables constructed. Process blocks last so that
15074 # if there are name collisions with them, blocks have lowest priority.
15075 # Should there ever be other collisions, manual intervention would be
15076 # required. See the comments at the beginning of the program for a
15077 # possible way to handle those semi-automatically.
15078 foreach my $table (@tables, @blocks) {
15080 # For non-binary properties, the synonym is just the name of the
15081 # table, like Greek, but for binary properties the synonym is the name
15082 # of the property, and means the code points in its 'Y' table.
15083 my $nominal = $table;
15084 my $nominal_property = $nominal->property;
15086 if (! $nominal->isa('Property')) {
15091 # Here is a binary property. Use the 'Y' table. Verify that is
15093 my $yes = $nominal->table('Y');
15094 unless (defined $yes) { # Must be defined, but is permissible to
15096 Carp::my_carp_bug("Undefined $nominal, 'Y'. Skipping.");
15102 foreach my $alias ($nominal->aliases) {
15104 # Attempt to create a table in the perl directory for the
15105 # candidate table, using whatever aliases in it that don't
15106 # conflict. Also add non-conflicting aliases for all these
15107 # prefixed by 'Is_' (and/or 'In_' for Block property tables)
15109 foreach my $prefix ("", 'Is_', 'In_') {
15111 # Only Block properties can have added 'In_' aliases.
15112 next if $prefix eq 'In_' and $nominal_property != $block;
15114 my $proposed_name = $prefix . $alias->name;
15116 # No Is_Is, In_In, nor combinations thereof
15117 trace "$proposed_name is a no-no" if main::DEBUG && $to_trace && $proposed_name =~ /^ I [ns] _I [ns] _/x;
15118 next if $proposed_name =~ /^ I [ns] _I [ns] _/x;
15120 trace "Seeing if can add alias or table: 'perl=$proposed_name' based on $nominal" if main::DEBUG && $to_trace;
15122 # Get a reference to any existing table in the perl
15123 # directory with the desired name.
15124 my $pre_existing = $perl->table($proposed_name);
15126 if (! defined $pre_existing) {
15128 # No name collision, so ok to add the perl synonym.
15130 my $make_re_pod_entry;
15131 my $ok_as_filename;
15132 my $status = $alias->status;
15133 if ($nominal_property == $block) {
15135 # For block properties, the 'In' form is preferred for
15136 # external use; the pod file contains wild cards for
15137 # this and the 'Is' form so no entries for those; and
15138 # we don't want people using the name without the
15139 # 'In', so discourage that.
15140 if ($prefix eq "") {
15141 $make_re_pod_entry = 1;
15142 $status = $status || $DISCOURAGED;
15143 $ok_as_filename = 0;
15145 elsif ($prefix eq 'In_') {
15146 $make_re_pod_entry = 0;
15147 $status = $status || $NORMAL;
15148 $ok_as_filename = 1;
15151 $make_re_pod_entry = 0;
15152 $status = $status || $DISCOURAGED;
15153 $ok_as_filename = 0;
15156 elsif ($prefix ne "") {
15158 # The 'Is' prefix is handled in the pod by a wild
15159 # card, and we won't use it for an external name
15160 $make_re_pod_entry = 0;
15161 $status = $status || $NORMAL;
15162 $ok_as_filename = 0;
15166 # Here, is an empty prefix, non block. This gets its
15167 # own pod entry and can be used for an external name.
15168 $make_re_pod_entry = 1;
15169 $status = $status || $NORMAL;
15170 $ok_as_filename = 1;
15173 # Here, there isn't a perl pre-existing table with the
15174 # name. Look through the list of equivalents of this
15175 # table to see if one is a perl table.
15176 foreach my $equivalent ($actual->leader->equivalents) {
15177 next if $equivalent->property != $perl;
15179 # Here, have found a table for $perl. Add this alias
15180 # to it, and are done with this prefix.
15181 $equivalent->add_alias($proposed_name,
15182 Re_Pod_Entry => $make_re_pod_entry,
15184 # Currently don't output these in the
15185 # ucd pod, as are strongly discouraged
15190 OK_as_Filename => $ok_as_filename);
15191 trace "adding alias perl=$proposed_name to $equivalent" if main::DEBUG && $to_trace;
15195 # Here, $perl doesn't already have a table that is a
15196 # synonym for this property, add one.
15197 my $added_table = $perl->add_match_table($proposed_name,
15198 Re_Pod_Entry => $make_re_pod_entry,
15200 # See UCD comment just above
15204 OK_as_Filename => $ok_as_filename);
15205 # And it will be related to the actual table, since it is
15207 $added_table->set_equivalent_to($actual, Related => 1);
15208 trace "added ", $perl->table($proposed_name) if main::DEBUG && $to_trace;
15210 } # End of no pre-existing.
15212 # Here, there is a pre-existing table that has the proposed
15213 # name. We could be in trouble, but not if this is just a
15214 # synonym for another table that we have already made a child
15215 # of the pre-existing one.
15216 if ($pre_existing->is_set_equivalent_to($actual)) {
15217 trace "$pre_existing is already equivalent to $actual; adding alias perl=$proposed_name to it" if main::DEBUG && $to_trace;
15218 $pre_existing->add_alias($proposed_name);
15222 # Here, there is a name collision, but it still could be ok if
15223 # the tables match the identical set of code points, in which
15224 # case, we can combine the names. Compare each table's code
15225 # point list to see if they are identical.
15226 trace "Potential name conflict with $pre_existing having ", $pre_existing->count, " code points" if main::DEBUG && $to_trace;
15227 if ($pre_existing->matches_identically_to($actual)) {
15229 # Here, they do match identically. Not a real conflict.
15230 # Make the perl version a child of the Unicode one, except
15231 # in the non-obvious case of where the perl name is
15232 # already a synonym of another Unicode property. (This is
15233 # excluded by the test for it being its own parent.) The
15234 # reason for this exclusion is that then the two Unicode
15235 # properties become related; and we don't really know if
15236 # they are or not. We generate documentation based on
15237 # relatedness, and this would be misleading. Code
15238 # later executed in the process will cause the tables to
15239 # be represented by a single file anyway, without making
15240 # it look in the pod like they are necessarily related.
15241 if ($pre_existing->parent == $pre_existing
15242 && ($pre_existing->property == $perl
15243 || $actual->property == $perl))
15245 trace "Setting $pre_existing equivalent to $actual since one is \$perl, and match identical sets" if main::DEBUG && $to_trace;
15246 $pre_existing->set_equivalent_to($actual, Related => 1);
15248 elsif (main::DEBUG && $to_trace) {
15249 trace "$pre_existing is equivalent to $actual since match identical sets, but not setting them equivalent, to preserve the separateness of the perl aliases";
15250 trace $pre_existing->parent;
15255 # Here they didn't match identically, there is a real conflict
15256 # between our new name and a pre-existing property.
15257 $actual->add_conflicting($proposed_name, 'p', $pre_existing);
15258 $pre_existing->add_conflicting($nominal->full_name,
15262 # Don't output a warning for aliases for the block
15263 # properties (unless they start with 'In_') as it is
15264 # expected that there will be conflicts and the block
15266 if ($verbosity >= $NORMAL_VERBOSITY
15267 && ($actual->property != $block || $prefix eq 'In_'))
15269 print simple_fold(join_lines(<<END
15270 There is already an alias named $proposed_name (from $pre_existing),
15271 so not creating this alias for $actual
15276 # Keep track for documentation purposes.
15277 $has_In_conflicts++ if $prefix eq 'In_';
15278 $has_Is_conflicts++ if $prefix eq 'Is_';
15283 # There are some properties which have No and Yes (and N and Y) as
15284 # property values, but aren't binary, and could possibly be confused with
15285 # binary ones. So create caveats for them. There are tables that are
15286 # named 'No', and tables that are named 'N', but confusion is not likely
15287 # unless they are the same table. For example, N meaning Number or
15288 # Neutral is not likely to cause confusion, so don't add caveats to things
15290 foreach my $property (grep { $_->type != $BINARY
15291 && $_->type != $FORCED_BINARY }
15294 my $yes = $property->table('Yes');
15295 if (defined $yes) {
15296 my $y = $property->table('Y');
15297 if (defined $y && $yes == $y) {
15298 foreach my $alias ($property->aliases) {
15299 $yes->add_conflicting($alias->name);
15303 my $no = $property->table('No');
15305 my $n = $property->table('N');
15306 if (defined $n && $no == $n) {
15307 foreach my $alias ($property->aliases) {
15308 $no->add_conflicting($alias->name, 'P');
15317 sub register_file_for_name($$$) {
15318 # Given info about a table and a datafile that it should be associated
15319 # with, register that association
15322 my $directory_ref = shift; # Array of the directory path for the file
15323 my $file = shift; # The file name in the final directory.
15324 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
15326 trace "table=$table, file=$file, directory=@$directory_ref, fate=", $table->fate if main::DEBUG && $to_trace;
15328 if ($table->isa('Property')) {
15329 $table->set_file_path(@$directory_ref, $file);
15330 push @map_properties, $table;
15332 # No swash means don't do the rest of this.
15333 return if $table->fate != $ORDINARY
15334 && ! ($table->name =~ /^_/ && $table->fate == $INTERNAL_ONLY);
15336 # Get the path to the file
15337 my @path = $table->file_path;
15339 # Use just the file name if no subdirectory.
15340 shift @path if $path[0] eq File::Spec->curdir();
15342 my $file = join '/', @path;
15344 # Create a hash entry for utf8_heavy to get the file that stores this
15345 # property's map table
15346 foreach my $alias ($table->aliases) {
15347 my $name = $alias->name;
15348 if ($name =~ /^_/) {
15349 $strict_property_to_file_of{lc $name} = $file;
15352 $loose_property_to_file_of{standardize($name)} = $file;
15356 # And a way for utf8_heavy to find the proper key in the SwashInfo
15357 # hash for this property.
15358 $file_to_swash_name{$file} = "To" . $table->swash_name;
15362 # Do all of the work for all equivalent tables when called with the leader
15363 # table, so skip if isn't the leader.
15364 return if $table->leader != $table;
15366 # If this is a complement of another file, use that other file instead,
15367 # with a ! prepended to it.
15369 if (($complement = $table->complement) != 0) {
15370 my @directories = $complement->file_path;
15372 # This assumes that the 0th element is something like 'lib',
15373 # the 1th element the property name (in its own directory), like
15374 # 'AHex', and the 2th element the file like 'Y' which will have a .pl
15375 # appended to it later.
15376 $directories[1] =~ s/^/!/;
15377 $file = pop @directories;
15378 $directory_ref =\@directories;
15381 # Join all the file path components together, using slashes.
15382 my $full_filename = join('/', @$directory_ref, $file);
15384 # All go in the same subdirectory of unicore, or the special
15385 # pseudo-directory '#'
15386 if ($directory_ref->[0] !~ / ^ $matches_directory | \# $ /x) {
15387 Carp::my_carp("Unexpected directory in "
15388 . join('/', @{$directory_ref}, $file));
15391 # For this table and all its equivalents ...
15392 foreach my $table ($table, $table->equivalents) {
15394 # Associate it with its file internally. Don't include the
15395 # $matches_directory first component
15396 $table->set_file_path(@$directory_ref, $file);
15398 # No swash means don't do the rest of this.
15399 next if $table->isa('Map_Table') && $table->fate != $ORDINARY;
15401 my $sub_filename = join('/', $directory_ref->[1, -1], $file);
15403 my $property = $table->property;
15404 my $property_name = ($property == $perl)
15405 ? "" # 'perl' is never explicitly stated
15406 : standardize($property->name) . '=';
15408 my $is_default = 0; # Is this table the default one for the property?
15410 # To calculate $is_default, we find if this table is the same as the
15411 # default one for the property. But this is complicated by the
15412 # possibility that there is a master table for this one, and the
15413 # information is stored there instead of here.
15414 my $parent = $table->parent;
15415 my $leader_prop = $parent->property;
15416 my $default_map = $leader_prop->default_map;
15417 if (defined $default_map) {
15418 my $default_table = $leader_prop->table($default_map);
15419 $is_default = 1 if defined $default_table && $parent == $default_table;
15422 # Calculate the loose name for this table. Mostly it's just its name,
15423 # standardized. But in the case of Perl tables that are single-form
15424 # equivalents to Unicode properties, it is the latter's name.
15425 my $loose_table_name =
15426 ($property != $perl || $leader_prop == $perl)
15427 ? standardize($table->name)
15428 : standardize($parent->name);
15430 my $deprecated = ($table->status eq $DEPRECATED)
15431 ? $table->status_info
15433 my $caseless_equivalent = $table->caseless_equivalent;
15435 # And for each of the table's aliases... This inner loop eventually
15436 # goes through all aliases in the UCD that we generate regex match
15438 foreach my $alias ($table->aliases) {
15439 my $standard = utf8_heavy_name($table, $alias);
15441 # Generate an entry in either the loose or strict hashes, which
15442 # will translate the property and alias names combination into the
15443 # file where the table for them is stored.
15444 if ($alias->loose_match) {
15445 if (exists $loose_to_file_of{$standard}) {
15446 Carp::my_carp("Can't change file registered to $loose_to_file_of{$standard} to '$sub_filename'.");
15449 $loose_to_file_of{$standard} = $sub_filename;
15453 if (exists $stricter_to_file_of{$standard}) {
15454 Carp::my_carp("Can't change file registered to $stricter_to_file_of{$standard} to '$sub_filename'.");
15457 $stricter_to_file_of{$standard} = $sub_filename;
15459 # Tightly coupled with how utf8_heavy.pl works, for a
15460 # floating point number that is a whole number, get rid of
15461 # the trailing decimal point and 0's, so that utf8_heavy
15462 # will work. Also note that this assumes that such a
15463 # number is matched strictly; so if that were to change,
15464 # this would be wrong.
15465 if ((my $integer_name = $alias->name)
15466 =~ s/^ ( -? \d+ ) \.0+ $ /$1/x)
15468 $stricter_to_file_of{$property_name . $integer_name}
15474 # For Unicode::UCD, create a mapping of the prop=value to the
15475 # canonical =value for that property.
15476 if ($standard =~ /=/) {
15478 # This could happen if a strict name mapped into an existing
15479 # loose name. In that event, the strict names would have to
15480 # be moved to a new hash.
15481 if (exists($loose_to_standard_value{$standard})) {
15482 Carp::my_carp_bug("'$standard' conflicts with a pre-existing use. Bad News. Continuing anyway");
15484 $loose_to_standard_value{$standard} = $loose_table_name;
15487 # Keep a list of the deprecated properties and their filenames
15488 if ($deprecated && $complement == 0) {
15489 $utf8::why_deprecated{$sub_filename} = $deprecated;
15492 # And a substitute table, if any, for case-insensitive matching
15493 if ($caseless_equivalent != 0) {
15494 $caseless_equivalent_to{$standard} = $caseless_equivalent;
15497 # Add to defaults list if the table this alias belongs to is the
15499 $loose_defaults{$standard} = 1 if $is_default;
15507 my %base_names; # Names already used for avoiding DOS 8.3 filesystem
15509 my %full_dir_name_of; # Full length names of directories used.
15511 sub construct_filename($$$) {
15512 # Return a file name for a table, based on the table name, but perhaps
15513 # changed to get rid of non-portable characters in it, and to make
15514 # sure that it is unique on a file system that allows the names before
15515 # any period to be at most 8 characters (DOS). While we're at it
15516 # check and complain if there are any directory conflicts.
15518 my $name = shift; # The name to start with
15519 my $mutable = shift; # Boolean: can it be changed? If no, but
15520 # yet it must be to work properly, a warning
15522 my $directories_ref = shift; # A reference to an array containing the
15523 # path to the file, with each element one path
15524 # component. This is used because the same
15525 # name can be used in different directories.
15526 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
15528 my $warn = ! defined wantarray; # If true, then if the name is
15529 # changed, a warning is issued as well.
15531 if (! defined $name) {
15532 Carp::my_carp("Undefined name in directory "
15533 . File::Spec->join(@$directories_ref)
15538 # Make sure that no directory names conflict with each other. Look at
15539 # each directory in the input file's path. If it is already in use,
15540 # assume it is correct, and is merely being re-used, but if we
15541 # truncate it to 8 characters, and find that there are two directories
15542 # that are the same for the first 8 characters, but differ after that,
15543 # then that is a problem.
15544 foreach my $directory (@$directories_ref) {
15545 my $short_dir = substr($directory, 0, 8);
15546 if (defined $full_dir_name_of{$short_dir}) {
15547 next if $full_dir_name_of{$short_dir} eq $directory;
15548 Carp::my_carp("$directory conflicts with $full_dir_name_of{$short_dir}. Bad News. Continuing anyway");
15551 $full_dir_name_of{$short_dir} = $directory;
15555 my $path = join '/', @$directories_ref;
15556 $path .= '/' if $path;
15558 # Remove interior underscores.
15559 (my $filename = $name) =~ s/ (?<=.) _ (?=.) //xg;
15561 # Convert the dot in floating point numbers to an underscore
15562 $filename =~ s/\./_/ if $filename =~ / ^ \d+ \. \d+ $ /x;
15566 # Extract any suffix, delete any non-word character, and truncate to 3
15568 if ($filename =~ m/ ( .*? ) ( \. .* ) /x) {
15571 $suffix =~ s/\W+//g;
15572 substr($suffix, 4) = "" if length($suffix) > 4;
15575 # Change any non-word character outside the suffix into an underscore,
15576 # and truncate to 8.
15577 $filename =~ s/\W+/_/g; # eg., "L&" -> "L_"
15578 substr($filename, 8) = "" if length($filename) > 8;
15580 # Make sure the basename doesn't conflict with something we
15581 # might have already written. If we have, say,
15588 while (my $num = $base_names{$path}{lc "$filename$suffix"}++) {
15589 $num++; # so basenames with numbers start with '2', which
15590 # just looks more natural.
15592 # Want to append $num, but if it'll make the basename longer
15593 # than 8 characters, pre-truncate $filename so that the result
15595 my $delta = length($filename) + length($num) - 8;
15597 substr($filename, -$delta) = $num;
15602 if ($warn && ! $warned) {
15604 Carp::my_carp("'$path$name' conflicts with another name on a filesystem with 8 significant characters (like DOS). Proceeding anyway.");
15608 return $filename if $mutable;
15610 # If not changeable, must return the input name, but warn if needed to
15611 # change it beyond shortening it.
15612 if ($name ne $filename
15613 && substr($name, 0, length($filename)) ne $filename) {
15614 Carp::my_carp("'$path$name' had to be changed into '$filename'. Bad News. Proceeding anyway.");
15620 # The pod file contains a very large table. Many of the lines in that table
15621 # would exceed a typical output window's size, and so need to be wrapped with
15622 # a hanging indent to make them look good. The pod language is really
15623 # insufficient here. There is no general construct to do that in pod, so it
15624 # is done here by beginning each such line with a space to cause the result to
15625 # be output without formatting, and doing all the formatting here. This leads
15626 # to the result that if the eventual display window is too narrow it won't
15627 # look good, and if the window is too wide, no advantage is taken of that
15628 # extra width. A further complication is that the output may be indented by
15629 # the formatter so that there is less space than expected. What I (khw) have
15630 # done is to assume that that indent is a particular number of spaces based on
15631 # what it is in my Linux system; people can always resize their windows if
15632 # necessary, but this is obviously less than desirable, but the best that can
15634 my $automatic_pod_indent = 8;
15636 # Try to format so that uses fewest lines, but few long left column entries
15637 # slide into the right column. An experiment on 5.1 data yielded the
15638 # following percentages that didn't cut into the other side along with the
15639 # associated first-column widths
15641 # 80% not too bad except for a few blocks
15642 # 90% = 33; # , cuts 353/3053 lines from 37 = 12%
15644 my $indent_info_column = 27; # 75% of lines didn't have overlap
15646 my $FILLER = 3; # Length of initial boiler-plate columns in a pod line
15647 # The 3 is because of:
15648 # 1 for the leading space to tell the pod formatter to
15651 # 1 for the space between the flag and the main data
15653 sub format_pod_line ($$$;$$) {
15654 # Take a pod line and return it, formatted properly
15656 my $first_column_width = shift;
15657 my $entry = shift; # Contents of left column
15658 my $info = shift; # Contents of right column
15660 my $status = shift || ""; # Any flag
15662 my $loose_match = shift; # Boolean.
15663 $loose_match = 1 unless defined $loose_match;
15665 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
15668 $flags .= $STRICTER if ! $loose_match;
15670 $flags .= $status if $status;
15672 # There is a blank in the left column to cause the pod formatter to
15673 # output the line as-is.
15674 return sprintf " %-*s%-*s %s\n",
15675 # The first * in the format is replaced by this, the -1 is
15676 # to account for the leading blank. There isn't a
15677 # hard-coded blank after this to separate the flags from
15678 # the rest of the line, so that in the unlikely event that
15679 # multiple flags are shown on the same line, they both
15680 # will get displayed at the expense of that separation,
15681 # but since they are left justified, a blank will be
15682 # inserted in the normal case.
15686 # The other * in the format is replaced by this number to
15687 # cause the first main column to right fill with blanks.
15688 # The -1 is for the guaranteed blank following it.
15689 $first_column_width - $FILLER - 1,
15694 my @zero_match_tables; # List of tables that have no matches in this release
15696 sub make_re_pod_entries($) {
15697 # This generates the entries for the pod file for a given table.
15698 # Also done at this time are any children tables. The output looks like:
15699 # \p{Common} \p{Script=Common} (Short: \p{Zyyy}) (5178)
15701 my $input_table = shift; # Table the entry is for
15702 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
15704 # Generate parent and all its children at the same time.
15705 return if $input_table->parent != $input_table;
15707 my $property = $input_table->property;
15708 my $type = $property->type;
15709 my $full_name = $property->full_name;
15711 my $count = $input_table->count;
15713 my $non_unicode_string;
15714 if ($count > $MAX_UNICODE_CODEPOINTS) {
15715 $unicode_count = $count - ($MAX_WORKING_CODEPOINT
15716 - $MAX_UNICODE_CODEPOINT);
15717 $non_unicode_string = " plus all above-Unicode code points";
15720 $unicode_count = $count;
15721 $non_unicode_string = "";
15723 my $string_count = clarify_number($unicode_count) . $non_unicode_string;
15724 my $status = $input_table->status;
15725 my $status_info = $input_table->status_info;
15726 my $caseless_equivalent = $input_table->caseless_equivalent;
15728 # Don't mention a placeholder equivalent as it isn't to be listed in the
15730 $caseless_equivalent = 0 if $caseless_equivalent != 0
15731 && $caseless_equivalent->fate > $ORDINARY;
15733 my $entry_for_first_table; # The entry for the first table output.
15734 # Almost certainly, it is the parent.
15736 # For each related table (including itself), we will generate a pod entry
15737 # for each name each table goes by
15738 foreach my $table ($input_table, $input_table->children) {
15740 # utf8_heavy.pl cannot deal with null string property values, so skip
15741 # any tables that have no non-null names.
15742 next if ! grep { $_->name ne "" } $table->aliases;
15744 # First, gather all the info that applies to this table as a whole.
15746 push @zero_match_tables, $table if $count == 0
15747 # Don't mention special tables
15748 # as being zero length
15749 && $table->fate == $ORDINARY;
15751 my $table_property = $table->property;
15753 # The short name has all the underscores removed, while the full name
15754 # retains them. Later, we decide whether to output a short synonym
15755 # for the full one, we need to compare apples to apples, so we use the
15756 # short name's length including underscores.
15757 my $table_property_short_name_length;
15758 my $table_property_short_name
15759 = $table_property->short_name(\$table_property_short_name_length);
15760 my $table_property_full_name = $table_property->full_name;
15762 # Get how much savings there is in the short name over the full one
15763 # (delta will always be <= 0)
15764 my $table_property_short_delta = $table_property_short_name_length
15765 - length($table_property_full_name);
15766 my @table_description = $table->description;
15767 my @table_note = $table->note;
15769 # Generate an entry for each alias in this table.
15770 my $entry_for_first_alias; # saves the first one encountered.
15771 foreach my $alias ($table->aliases) {
15773 # Skip if not to go in pod.
15774 next unless $alias->make_re_pod_entry;
15776 # Start gathering all the components for the entry
15777 my $name = $alias->name;
15779 # Skip if name is empty, as can't be accessed by regexes.
15780 next if $name eq "";
15782 my $entry; # Holds the left column, may include extras
15783 my $entry_ref; # To refer to the left column's contents from
15784 # another entry; has no extras
15786 # First the left column of the pod entry. Tables for the $perl
15787 # property always use the single form.
15788 if ($table_property == $perl) {
15789 $entry = "\\p{$name}";
15790 $entry .= " \\p$name" if length $name == 1; # Show non-braced
15792 $entry_ref = "\\p{$name}";
15794 else { # Compound form.
15796 # Only generate one entry for all the aliases that mean true
15797 # or false in binary properties. Append a '*' to indicate
15798 # some are missing. (The heading comment notes this.)
15800 if ($type == $BINARY) {
15801 next if $name ne 'N' && $name ne 'Y';
15804 elsif ($type != $FORCED_BINARY) {
15809 # Forced binary properties require special handling. It
15810 # has two sets of tables, one set is true/false; and the
15811 # other set is everything else. Entries are generated for
15812 # each set. Use the Bidi_Mirrored property (which appears
15813 # in all Unicode versions) to get a list of the aliases
15814 # for the true/false tables. Of these, only output the N
15815 # and Y ones, the same as, a regular binary property. And
15816 # output all the rest, same as a non-binary property.
15817 my $bm = property_ref("Bidi_Mirrored");
15818 if ($name eq 'N' || $name eq 'Y') {
15820 } elsif (grep { $name eq $_->name } $bm->table("Y")->aliases,
15821 $bm->table("N")->aliases)
15830 # Colon-space is used to give a little more space to be easier
15833 . $table_property_full_name
15836 # But for the reference to this entry, which will go in the
15837 # right column, where space is at a premium, use equals
15839 $entry_ref = "\\p{" . $table_property_full_name . "=$name}";
15842 # Then the right (info) column. This is stored as components of
15843 # an array for the moment, then joined into a string later. For
15844 # non-internal only properties, begin the info with the entry for
15845 # the first table we encountered (if any), as things are ordered
15846 # so that that one is the most descriptive. This leads to the
15847 # info column of an entry being a more descriptive version of the
15850 if ($name =~ /^_/) {
15852 '(For internal use by Perl, not necessarily stable)';
15854 elsif ($entry_for_first_alias) {
15855 push @info, $entry_for_first_alias;
15858 # If this entry is equivalent to another, add that to the info,
15859 # using the first such table we encountered
15860 if ($entry_for_first_table) {
15862 push @info, "(= $entry_for_first_table)";
15865 push @info, $entry_for_first_table;
15869 # If the name is a large integer, add an equivalent with an
15870 # exponent for better readability
15871 if ($name =~ /^[+-]?[\d]+$/ && $name >= 10_000) {
15872 push @info, sprintf "(= %.1e)", $name
15875 my $parenthesized = "";
15876 if (! $entry_for_first_alias) {
15878 # This is the first alias for the current table. The alias
15879 # array is ordered so that this is the fullest, most
15880 # descriptive alias, so it gets the fullest info. The other
15881 # aliases are mostly merely pointers to this one, using the
15882 # information already added above.
15884 # Display any status message, but only on the parent table
15885 if ($status && ! $entry_for_first_table) {
15886 push @info, $status_info;
15889 # Put out any descriptive info
15890 if (@table_description || @table_note) {
15891 push @info, join "; ", @table_description, @table_note;
15894 # Look to see if there is a shorter name we can point people
15896 my $standard_name = standardize($name);
15898 my $proposed_short = $table->short_name;
15899 if (defined $proposed_short) {
15900 my $standard_short = standardize($proposed_short);
15902 # If the short name is shorter than the standard one, or
15903 # even it it's not, but the combination of it and its
15904 # short property name (as in \p{prop=short} ($perl doesn't
15905 # have this form)) saves at least two characters, then,
15906 # cause it to be listed as a shorter synonym.
15907 if (length $standard_short < length $standard_name
15908 || ($table_property != $perl
15909 && (length($standard_short)
15910 - length($standard_name)
15911 + $table_property_short_delta) # (<= 0)
15914 $short_name = $proposed_short;
15915 if ($table_property != $perl) {
15916 $short_name = $table_property_short_name
15919 $short_name = "\\p{$short_name}";
15923 # And if this is a compound form name, see if there is a
15924 # single form equivalent
15926 if ($table_property != $perl) {
15928 # Special case the binary N tables, so that will print
15929 # \P{single}, but use the Y table values to populate
15930 # 'single', as we haven't likewise populated the N table.
15931 # For forced binary tables, we can't just look at the N
15932 # table, but must see if this table is equivalent to the N
15933 # one, as there are two equivalent beasts in these
15937 if ( ($type == $BINARY
15938 && $input_table == $property->table('No'))
15939 || ($type == $FORCED_BINARY
15940 && $property->table('No')->
15941 is_set_equivalent_to($input_table)))
15943 $test_table = $property->table('Yes');
15947 $test_table = $input_table;
15951 # Look for a single form amongst all the children.
15952 foreach my $table ($test_table->children) {
15953 next if $table->property != $perl;
15954 my $proposed_name = $table->short_name;
15955 next if ! defined $proposed_name;
15957 # Don't mention internal-only properties as a possible
15958 # single form synonym
15959 next if substr($proposed_name, 0, 1) eq '_';
15961 $proposed_name = "\\$p\{$proposed_name}";
15962 if (! defined $single_form
15963 || length($proposed_name) < length $single_form)
15965 $single_form = $proposed_name;
15967 # The goal here is to find a single form; not the
15968 # shortest possible one. We've already found a
15969 # short name. So, stop at the first single form
15970 # found, which is likely to be closer to the
15977 # Output both short and single in the same parenthesized
15978 # expression, but with only one of 'Single', 'Short' if there
15980 if ($short_name || $single_form || $table->conflicting) {
15981 $parenthesized .= "Short: $short_name" if $short_name;
15982 if ($short_name && $single_form) {
15983 $parenthesized .= ', ';
15985 elsif ($single_form) {
15986 $parenthesized .= 'Single: ';
15988 $parenthesized .= $single_form if $single_form;
15992 if ($caseless_equivalent != 0) {
15993 $parenthesized .= '; ' if $parenthesized ne "";
15994 $parenthesized .= "/i= " . $caseless_equivalent->complete_name;
15998 # Warn if this property isn't the same as one that a
15999 # semi-casual user might expect. The other components of this
16000 # parenthesized structure are calculated only for the first entry
16001 # for this table, but the conflicting is deemed important enough
16002 # to go on every entry.
16003 my $conflicting = join " NOR ", $table->conflicting;
16004 if ($conflicting) {
16005 $parenthesized .= '; ' if $parenthesized ne "";
16006 $parenthesized .= "NOT $conflicting";
16009 push @info, "($parenthesized)" if $parenthesized;
16011 if ($name =~ /_$/ && $alias->loose_match) {
16012 push @info, "Note the trailing '_' matters in spite of loose matching rules.";
16015 if ($table_property != $perl && $table->perl_extension) {
16016 push @info, '(Perl extension)';
16018 push @info, "($string_count)";
16020 # Now, we have both the entry and info so add them to the
16021 # list of all the properties.
16022 push @match_properties,
16023 format_pod_line($indent_info_column,
16027 $alias->loose_match);
16029 $entry_for_first_alias = $entry_ref unless $entry_for_first_alias;
16030 } # End of looping through the aliases for this table.
16032 if (! $entry_for_first_table) {
16033 $entry_for_first_table = $entry_for_first_alias;
16035 } # End of looping through all the related tables
16039 sub make_ucd_table_pod_entries {
16042 # Generate the entries for the UCD section of the pod for $table. This
16043 # also calculates if names are ambiguous, so has to be called even if the
16044 # pod is not being output
16046 my $short_name = $table->name;
16047 my $standard_short_name = standardize($short_name);
16048 my $full_name = $table->full_name;
16049 my $standard_full_name = standardize($full_name);
16051 my $full_info = ""; # Text of info column for full-name entries
16052 my $other_info = ""; # Text of info column for short-name entries
16053 my $short_info = ""; # Text of info column for other entries
16054 my $meaning = ""; # Synonym of this table
16056 my $property = ($table->isa('Property'))
16058 : $table->parent->property;
16060 my $perl_extension = $table->perl_extension;
16062 # Get the more official name for for perl extensions that aren't
16063 # stand-alone properties
16064 if ($perl_extension && $property != $table) {
16065 if ($property == $perl ||$property->type == $BINARY) {
16066 $meaning = $table->complete_name;
16069 $meaning = $property->full_name . "=$full_name";
16073 # There are three types of info column. One for the short name, one for
16074 # the full name, and one for everything else. They mostly are the same,
16075 # so initialize in the same loop.
16076 foreach my $info_ref (\$full_info, \$short_info, \$other_info) {
16077 if ($perl_extension && $property != $table) {
16079 # Add the synonymous name for the non-full name entries; and to
16080 # the full-name entry if it adds extra information
16081 if ($info_ref == \$other_info
16082 || ($info_ref == \$short_info
16083 && $standard_short_name ne $standard_full_name)
16084 || standardize($meaning) ne $standard_full_name
16086 $$info_ref .= "$meaning.";
16089 elsif ($info_ref != \$full_info) {
16091 # Otherwise, the non-full name columns include the full name
16092 $$info_ref .= $full_name;
16095 # And the full-name entry includes the short name, if shorter
16096 if ($info_ref == \$full_info
16097 && length $standard_short_name < length $standard_full_name)
16099 $full_info =~ s/\.\Z//;
16100 $full_info .= " " if $full_info;
16101 $full_info .= "(Short: $short_name)";
16104 if ($table->perl_extension) {
16105 $$info_ref =~ s/\.\Z//;
16106 $$info_ref .= ". " if $$info_ref;
16107 $$info_ref .= "(Perl extension)";
16111 # Add any extra annotations to the full name entry
16112 foreach my $more_info ($table->description,
16114 $table->status_info)
16116 next unless $more_info;
16117 $full_info =~ s/\.\Z//;
16118 $full_info .= ". " if $full_info;
16119 $full_info .= $more_info;
16121 if ($table->property->type == $FORCED_BINARY) {
16123 $full_info =~ s/\.\Z//;
16124 $full_info .= ". ";
16126 $full_info .= "This is a combination property which has both:"
16127 . " 1) a map to various string values; and"
16128 . " 2) a map to boolean Y/N, where 'Y' means the"
16129 . " string value is non-empty. Add the prefix 'is'"
16130 . " to the prop_invmap() call to get the latter";
16133 # These keep track if have created full and short name pod entries for the
16136 my $done_short = 0;
16138 # Every possible name is kept track of, even those that aren't going to be
16139 # output. This way we can be sure to find the ambiguities.
16140 foreach my $alias ($table->aliases) {
16141 my $name = $alias->name;
16142 my $standard = standardize($name);
16144 my $output_this = $alias->ucd;
16146 # If the full and short names are the same, we want to output the full
16147 # one's entry, so it has priority.
16148 if ($standard eq $standard_full_name) {
16149 next if $done_full;
16151 $info = $full_info;
16153 elsif ($standard eq $standard_short_name) {
16154 next if $done_short;
16156 next if $standard_short_name eq $standard_full_name;
16157 $info = $short_info;
16160 $info = $other_info;
16163 $combination_property{$standard} = 1
16164 if $table->property->type == $FORCED_BINARY;
16166 # Here, we have set up the two columns for this entry. But if an
16167 # entry already exists for this name, we have to decide which one
16168 # we're going to later output.
16169 if (exists $ucd_pod{$standard}) {
16171 # If the two entries refer to the same property, it's not going to
16172 # be ambiguous. (Likely it's because the names when standardized
16173 # are the same.) But that means if they are different properties,
16174 # there is ambiguity.
16175 if ($ucd_pod{$standard}->{'property'} != $property) {
16177 # Here, we have an ambiguity. This code assumes that one is
16178 # scheduled to be output and one not and that one is a perl
16179 # extension (which is not to be output) and the other isn't.
16180 # If those assumptions are wrong, things have to be rethought.
16181 if ($ucd_pod{$standard}{'output_this'} == $output_this
16182 || $ucd_pod{$standard}{'perl_extension'} == $perl_extension
16183 || $output_this == $perl_extension)
16185 Carp::my_carp("Bad news. $property and $ucd_pod{$standard}->{'property'} have unexpected output status and perl-extension combinations. Proceeding anyway.");
16188 # We modifiy the info column of the one being output to
16189 # indicate the ambiguity. Set $which to point to that one's
16192 if ($ucd_pod{$standard}{'output_this'}) {
16193 $which = \$ucd_pod{$standard}->{'info'};
16197 $meaning = $ucd_pod{$standard}{'meaning'};
16201 $$which =~ s/\.\Z//;
16202 $$which .= "; NOT '$standard' meaning '$meaning'";
16204 $ambiguous_names{$standard} = 1;
16207 # Use the non-perl-extension variant
16208 next unless $ucd_pod{$standard}{'perl_extension'};
16211 # Store enough information about this entry that we can later look for
16212 # ambiguities, and output it properly.
16213 $ucd_pod{$standard} = { 'name' => $name,
16215 'meaning' => $meaning,
16216 'output_this' => $output_this,
16217 'perl_extension' => $perl_extension,
16218 'property' => $property,
16219 'status' => $alias->status,
16221 } # End of looping through all this table's aliases
16226 sub pod_alphanumeric_sort {
16227 # Sort pod entries alphanumerically.
16229 # The first few character columns are filler, plus the '\p{'; and get rid
16230 # of all the trailing stuff, starting with the trailing '}', so as to sort
16231 # on just 'Name=Value'
16232 (my $a = lc $a) =~ s/^ .*? \{ //x;
16234 (my $b = lc $b) =~ s/^ .*? \{ //x;
16237 # Determine if the two operands are both internal only or both not.
16238 # Character 0 should be a '\'; 1 should be a p; 2 should be '{', so 3
16239 # should be the underscore that begins internal only
16240 my $a_is_internal = (substr($a, 0, 1) eq '_');
16241 my $b_is_internal = (substr($b, 0, 1) eq '_');
16243 # Sort so the internals come last in the table instead of first (which the
16244 # leading underscore would otherwise indicate).
16245 if ($a_is_internal != $b_is_internal) {
16246 return 1 if $a_is_internal;
16250 # Determine if the two operands are numeric property values or not.
16251 # A numeric property will look like xyz: 3. But the number
16252 # can begin with an optional minus sign, and may have a
16253 # fraction or rational component, like xyz: 3/2. If either
16254 # isn't numeric, use alphabetic sort.
16255 my ($a_initial, $a_number) =
16256 ($a =~ /^ ( [^:=]+ [:=] \s* ) (-? \d+ (?: [.\/] \d+)? )/ix);
16257 return $a cmp $b unless defined $a_number;
16258 my ($b_initial, $b_number) =
16259 ($b =~ /^ ( [^:=]+ [:=] \s* ) (-? \d+ (?: [.\/] \d+)? )/ix);
16260 return $a cmp $b unless defined $b_number;
16262 # Here they are both numeric, but use alphabetic sort if the
16263 # initial parts don't match
16264 return $a cmp $b if $a_initial ne $b_initial;
16266 # Convert rationals to floating for the comparison.
16267 $a_number = eval $a_number if $a_number =~ qr{/};
16268 $b_number = eval $b_number if $b_number =~ qr{/};
16270 return $a_number <=> $b_number;
16274 # Create the .pod file. This generates the various subsections and then
16275 # combines them in one big HERE document.
16277 my $Is_flags_text = "If an entry has flag(s) at its beginning, like \"$DEPRECATED\", the \"Is_\" form has the same flag(s)";
16279 return unless defined $pod_directory;
16280 print "Making pod file\n" if $verbosity >= $PROGRESS;
16282 my $exception_message =
16283 '(Any exceptions are individually noted beginning with the word NOT.)';
16285 if (-e 'Blocks.txt') {
16287 # Add the line: '\p{In_*} \p{Block: *}', with the warning message
16288 # if the global $has_In_conflicts indicates we have them.
16289 push @match_properties, format_pod_line($indent_info_column,
16292 . (($has_In_conflicts)
16293 ? " $exception_message"
16295 @block_warning = << "END";
16297 Matches in the Block property have shortcuts that begin with "In_". For
16298 example, C<\\p{Block=Latin1}> can be written as C<\\p{In_Latin1}>. For
16299 backward compatibility, if there is no conflict with another shortcut, these
16300 may also be written as C<\\p{Latin1}> or C<\\p{Is_Latin1}>. But, N.B., there
16301 are numerous such conflicting shortcuts. Use of these forms for Block is
16302 discouraged, and are flagged as such, not only because of the potential
16303 confusion as to what is meant, but also because a later release of Unicode may
16304 preempt the shortcut, and your program would no longer be correct. Use the
16305 "In_" form instead to avoid this, or even more clearly, use the compound form,
16306 e.g., C<\\p{blk:latin1}>. See L<perlunicode/"Blocks"> for more information
16310 my $text = $Is_flags_text;
16311 $text = "$exception_message $text" if $has_Is_conflicts;
16313 # And the 'Is_ line';
16314 push @match_properties, format_pod_line($indent_info_column,
16318 # Sort the properties array for output. It is sorted alphabetically
16319 # except numerically for numeric properties, and only output unique lines.
16320 @match_properties = sort pod_alphanumeric_sort uniques @match_properties;
16322 my $formatted_properties = simple_fold(\@match_properties,
16324 # indent succeeding lines by two extra
16325 # which looks better
16326 $indent_info_column + 2,
16328 # shorten the line length by how much
16329 # the formatter indents, so the folded
16330 # line will fit in the space
16331 # presumably available
16332 $automatic_pod_indent);
16333 # Add column headings, indented to be a little more centered, but not
16335 $formatted_properties = format_pod_line($indent_info_column,
16339 . $formatted_properties;
16341 # Generate pod documentation lines for the tables that match nothing
16342 my $zero_matches = "";
16343 if (@zero_match_tables) {
16344 @zero_match_tables = uniques(@zero_match_tables);
16345 $zero_matches = join "\n\n",
16346 map { $_ = '=item \p{' . $_->complete_name . "}" }
16347 sort { $a->complete_name cmp $b->complete_name }
16348 @zero_match_tables;
16350 $zero_matches = <<END;
16352 =head2 Legal C<\\p{}> and C<\\P{}> constructs that match no characters
16354 Unicode has some property-value pairs that currently don't match anything.
16355 This happens generally either because they are obsolete, or they exist for
16356 symmetry with other forms, but no language has yet been encoded that uses
16357 them. In this version of Unicode, the following match zero code points:
16368 # Generate list of properties that we don't accept, grouped by the reasons
16369 # why. This is so only put out the 'why' once, and then list all the
16370 # properties that have that reason under it.
16372 my %why_list; # The keys are the reasons; the values are lists of
16373 # properties that have the key as their reason
16375 # For each property, add it to the list that are suppressed for its reason
16376 # The sort will cause the alphabetically first properties to be added to
16377 # each list first, so each list will be sorted.
16378 foreach my $property (sort keys %why_suppressed) {
16379 next unless $why_suppressed{$property};
16380 push @{$why_list{$why_suppressed{$property}}}, $property;
16383 # For each reason (sorted by the first property that has that reason)...
16384 my @bad_re_properties;
16385 foreach my $why (sort { $why_list{$a}->[0] cmp $why_list{$b}->[0] }
16388 # Add to the output, all the properties that have that reason.
16389 my $has_item = 0; # Flag if actually output anything.
16390 foreach my $name (@{$why_list{$why}}) {
16392 # Split compound names into $property and $table components
16393 my $property = $name;
16395 if ($property =~ / (.*) = (.*) /x) {
16400 # This release of Unicode may not have a property that is
16401 # suppressed, so don't reference a non-existent one.
16402 $property = property_ref($property);
16403 next if ! defined $property;
16405 # And since this list is only for match tables, don't list the
16406 # ones that don't have match tables.
16407 next if ! $property->to_create_match_tables;
16409 # Find any abbreviation, and turn it into a compound name if this
16410 # is a property=value pair.
16411 my $short_name = $property->name;
16412 $short_name .= '=' . $property->table($table)->name if $table;
16414 # Start with an empty line.
16415 push @bad_re_properties, "\n\n" unless $has_item;
16417 # And add the property as an item for the reason.
16418 push @bad_re_properties, "\n=item I<$name> ($short_name)\n";
16422 # And add the reason under the list of properties, if such a list
16423 # actually got generated. Note that the header got added
16424 # unconditionally before. But pod ignores extra blank lines, so no
16426 push @bad_re_properties, "\n$why\n" if $has_item;
16428 } # End of looping through each reason.
16430 if (! @bad_re_properties) {
16431 push @bad_re_properties,
16432 "*** This installation accepts ALL non-Unihan properties ***";
16435 # Add =over only if non-empty to avoid an empty =over/=back section,
16436 # which is considered bad form.
16437 unshift @bad_re_properties, "\n=over 4\n";
16438 push @bad_re_properties, "\n=back\n";
16441 # Similiarly, generate a list of files that we don't use, grouped by the
16442 # reasons why (Don't output if the reason is empty). First, create a hash
16443 # whose keys are the reasons, and whose values are anonymous arrays of all
16444 # the files that share that reason.
16445 my %grouped_by_reason;
16446 foreach my $file (keys %skipped_files) {
16447 next unless $skipped_files{$file};
16448 push @{$grouped_by_reason{$skipped_files{$file}}}, $file;
16451 # Then, sort each group.
16452 foreach my $group (keys %grouped_by_reason) {
16453 @{$grouped_by_reason{$group}} = sort { lc $a cmp lc $b }
16454 @{$grouped_by_reason{$group}} ;
16457 # Finally, create the output text. For each reason (sorted by the
16458 # alphabetically first file that has that reason)...
16460 foreach my $reason (sort { lc $grouped_by_reason{$a}->[0]
16461 cmp lc $grouped_by_reason{$b}->[0]
16463 keys %grouped_by_reason)
16465 # Add all the files that have that reason to the output. Start
16466 # with an empty line.
16467 push @unused_files, "\n\n";
16468 push @unused_files, map { "\n=item F<$_> \n" }
16469 @{$grouped_by_reason{$reason}};
16470 # And add the reason under the list of files
16471 push @unused_files, "\n$reason\n";
16474 # Similarly, create the output text for the UCD section of the pod
16476 foreach my $key (keys %ucd_pod) {
16477 next unless $ucd_pod{$key}->{'output_this'};
16478 push @ucd_pod, format_pod_line($indent_info_column,
16479 $ucd_pod{$key}->{'name'},
16480 $ucd_pod{$key}->{'info'},
16481 $ucd_pod{$key}->{'status'},
16485 # Sort alphabetically, and fold for output
16486 @ucd_pod = sort { lc substr($a, 2) cmp lc substr($b, 2) } @ucd_pod;
16487 my $ucd_pod = simple_fold(\@ucd_pod,
16489 $indent_info_column,
16490 $automatic_pod_indent);
16491 $ucd_pod = format_pod_line($indent_info_column, 'NAME', ' INFO')
16496 # Everything is ready to assemble.
16497 my @OUT = << "END";
16502 To change this file, edit $0 instead.
16508 $pod_file - Index of Unicode Version $unicode_version character properties in Perl
16512 This document provides information about the portion of the Unicode database
16513 that deals with character properties, that is the portion that is defined on
16514 single code points. (L</Other information in the Unicode data base>
16515 below briefly mentions other data that Unicode provides.)
16517 Perl can provide access to all non-provisional Unicode character properties,
16518 though not all are enabled by default. The omitted ones are the Unihan
16519 properties (accessible via the CPAN module L<Unicode::Unihan>) and certain
16520 deprecated or Unicode-internal properties. (An installation may choose to
16521 recompile Perl's tables to change this. See L<Unicode character
16522 properties that are NOT accepted by Perl>.)
16524 For most purposes, access to Unicode properties from the Perl core is through
16525 regular expression matches, as described in the next section.
16526 For some special purposes, and to access the properties that are not suitable
16527 for regular expression matching, all the Unicode character properties that
16528 Perl handles are accessible via the standard L<Unicode::UCD> module, as
16529 described in the section L</Properties accessible through Unicode::UCD>.
16531 Perl also provides some additional extensions and short-cut synonyms
16532 for Unicode properties.
16534 This document merely lists all available properties and does not attempt to
16535 explain what each property really means. There is a brief description of each
16536 Perl extension; see L<perlunicode/Other Properties> for more information on
16537 these. There is some detail about Blocks, Scripts, General_Category,
16538 and Bidi_Class in L<perlunicode>, but to find out about the intricacies of the
16539 official Unicode properties, refer to the Unicode standard. A good starting
16540 place is L<$unicode_reference_url>.
16542 Note that you can define your own properties; see
16543 L<perlunicode/"User-Defined Character Properties">.
16545 =head1 Properties accessible through C<\\p{}> and C<\\P{}>
16547 The Perl regular expression C<\\p{}> and C<\\P{}> constructs give access to
16548 most of the Unicode character properties. The table below shows all these
16549 constructs, both single and compound forms.
16551 B<Compound forms> consist of two components, separated by an equals sign or a
16552 colon. The first component is the property name, and the second component is
16553 the particular value of the property to match against, for example,
16554 C<\\p{Script: Greek}> and C<\\p{Script=Greek}> both mean to match characters
16555 whose Script property value is Greek.
16557 B<Single forms>, like C<\\p{Greek}>, are mostly Perl-defined shortcuts for
16558 their equivalent compound forms. The table shows these equivalences. (In our
16559 example, C<\\p{Greek}> is a just a shortcut for C<\\p{Script=Greek}>.)
16560 There are also a few Perl-defined single forms that are not shortcuts for a
16561 compound form. One such is C<\\p{Word}>. These are also listed in the table.
16563 In parsing these constructs, Perl always ignores Upper/lower case differences
16564 everywhere within the {braces}. Thus C<\\p{Greek}> means the same thing as
16565 C<\\p{greek}>. But note that changing the case of the C<"p"> or C<"P"> before
16566 the left brace completely changes the meaning of the construct, from "match"
16567 (for C<\\p{}>) to "doesn't match" (for C<\\P{}>). Casing in this document is
16568 for improved legibility.
16570 Also, white space, hyphens, and underscores are normally ignored
16571 everywhere between the {braces}, and hence can be freely added or removed
16572 even if the C</x> modifier hasn't been specified on the regular expression.
16573 But in the table below $a_bold_stricter at the beginning of an entry
16574 means that tighter (stricter) rules are used for that entry:
16580 =item Single form (C<\\p{name}>) tighter rules:
16582 White space, hyphens, and underscores ARE significant
16587 =item * white space adjacent to a non-word character
16589 =item * underscores separating digits in numbers
16593 That means, for example, that you can freely add or remove white space
16594 adjacent to (but within) the braces without affecting the meaning.
16596 =item Compound form (C<\\p{name=value}> or C<\\p{name:value}>) tighter rules:
16598 The tighter rules given above for the single form apply to everything to the
16599 right of the colon or equals; the looser rules still apply to everything to
16602 That means, for example, that you can freely add or remove white space
16603 adjacent to (but within) the braces and the colon or equal sign.
16609 Some properties are considered obsolete by Unicode, but still available.
16610 There are several varieties of obsolescence:
16618 A property may be stabilized. Such a determination does not indicate
16619 that the property should or should not be used; instead it is a declaration
16620 that the property will not be maintained nor extended for newly encoded
16621 characters. Such properties are marked with $a_bold_stabilized in the
16626 A property may be deprecated, perhaps because its original intent
16627 has been replaced by another property, or because its specification was
16628 somehow defective. This means that its use is strongly
16629 discouraged, so much so that a warning will be issued if used, unless the
16630 regular expression is in the scope of a C<S<no warnings 'deprecated'>>
16631 statement. $A_bold_deprecated flags each such entry in the table, and
16632 the entry there for the longest, most descriptive version of the property will
16633 give the reason it is deprecated, and perhaps advice. Perl may issue such a
16634 warning, even for properties that aren't officially deprecated by Unicode,
16635 when there used to be characters or code points that were matched by them, but
16636 no longer. This is to warn you that your program may not work like it did on
16637 earlier Unicode releases.
16639 A deprecated property may be made unavailable in a future Perl version, so it
16640 is best to move away from them.
16642 A deprecated property may also be stabilized, but this fact is not shown.
16646 Properties marked with $a_bold_obsolete in the table are considered (plain)
16647 obsolete. Generally this designation is given to properties that Unicode once
16648 used for internal purposes (but not any longer).
16652 Some Perl extensions are present for backwards compatibility and are
16653 discouraged from being used, but are not obsolete. $A_bold_discouraged
16654 flags each such entry in the table. Future Unicode versions may force
16655 some of these extensions to be removed without warning, replaced by another
16656 property with the same name that means something different. Use the
16657 equivalent shown instead.
16663 The table below has two columns. The left column contains the C<\\p{}>
16664 constructs to look up, possibly preceded by the flags mentioned above; and
16665 the right column contains information about them, like a description, or
16666 synonyms. The table shows both the single and compound forms for each
16667 property that has them. If the left column is a short name for a property,
16668 the right column will give its longer, more descriptive name; and if the left
16669 column is the longest name, the right column will show any equivalent shortest
16670 name, in both single and compound forms if applicable.
16672 If braces are not needed to specify a property (e.g., C<\\pL>), the left
16673 column contains both forms, with and without braces.
16675 The right column will also caution you if a property means something different
16676 than what might normally be expected.
16678 All single forms are Perl extensions; a few compound forms are as well, and
16681 Numbers in (parentheses) indicate the total number of Unicode code points
16682 matched by the property. For emphasis, those properties that match no code
16683 points at all are listed as well in a separate section following the table.
16685 Most properties match the same code points regardless of whether C<"/i">
16686 case-insensitive matching is specified or not. But a few properties are
16687 affected. These are shown with the notation S<C<(/i= I<other_property>)>>
16688 in the second column. Under case-insensitive matching they match the
16689 same code pode points as the property I<other_property>.
16691 There is no description given for most non-Perl defined properties (See
16692 L<$unicode_reference_url> for that).
16694 For compactness, 'B<*>' is used as a wildcard instead of showing all possible
16695 combinations. For example, entries like:
16697 \\p{Gc: *} \\p{General_Category: *}
16699 mean that 'Gc' is a synonym for 'General_Category', and anything that is valid
16700 for the latter is also valid for the former. Similarly,
16704 means that if and only if, for example, C<\\p{Foo}> exists, then
16705 C<\\p{Is_Foo}> and C<\\p{IsFoo}> are also valid and all mean the same thing.
16706 And similarly, C<\\p{Foo=Bar}> means the same as C<\\p{Is_Foo=Bar}> and
16707 C<\\p{IsFoo=Bar}>. "*" here is restricted to something not beginning with an
16710 Also, in binary properties, 'Yes', 'T', and 'True' are all synonyms for 'Y'.
16711 And 'No', 'F', and 'False' are all synonyms for 'N'. The table shows 'Y*' and
16712 'N*' to indicate this, and doesn't have separate entries for the other
16713 possibilities. Note that not all properties which have values 'Yes' and 'No'
16714 are binary, and they have all their values spelled out without using this wild
16715 card, and a C<NOT> clause in their description that highlights their not being
16716 binary. These also require the compound form to match them, whereas true
16717 binary properties have both single and compound forms available.
16719 Note that all non-essential underscores are removed in the display of the
16728 B<*> is a wild-card
16732 B<(\\d+)> in the info column gives the number of Unicode code points matched
16737 B<$DEPRECATED> means this is deprecated.
16741 B<$OBSOLETE> means this is obsolete.
16745 B<$STABILIZED> means this is stabilized.
16749 B<$STRICTER> means tighter (stricter) name matching applies.
16753 B<$DISCOURAGED> means use of this form is discouraged, and may not be
16758 $formatted_properties
16762 =head1 Properties accessible through Unicode::UCD
16764 The value of any Unicode (not including Perl extensions) character
16765 property mentioned above for any single code point is available through
16766 L<Unicode::UCD/charprop()>. L<Unicode::UCD/charprops_all()> returns the
16767 values of all the Unicode properties for a given code point.
16769 Besides these, all the Unicode character properties mentioned above
16770 (except for those marked as for internal use by Perl) are also
16771 accessible by L<Unicode::UCD/prop_invlist()>.
16773 Due to their nature, not all Unicode character properties are suitable for
16774 regular expression matches, nor C<prop_invlist()>. The remaining
16775 non-provisional, non-internal ones are accessible via
16776 L<Unicode::UCD/prop_invmap()> (except for those that this Perl installation
16777 hasn't included; see L<below for which those are|/Unicode character properties
16778 that are NOT accepted by Perl>).
16780 For compatibility with other parts of Perl, all the single forms given in the
16781 table in the L<section above|/Properties accessible through \\p{} and \\P{}>
16782 are recognized. BUT, there are some ambiguities between some Perl extensions
16783 and the Unicode properties, all of which are silently resolved in favor of the
16784 official Unicode property. To avoid surprises, you should only use
16785 C<prop_invmap()> for forms listed in the table below, which omits the
16786 non-recommended ones. The affected forms are the Perl single form equivalents
16787 of Unicode properties, such as C<\\p{sc}> being a single-form equivalent of
16788 C<\\p{gc=sc}>, which is treated by C<prop_invmap()> as the C<Script> property,
16789 whose short name is C<sc>. The table indicates the current ambiguities in the
16790 INFO column, beginning with the word C<"NOT">.
16792 The standard Unicode properties listed below are documented in
16793 L<$unicode_reference_url>; Perl_Decimal_Digit is documented in
16794 L<Unicode::UCD/prop_invmap()>. The other Perl extensions are in
16795 L<perlunicode/Other Properties>;
16797 The first column in the table is a name for the property; the second column is
16798 an alternative name, if any, plus possibly some annotations. The alternative
16799 name is the property's full name, unless that would simply repeat the first
16800 column, in which case the second column indicates the property's short name
16801 (if different). The annotations are given only in the entry for the full
16802 name. If a property is obsolete, etc, the entry will be flagged with the same
16803 characters used in the table in the L<section above|/Properties accessible
16804 through \\p{} and \\P{}>, like B<$DEPRECATED> or B<$STABILIZED>.
16808 =head1 Properties accessible through other means
16810 Certain properties are accessible also via core function calls. These are:
16812 Lowercase_Mapping lc() and lcfirst()
16813 Titlecase_Mapping ucfirst()
16814 Uppercase_Mapping uc()
16816 Also, Case_Folding is accessible through the C</i> modifier in regular
16817 expressions, the C<\\F> transliteration escape, and the C<L<fc|perlfunc/fc>>
16820 And, the Name and Name_Aliases properties are accessible through the C<\\N{}>
16821 interpolation in double-quoted strings and regular expressions; and functions
16822 C<charnames::viacode()>, C<charnames::vianame()>, and
16823 C<charnames::string_vianame()> (which require a C<use charnames ();> to be
16826 Finally, most properties related to decomposition are accessible via
16827 L<Unicode::Normalize>.
16829 =head1 Unicode character properties that are NOT accepted by Perl
16831 Perl will generate an error for a few character properties in Unicode when
16832 used in a regular expression. The non-Unihan ones are listed below, with the
16833 reasons they are not accepted, perhaps with work-arounds. The short names for
16834 the properties are listed enclosed in (parentheses).
16835 As described after the list, an installation can change the defaults and choose
16836 to accept any of these. The list is machine generated based on the
16837 choices made for the installation that generated this document.
16841 An installation can choose to allow any of these to be matched by downloading
16842 the Unicode database from L<http://www.unicode.org/Public/> to
16843 C<\$Config{privlib}>/F<unicore/> in the Perl source tree, changing the
16844 controlling lists contained in the program
16845 C<\$Config{privlib}>/F<unicore/mktables> and then re-compiling and installing.
16846 (C<\%Config> is available from the Config module).
16848 Also, perl can be recompiled to operate on an earlier version of the Unicode
16849 standard. Further information is at
16850 C<\$Config{privlib}>/F<unicore/README.perl>.
16852 =head1 Other information in the Unicode data base
16854 The Unicode data base is delivered in two different formats. The XML version
16855 is valid for more modern Unicode releases. The other version is a collection
16856 of files. The two are intended to give equivalent information. Perl uses the
16857 older form; this allows you to recompile Perl to use early Unicode releases.
16859 The only non-character property that Perl currently supports is Named
16860 Sequences, in which a sequence of code points
16861 is given a name and generally treated as a single entity. (Perl supports
16862 these via the C<\\N{...}> double-quotish construct,
16863 L<charnames/charnames::string_vianame(name)>, and L<Unicode::UCD/namedseq()>.
16865 Below is a list of the files in the Unicode data base that Perl doesn't
16866 currently use, along with very brief descriptions of their purposes.
16867 Some of the names of the files have been shortened from those that Unicode
16868 uses, in order to allow them to be distinguishable from similarly named files
16869 on file systems for which only the first 8 characters of a name are
16880 L<$unicode_reference_url>
16888 # And write it. The 0 means no utf8.
16889 main::write([ $pod_directory, "$pod_file.pod" ], 0, \@OUT);
16893 sub make_Heavy () {
16894 # Create and write Heavy.pl, which passes info about the tables to
16897 # Stringify structures for output
16898 my $loose_property_name_of
16899 = simple_dumper(\%loose_property_name_of, ' ' x 4);
16900 chomp $loose_property_name_of;
16902 my $strict_property_name_of
16903 = simple_dumper(\%strict_property_name_of, ' ' x 4);
16904 chomp $strict_property_name_of;
16906 my $stricter_to_file_of = simple_dumper(\%stricter_to_file_of, ' ' x 4);
16907 chomp $stricter_to_file_of;
16909 my $inline_definitions = simple_dumper(\@inline_definitions, " " x 4);
16910 chomp $inline_definitions;
16912 my $loose_to_file_of = simple_dumper(\%loose_to_file_of, ' ' x 4);
16913 chomp $loose_to_file_of;
16915 my $nv_floating_to_rational
16916 = simple_dumper(\%nv_floating_to_rational, ' ' x 4);
16917 chomp $nv_floating_to_rational;
16919 my $why_deprecated = simple_dumper(\%utf8::why_deprecated, ' ' x 4);
16920 chomp $why_deprecated;
16922 # We set the key to the file when we associated files with tables, but we
16923 # couldn't do the same for the value then, as we might not have the file
16924 # for the alternate table figured out at that time.
16925 foreach my $cased (keys %caseless_equivalent_to) {
16926 my @path = $caseless_equivalent_to{$cased}->file_path;
16928 if ($path[0] eq "#") { # Pseudo-directory '#'
16929 $path = join '/', @path;
16931 else { # Gets rid of lib/
16932 $path = join '/', @path[1, -1];
16934 $caseless_equivalent_to{$cased} = $path;
16936 my $caseless_equivalent_to
16937 = simple_dumper(\%caseless_equivalent_to, ' ' x 4);
16938 chomp $caseless_equivalent_to;
16940 my $loose_property_to_file_of
16941 = simple_dumper(\%loose_property_to_file_of, ' ' x 4);
16942 chomp $loose_property_to_file_of;
16944 my $strict_property_to_file_of
16945 = simple_dumper(\%strict_property_to_file_of, ' ' x 4);
16946 chomp $strict_property_to_file_of;
16948 my $file_to_swash_name = simple_dumper(\%file_to_swash_name, ' ' x 4);
16949 chomp $file_to_swash_name;
16953 $INTERNAL_ONLY_HEADER
16955 # This file is for the use of utf8_heavy.pl and Unicode::UCD
16957 # Maps Unicode (not Perl single-form extensions) property names in loose
16958 # standard form to their corresponding standard names
16959 \%utf8::loose_property_name_of = (
16960 $loose_property_name_of
16963 # Same, but strict names
16964 \%utf8::strict_property_name_of = (
16965 $strict_property_name_of
16968 # Gives the definitions (in the form of inversion lists) for those properties
16969 # whose definitions aren't kept in files
16970 \@utf8::inline_definitions = (
16971 $inline_definitions
16974 # Maps property, table to file for those using stricter matching. For paths
16975 # whose directory is '#', the file is in the form of a numeric index into
16976 # \@inline_definitions
16977 \%utf8::stricter_to_file_of = (
16978 $stricter_to_file_of
16981 # Maps property, table to file for those using loose matching. For paths
16982 # whose directory is '#', the file is in the form of a numeric index into
16983 # \@inline_definitions
16984 \%utf8::loose_to_file_of = (
16988 # Maps floating point to fractional form
16989 \%utf8::nv_floating_to_rational = (
16990 $nv_floating_to_rational
16993 # If a floating point number doesn't have enough digits in it to get this
16994 # close to a fraction, it isn't considered to be that fraction even if all the
16995 # digits it does have match.
16996 \$utf8::max_floating_slop = $MAX_FLOATING_SLOP;
16998 # Deprecated tables to generate a warning for. The key is the file containing
16999 # the table, so as to avoid duplication, as many property names can map to the
17000 # file, but we only need one entry for all of them.
17001 \%utf8::why_deprecated = (
17005 # A few properties have different behavior under /i matching. This maps
17006 # those to substitute files to use under /i.
17007 \%utf8::caseless_equivalent = (
17008 $caseless_equivalent_to
17011 # Property names to mapping files
17012 \%utf8::loose_property_to_file_of = (
17013 $loose_property_to_file_of
17016 # Property names to mapping files
17017 \%utf8::strict_property_to_file_of = (
17018 $strict_property_to_file_of
17021 # Files to the swash names within them.
17022 \%utf8::file_to_swash_name = (
17023 $file_to_swash_name
17029 main::write("Heavy.pl", 0, \@heavy); # The 0 means no utf8.
17033 sub make_Name_pm () {
17034 # Create and write Name.pm, which contains subroutines and data to use in
17035 # conjunction with Name.pl
17037 # Maybe there's nothing to do.
17038 return unless $has_hangul_syllables || @code_points_ending_in_code_point;
17042 $INTERNAL_ONLY_HEADER
17045 # Convert these structures to output format.
17046 my $code_points_ending_in_code_point =
17047 main::simple_dumper(\@code_points_ending_in_code_point,
17049 my $names = main::simple_dumper(\%names_ending_in_code_point,
17051 my $loose_names = main::simple_dumper(\%loose_names_ending_in_code_point,
17054 # Do the same with the Hangul names,
17060 if ($has_hangul_syllables) {
17062 # Construct a regular expression of all the possible
17063 # combinations of the Hangul syllables.
17064 my @L_re; # Leading consonants
17065 for my $i ($LBase .. $LBase + $LCount - 1) {
17066 push @L_re, $Jamo{$i}
17068 my @V_re; # Middle vowels
17069 for my $i ($VBase .. $VBase + $VCount - 1) {
17070 push @V_re, $Jamo{$i}
17072 my @T_re; # Trailing consonants
17073 for my $i ($TBase + 1 .. $TBase + $TCount - 1) {
17074 push @T_re, $Jamo{$i}
17077 # The whole re is made up of the L V T combination.
17079 . join ('|', sort @L_re)
17081 . join ('|', sort @V_re)
17083 . join ('|', sort @T_re)
17086 # These hashes needed by the algorithm were generated
17087 # during reading of the Jamo.txt file
17088 $jamo = main::simple_dumper(\%Jamo, ' ' x 8);
17089 $jamo_l = main::simple_dumper(\%Jamo_L, ' ' x 8);
17090 $jamo_v = main::simple_dumper(\%Jamo_V, ' ' x 8);
17091 $jamo_t = main::simple_dumper(\%Jamo_T, ' ' x 8);
17098 # This module contains machine-generated tables and code for the
17099 # algorithmically-determinable Unicode character names. The following
17100 # routines can be used to translate between name and code point and vice versa
17104 # Matches legal code point. 4-6 hex numbers, If there are 6, the first
17105 # two must be 10; if there are 5, the first must not be a 0. Written this
17106 # way to decrease backtracking. The first regex allows the code point to
17107 # be at the end of a word, but to work properly, the word shouldn't end
17108 # with a valid hex character. The second one won't match a code point at
17109 # the end of a word, and doesn't have the run-on issue
17110 my \$run_on_code_point_re = qr/$run_on_code_point_re/;
17111 my \$code_point_re = qr/$code_point_re/;
17113 # In the following hash, the keys are the bases of names which include
17114 # the code point in the name, like CJK UNIFIED IDEOGRAPH-4E01. The value
17115 # of each key is another hash which is used to get the low and high ends
17116 # for each range of code points that apply to the name.
17117 my %names_ending_in_code_point = (
17121 # The following hash is a copy of the previous one, except is for loose
17122 # matching, so each name has blanks and dashes squeezed out
17123 my %loose_names_ending_in_code_point = (
17127 # And the following array gives the inverse mapping from code points to
17128 # names. Lowest code points are first
17129 my \@code_points_ending_in_code_point = (
17130 $code_points_ending_in_code_point
17133 # Earlier releases didn't have Jamos. No sense outputting
17134 # them unless will be used.
17135 if ($has_hangul_syllables) {
17138 # Convert from code point to Jamo short name for use in composing Hangul
17144 # Leading consonant (can be null)
17154 # Optional trailing consonant
17159 # Computed re that splits up a Hangul name into LVT or LV syllables
17160 my \$syllable_re = qr/$jamo_re/;
17162 my \$HANGUL_SYLLABLE = "HANGUL SYLLABLE ";
17163 my \$loose_HANGUL_SYLLABLE = "HANGULSYLLABLE";
17165 # These constants names and values were taken from the Unicode standard,
17166 # version 5.1, section 3.12. They are used in conjunction with Hangul
17168 my \$SBase = $SBase_string;
17169 my \$LBase = $LBase_string;
17170 my \$VBase = $VBase_string;
17171 my \$TBase = $TBase_string;
17172 my \$SCount = $SCount;
17173 my \$LCount = $LCount;
17174 my \$VCount = $VCount;
17175 my \$TCount = $TCount;
17176 my \$NCount = \$VCount * \$TCount;
17178 } # End of has Jamos
17180 push @name, << 'END';
17182 sub name_to_code_point_special {
17183 my ($name, $loose) = @_;
17185 # Returns undef if not one of the specially handled names; otherwise
17186 # returns the code point equivalent to the input name
17187 # $loose is non-zero if to use loose matching, 'name' in that case
17188 # must be input as upper case with all blanks and dashes squeezed out.
17190 if ($has_hangul_syllables) {
17191 push @name, << 'END';
17193 if ((! $loose && $name =~ s/$HANGUL_SYLLABLE//)
17194 || ($loose && $name =~ s/$loose_HANGUL_SYLLABLE//))
17196 return if $name !~ qr/^$syllable_re$/;
17197 my $L = $Jamo_L{$1};
17198 my $V = $Jamo_V{$2};
17199 my $T = (defined $3) ? $Jamo_T{$3} : 0;
17200 return ($L * $VCount + $V) * $TCount + $T + $SBase;
17204 push @name, << 'END';
17206 # Name must end in 'code_point' for this to handle.
17207 return if (($loose && $name !~ /^ (.*?) ($run_on_code_point_re) $/x)
17208 || (! $loose && $name !~ /^ (.*) ($code_point_re) $/x));
17211 my $code_point = CORE::hex $2;
17215 $names_ref = \%loose_names_ending_in_code_point;
17218 return if $base !~ s/-$//;
17219 $names_ref = \%names_ending_in_code_point;
17222 # Name must be one of the ones which has the code point in it.
17223 return if ! $names_ref->{$base};
17225 # Look through the list of ranges that apply to this name to see if
17226 # the code point is in one of them.
17227 for (my $i = 0; $i < scalar @{$names_ref->{$base}{'low'}}; $i++) {
17228 return if $names_ref->{$base}{'low'}->[$i] > $code_point;
17229 next if $names_ref->{$base}{'high'}->[$i] < $code_point;
17231 # Here, the code point is in the range.
17232 return $code_point;
17235 # Here, looked like the name had a code point number in it, but
17236 # did not match one of the valid ones.
17240 sub code_point_to_name_special {
17241 my $code_point = shift;
17243 # Returns the name of a code point if algorithmically determinable;
17246 if ($has_hangul_syllables) {
17247 push @name, << 'END';
17249 # If in the Hangul range, calculate the name based on Unicode's
17251 if ($code_point >= $SBase && $code_point <= $SBase + $SCount -1) {
17253 my $SIndex = $code_point - $SBase;
17254 my $L = $LBase + $SIndex / $NCount;
17255 my $V = $VBase + ($SIndex % $NCount) / $TCount;
17256 my $T = $TBase + $SIndex % $TCount;
17257 $name = "$HANGUL_SYLLABLE$Jamo{$L}$Jamo{$V}";
17258 $name .= $Jamo{$T} if $T != $TBase;
17263 push @name, << 'END';
17265 # Look through list of these code points for one in range.
17266 foreach my $hash (@code_points_ending_in_code_point) {
17267 return if $code_point < $hash->{'low'};
17268 if ($code_point <= $hash->{'high'}) {
17269 return sprintf("%s-%04X", $hash->{'name'}, $code_point);
17272 return; # None found
17279 main::write("Name.pm", 0, \@name); # The 0 means no utf8.
17284 # Create and write UCD.pl, which passes info about the tables to
17287 # Create a mapping from each alias of Perl single-form extensions to all
17288 # its equivalent aliases, for quick look-up.
17289 my %perlprop_to_aliases;
17290 foreach my $table ($perl->tables) {
17292 # First create the list of the aliases of each extension
17293 my @aliases_list; # List of legal aliases for this extension
17295 my $table_name = $table->name;
17296 my $standard_table_name = standardize($table_name);
17297 my $table_full_name = $table->full_name;
17298 my $standard_table_full_name = standardize($table_full_name);
17300 # Make sure that the list has both the short and full names
17301 push @aliases_list, $table_name, $table_full_name;
17303 my $found_ucd = 0; # ? Did we actually get an alias that should be
17304 # output for this table
17306 # Go through all the aliases (including the two just added), and add
17307 # any new unique ones to the list
17308 foreach my $alias ($table->aliases) {
17310 # Skip non-legal names
17311 next unless $alias->ok_as_filename;
17312 next unless $alias->ucd;
17314 $found_ucd = 1; # have at least one legal name
17316 my $name = $alias->name;
17317 my $standard = standardize($name);
17319 # Don't repeat a name that is equivalent to one already on the
17321 next if $standard eq $standard_table_name;
17322 next if $standard eq $standard_table_full_name;
17324 push @aliases_list, $name;
17327 # If there were no legal names, don't output anything.
17328 next unless $found_ucd;
17330 # To conserve memory in the program reading these in, omit full names
17331 # that are identical to the short name, when those are the only two
17332 # aliases for the property.
17333 if (@aliases_list == 2 && $aliases_list[0] eq $aliases_list[1]) {
17337 # Here, @aliases_list is the list of all the aliases that this
17338 # extension legally has. Now can create a map to it from each legal
17339 # standardized alias
17340 foreach my $alias ($table->aliases) {
17341 next unless $alias->ucd;
17342 next unless $alias->ok_as_filename;
17343 push @{$perlprop_to_aliases{standardize($alias->name)}},
17348 # Make a list of all combinations of properties/values that are suppressed.
17350 if (! $debug_skip) { # This tends to fail in this debug mode
17351 foreach my $property_name (keys %why_suppressed) {
17354 my $value_name = $1 if $property_name =~ s/ = ( .* ) //x;
17356 # The hash may contain properties not in this release of Unicode
17357 next unless defined (my $property = property_ref($property_name));
17359 # Find all combinations
17360 foreach my $prop_alias ($property->aliases) {
17361 my $prop_alias_name = standardize($prop_alias->name);
17363 # If no =value, there's just one combination possibe for this
17364 if (! $value_name) {
17366 # The property may be suppressed, but there may be a proxy
17367 # for it, so it shouldn't be listed as suppressed
17368 next if $prop_alias->ucd;
17369 push @suppressed, $prop_alias_name;
17372 foreach my $value_alias
17373 ($property->table($value_name)->aliases)
17375 next if $value_alias->ucd;
17377 push @suppressed, "$prop_alias_name="
17378 . standardize($value_alias->name);
17384 @suppressed = sort @suppressed; # So doesn't change between runs of this
17387 # Convert the structure below (designed for Name.pm) to a form that UCD
17388 # wants, so it doesn't have to modify it at all; i.e. so that it includes
17389 # an element for the Hangul syllables in the appropriate place, and
17390 # otherwise changes the name to include the "-<code point>" suffix.
17391 my @algorithm_names;
17392 my $done_hangul = $v_version lt v2.0.0; # Hanguls as we know them came
17393 # along in this version
17394 # Copy it linearly.
17395 for my $i (0 .. @code_points_ending_in_code_point - 1) {
17397 # Insert the hanguls in the correct place.
17399 && $code_points_ending_in_code_point[$i]->{'low'} > $SBase)
17402 push @algorithm_names, { low => $SBase,
17403 high => $SBase + $SCount - 1,
17404 name => '<hangul syllable>',
17408 # Copy the current entry, modified.
17409 push @algorithm_names, {
17410 low => $code_points_ending_in_code_point[$i]->{'low'},
17411 high => $code_points_ending_in_code_point[$i]->{'high'},
17413 "$code_points_ending_in_code_point[$i]->{'name'}-<code point>",
17417 # Serialize these structures for output.
17418 my $loose_to_standard_value
17419 = simple_dumper(\%loose_to_standard_value, ' ' x 4);
17420 chomp $loose_to_standard_value;
17422 my $string_property_loose_to_name
17423 = simple_dumper(\%string_property_loose_to_name, ' ' x 4);
17424 chomp $string_property_loose_to_name;
17426 my $perlprop_to_aliases = simple_dumper(\%perlprop_to_aliases, ' ' x 4);
17427 chomp $perlprop_to_aliases;
17429 my $prop_aliases = simple_dumper(\%prop_aliases, ' ' x 4);
17430 chomp $prop_aliases;
17432 my $prop_value_aliases = simple_dumper(\%prop_value_aliases, ' ' x 4);
17433 chomp $prop_value_aliases;
17435 my $suppressed = (@suppressed) ? simple_dumper(\@suppressed, ' ' x 4) : "";
17438 my $algorithm_names = simple_dumper(\@algorithm_names, ' ' x 4);
17439 chomp $algorithm_names;
17441 my $ambiguous_names = simple_dumper(\%ambiguous_names, ' ' x 4);
17442 chomp $ambiguous_names;
17444 my $combination_property = simple_dumper(\%combination_property, ' ' x 4);
17445 chomp $combination_property;
17447 my $loose_defaults = simple_dumper(\%loose_defaults, ' ' x 4);
17448 chomp $loose_defaults;
17452 $INTERNAL_ONLY_HEADER
17454 # This file is for the use of Unicode::UCD
17456 # Highest legal Unicode code point
17457 \$Unicode::UCD::MAX_UNICODE_CODEPOINT = 0x$MAX_UNICODE_CODEPOINT_STRING;
17460 \$Unicode::UCD::HANGUL_BEGIN = $SBase_string;
17461 \$Unicode::UCD::HANGUL_COUNT = $SCount;
17463 # Keys are all the possible "prop=value" combinations, in loose form; values
17464 # are the standard loose name for the 'value' part of the key
17465 \%Unicode::UCD::loose_to_standard_value = (
17466 $loose_to_standard_value
17469 # String property loose names to standard loose name
17470 \%Unicode::UCD::string_property_loose_to_name = (
17471 $string_property_loose_to_name
17474 # Keys are Perl extensions in loose form; values are each one's list of
17476 \%Unicode::UCD::loose_perlprop_to_name = (
17477 $perlprop_to_aliases
17480 # Keys are standard property name; values are each one's aliases
17481 \%Unicode::UCD::prop_aliases = (
17485 # Keys of top level are standard property name; values are keys to another
17486 # hash, Each one is one of the property's values, in standard form. The
17487 # values are that prop-val's aliases. If only one specified, the short and
17488 # long alias are identical.
17489 \%Unicode::UCD::prop_value_aliases = (
17490 $prop_value_aliases
17493 # Ordered (by code point ordinal) list of the ranges of code points whose
17494 # names are algorithmically determined. Each range entry is an anonymous hash
17495 # of the start and end points and a template for the names within it.
17496 \@Unicode::UCD::algorithmic_named_code_points = (
17500 # The properties that as-is have two meanings, and which must be disambiguated
17501 \%Unicode::UCD::ambiguous_names = (
17505 # Keys are the prop-val combinations which are the default values for the
17506 # given property, expressed in standard loose form
17507 \%Unicode::UCD::loose_defaults = (
17511 # The properties that are combinations, in that they have both a map table and
17512 # a match table. This is actually for UCD.t, so it knows how to test for
17514 \%Unicode::UCD::combination_property = (
17515 $combination_property
17518 # All combinations of names that are suppressed.
17519 # This is actually for UCD.t, so it knows which properties shouldn't have
17520 # entries. If it got any bigger, would probably want to put it in its own
17521 # file to use memory only when it was needed, in testing.
17522 \@Unicode::UCD::suppressed_properties = (
17529 main::write("UCD.pl", 0, \@ucd); # The 0 means no utf8.
17533 sub write_all_tables() {
17534 # Write out all the tables generated by this program to files, as well as
17535 # the supporting data structures, pod file, and .t file.
17537 my @writables; # List of tables that actually get written
17538 my %match_tables_to_write; # Used to collapse identical match tables
17539 # into one file. Each key is a hash function
17540 # result to partition tables into buckets.
17541 # Each value is an array of the tables that
17542 # fit in the bucket.
17544 # For each property ...
17545 # (sort so that if there is an immutable file name, it has precedence, so
17546 # some other property can't come in and take over its file name. (We
17547 # don't care if both defined, as they had better be different anyway.)
17548 # The property named 'Perl' needs to be first (it doesn't have any
17549 # immutable file name) because empty properties are defined in terms of
17550 # its table named 'All' under the -annotate option.) We also sort by
17551 # the property's name. This is just for repeatability of the outputs
17552 # between runs of this program, but does not affect correctness.
17554 foreach my $property ($perl,
17555 sort { return -1 if defined $a->file;
17556 return 1 if defined $b->file;
17557 return $a->name cmp $b->name;
17558 } grep { $_ != $perl } property_ref('*'))
17560 my $type = $property->type;
17562 # And for each table for that property, starting with the mapping
17565 foreach my $table($property,
17567 # and all the match tables for it (if any), sorted so
17568 # the ones with the shortest associated file name come
17569 # first. The length sorting prevents problems of a
17570 # longer file taking a name that might have to be used
17571 # by a shorter one. The alphabetic sorting prevents
17572 # differences between releases
17573 sort { my $ext_a = $a->external_name;
17574 return 1 if ! defined $ext_a;
17575 my $ext_b = $b->external_name;
17576 return -1 if ! defined $ext_b;
17578 # But return the non-complement table before
17579 # the complement one, as the latter is defined
17580 # in terms of the former, and needs to have
17581 # the information for the former available.
17582 return 1 if $a->complement != 0;
17583 return -1 if $b->complement != 0;
17585 # Similarly, return a subservient table after
17587 return 1 if $a->leader != $a;
17588 return -1 if $b->leader != $b;
17590 my $cmp = length $ext_a <=> length $ext_b;
17592 # Return result if lengths not equal
17593 return $cmp if $cmp;
17595 # Alphabetic if lengths equal
17596 return $ext_a cmp $ext_b
17597 } $property->tables
17601 # Here we have a table associated with a property. It could be
17602 # the map table (done first for each property), or one of the
17603 # other tables. Determine which type.
17604 my $is_property = $table->isa('Property');
17606 my $name = $table->name;
17607 my $complete_name = $table->complete_name;
17609 # See if should suppress the table if is empty, but warn if it
17610 # contains something.
17611 my $suppress_if_empty_warn_if_not
17612 = $why_suppress_if_empty_warn_if_not{$complete_name} || 0;
17614 # Calculate if this table should have any code points associated
17616 my $expected_empty =
17618 # $perl should be empty
17619 ($is_property && ($table == $perl))
17621 # Match tables in properties we skipped populating should be
17623 || (! $is_property && ! $property->to_create_match_tables)
17625 # Tables and properties that are expected to have no code
17626 # points should be empty
17627 || $suppress_if_empty_warn_if_not
17630 # Set a boolean if this table is the complement of an empty binary
17632 my $is_complement_of_empty_binary =
17633 $type == $BINARY &&
17634 (($table == $property->table('Y')
17635 && $property->table('N')->is_empty)
17636 || ($table == $property->table('N')
17637 && $property->table('Y')->is_empty));
17639 if ($table->is_empty) {
17641 if ($suppress_if_empty_warn_if_not) {
17642 $table->set_fate($SUPPRESSED,
17643 $suppress_if_empty_warn_if_not);
17646 # Suppress (by skipping them) expected empty tables.
17647 next TABLE if $expected_empty;
17649 # And setup to later output a warning for those that aren't
17650 # known to be allowed to be empty. Don't do the warning if
17651 # this table is a child of another one to avoid duplicating
17652 # the warning that should come from the parent one.
17653 if (($table == $property || $table->parent == $table)
17654 && $table->fate != $SUPPRESSED
17655 && $table->fate != $MAP_PROXIED
17656 && ! grep { $complete_name =~ /^$_$/ }
17657 @tables_that_may_be_empty)
17659 push @unhandled_properties, "$table";
17662 # The old way of expressing an empty match list was to
17663 # complement the list that matches everything. The new way is
17664 # to create an empty inversion list, but this doesn't work for
17665 # annotating, so use the old way then.
17666 $table->set_complement($All) if $annotate
17667 && $table != $property;
17669 elsif ($expected_empty) {
17671 if ($suppress_if_empty_warn_if_not) {
17672 $because = " because $suppress_if_empty_warn_if_not";
17675 Carp::my_carp("Not expecting property $table$because. Generating file for it anyway.");
17678 # Some tables should match everything
17679 my $expected_full =
17680 ($table->fate == $SUPPRESSED)
17683 ? # All these types of map tables will be full because
17684 # they will have been populated with defaults
17687 : # A match table should match everything if its method
17689 ($table->matches_all
17691 # The complement of an empty binary table will match
17693 || $is_complement_of_empty_binary
17697 my $count = $table->count;
17698 if ($expected_full) {
17699 if ($count != $MAX_WORKING_CODEPOINTS) {
17700 Carp::my_carp("$table matches only "
17701 . clarify_number($count)
17702 . " Unicode code points but should match "
17703 . clarify_number($MAX_WORKING_CODEPOINTS)
17705 . clarify_number(abs($MAX_WORKING_CODEPOINTS - $count))
17706 . "). Proceeding anyway.");
17709 # Here is expected to be full. If it is because it is the
17710 # complement of an (empty) binary table that is to be
17711 # suppressed, then suppress this one as well.
17712 if ($is_complement_of_empty_binary) {
17713 my $opposing_name = ($name eq 'Y') ? 'N' : 'Y';
17714 my $opposing = $property->table($opposing_name);
17715 my $opposing_status = $opposing->status;
17716 if ($opposing_status) {
17717 $table->set_status($opposing_status,
17718 $opposing->status_info);
17722 elsif ($count == $MAX_UNICODE_CODEPOINTS
17724 && ($table == $property || $table->leader == $table)
17725 && $table->property->status ne $NORMAL)
17727 Carp::my_carp("$table unexpectedly matches all Unicode code points. Proceeding anyway.");
17730 if ($table->fate >= $SUPPRESSED) {
17731 if (! $is_property) {
17732 my @children = $table->children;
17733 foreach my $child (@children) {
17734 if ($child->fate < $SUPPRESSED) {
17735 Carp::my_carp_bug("'$table' is suppressed and has a child '$child' which isn't");
17743 if (! $is_property) {
17745 make_ucd_table_pod_entries($table) if $table->property == $perl;
17747 # Several things need to be done just once for each related
17748 # group of match tables. Do them on the parent.
17749 if ($table->parent == $table) {
17751 # Add an entry in the pod file for the table; it also does
17753 make_re_pod_entries($table) if defined $pod_directory;
17755 # See if the the table matches identical code points with
17756 # something that has already been output. In that case,
17757 # no need to have two files with the same code points in
17758 # them. We use the table's hash() method to store these
17759 # in buckets, so that it is quite likely that if two
17760 # tables are in the same bucket they will be identical, so
17761 # don't have to compare tables frequently. The tables
17762 # have to have the same status to share a file, so add
17763 # this to the bucket hash. (The reason for this latter is
17764 # that Heavy.pl associates a status with a file.)
17765 # We don't check tables that are inverses of others, as it
17766 # would lead to some coding complications, and checking
17767 # all the regular ones should find everything.
17768 if ($table->complement == 0) {
17769 my $hash = $table->hash . ';' . $table->status;
17771 # Look at each table that is in the same bucket as
17772 # this one would be.
17773 foreach my $comparison
17774 (@{$match_tables_to_write{$hash}})
17776 if ($table->matches_identically_to($comparison)) {
17777 $table->set_equivalent_to($comparison,
17783 # Here, not equivalent, add this table to the bucket.
17784 push @{$match_tables_to_write{$hash}}, $table;
17790 # Here is the property itself.
17791 # Don't write out or make references to the $perl property
17792 next if $table == $perl;
17794 make_ucd_table_pod_entries($table);
17796 # There is a mapping stored of the various synonyms to the
17797 # standardized name of the property for utf8_heavy.pl.
17798 # Also, the pod file contains entries of the form:
17799 # \p{alias: *} \p{full: *}
17800 # rather than show every possible combination of things.
17802 my @property_aliases = $property->aliases;
17804 my $full_property_name = $property->full_name;
17805 my $property_name = $property->name;
17806 my $standard_property_name = standardize($property_name);
17807 my $standard_property_full_name
17808 = standardize($full_property_name);
17810 # We also create for Unicode::UCD a list of aliases for
17811 # the property. The list starts with the property name;
17812 # then its full name. Legacy properties are not listed in
17816 if ( $property->fate <= $MAP_PROXIED) {
17817 @property_list = ($property_name, $full_property_name);
17818 @standard_list = ($standard_property_name,
17819 $standard_property_full_name);
17822 # For each synonym ...
17823 for my $i (0 .. @property_aliases - 1) {
17824 my $alias = $property_aliases[$i];
17825 my $alias_name = $alias->name;
17826 my $alias_standard = standardize($alias_name);
17829 # Add other aliases to the list of property aliases
17830 if ($property->fate <= $MAP_PROXIED
17831 && ! grep { $alias_standard eq $_ } @standard_list)
17833 push @property_list, $alias_name;
17834 push @standard_list, $alias_standard;
17837 # For utf8_heavy, set the mapping of the alias to the
17839 if ($type == $STRING) {
17840 if ($property->fate <= $MAP_PROXIED) {
17841 $string_property_loose_to_name{$alias_standard}
17842 = $standard_property_name;
17846 my $hash_ref = ($alias_standard =~ /^_/)
17847 ? \%strict_property_name_of
17848 : \%loose_property_name_of;
17849 if (exists $hash_ref->{$alias_standard}) {
17850 Carp::my_carp("There already is a property with the same standard name as $alias_name: $hash_ref->{$alias_standard}. Old name is retained");
17853 $hash_ref->{$alias_standard}
17854 = $standard_property_name;
17857 # Now for the re pod entry for this alias. Skip if not
17858 # outputting a pod; skip the first one, which is the
17859 # full name so won't have an entry like: '\p{full: *}
17860 # \p{full: *}', and skip if don't want an entry for
17863 || ! defined $pod_directory
17864 || ! $alias->make_re_pod_entry;
17866 my $rhs = "\\p{$full_property_name: *}";
17867 if ($property != $perl && $table->perl_extension) {
17868 $rhs .= ' (Perl extension)';
17870 push @match_properties,
17871 format_pod_line($indent_info_column,
17872 '\p{' . $alias->name . ': *}',
17878 # The list of all possible names is attached to each alias, so
17880 if (@property_list) {
17881 push @{$prop_aliases{$standard_list[0]}}, @property_list;
17884 if ($property->fate <= $MAP_PROXIED) {
17886 # Similarly, we create for Unicode::UCD a list of
17887 # property-value aliases.
17889 # Look at each table in the property...
17890 foreach my $table ($property->tables) {
17892 my $table_full_name = $table->full_name;
17893 my $standard_table_full_name
17894 = standardize($table_full_name);
17895 my $table_name = $table->name;
17896 my $standard_table_name = standardize($table_name);
17898 # The list starts with the table name and its full
17900 push @values_list, $table_name, $table_full_name;
17902 # We add to the table each unique alias that isn't
17903 # discouraged from use.
17904 foreach my $alias ($table->aliases) {
17905 next if $alias->status
17906 && $alias->status eq $DISCOURAGED;
17907 my $name = $alias->name;
17908 my $standard = standardize($name);
17909 next if $standard eq $standard_table_name;
17910 next if $standard eq $standard_table_full_name;
17911 push @values_list, $name;
17914 # Here @values_list is a list of all the aliases for
17915 # the table. That is, all the property-values given
17916 # by this table. By agreement with Unicode::UCD,
17917 # if the name and full name are identical, and there
17918 # are no other names, drop the duplcate entry to save
17920 if (@values_list == 2
17921 && $values_list[0] eq $values_list[1])
17926 # To save memory, unlike the similar list for property
17927 # aliases above, only the standard forms have the list.
17928 # This forces an extra step of converting from input
17929 # name to standard name, but the savings are
17930 # considerable. (There is only marginal savings if we
17931 # did this with the property aliases.)
17932 push @{$prop_value_aliases{$standard_property_name}{$standard_table_name}}, @values_list;
17936 # Don't write out a mapping file if not desired.
17937 next if ! $property->to_output_map;
17940 # Here, we know we want to write out the table, but don't do it
17941 # yet because there may be other tables that come along and will
17942 # want to share the file, and the file's comments will change to
17943 # mention them. So save for later.
17944 push @writables, $table;
17946 } # End of looping through the property and all its tables.
17947 } # End of looping through all properties.
17949 # Now have all the tables that will have files written for them. Do it.
17950 foreach my $table (@writables) {
17953 my $property = $table->property;
17954 my $is_property = ($table == $property);
17956 # For very short tables, instead of writing them out to actual files,
17957 # we in-line their inversion list definitions into Heavy.pl. The
17958 # definition replaces the file name, and the special pseudo-directory
17959 # '#' is used to signal this. This significantly cuts down the number
17960 # of files written at little extra cost to the hashes in Heavy.pl.
17961 # And it means, no run-time files to read to get the definitions.
17963 && ! $annotate # For annotation, we want to explicitly show
17964 # everything, so keep in files
17965 && $table->ranges <= 3)
17967 my @ranges = $table->ranges;
17968 my $count = @ranges;
17969 if ($count == 0) { # 0th index reserved for 0-length lists
17972 elsif ($table->leader != $table) {
17974 # Here, is a table that is equivalent to another; code
17975 # in register_file_for_name() causes its leader's definition
17980 else { # No equivalent table so far.
17982 # Build up its definition range-by-range.
17983 my $definition = "";
17984 while (defined (my $range = shift @ranges)) {
17985 my $end = $range->end;
17986 if ($end < $MAX_WORKING_CODEPOINT) {
17988 $end = "\n" . ($end + 1);
17990 else { # Extends to infinity, hence no 'end'
17993 $definition .= "\n" . $range->start . $end;
17995 $definition = "V$count" . $definition;
17996 $filename = @inline_definitions;
17997 push @inline_definitions, $definition;
18000 register_file_for_name($table, \@directory, $filename);
18004 if (! $is_property) {
18005 # Match tables for the property go in lib/$subdirectory, which is
18006 # the property's name. Don't use the standard file name for this,
18007 # as may get an unfamiliar alias
18008 @directory = ($matches_directory, $property->external_name);
18012 @directory = $table->directory;
18013 $filename = $table->file;
18016 # Use specified filename if available, or default to property's
18017 # shortest name. We need an 8.3 safe filename (which means "an 8
18018 # safe" filename, since after the dot is only 'pl', which is < 3)
18019 # The 2nd parameter is if the filename shouldn't be changed, and
18020 # it shouldn't iff there is a hard-coded name for this table.
18021 $filename = construct_filename(
18022 $filename || $table->external_name,
18023 ! $filename, # mutable if no filename
18026 register_file_for_name($table, \@directory, $filename);
18028 # Only need to write one file when shared by more than one
18030 next if ! $is_property
18031 && ($table->leader != $table || $table->complement != 0);
18033 # Construct a nice comment to add to the file
18034 $table->set_final_comment;
18040 # Write out the pod file
18043 # And Heavy.pl, Name.pm, UCD.pl
18048 make_property_test_script() if $make_test_script;
18049 make_normalization_test_script() if $make_norm_test_script;
18053 my @white_space_separators = ( # This used only for making the test script.
18060 sub generate_separator($) {
18061 # This used only for making the test script. It generates the colon or
18062 # equal separator between the property and property value, with random
18063 # white space surrounding the separator
18067 return "" if $lhs eq ""; # No separator if there's only one (the r) side
18069 # Choose space before and after randomly
18070 my $spaces_before =$white_space_separators[rand(@white_space_separators)];
18071 my $spaces_after = $white_space_separators[rand(@white_space_separators)];
18073 # And return the whole complex, half the time using a colon, half the
18075 return $spaces_before
18076 . (rand() < 0.5) ? '=' : ':'
18080 sub generate_tests($$$$$) {
18081 # This used only for making the test script. It generates test cases that
18082 # are expected to compile successfully in perl. Note that the lhs and
18083 # rhs are assumed to already be as randomized as the caller wants.
18085 my $lhs = shift; # The property: what's to the left of the colon
18086 # or equals separator
18087 my $rhs = shift; # The property value; what's to the right
18088 my $valid_code = shift; # A code point that's known to be in the
18089 # table given by lhs=rhs; undef if table is
18091 my $invalid_code = shift; # A code point known to not be in the table;
18092 # undef if the table is all code points
18093 my $warning = shift;
18095 # Get the colon or equal
18096 my $separator = generate_separator($lhs);
18098 # The whole 'property=value'
18099 my $name = "$lhs$separator$rhs";
18102 # Create a complete set of tests, with complements.
18103 if (defined $valid_code) {
18104 push @output, <<"EOC"
18105 Expect(1, $valid_code, '\\p{$name}', $warning);
18106 Expect(0, $valid_code, '\\p{^$name}', $warning);
18107 Expect(0, $valid_code, '\\P{$name}', $warning);
18108 Expect(1, $valid_code, '\\P{^$name}', $warning);
18111 if (defined $invalid_code) {
18112 push @output, <<"EOC"
18113 Expect(0, $invalid_code, '\\p{$name}', $warning);
18114 Expect(1, $invalid_code, '\\p{^$name}', $warning);
18115 Expect(1, $invalid_code, '\\P{$name}', $warning);
18116 Expect(0, $invalid_code, '\\P{^$name}', $warning);
18122 sub generate_error($$$) {
18123 # This used only for making the test script. It generates test cases that
18124 # are expected to not only not match, but to be syntax or similar errors
18126 my $lhs = shift; # The property: what's to the left of the
18127 # colon or equals separator
18128 my $rhs = shift; # The property value; what's to the right
18129 my $already_in_error = shift; # Boolean; if true it's known that the
18130 # unmodified lhs and rhs will cause an error.
18131 # This routine should not force another one
18132 # Get the colon or equal
18133 my $separator = generate_separator($lhs);
18135 # Since this is an error only, don't bother to randomly decide whether to
18136 # put the error on the left or right side; and assume that the rhs is
18137 # loosely matched, again for convenience rather than rigor.
18138 $rhs = randomize_loose_name($rhs, 'ERROR') unless $already_in_error;
18140 my $property = $lhs . $separator . $rhs;
18143 Error('\\p{$property}');
18144 Error('\\P{$property}');
18148 # These are used only for making the test script
18149 # XXX Maybe should also have a bad strict seps, which includes underscore.
18151 my @good_loose_seps = (
18158 my @bad_loose_seps = (
18163 sub randomize_stricter_name {
18164 # This used only for making the test script. Take the input name and
18165 # return a randomized, but valid version of it under the stricter matching
18169 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
18171 # If the name looks like a number (integer, floating, or rational), do
18173 if ($name =~ qr{ ^ ( -? ) (\d+ ( ( [./] ) \d+ )? ) $ }x) {
18176 my $separator = $3;
18178 # If there isn't a sign, part of the time add a plus
18179 # Note: Not testing having any denominator having a minus sign
18181 $sign = '+' if rand() <= .3;
18184 # And add 0 or more leading zeros.
18185 $name = $sign . ('0' x int rand(10)) . $number;
18187 if (defined $separator) {
18188 my $extra_zeros = '0' x int rand(10);
18190 if ($separator eq '.') {
18192 # Similarly, add 0 or more trailing zeros after a decimal
18194 $name .= $extra_zeros;
18198 # Or, leading zeros before the denominator
18199 $name =~ s,/,/$extra_zeros,;
18204 # For legibility of the test, only change the case of whole sections at a
18205 # time. To do this, first split into sections. The split returns the
18208 for my $section (split / ( [ - + \s _ . ]+ ) /x, $name) {
18209 trace $section if main::DEBUG && $to_trace;
18211 if (length $section > 1 && $section !~ /\D/) {
18213 # If the section is a sequence of digits, about half the time
18214 # randomly add underscores between some of them.
18217 # Figure out how many underscores to add. max is 1 less than
18218 # the number of digits. (But add 1 at the end to make sure
18219 # result isn't 0, and compensate earlier by subtracting 2
18221 my $num_underscores = int rand(length($section) - 2) + 1;
18223 # And add them evenly throughout, for convenience, not rigor
18225 my $spacing = (length($section) - 1)/ $num_underscores;
18226 my $temp = $section;
18228 for my $i (1 .. $num_underscores) {
18229 $section .= substr($temp, 0, $spacing, "") . '_';
18233 push @sections, $section;
18237 # Here not a sequence of digits. Change the case of the section
18239 my $switch = int rand(4);
18240 if ($switch == 0) {
18241 push @sections, uc $section;
18243 elsif ($switch == 1) {
18244 push @sections, lc $section;
18246 elsif ($switch == 2) {
18247 push @sections, ucfirst $section;
18250 push @sections, $section;
18254 trace "returning", join "", @sections if main::DEBUG && $to_trace;
18255 return join "", @sections;
18258 sub randomize_loose_name($;$) {
18259 # This used only for making the test script
18262 my $want_error = shift; # if true, make an error
18263 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
18265 $name = randomize_stricter_name($name);
18268 push @parts, $good_loose_seps[rand(@good_loose_seps)];
18270 # Preserve trailing ones for the sake of not stripping the underscore from
18272 for my $part (split /[-\s_]+ (?= . )/, $name) {
18274 if ($want_error and rand() < 0.3) {
18275 push @parts, $bad_loose_seps[rand(@bad_loose_seps)];
18279 push @parts, $good_loose_seps[rand(@good_loose_seps)];
18282 push @parts, $part;
18284 my $new = join("", @parts);
18285 trace "$name => $new" if main::DEBUG && $to_trace;
18288 if (rand() >= 0.5) {
18289 $new .= $bad_loose_seps[rand(@bad_loose_seps)];
18292 $new = $bad_loose_seps[rand(@bad_loose_seps)] . $new;
18298 # Used to make sure don't generate duplicate test cases.
18299 my %test_generated;
18301 sub make_property_test_script() {
18302 # This used only for making the test script
18303 # this written directly -- it's huge.
18305 print "Making test script\n" if $verbosity >= $PROGRESS;
18307 # This uses randomness to test different possibilities without testing all
18308 # possibilities. To ensure repeatability, set the seed to 0. But if
18309 # tests are added, it will perturb all later ones in the .t file
18312 $t_path = 'TestProp.pl' unless defined $t_path; # the traditional name
18314 # Keep going down an order of magnitude
18315 # until find that adding this quantity to
18316 # 1 remains 1; but put an upper limit on
18317 # this so in case this algorithm doesn't
18318 # work properly on some platform, that we
18319 # won't loop forever.
18321 my $min_floating_slop = 1;
18322 while (1+ $min_floating_slop != 1
18325 my $next = $min_floating_slop / 10;
18326 last if $next == 0; # If underflows,
18328 $min_floating_slop = $next;
18331 # It doesn't matter whether the elements of this array contain single lines
18332 # or multiple lines. main::write doesn't count the lines.
18335 # Sort these so get results in same order on different runs of this
18337 foreach my $property (sort { $a->name cmp $b->name } property_ref('*')) {
18338 foreach my $table (sort { $a->name cmp $b->name } $property->tables) {
18340 # Find code points that match, and don't match this table.
18341 my $valid = $table->get_valid_code_point;
18342 my $invalid = $table->get_invalid_code_point;
18343 my $warning = ($table->status eq $DEPRECATED)
18347 # Test each possible combination of the property's aliases with
18348 # the table's. If this gets to be too many, could do what is done
18349 # in the set_final_comment() for Tables
18350 my @table_aliases = grep { $_->status ne $INTERNAL_ALIAS } $table->aliases;
18351 next unless @table_aliases;
18352 my @property_aliases = grep { $_->status ne $INTERNAL_ALIAS } $table->property->aliases;
18353 next unless @property_aliases;
18355 # Every property can be optionally be prefixed by 'Is_', so test
18356 # that those work, by creating such a new alias for each
18357 # pre-existing one.
18358 push @property_aliases, map { Alias->new("Is_" . $_->name,
18360 $_->make_re_pod_entry,
18361 $_->ok_as_filename,
18365 } @property_aliases;
18366 my $max = max(scalar @table_aliases, scalar @property_aliases);
18367 for my $j (0 .. $max - 1) {
18369 # The current alias for property is the next one on the list,
18370 # or if beyond the end, start over. Similarly for table
18372 = $property_aliases[$j % @property_aliases]->name;
18374 $property_name = "" if $table->property == $perl;
18375 my $table_alias = $table_aliases[$j % @table_aliases];
18376 my $table_name = $table_alias->name;
18377 my $loose_match = $table_alias->loose_match;
18379 # If the table doesn't have a file, any test for it is
18380 # already guaranteed to be in error
18381 my $already_error = ! $table->file_path;
18383 # Generate error cases for this alias.
18384 push @output, generate_error($property_name,
18388 # If the table is guaranteed to always generate an error,
18389 # quit now without generating success cases.
18390 next if $already_error;
18392 # Now for the success cases.
18394 if ($loose_match) {
18396 # For loose matching, create an extra test case for the
18398 my $standard = standardize($table_name);
18400 # $test_name should be a unique combination for each test
18401 # case; used just to avoid duplicate tests
18402 my $test_name = "$property_name=$standard";
18404 # Don't output duplicate test cases.
18405 if (! exists $test_generated{$test_name}) {
18406 $test_generated{$test_name} = 1;
18407 push @output, generate_tests($property_name,
18414 $random = randomize_loose_name($table_name)
18416 else { # Stricter match
18417 $random = randomize_stricter_name($table_name);
18420 # Now for the main test case for this alias.
18421 my $test_name = "$property_name=$random";
18422 if (! exists $test_generated{$test_name}) {
18423 $test_generated{$test_name} = 1;
18424 push @output, generate_tests($property_name,
18431 # If the name is a rational number, add tests for the
18432 # floating point equivalent.
18433 if ($table_name =~ qr{/}) {
18435 # Calculate the float, and find just the fraction.
18436 my $float = eval $table_name;
18437 my ($whole, $fraction)
18438 = $float =~ / (.*) \. (.*) /x;
18440 # Starting with one digit after the decimal point,
18441 # create a test for each possible precision (number of
18442 # digits past the decimal point) until well beyond the
18443 # native number found on this machine. (If we started
18444 # with 0 digits, it would be an integer, which could
18445 # well match an unrelated table)
18447 for my $i (1 .. $min_floating_slop + 3) {
18448 my $table_name = sprintf("%.*f", $i, $float);
18449 if ($i < $MIN_FRACTION_LENGTH) {
18451 # If the test case has fewer digits than the
18452 # minimum acceptable precision, it shouldn't
18453 # succeed, so we expect an error for it.
18454 # E.g., 2/3 = .7 at one decimal point, and we
18455 # shouldn't say it matches .7. We should make
18456 # it be .667 at least before agreeing that the
18457 # intent was to match 2/3. But at the
18458 # less-than- acceptable level of precision, it
18459 # might actually match an unrelated number.
18460 # So don't generate a test case if this
18461 # conflating is possible. In our example, we
18462 # don't want 2/3 matching 7/10, if there is
18463 # a 7/10 code point.
18465 (keys %nv_floating_to_rational)
18468 if abs($table_name - $existing)
18469 < $MAX_FLOATING_SLOP;
18471 push @output, generate_error($property_name,
18473 1 # 1 => already an error
18478 # Here the number of digits exceeds the
18479 # minimum we think is needed. So generate a
18480 # success test case for it.
18481 push @output, generate_tests($property_name,
18500 (map {"Test_GCB('$_');\n"} @backslash_X_tests),
18501 (map {"Test_SB('$_');\n"} @SB_tests),
18502 (map {"Test_WB('$_');\n"} @WB_tests),
18509 sub make_normalization_test_script() {
18510 print "Making normalization test script\n" if $verbosity >= $PROGRESS;
18512 my $n_path = 'TestNorm.pl';
18514 unshift @normalization_tests, <<'END';
18518 sub ord_string { # Convert packed ords to printable string
18520 return "'" . join("", map { '\N{' . charnames::viacode($_) . '}' }
18521 unpack "U*", shift) . "'";
18522 #return "'" . join(" ", map { sprintf "%04X", $_ } unpack "U*", shift) . "'";
18526 my ($source, $nfc, $nfd, $nfkc, $nfkd) = @_;
18527 my $display_source = ord_string($source);
18528 my $display_nfc = ord_string($nfc);
18529 my $display_nfd = ord_string($nfd);
18530 my $display_nfkc = ord_string($nfkc);
18531 my $display_nfkd = ord_string($nfkd);
18533 use Unicode::Normalize;
18535 # nfc == toNFC(source) == toNFC(nfc) == toNFC(nfd)
18536 # nfkc == toNFC(nfkc) == toNFC(nfkd)
18539 # nfd == toNFD(source) == toNFD(nfc) == toNFD(nfd)
18540 # nfkd == toNFD(nfkc) == toNFD(nfkd)
18543 # nfkc == toNFKC(source) == toNFKC(nfc) == toNFKC(nfd) ==
18544 # toNFKC(nfkc) == toNFKC(nfkd)
18547 # nfkd == toNFKD(source) == toNFKD(nfc) == toNFKD(nfd) ==
18548 # toNFKD(nfkc) == toNFKD(nfkd)
18550 is(NFC($source), $nfc, "NFC($display_source) eq $display_nfc");
18551 is(NFC($nfc), $nfc, "NFC($display_nfc) eq $display_nfc");
18552 is(NFC($nfd), $nfc, "NFC($display_nfd) eq $display_nfc");
18553 is(NFC($nfkc), $nfkc, "NFC($display_nfkc) eq $display_nfkc");
18554 is(NFC($nfkd), $nfkc, "NFC($display_nfkd) eq $display_nfkc");
18556 is(NFD($source), $nfd, "NFD($display_source) eq $display_nfd");
18557 is(NFD($nfc), $nfd, "NFD($display_nfc) eq $display_nfd");
18558 is(NFD($nfd), $nfd, "NFD($display_nfd) eq $display_nfd");
18559 is(NFD($nfkc), $nfkd, "NFD($display_nfkc) eq $display_nfkd");
18560 is(NFD($nfkd), $nfkd, "NFD($display_nfkd) eq $display_nfkd");
18562 is(NFKC($source), $nfkc, "NFKC($display_source) eq $display_nfkc");
18563 is(NFKC($nfc), $nfkc, "NFKC($display_nfc) eq $display_nfkc");
18564 is(NFKC($nfd), $nfkc, "NFKC($display_nfd) eq $display_nfkc");
18565 is(NFKC($nfkc), $nfkc, "NFKC($display_nfkc) eq $display_nfkc");
18566 is(NFKC($nfkd), $nfkc, "NFKC($display_nfkd) eq $display_nfkc");
18568 is(NFKD($source), $nfkd, "NFKD($display_source) eq $display_nfkd");
18569 is(NFKD($nfc), $nfkd, "NFKD($display_nfc) eq $display_nfkd");
18570 is(NFKD($nfd), $nfkd, "NFKD($display_nfd) eq $display_nfkd");
18571 is(NFKD($nfkc), $nfkd, "NFKD($display_nfkc) eq $display_nfkd");
18572 is(NFKD($nfkd), $nfkd, "NFKD($display_nfkd) eq $display_nfkd");
18579 @normalization_tests,
18585 # Skip reasons, so will be exact same text and hence the files with each
18586 # reason will get grouped together in perluniprops.
18587 my $Documentation = "Documentation";
18589 = "Provisional; for the analysis and processing of Indic scripts";
18590 my $Validation = "Validation Tests";
18591 my $Validation_Documentation = "Documentation of validation Tests";
18593 # This is a list of the input files and how to handle them. The files are
18594 # processed in their order in this list. Some reordering is possible if
18595 # desired, but the PropertyAliases and PropValueAliases files should be first,
18596 # and the extracted before the others (as data in an extracted file can be
18597 # over-ridden by the non-extracted. Some other files depend on data derived
18598 # from an earlier file, like UnicodeData requires data from Jamo, and the case
18599 # changing and folding requires data from Unicode. Mostly, it is safest to
18600 # order by first version releases in (except the Jamo).
18602 # The version strings allow the program to know whether to expect a file or
18603 # not, but if a file exists in the directory, it will be processed, even if it
18604 # is in a version earlier than expected, so you can copy files from a later
18605 # release into an earlier release's directory.
18606 my @input_file_objects = (
18607 Input_file->new('PropertyAliases.txt', v3.2,
18608 Handler => \&process_PropertyAliases,
18609 Early => [ \&substitute_PropertyAliases ],
18610 Required_Even_in_Debug_Skip => 1,
18612 Input_file->new(undef, v0, # No file associated with this
18613 Progress_Message => 'Finishing property setup',
18614 Handler => \&finish_property_setup,
18616 Input_file->new('PropValueAliases.txt', v3.2,
18617 Handler => \&process_PropValueAliases,
18618 Early => [ \&substitute_PropValueAliases ],
18619 Has_Missings_Defaults => $NOT_IGNORED,
18620 Required_Even_in_Debug_Skip => 1,
18622 Input_file->new("${EXTRACTED}DGeneralCategory.txt", v3.1.0,
18623 Property => 'General_Category',
18625 Input_file->new("${EXTRACTED}DCombiningClass.txt", v3.1.0,
18626 Property => 'Canonical_Combining_Class',
18627 Has_Missings_Defaults => $NOT_IGNORED,
18629 Input_file->new("${EXTRACTED}DNumType.txt", v3.1.0,
18630 Property => 'Numeric_Type',
18631 Has_Missings_Defaults => $NOT_IGNORED,
18633 Input_file->new("${EXTRACTED}DEastAsianWidth.txt", v3.1.0,
18634 Property => 'East_Asian_Width',
18635 Has_Missings_Defaults => $NOT_IGNORED,
18637 Input_file->new("${EXTRACTED}DLineBreak.txt", v3.1.0,
18638 Property => 'Line_Break',
18639 Has_Missings_Defaults => $NOT_IGNORED,
18641 Input_file->new("${EXTRACTED}DBidiClass.txt", v3.1.1,
18642 Property => 'Bidi_Class',
18643 Has_Missings_Defaults => $NOT_IGNORED,
18645 Input_file->new("${EXTRACTED}DDecompositionType.txt", v3.1.0,
18646 Property => 'Decomposition_Type',
18647 Has_Missings_Defaults => $NOT_IGNORED,
18649 Input_file->new("${EXTRACTED}DBinaryProperties.txt", v3.1.0),
18650 Input_file->new("${EXTRACTED}DNumValues.txt", v3.1.0,
18651 Property => 'Numeric_Value',
18652 Each_Line_Handler => \&filter_numeric_value_line,
18653 Has_Missings_Defaults => $NOT_IGNORED,
18655 Input_file->new("${EXTRACTED}DJoinGroup.txt", v3.1.0,
18656 Property => 'Joining_Group',
18657 Has_Missings_Defaults => $NOT_IGNORED,
18660 Input_file->new("${EXTRACTED}DJoinType.txt", v3.1.0,
18661 Property => 'Joining_Type',
18662 Has_Missings_Defaults => $NOT_IGNORED,
18664 Input_file->new('Jamo.txt', v2.0.0,
18665 Property => 'Jamo_Short_Name',
18666 Each_Line_Handler => \&filter_jamo_line,
18668 Input_file->new('UnicodeData.txt', v1.1.5,
18669 Pre_Handler => \&setup_UnicodeData,
18671 # We clean up this file for some early versions.
18672 Each_Line_Handler => [ (($v_version lt v2.0.0 )
18674 : ($v_version eq v2.1.5)
18675 ? \&filter_v2_1_5_ucd
18677 # And for 5.14 Perls with 6.0,
18678 # have to also make changes
18679 : ($v_version ge v6.0.0
18684 # Early versions did not have the
18685 # proper Unicode_1 names for the
18687 (($v_version lt v3.0.0)
18688 ? \&filter_early_U1_names
18691 # Early versions did not correctly
18692 # use the later method for giving
18693 # decimal digit values
18694 (($v_version le v3.2.0)
18695 ? \&filter_bad_Nd_ucd
18698 # And the main filter
18699 \&filter_UnicodeData_line,
18701 EOF_Handler => \&EOF_UnicodeData,
18703 Input_file->new('CJKXREF.TXT', v1.1.5,
18704 Withdrawn => v2.0.0,
18705 Skip => 'Gives the mapping of CJK code points '
18706 . 'between Unicode and various other standards',
18708 Input_file->new('ArabicShaping.txt', v2.0.0,
18709 Each_Line_Handler =>
18710 ($v_version lt 4.1.0)
18711 ? \&filter_old_style_arabic_shaping
18713 # The first field after the range is a "schematic name"
18715 Properties => [ '<ignored>', 'Joining_Type', 'Joining_Group' ],
18716 Has_Missings_Defaults => $NOT_IGNORED,
18718 Input_file->new('Blocks.txt', v2.0.0,
18719 Property => 'Block',
18720 Has_Missings_Defaults => $NOT_IGNORED,
18721 Each_Line_Handler => \&filter_blocks_lines
18723 Input_file->new('Index.txt', v2.0.0,
18724 Skip => 'Alphabetical index of Unicode characters',
18726 Input_file->new('NamesList.txt', v2.0.0,
18727 Skip => 'Annotated list of characters',
18729 Input_file->new('PropList.txt', v2.0.0,
18730 Each_Line_Handler => (($v_version lt v3.1.0)
18731 ? \&filter_old_style_proplist
18734 Input_file->new('Props.txt', v2.0.0,
18735 Withdrawn => v3.0.0,
18736 Skip => 'A subset of F<PropList.txt> (which is used instead)',
18738 Input_file->new('ReadMe.txt', v2.0.0,
18739 Skip => $Documentation,
18741 Input_file->new('Unihan.txt', v2.0.0,
18742 Withdrawn => v5.2.0,
18743 Construction_Time_Handler => \&construct_unihan,
18744 Pre_Handler => \&setup_unihan,
18746 'Unicode_Radical_Stroke'
18748 Each_Line_Handler => \&filter_unihan_line,
18750 Input_file->new('SpecialCasing.txt', v2.1.8,
18751 Each_Line_Handler => ($v_version eq 2.1.8)
18752 ? \&filter_2_1_8_special_casing_line
18753 : \&filter_special_casing_line,
18754 Pre_Handler => \&setup_special_casing,
18755 Has_Missings_Defaults => $IGNORED,
18758 'LineBreak.txt', v3.0.0,
18759 Has_Missings_Defaults => $NOT_IGNORED,
18760 Property => 'Line_Break',
18761 # Early versions had problematic syntax
18762 Each_Line_Handler => (($v_version lt v3.1.0)
18763 ? \&filter_early_ea_lb
18766 Input_file->new('EastAsianWidth.txt', v3.0.0,
18767 Property => 'East_Asian_Width',
18768 Has_Missings_Defaults => $NOT_IGNORED,
18769 # Early versions had problematic syntax
18770 Each_Line_Handler => (($v_version lt v3.1.0)
18771 ? \&filter_early_ea_lb
18774 Input_file->new('CompositionExclusions.txt', v3.0.0,
18775 Property => 'Composition_Exclusion',
18777 Input_file->new('UnicodeData.html', v3.0.0,
18778 Withdrawn => v4.0.1,
18779 Skip => $Documentation,
18781 Input_file->new('BidiMirroring.txt', v3.0.1,
18782 Property => 'Bidi_Mirroring_Glyph',
18783 Has_Missings_Defaults => ($v_version lt v6.2.0)
18785 # Is <none> which doesn't mean
18786 # anything to us, we will use the
18790 Input_file->new('NamesList.html', v3.0.0,
18791 Skip => 'Describes the format and contents of '
18792 . 'F<NamesList.txt>',
18794 Input_file->new('UnicodeCharacterDatabase.html', v3.0.0,
18796 Skip => $Documentation,
18798 Input_file->new('CaseFolding.txt', v3.0.1,
18799 Pre_Handler => \&setup_case_folding,
18800 Each_Line_Handler =>
18801 [ ($v_version lt v3.1.0)
18802 ? \&filter_old_style_case_folding
18804 \&filter_case_folding_line
18806 Has_Missings_Defaults => $IGNORED,
18808 Input_file->new("NormTest.txt", v3.0.1,
18809 Handler => \&process_NormalizationsTest,
18810 Skip => ($make_norm_test_script) ? 0 : $Validation,
18812 Input_file->new('DCoreProperties.txt', v3.1.0,
18813 # 5.2 changed this file
18814 Has_Missings_Defaults => (($v_version ge v5.2.0)
18818 Input_file->new('DProperties.html', v3.1.0,
18819 Withdrawn => v3.2.0,
18820 Skip => $Documentation,
18822 Input_file->new('PropList.html', v3.1.0,
18824 Skip => $Documentation,
18826 Input_file->new('Scripts.txt', v3.1.0,
18827 Property => 'Script',
18828 Each_Line_Handler => (($v_version le v4.0.0)
18829 ? \&filter_all_caps_script_names
18831 Has_Missings_Defaults => $NOT_IGNORED,
18833 Input_file->new('DNormalizationProps.txt', v3.1.0,
18834 Has_Missings_Defaults => $NOT_IGNORED,
18835 Each_Line_Handler => (($v_version lt v4.0.1)
18836 ? \&filter_old_style_normalization_lines
18839 Input_file->new('DerivedProperties.html', v3.1.1,
18841 Skip => $Documentation,
18843 Input_file->new('DAge.txt', v3.2.0,
18844 Has_Missings_Defaults => $NOT_IGNORED,
18847 Input_file->new('HangulSyllableType.txt', v4.0,
18848 Has_Missings_Defaults => $NOT_IGNORED,
18849 Early => [ \&generate_hst, 'Hangul_Syllable_Type' ],
18850 Property => 'Hangul_Syllable_Type'
18852 Input_file->new('NormalizationCorrections.txt', v3.2.0,
18853 # This documents the cumulative fixes to erroneous
18854 # normalizations in earlier Unicode versions. Its main
18855 # purpose is so that someone running on an earlier
18856 # version can use this file to override what got
18857 # published in that earlier release. It would be easy
18858 # for mktables to handle this file. But all the
18859 # corrections in it should already be in the other files
18860 # for the release it is. To get it to actually mean
18861 # something useful, someone would have to be using an
18862 # earlier Unicode release, and copy it into the directory
18863 # for that release and recomplile. So far there has been
18864 # no demand to do that, so this hasn't been implemented.
18865 Skip => 'Documentation of corrections already '
18866 . 'incorporated into the Unicode data base',
18868 Input_file->new('StandardizedVariants.html', v3.2.0,
18869 Skip => 'Provides a visual display of the standard '
18870 . 'variant sequences derived from '
18871 . 'F<StandardizedVariants.txt>.',
18872 # I don't know why the html came earlier than the
18873 # .txt, but both are skipped anyway, so it doesn't
18876 Input_file->new('StandardizedVariants.txt', v4.0.0,
18877 Skip => 'Certain glyph variations for character display '
18878 . 'are standardized. This lists the non-Unihan '
18879 . 'ones; the Unihan ones are also not used by '
18880 . 'Perl, and are in a separate Unicode data base '
18881 . 'L<http://www.unicode.org/ivd>',
18883 Input_file->new('UCD.html', v4.0.0,
18885 Skip => $Documentation,
18887 Input_file->new("$AUXILIARY/WordBreakProperty.txt", v4.1.0,
18888 Early => [ "WBsubst.txt", '_Perl_WB', 'ALetter' ],
18889 Property => 'Word_Break',
18890 Has_Missings_Defaults => $NOT_IGNORED,
18892 Input_file->new("$AUXILIARY/GraphemeBreakProperty.txt", v4.1,
18893 Early => [ \&generate_GCB, '_Perl_GCB' ],
18894 Property => 'Grapheme_Cluster_Break',
18895 Has_Missings_Defaults => $NOT_IGNORED,
18897 Input_file->new("$AUXILIARY/GCBTest.txt", v4.1.0,
18898 Handler => \&process_GCB_test,
18900 Input_file->new("$AUXILIARY/GraphemeBreakTest.html", v4.1.0,
18901 Skip => $Validation_Documentation,
18903 Input_file->new("$AUXILIARY/SBTest.txt", v4.1.0,
18904 Handler => \&process_SB_test,
18906 Input_file->new("$AUXILIARY/SentenceBreakTest.html", v4.1.0,
18907 Skip => $Validation_Documentation,
18909 Input_file->new("$AUXILIARY/WBTest.txt", v4.1.0,
18910 Handler => \&process_WB_test,
18912 Input_file->new("$AUXILIARY/WordBreakTest.html", v4.1.0,
18913 Skip => $Validation_Documentation,
18915 Input_file->new("$AUXILIARY/SentenceBreakProperty.txt", v4.1.0,
18916 Property => 'Sentence_Break',
18917 Early => [ "SBsubst.txt", '_Perl_SB', 'OLetter' ],
18918 Has_Missings_Defaults => $NOT_IGNORED,
18920 Input_file->new('NamedSequences.txt', v4.1.0,
18921 Handler => \&process_NamedSequences
18923 Input_file->new('Unihan.html', v4.1.0,
18925 Skip => $Documentation,
18927 Input_file->new('NameAliases.txt', v5.0,
18928 Property => 'Name_Alias',
18929 Each_Line_Handler => ($v_version le v6.0.0)
18930 ? \&filter_early_version_name_alias_line
18931 : \&filter_later_version_name_alias_line,
18933 # NameAliases.txt came along in v5.0. The above constructor handles
18934 # this. But until 6.1, it was lacking some information needed by core
18935 # perl. The constructor below handles that. It is either a kludge or
18936 # clever, depending on your point of view. The 'Withdrawn' parameter
18937 # indicates not to use it at all starting in 6.1 (so the above
18938 # constructor applies), and the 'v6.1' parameter indicates to use the
18939 # Early parameter before 6.1. Therefore 'Early" is always used,
18940 # yielding the internal-only property '_Perl_Name_Alias', which it
18941 # gets from a NameAliases.txt from 6.1 or later stored in
18942 # N_Asubst.txt. In combination with the above constructor,
18943 # 'Name_Alias' is publicly accessible starting with v5.0, and the
18944 # better 6.1 version is accessible to perl core in all releases.
18945 Input_file->new("NameAliases.txt", v6.1,
18947 Early => [ "N_Asubst.txt", '_Perl_Name_Alias', "" ],
18948 Property => 'Name_Alias',
18949 EOF_Handler => \&fixup_early_perl_name_alias,
18950 Each_Line_Handler =>
18951 \&filter_later_version_name_alias_line,
18953 Input_file->new('NamedSqProv.txt', v5.0.0,
18954 Skip => 'Named sequences proposed for inclusion in a '
18955 . 'later version of the Unicode Standard; if you '
18956 . 'need them now, you can append this file to '
18957 . 'F<NamedSequences.txt> and recompile perl',
18959 Input_file->new("$AUXILIARY/LBTest.txt", v5.1.0,
18960 Skip => $Validation,
18962 Input_file->new("$AUXILIARY/LineBreakTest.html", v5.1.0,
18963 Skip => $Validation_Documentation,
18965 Input_file->new("BidiTest.txt", v5.2.0,
18966 Skip => $Validation,
18968 Input_file->new('UnihanIndicesDictionary.txt', v5.2.0,
18970 Each_Line_Handler => \&filter_unihan_line,
18972 Input_file->new('UnihanDataDictionaryLike.txt', v5.2.0,
18974 Each_Line_Handler => \&filter_unihan_line,
18976 Input_file->new('UnihanIRGSources.txt', v5.2.0,
18978 'kCompatibilityVariant',
18990 Pre_Handler => \&setup_unihan,
18991 Each_Line_Handler => \&filter_unihan_line,
18993 Input_file->new('UnihanNumericValues.txt', v5.2.0,
18995 'kAccountingNumeric',
18999 Each_Line_Handler => \&filter_unihan_line,
19001 Input_file->new('UnihanOtherMappings.txt', v5.2.0,
19003 Each_Line_Handler => \&filter_unihan_line,
19005 Input_file->new('UnihanRadicalStrokeCounts.txt', v5.2.0,
19007 'Unicode_Radical_Stroke'
19009 Each_Line_Handler => \&filter_unihan_line,
19011 Input_file->new('UnihanReadings.txt', v5.2.0,
19013 Each_Line_Handler => \&filter_unihan_line,
19015 Input_file->new('UnihanVariants.txt', v5.2.0,
19017 Each_Line_Handler => \&filter_unihan_line,
19019 Input_file->new('CJKRadicals.txt', v5.2.0,
19020 Skip => 'Maps the kRSUnicode property values to '
19021 . 'corresponding code points',
19023 Input_file->new('EmojiSources.txt', v6.0.0,
19024 Skip => 'Maps certain Unicode code points to their '
19025 . 'legacy Japanese cell-phone values',
19027 Input_file->new('ScriptExtensions.txt', v6.0.0,
19028 Property => 'Script_Extensions',
19029 Pre_Handler => \&setup_script_extensions,
19030 Each_Line_Handler => \&filter_script_extensions_line,
19031 Has_Missings_Defaults => (($v_version le v6.0.0)
19035 # These two Indic files are actually not usable as-is until 6.1.0,
19036 # because their property values are missing from PropValueAliases.txt
19037 # until that release, so that further work would have to be done to get
19038 # them to work properly, which isn't worth it because of them being
19040 Input_file->new('IndicMatraCategory.txt', v6.0.0,
19041 Withdrawn => v8.0.0,
19042 Property => 'Indic_Matra_Category',
19043 Has_Missings_Defaults => $NOT_IGNORED,
19044 Skip => $Indic_Skip,
19046 Input_file->new('IndicSyllabicCategory.txt', v6.0.0,
19047 Property => 'Indic_Syllabic_Category',
19048 Has_Missings_Defaults => $NOT_IGNORED,
19049 Skip => (($v_version lt v8.0.0)
19053 Input_file->new('USourceData.txt', v6.2.0,
19054 Skip => 'Documentation of status and cross reference of '
19055 . 'proposals for encoding by Unicode of Unihan '
19058 Input_file->new('USourceGlyphs.pdf', v6.2.0,
19059 Skip => 'Pictures of the characters in F<USourceData.txt>',
19061 Input_file->new('BidiBrackets.txt', v6.3.0,
19062 Properties => [ 'Bidi_Paired_Bracket',
19063 'Bidi_Paired_Bracket_Type'
19065 Has_Missings_Defaults => $NO_DEFAULTS,
19067 Input_file->new("BidiCharacterTest.txt", v6.3.0,
19068 Skip => $Validation,
19070 Input_file->new('IndicPositionalCategory.txt', v8.0.0,
19071 Property => 'Indic_Positional_Category',
19072 Has_Missings_Defaults => $NOT_IGNORED,
19076 # End of all the preliminaries.
19079 if (@missing_early_files) {
19080 print simple_fold(join_lines(<<END
19082 The compilation cannot be completed because one or more required input files,
19083 listed below, are missing. This is because you are compiling Unicode version
19084 $unicode_version, which predates the existence of these file(s). To fully
19085 function, perl needs the data that these files would have contained if they
19086 had been in this release. To work around this, create copies of later
19087 versions of the missing files in the directory containing '$0'. (Perl will
19088 make the necessary adjustments to the data to compensate for it not being the
19089 same version as is being compiled.) The files are available from unicode.org,
19090 via either ftp or http. If using http, they will be under
19091 www.unicode.org/versions/. Below are listed the source file name of each
19092 missing file, the Unicode version to copy it from, and the name to store it
19093 as. (Note that the listed source file name may not be exactly the one that
19094 Unicode calls it. If you don't find it, you can look it up in 'README.perl'
19095 to get the correct name.)
19098 print simple_fold(join_lines("\n$_")) for @missing_early_files;
19102 if ($compare_versions) {
19103 Carp::my_carp(<<END
19104 Warning. \$compare_versions is set. Output is not suitable for production
19109 # Put into %potential_files a list of all the files in the directory structure
19110 # that could be inputs to this program
19113 return unless / \. ( txt | htm l? ) $ /xi; # Some platforms change the
19115 my $full = lc(File::Spec->rel2abs($_));
19116 $potential_files{$full} = 1;
19119 }, File::Spec->curdir());
19121 my @mktables_list_output_files;
19122 my $old_start_time = 0;
19123 my $old_options = "";
19125 if (! -e $file_list) {
19126 print "'$file_list' doesn't exist, so forcing rebuild.\n" if $verbosity >= $VERBOSE;
19127 $write_unchanged_files = 1;
19128 } elsif ($write_unchanged_files) {
19129 print "Not checking file list '$file_list'.\n" if $verbosity >= $VERBOSE;
19132 print "Reading file list '$file_list'\n" if $verbosity >= $VERBOSE;
19134 if (! open $file_handle, "<", $file_list) {
19135 Carp::my_carp("Failed to open '$file_list'; turning on -globlist option instead: $!");
19141 # Read and parse mktables.lst, placing the results from the first part
19142 # into @input, and the second part into @mktables_list_output_files
19143 for my $list ( \@input, \@mktables_list_output_files ) {
19144 while (<$file_handle>) {
19145 s/^ \s+ | \s+ $//xg;
19146 if (/^ \s* \# \s* Autogenerated\ starting\ on\ (\d+)/x) {
19147 $old_start_time = $1;
19150 if (/^ \s* \# \s* From\ options\ (.+) /x) {
19154 next if /^ \s* (?: \# .* )? $/x;
19156 my ( $file ) = split /\t/;
19157 push @$list, $file;
19159 @$list = uniques(@$list);
19163 # Look through all the input files
19164 foreach my $input (@input) {
19165 next if $input eq 'version'; # Already have checked this.
19167 # Ignore if doesn't exist. The checking about whether we care or
19168 # not is done via the Input_file object.
19169 next if ! file_exists($input);
19171 # The paths are stored with relative names, and with '/' as the
19172 # delimiter; convert to absolute on this machine
19173 my $full = lc(File::Spec->rel2abs(internal_file_to_platform($input)));
19174 $potential_files{lc $full} = 1;
19178 close $file_handle;
19183 # Here wants to process all .txt files in the directory structure.
19184 # Convert them to full path names. They are stored in the platform's
19187 foreach my $object (@input_file_objects) {
19188 my $file = $object->file;
19189 next unless defined $file;
19190 push @known_files, File::Spec->rel2abs($file);
19193 my @unknown_input_files;
19194 foreach my $file (keys %potential_files) { # The keys are stored in lc
19195 next if grep { $file eq lc($_) } @known_files;
19197 # Here, the file is unknown to us. Get relative path name
19198 $file = File::Spec->abs2rel($file);
19199 push @unknown_input_files, $file;
19201 # What will happen is we create a data structure for it, and add it to
19202 # the list of input files to process. First get the subdirectories
19204 my (undef, $directories, undef) = File::Spec->splitpath($file);
19205 $directories =~ s;/$;;; # Can have extraneous trailing '/'
19206 my @directories = File::Spec->splitdir($directories);
19208 # If the file isn't extracted (meaning none of the directories is the
19209 # extracted one), just add it to the end of the list of inputs.
19210 if (! grep { $EXTRACTED_DIR eq $_ } @directories) {
19211 push @input_file_objects, Input_file->new($file, v0);
19215 # Here, the file is extracted. It needs to go ahead of most other
19216 # processing. Search for the first input file that isn't a
19217 # special required property (that is, find one whose first_release
19218 # is non-0), and isn't extracted. Also, the Age property file is
19219 # processed before the extracted ones, just in case
19220 # $compare_versions is set.
19221 for (my $i = 0; $i < @input_file_objects; $i++) {
19222 if ($input_file_objects[$i]->first_released ne v0
19223 && lc($input_file_objects[$i]->file) ne 'dage.txt'
19224 && $input_file_objects[$i]->file !~ /$EXTRACTED_DIR/i)
19226 splice @input_file_objects, $i, 0,
19227 Input_file->new($file, v0);
19234 if (@unknown_input_files) {
19235 print STDERR simple_fold(join_lines(<<END
19237 The following files are unknown as to how to handle. Assuming they are
19238 typical property files. You'll know by later error messages if it worked or
19241 ) . " " . join(", ", @unknown_input_files) . "\n\n");
19243 } # End of looking through directory structure for more .txt files.
19245 # Create the list of input files from the objects we have defined, plus
19247 my @input_files = qw(version Makefile);
19248 foreach my $object (@input_file_objects) {
19249 my $file = $object->file;
19250 next if ! defined $file; # Not all objects have files
19251 next if defined $object->skip;;
19252 push @input_files, $file;
19255 if ( $verbosity >= $VERBOSE ) {
19256 print "Expecting ".scalar( @input_files )." input files. ",
19257 "Checking ".scalar( @mktables_list_output_files )." output files.\n";
19260 # We set $most_recent to be the most recently changed input file, including
19261 # this program itself (done much earlier in this file)
19262 foreach my $in (@input_files) {
19263 next unless -e $in; # Keep going even if missing a file
19264 my $mod_time = (stat $in)[9];
19265 $most_recent = $mod_time if $mod_time > $most_recent;
19267 # See that the input files have distinct names, to warn someone if they
19268 # are adding a new one
19270 my ($volume, $directories, $file ) = File::Spec->splitpath($in);
19271 $directories =~ s;/$;;; # Can have extraneous trailing '/'
19272 my @directories = File::Spec->splitdir($directories);
19273 construct_filename($file, 'mutable', \@directories);
19277 # We use 'Makefile' just to see if it has changed since the last time we
19278 # rebuilt. Now discard it.
19279 @input_files = grep { $_ ne 'Makefile' } @input_files;
19281 my $rebuild = $write_unchanged_files # Rebuild: if unconditional rebuild
19282 || ! scalar @mktables_list_output_files # or if no outputs known
19283 || $old_start_time < $most_recent # or out-of-date
19284 || $old_options ne $command_line_arguments; # or with different
19287 # Now we check to see if any output files are older than youngest, if
19288 # they are, we need to continue on, otherwise we can presumably bail.
19290 foreach my $out (@mktables_list_output_files) {
19291 if ( ! file_exists($out)) {
19292 print "'$out' is missing.\n" if $verbosity >= $VERBOSE;
19296 #local $to_trace = 1 if main::DEBUG;
19297 trace $most_recent, (stat $out)[9] if main::DEBUG && $to_trace;
19298 if ( (stat $out)[9] <= $most_recent ) {
19299 #trace "$out: most recent mod time: ", (stat $out)[9], ", youngest: $most_recent\n" if main::DEBUG && $to_trace;
19300 print "'$out' is too old.\n" if $verbosity >= $VERBOSE;
19307 print "Files seem to be ok, not bothering to rebuild. Add '-w' option to force build\n";
19310 print "Must rebuild tables.\n" if $verbosity >= $VERBOSE;
19312 # Ready to do the major processing. First create the perl pseudo-property.
19313 $perl = Property->new('perl', Type => $NON_STRING, Perl_Extension => 1);
19315 # Process each input file
19316 foreach my $file (@input_file_objects) {
19320 # Finish the table generation.
19322 print "Finishing processing Unicode properties\n" if $verbosity >= $PROGRESS;
19325 # For the very specialized case of comparing two Unicode versions...
19326 if (DEBUG && $compare_versions) {
19327 handle_compare_versions();
19330 print "Compiling Perl properties\n" if $verbosity >= $PROGRESS;
19333 print "Creating Perl synonyms\n" if $verbosity >= $PROGRESS;
19334 add_perl_synonyms();
19336 print "Writing tables\n" if $verbosity >= $PROGRESS;
19337 write_all_tables();
19339 # Write mktables.lst
19340 if ( $file_list and $make_list ) {
19342 print "Updating '$file_list'\n" if $verbosity >= $PROGRESS;
19343 foreach my $file (@input_files, @files_actually_output) {
19344 my (undef, $directories, $file) = File::Spec->splitpath($file);
19345 my @directories = File::Spec->splitdir($directories);
19346 $file = join '/', @directories, $file;
19350 if (! open $ofh,">",$file_list) {
19351 Carp::my_carp("Can't write to '$file_list'. Skipping: $!");
19355 my $localtime = localtime $start_time;
19356 print $ofh <<"END";
19358 # $file_list -- File list for $0.
19360 # Autogenerated starting on $start_time ($localtime)
19361 # From options $command_line_arguments
19363 # - First section is input files
19364 # ($0 itself is not listed but is automatically considered an input)
19365 # - Section separator is /^=+\$/
19366 # - Second section is a list of output files.
19367 # - Lines matching /^\\s*#/ are treated as comments
19368 # which along with blank lines are ignored.
19374 print $ofh "$_\n" for sort(@input_files);
19375 print $ofh "\n=================================\n# Output files:\n\n";
19376 print $ofh "$_\n" for sort @files_actually_output;
19377 print $ofh "\n# ",scalar(@input_files)," input files\n",
19378 "# ",scalar(@files_actually_output)+1," output files\n\n",
19381 or Carp::my_carp("Failed to close $ofh: $!");
19383 print "Filelist has ",scalar(@input_files)," input files and ",
19384 scalar(@files_actually_output)+1," output files\n"
19385 if $verbosity >= $VERBOSE;
19389 # Output these warnings unless -q explicitly specified.
19390 if ($verbosity >= $NORMAL_VERBOSITY && ! $debug_skip) {
19391 if (@unhandled_properties) {
19392 print "\nProperties and tables that unexpectedly have no code points\n";
19393 foreach my $property (sort @unhandled_properties) {
19394 print $property, "\n";
19398 if (%potential_files) {
19399 print "\nInput files that are not considered:\n";
19400 foreach my $file (sort keys %potential_files) {
19401 print File::Spec->abs2rel($file), "\n";
19404 print "\nAll done\n" if $verbosity >= $VERBOSE;
19408 # TRAILING CODE IS USED BY make_property_test_script()
19414 # Test qr/\X/ and the \p{} regular expression constructs. This file is
19415 # constructed by mktables from the tables it generates, so if mktables is
19416 # buggy, this won't necessarily catch those bugs. Tests are generated for all
19417 # feasible properties; a few aren't currently feasible; see
19418 # is_code_point_usable() in mktables for details.
19420 # Standard test packages are not used because this manipulates SIG_WARN. It
19421 # exits 0 if every non-skipped test succeeded; -1 if any failed.
19426 # loc_tools.pl requires this function to be defined
19428 my ($pass, @msg) = @_;
19429 print "not " unless $pass;
19432 print " - ", join "", @msg if @msg;
19437 my $expected = shift;
19440 my $warning_type = shift; # Type of warning message, like 'deprecated'
19442 my $line = (caller)[2];
19444 # Convert the code point to hex form
19445 my $string = sprintf "\"\\x{%04X}\"", $ord;
19449 # The first time through, use all warnings. If the input should generate
19450 # a warning, add another time through with them turned off
19451 push @tests, "no warnings '$warning_type';" if $warning_type;
19453 foreach my $no_warnings (@tests) {
19455 # Store any warning messages instead of outputting them
19456 local $SIG{__WARN__} = $SIG{__WARN__};
19457 my $warning_message;
19458 $SIG{__WARN__} = sub { $warning_message = $_[0] };
19462 # A string eval is needed because of the 'no warnings'.
19463 # Assumes no parens in the regular expression
19464 my $result = eval "$no_warnings
19465 my \$RegObj = qr($regex);
19466 $string =~ \$RegObj ? 1 : 0";
19467 if (not defined $result) {
19468 print "not ok $Tests - couldn't compile /$regex/; line $line: $@\n";
19471 elsif ($result ^ $expected) {
19472 print "not ok $Tests - expected $expected but got $result for $string =~ qr/$regex/; line $line\n";
19475 elsif ($warning_message) {
19476 if (! $warning_type || ($warning_type && $no_warnings)) {
19477 print "not ok $Tests - for qr/$regex/ did not expect warning message '$warning_message'; line $line\n";
19481 print "ok $Tests - expected and got a warning message for qr/$regex/; line $line\n";
19484 elsif ($warning_type && ! $no_warnings) {
19485 print "not ok $Tests - for qr/$regex/ expected a $warning_type warning message, but got none; line $line\n";
19489 print "ok $Tests - got $result for $string =~ qr/$regex/; line $line\n";
19498 if (eval { 'x' =~ qr/$regex/; 1 }) {
19500 my $line = (caller)[2];
19501 print "not ok $Tests - re compiled ok, but expected error for qr/$regex/; line $line: $@\n";
19504 my $line = (caller)[2];
19505 print "ok $Tests - got and expected error for qr/$regex/; line $line\n";
19510 # Break test files (e.g. GCBTest.txt) character that break allowed here
19511 my $breakable_utf8 = my $breakable = chr(utf8::unicode_to_native(0xF7));
19512 utf8::upgrade($breakable_utf8);
19514 # Break test files (e.g. GCBTest.txt) character that indicates can't break
19516 my $nobreak_utf8 = my $nobreak = chr(utf8::unicode_to_native(0xD7));
19517 utf8::upgrade($nobreak_utf8);
19521 chdir 't' if -d 't';
19522 eval { require "./loc_tools.pl" };
19523 $utf8_locale = &find_utf8_ctype_locale if defined &find_utf8_ctype_locale;
19525 sub _test_break($$) {
19526 # Test qr/\X/ matches. The input is a line from auxiliary/GCBTest.txt
19527 # Each such line is a sequence of code points given by their hex numbers,
19528 # separated by the two characters defined just before this subroutine that
19529 # indicate that either there can or cannot be a break between the adjacent
19530 # code points. If there isn't a break, that means the sequence forms an
19531 # extended grapheme cluster, which means that \X should match the whole
19532 # thing. If there is a break, \X should stop there. This is all
19533 # converted by this routine into a match:
19534 # $string =~ /(\X)/,
19535 # Each \X should match the next cluster; and that is what is checked.
19537 my $template = shift;
19538 my $break_type = shift;
19540 my $line = (caller 1)[2]; # Line number
19542 # The line contains characters above the ASCII range, but in Latin1. It
19543 # may or may not be in utf8, and if it is, it may or may not know it. So,
19544 # convert these characters to 8 bits. If knows is in utf8, simply
19546 if (utf8::is_utf8($template)) {
19547 utf8::downgrade($template);
19550 # Otherwise, if it is in utf8, but doesn't know it, the next lines
19551 # convert the two problematic characters to their 8-bit equivalents.
19552 # If it isn't in utf8, they don't harm anything.
19554 $template =~ s/$nobreak_utf8/$nobreak/g;
19555 $template =~ s/$breakable_utf8/$breakable/g;
19558 # The input is just the break/no-break symbols and sequences of Unicode
19559 # code points as hex digits separated by spaces for legibility. e.g.:
19560 # ÷ 0020 × 0308 ÷ 0020 ÷
19561 # Convert to native \x format
19562 $template =~ s/ \s* ( [[:xdigit:]]+ ) \s* /sprintf("\\x{%02X}", utf8::unicode_to_native(hex $1))/gex;
19563 $template =~ s/ \s* //gx; # Probably the line above removed all spaces;
19566 # Make a copy of the input with the symbols replaced by \b{} and \B{} as
19568 my $break_pattern = $template =~ s/ $breakable /\\b{$break_type}/grx;
19569 $break_pattern =~ s/ $nobreak /\\B{$break_type}/gx;
19571 my $display_string = $template =~ s/[$breakable$nobreak]//gr;
19572 my $string = eval "\"$display_string\"";
19574 # The remaining massaging of the input is for the \X tests. Get rid of
19575 # the leading and trailing breakables
19576 $template =~ s/^ \s* $breakable \s* //x;
19577 $template =~ s/ \s* $breakable \s* $ //x;
19580 $template =~ s/ \s* $nobreak \s* //xg;
19582 # Split the input into segments that are breakable between them.
19583 my @should_display = split /\s*$breakable\s*/, $template;
19584 my @should_match = map { eval "\"$_\"" } @should_display;
19586 # If a string can be represented in both non-ut8 and utf8, test both cases
19587 my $display_upgrade = "";
19589 for my $to_upgrade (0 .. 1) {
19593 # If already in utf8, would just be a repeat
19594 next UPGRADE if utf8::is_utf8($string);
19596 utf8::upgrade($string);
19597 $display_upgrade = " (utf8-upgraded)";
19600 # The /l modifier has C after it to indicate the locale to try
19601 my @modifiers = qw(a aa d lC u i);
19602 push @modifiers, "l$utf8_locale" if defined $utf8_locale;
19604 # Test for each of the regex modifiers.
19605 for my $modifier (@modifiers) {
19606 my $display_locale = "";
19608 # For /l, set the locale to what it says to.
19609 if ($modifier =~ / ^ l (.*) /x) {
19611 $display_locale = "(locale = $locale)";
19613 if (defined $Config{d_setlocale}) {
19614 eval { require POSIX; import POSIX 'locale_h'; };
19615 if (defined &POSIX::LC_CTYPE) {
19616 POSIX::setlocale(&POSIX::LC_CTYPE, $locale);
19622 no warnings qw(locale regexp surrogate);
19623 my $pattern = "(?$modifier:$break_pattern)";
19625 # Actually do the test
19626 my $matched = $string =~ qr/$pattern/;
19627 print "not " unless $matched;
19629 # Fancy display of test results
19630 $matched = ($matched) ? "matched" : "failed to match";
19631 print "ok ", ++$Tests, " - \"$display_string\" $matched /$pattern/$display_upgrade; line $line $display_locale\n";
19633 # Repeat with the first \B{} in the pattern. This makes sure the
19634 # code in regexec.c:find_byclass() for \B gets executed
19635 if ($pattern =~ / ( .*? : ) .* ( \\B\{ .* ) /x) {
19636 my $B_pattern = "$1$2";
19637 $matched = $string =~ qr/$B_pattern/;
19638 print "not " unless $matched;
19639 print "ok ", ++$Tests, " - \"$display_string\" $matched /$B_pattern/$display_upgrade; line $line $display_locale\n";
19643 next if $break_type ne 'gcb';
19645 # Finally, do the \X match.
19646 my @matches = $string =~ /(\X)/g;
19648 # Look through each matched cluster to verify that it matches what we
19650 my $min = (@matches < @should_match) ? @matches : @should_match;
19651 for my $i (0 .. $min - 1) {
19653 if ($matches[$i] eq $should_match[$i]) {
19654 print "ok $Tests - ";
19656 print "In \"$display_string\" =~ /(\\X)/g, \\X #1";
19658 print "And \\X #", $i + 1,
19660 print " correctly matched $should_display[$i]; line $line\n";
19662 $matches[$i] = join("", map { sprintf "\\x{%04X}", $_ }
19663 split "", $matches[$i]);
19664 print "not ok $Tests - In \"$display_string\" =~ /(\\X)/g, \\X #",
19666 " should have matched $should_display[$i]",
19667 " but instead matched $matches[$i]",
19668 ". Abandoning rest of line $line\n";
19673 # And the number of matches should equal the number of expected matches.
19675 if (@matches == @should_match) {
19676 print "ok $Tests - Nothing was left over; line $line\n";
19678 print "not ok $Tests - There were ", scalar @should_match, " \\X matches expected, but got ", scalar @matches, " instead; line $line\n";
19686 _test_break(shift, 'gcb');
19690 _test_break(shift, 'sb');
19694 _test_break(shift, 'wb');
19698 print "1..$Tests\n";
19699 exit($Fails ? -1 : 0);
19702 Error('\p{Script=InGreek}'); # Bug #69018
19703 Test_GCB("1100 $nobreak 1161"); # Bug #70940
19704 Expect(0, 0x2028, '\p{Print}', ""); # Bug # 71722
19705 Expect(0, 0x2029, '\p{Print}', ""); # Bug # 71722
19706 Expect(1, 0xFF10, '\p{XDigit}', ""); # Bug # 71726