3 # !!!!!!!!!!!!!! IF YOU MODIFY THIS FILE !!!!!!!!!!!!!!!!!!!!!!!!!
4 # Any files created or read by this program should be listed in 'mktables.lst'
5 # Use -makelist to regenerate it.
7 # Needs 'no overloading' to run faster on miniperl. Code commented out at the
8 # subroutine objaddr can be used instead to work as far back (untested) as
9 # 5.8: needs pack "U". But almost all occurrences of objaddr have been
10 # removed in favor of using 'no overloading'. You also would have to go
11 # through and replace occurrences like:
12 # my $addr = do { no overloading; pack 'J', $self; }
14 # my $addr = main::objaddr $self;
15 # (or reverse commit 9b01bafde4b022706c3d6f947a0963f821b2e50b
16 # that instituted the change to main::objaddr, and subsequent commits that
17 # changed 0+$self to pack 'J', $self.)
20 BEGIN { # Get the time the script started running; do it at compilation to
21 # get it as close as possible
36 sub DEBUG () { 0 } # Set to 0 for production; 1 for development
37 my $debugging_build = $Config{"ccflags"} =~ /-DDEBUGGING/;
39 ##########################################################################
41 # mktables -- create the runtime Perl Unicode files (lib/unicore/.../*.pl),
42 # from the Unicode database files (lib/unicore/.../*.txt), It also generates
43 # a pod file and a .t file
45 # The structure of this file is:
46 # First these introductory comments; then
47 # code needed for everywhere, such as debugging stuff; then
48 # code to handle input parameters; then
49 # data structures likely to be of external interest (some of which depend on
50 # the input parameters, so follows them; then
51 # more data structures and subroutine and package (class) definitions; then
52 # the small actual loop to process the input files and finish up; then
53 # a __DATA__ section, for the .t tests
55 # This program works on all releases of Unicode through at least 6.0. The
56 # outputs have been scrutinized most intently for release 5.1. The others
57 # have been checked for somewhat more than just sanity. It can handle all
58 # existing Unicode character properties in those releases.
60 # This program is mostly about Unicode character (or code point) properties.
61 # A property describes some attribute or quality of a code point, like if it
62 # is lowercase or not, its name, what version of Unicode it was first defined
63 # in, or what its uppercase equivalent is. Unicode deals with these disparate
64 # possibilities by making all properties into mappings from each code point
65 # into some corresponding value. In the case of it being lowercase or not,
66 # the mapping is either to 'Y' or 'N' (or various synonyms thereof). Each
67 # property maps each Unicode code point to a single value, called a "property
68 # value". (Hence each Unicode property is a true mathematical function with
69 # exactly one value per code point.)
71 # When using a property in a regular expression, what is desired isn't the
72 # mapping of the code point to its property's value, but the reverse (or the
73 # mathematical "inverse relation"): starting with the property value, "Does a
74 # code point map to it?" These are written in a "compound" form:
75 # \p{property=value}, e.g., \p{category=punctuation}. This program generates
76 # files containing the lists of code points that map to each such regular
77 # expression property value, one file per list
79 # There is also a single form shortcut that Perl adds for many of the commonly
80 # used properties. This happens for all binary properties, plus script,
81 # general_category, and block properties.
83 # Thus the outputs of this program are files. There are map files, mostly in
84 # the 'To' directory; and there are list files for use in regular expression
85 # matching, all in subdirectories of the 'lib' directory, with each
86 # subdirectory being named for the property that the lists in it are for.
87 # Bookkeeping, test, and documentation files are also generated.
89 my $matches_directory = 'lib'; # Where match (\p{}) files go.
90 my $map_directory = 'To'; # Where map files go.
94 # The major data structures of this program are Property, of course, but also
95 # Table. There are two kinds of tables, very similar to each other.
96 # "Match_Table" is the data structure giving the list of code points that have
97 # a particular property value, mentioned above. There is also a "Map_Table"
98 # data structure which gives the property's mapping from code point to value.
99 # There are two structures because the match tables need to be combined in
100 # various ways, such as constructing unions, intersections, complements, etc.,
101 # and the map ones don't. And there would be problems, perhaps subtle, if
102 # a map table were inadvertently operated on in some of those ways.
103 # The use of separate classes with operations defined on one but not the other
104 # prevents accidentally confusing the two.
106 # At the heart of each table's data structure is a "Range_List", which is just
107 # an ordered list of "Ranges", plus ancillary information, and methods to
108 # operate on them. A Range is a compact way to store property information.
109 # Each range has a starting code point, an ending code point, and a value that
110 # is meant to apply to all the code points between the two end points,
111 # inclusive. For a map table, this value is the property value for those
112 # code points. Two such ranges could be written like this:
113 # 0x41 .. 0x5A, 'Upper',
114 # 0x61 .. 0x7A, 'Lower'
116 # Each range also has a type used as a convenience to classify the values.
117 # Most ranges in this program will be Type 0, or normal, but there are some
118 # ranges that have a non-zero type. These are used only in map tables, and
119 # are for mappings that don't fit into the normal scheme of things. Mappings
120 # that require a hash entry to communicate with utf8.c are one example;
121 # another example is mappings for charnames.pm to use which indicate a name
122 # that is algorithmically determinable from its code point (and vice-versa).
123 # These are used to significantly compact these tables, instead of listing
124 # each one of the tens of thousands individually.
126 # In a match table, the value of a range is irrelevant (and hence the type as
127 # well, which will always be 0), and arbitrarily set to the null string.
128 # Using the example above, there would be two match tables for those two
129 # entries, one named Upper would contain the 0x41..0x5A range, and the other
130 # named Lower would contain 0x61..0x7A.
132 # Actually, there are two types of range lists, "Range_Map" is the one
133 # associated with map tables, and "Range_List" with match tables.
134 # Again, this is so that methods can be defined on one and not the other so as
135 # to prevent operating on them in incorrect ways.
137 # Eventually, most tables are written out to files to be read by utf8_heavy.pl
138 # in the perl core. All tables could in theory be written, but some are
139 # suppressed because there is no current practical use for them. It is easy
140 # to change which get written by changing various lists that are near the top
141 # of the actual code in this file. The table data structures contain enough
142 # ancillary information to allow them to be treated as separate entities for
143 # writing, such as the path to each one's file. There is a heading in each
144 # map table that gives the format of its entries, and what the map is for all
145 # the code points missing from it. (This allows tables to be more compact.)
147 # The Property data structure contains one or more tables. All properties
148 # contain a map table (except the $perl property which is a
149 # pseudo-property containing only match tables), and any properties that
150 # are usable in regular expression matches also contain various matching
151 # tables, one for each value the property can have. A binary property can
152 # have two values, True and False (or Y and N, which are preferred by Unicode
153 # terminology). Thus each of these properties will have a map table that
154 # takes every code point and maps it to Y or N (but having ranges cuts the
155 # number of entries in that table way down), and two match tables, one
156 # which has a list of all the code points that map to Y, and one for all the
157 # code points that map to N. (For each of these, a third table is also
158 # generated for the pseudo Perl property. It contains the identical code
159 # points as the Y table, but can be written, not in the compound form, but in
160 # a "single" form like \p{IsUppercase}.) Many properties are binary, but some
161 # properties have several possible values, some have many, and properties like
162 # Name have a different value for every named code point. Those will not,
163 # unless the controlling lists are changed, have their match tables written
164 # out. But all the ones which can be used in regular expression \p{} and \P{}
165 # constructs will. Prior to 5.14, generally a property would have either its
166 # map table or its match tables written but not both. Again, what gets
167 # written is controlled by lists which can easily be changed. Starting in
168 # 5.14, advantage was taken of this, and all the map tables needed to
169 # reconstruct the Unicode db are now written out, while suppressing the
170 # Unicode .txt files that contain the data. Our tables are much more compact
171 # than the .txt files, so a significant space savings was achieved.
173 # Properties have a 'Type', like binary, or string, or enum depending on how
174 # many match tables there are and the content of the maps. This 'Type' is
175 # different than a range 'Type', so don't get confused by the two concepts
176 # having the same name.
178 # For information about the Unicode properties, see Unicode's UAX44 document:
180 my $unicode_reference_url = 'http://www.unicode.org/reports/tr44/';
182 # As stated earlier, this program will work on any release of Unicode so far.
183 # Most obvious problems in earlier data have NOT been corrected except when
184 # necessary to make Perl or this program work reasonably. For example, no
185 # folding information was given in early releases, so this program substitutes
186 # lower case instead, just so that a regular expression with the /i option
187 # will do something that actually gives the right results in many cases.
188 # There are also a couple other corrections for version 1.1.5, commented at
189 # the point they are made. As an example of corrections that weren't made
190 # (but could be) is this statement from DerivedAge.txt: "The supplementary
191 # private use code points and the non-character code points were assigned in
192 # version 2.0, but not specifically listed in the UCD until versions 3.0 and
193 # 3.1 respectively." (To be precise it was 3.0.1 not 3.0.0) More information
194 # on Unicode version glitches is further down in these introductory comments.
196 # This program works on all non-provisional properties as of 6.0, though the
197 # files for some are suppressed from apparent lack of demand for them. You
198 # can change which are output by changing lists in this program.
200 # The old version of mktables emphasized the term "Fuzzy" to mean Unicode's
201 # loose matchings rules (from Unicode TR18):
203 # The recommended names for UCD properties and property values are in
204 # PropertyAliases.txt [Prop] and PropertyValueAliases.txt
205 # [PropValue]. There are both abbreviated names and longer, more
206 # descriptive names. It is strongly recommended that both names be
207 # recognized, and that loose matching of property names be used,
208 # whereby the case distinctions, whitespace, hyphens, and underbar
210 # The program still allows Fuzzy to override its determination of if loose
211 # matching should be used, but it isn't currently used, as it is no longer
212 # needed; the calculations it makes are good enough.
214 # SUMMARY OF HOW IT WORKS:
218 # A list is constructed containing each input file that is to be processed
220 # Each file on the list is processed in a loop, using the associated handler
222 # The PropertyAliases.txt and PropValueAliases.txt files are processed
223 # first. These files name the properties and property values.
224 # Objects are created of all the property and property value names
225 # that the rest of the input should expect, including all synonyms.
226 # The other input files give mappings from properties to property
227 # values. That is, they list code points and say what the mapping
228 # is under the given property. Some files give the mappings for
229 # just one property; and some for many. This program goes through
230 # each file and populates the properties from them. Some properties
231 # are listed in more than one file, and Unicode has set up a
232 # precedence as to which has priority if there is a conflict. Thus
233 # the order of processing matters, and this program handles the
234 # conflict possibility by processing the overriding input files
235 # last, so that if necessary they replace earlier values.
236 # After this is all done, the program creates the property mappings not
237 # furnished by Unicode, but derivable from what it does give.
238 # The tables of code points that match each property value in each
239 # property that is accessible by regular expressions are created.
240 # The Perl-defined properties are created and populated. Many of these
241 # require data determined from the earlier steps
242 # Any Perl-defined synonyms are created, and name clashes between Perl
243 # and Unicode are reconciled and warned about.
244 # All the properties are written to files
245 # Any other files are written, and final warnings issued.
247 # For clarity, a number of operators have been overloaded to work on tables:
248 # ~ means invert (take all characters not in the set). The more
249 # conventional '!' is not used because of the possibility of confusing
250 # it with the actual boolean operation.
252 # - means subtraction
253 # & means intersection
254 # The precedence of these is the order listed. Parentheses should be
255 # copiously used. These are not a general scheme. The operations aren't
256 # defined for a number of things, deliberately, to avoid getting into trouble.
257 # Operations are done on references and affect the underlying structures, so
258 # that the copy constructors for them have been overloaded to not return a new
259 # clone, but the input object itself.
261 # The bool operator is deliberately not overloaded to avoid confusion with
262 # "should it mean if the object merely exists, or also is non-empty?".
264 # WHY CERTAIN DESIGN DECISIONS WERE MADE
266 # This program needs to be able to run under miniperl. Therefore, it uses a
267 # minimum of other modules, and hence implements some things itself that could
268 # be gotten from CPAN
270 # This program uses inputs published by the Unicode Consortium. These can
271 # change incompatibly between releases without the Perl maintainers realizing
272 # it. Therefore this program is now designed to try to flag these. It looks
273 # at the directories where the inputs are, and flags any unrecognized files.
274 # It keeps track of all the properties in the files it handles, and flags any
275 # that it doesn't know how to handle. It also flags any input lines that
276 # don't match the expected syntax, among other checks.
278 # It is also designed so if a new input file matches one of the known
279 # templates, one hopefully just needs to add it to a list to have it
282 # As mentioned earlier, some properties are given in more than one file. In
283 # particular, the files in the extracted directory are supposedly just
284 # reformattings of the others. But they contain information not easily
285 # derivable from the other files, including results for Unihan, which this
286 # program doesn't ordinarily look at, and for unassigned code points. They
287 # also have historically had errors or been incomplete. In an attempt to
288 # create the best possible data, this program thus processes them first to
289 # glean information missing from the other files; then processes those other
290 # files to override any errors in the extracted ones. Much of the design was
291 # driven by this need to store things and then possibly override them.
293 # It tries to keep fatal errors to a minimum, to generate something usable for
294 # testing purposes. It always looks for files that could be inputs, and will
295 # warn about any that it doesn't know how to handle (the -q option suppresses
298 # Why is there more than one type of range?
299 # This simplified things. There are some very specialized code points that
300 # have to be handled specially for output, such as Hangul syllable names.
301 # By creating a range type (done late in the development process), it
302 # allowed this to be stored with the range, and overridden by other input.
303 # Originally these were stored in another data structure, and it became a
304 # mess trying to decide if a second file that was for the same property was
305 # overriding the earlier one or not.
307 # Why are there two kinds of tables, match and map?
308 # (And there is a base class shared by the two as well.) As stated above,
309 # they actually are for different things. Development proceeded much more
310 # smoothly when I (khw) realized the distinction. Map tables are used to
311 # give the property value for every code point (actually every code point
312 # that doesn't map to a default value). Match tables are used for regular
313 # expression matches, and are essentially the inverse mapping. Separating
314 # the two allows more specialized methods, and error checks so that one
315 # can't just take the intersection of two map tables, for example, as that
320 # This program is written so it will run under miniperl. Occasionally changes
321 # will cause an error where the backtrace doesn't work well under miniperl.
322 # To diagnose the problem, you can instead run it under regular perl, if you
325 # There is a good trace facility. To enable it, first sub DEBUG must be set
326 # to return true. Then a line like
328 # local $to_trace = 1 if main::DEBUG;
330 # can be added to enable tracing in its lexical scope or until you insert
333 # local $to_trace = 0 if main::DEBUG;
335 # then use a line like "trace $a, @b, %c, ...;
337 # Some of the more complex subroutines already have trace statements in them.
338 # Permanent trace statements should be like:
340 # trace ... if main::DEBUG && $to_trace;
342 # If there is just one or a few files that you're debugging, you can easily
343 # cause most everything else to be skipped. Change the line
345 # my $debug_skip = 0;
347 # to 1, and every file whose object is in @input_file_objects and doesn't have
348 # a, 'non_skip => 1,' in its constructor will be skipped.
350 # To compare the output tables, it may be useful to specify the -annotate
351 # flag. This causes the tables to expand so there is one entry for each
352 # non-algorithmically named code point giving, currently its name, and its
353 # graphic representation if printable (and you have a font that knows about
354 # it). This makes it easier to see what the particular code points are in
355 # each output table. The tables are usable, but because they don't have
356 # ranges (for the most part), a Perl using them will run slower. Non-named
357 # code points are annotated with a description of their status, and contiguous
358 # ones with the same description will be output as a range rather than
359 # individually. Algorithmically named characters are also output as ranges,
360 # except when there are just a few contiguous ones.
364 # The program would break if Unicode were to change its names so that
365 # interior white space, underscores, or dashes differences were significant
366 # within property and property value names.
368 # It might be easier to use the xml versions of the UCD if this program ever
369 # would need heavy revision, and the ability to handle old versions was not
372 # There is the potential for name collisions, in that Perl has chosen names
373 # that Unicode could decide it also likes. There have been such collisions in
374 # the past, with mostly Perl deciding to adopt the Unicode definition of the
375 # name. However in the 5.2 Unicode beta testing, there were a number of such
376 # collisions, which were withdrawn before the final release, because of Perl's
377 # and other's protests. These all involved new properties which began with
378 # 'Is'. Based on the protests, Unicode is unlikely to try that again. Also,
379 # many of the Perl-defined synonyms, like Any, Word, etc, are listed in a
380 # Unicode document, so they are unlikely to be used by Unicode for another
381 # purpose. However, they might try something beginning with 'In', or use any
382 # of the other Perl-defined properties. This program will warn you of name
383 # collisions, and refuse to generate tables with them, but manual intervention
384 # will be required in this event. One scheme that could be implemented, if
385 # necessary, would be to have this program generate another file, or add a
386 # field to mktables.lst that gives the date of first definition of a property.
387 # Each new release of Unicode would use that file as a basis for the next
388 # iteration. And the Perl synonym addition code could sort based on the age
389 # of the property, so older properties get priority, and newer ones that clash
390 # would be refused; hence existing code would not be impacted, and some other
391 # synonym would have to be used for the new property. This is ugly, and
392 # manual intervention would certainly be easier to do in the short run; lets
393 # hope it never comes to this.
397 # This program can generate tables from the Unihan database. But it doesn't
398 # by default, letting the CPAN module Unicode::Unihan handle them. Prior to
399 # version 5.2, this database was in a single file, Unihan.txt. In 5.2 the
400 # database was split into 8 different files, all beginning with the letters
401 # 'Unihan'. This program will read those file(s) if present, but it needs to
402 # know which of the many properties in the file(s) should have tables created
403 # for them. It will create tables for any properties listed in
404 # PropertyAliases.txt and PropValueAliases.txt, plus any listed in the
405 # @cjk_properties array and the @cjk_property_values array. Thus, if a
406 # property you want is not in those files of the release you are building
407 # against, you must add it to those two arrays. Starting in 4.0, the
408 # Unicode_Radical_Stroke was listed in those files, so if the Unihan database
409 # is present in the directory, a table will be generated for that property.
410 # In 5.2, several more properties were added. For your convenience, the two
411 # arrays are initialized with all the 6.0 listed properties that are also in
412 # earlier releases. But these are commented out. You can just uncomment the
413 # ones you want, or use them as a template for adding entries for other
416 # You may need to adjust the entries to suit your purposes. setup_unihan(),
417 # and filter_unihan_line() are the functions where this is done. This program
418 # already does some adjusting to make the lines look more like the rest of the
419 # Unicode DB; You can see what that is in filter_unihan_line()
421 # There is a bug in the 3.2 data file in which some values for the
422 # kPrimaryNumeric property have commas and an unexpected comment. A filter
423 # could be added for these; or for a particular installation, the Unihan.txt
424 # file could be edited to fix them.
426 # HOW TO ADD A FILE TO BE PROCESSED
428 # A new file from Unicode needs to have an object constructed for it in
429 # @input_file_objects, probably at the end or at the end of the extracted
430 # ones. The program should warn you if its name will clash with others on
431 # restrictive file systems, like DOS. If so, figure out a better name, and
432 # add lines to the README.perl file giving that. If the file is a character
433 # property, it should be in the format that Unicode has by default
434 # standardized for such files for the more recently introduced ones.
435 # If so, the Input_file constructor for @input_file_objects can just be the
436 # file name and release it first appeared in. If not, then it should be
437 # possible to construct an each_line_handler() to massage the line into the
440 # For non-character properties, more code will be needed. You can look at
441 # the existing entries for clues.
443 # UNICODE VERSIONS NOTES
445 # The Unicode UCD has had a number of errors in it over the versions. And
446 # these remain, by policy, in the standard for that version. Therefore it is
447 # risky to correct them, because code may be expecting the error. So this
448 # program doesn't generally make changes, unless the error breaks the Perl
449 # core. As an example, some versions of 2.1.x Jamo.txt have the wrong value
450 # for U+1105, which causes real problems for the algorithms for Jamo
451 # calculations, so it is changed here.
453 # But it isn't so clear cut as to what to do about concepts that are
454 # introduced in a later release; should they extend back to earlier releases
455 # where the concept just didn't exist? It was easier to do this than to not,
456 # so that's what was done. For example, the default value for code points not
457 # in the files for various properties was probably undefined until changed by
458 # some version. No_Block for blocks is such an example. This program will
459 # assign No_Block even in Unicode versions that didn't have it. This has the
460 # benefit that code being written doesn't have to special case earlier
461 # versions; and the detriment that it doesn't match the Standard precisely for
462 # the affected versions.
464 # Here are some observations about some of the issues in early versions:
466 # The number of code points in \p{alpha} halved in 2.1.9. It turns out that
467 # the reason is that the CJK block starting at 4E00 was removed from PropList,
468 # and was not put back in until 3.1.0
470 # Unicode introduced the synonym Space for White_Space in 4.1. Perl has
471 # always had a \p{Space}. In release 3.2 only, they are not synonymous. The
472 # reason is that 3.2 introduced U+205F=medium math space, which was not
473 # classed as white space, but Perl figured out that it should have been. 4.0
474 # reclassified it correctly.
476 # Another change between 3.2 and 4.0 is the CCC property value ATBL. In 3.2
477 # this was erroneously a synonym for 202. In 4.0, ATB became 202, and ATBL
478 # was left with no code points, as all the ones that mapped to 202 stayed
479 # mapped to 202. Thus if your program used the numeric name for the class,
480 # it would not have been affected, but if it used the mnemonic, it would have
483 # \p{Script=Hrkt} (Katakana_Or_Hiragana) came in 4.0.1. Before that code
484 # points which eventually came to have this script property value, instead
485 # mapped to "Unknown". But in the next release all these code points were
486 # moved to \p{sc=common} instead.
488 # The default for missing code points for BidiClass is complicated. Starting
489 # in 3.1.1, the derived file DBidiClass.txt handles this, but this program
490 # tries to do the best it can for earlier releases. It is done in
491 # process_PropertyAliases()
493 ##############################################################################
495 my $UNDEF = ':UNDEF:'; # String to print out for undefined values in tracing
497 my $MAX_LINE_WIDTH = 78;
499 # Debugging aid to skip most files so as to not be distracted by them when
500 # concentrating on the ones being debugged. Add
502 # to the constructor for those files you want processed when you set this.
503 # Files with a first version number of 0 are special: they are always
504 # processed regardless of the state of this flag. Generally, Jamo.txt and
505 # UnicodeData.txt must not be skipped if you want this program to not die
506 # before normal completion.
509 # Set to 1 to enable tracing.
512 { # Closure for trace: debugging aid
513 my $print_caller = 1; # ? Include calling subroutine name
514 my $main_with_colon = 'main::';
515 my $main_colon_length = length($main_with_colon);
518 return unless $to_trace; # Do nothing if global flag not set
522 local $DB::trace = 0;
523 $DB::trace = 0; # Quiet 'used only once' message
527 # Loop looking up the stack to get the first non-trace caller
532 $line_number = $caller_line;
533 (my $pkg, my $file, $caller_line, my $caller) = caller $i++;
534 $caller = $main_with_colon unless defined $caller;
536 $caller_name = $caller;
539 $caller_name =~ s/.*:://;
540 if (substr($caller_name, 0, $main_colon_length)
543 $caller_name = substr($caller_name, $main_colon_length);
546 } until ($caller_name ne 'trace');
548 # If the stack was empty, we were called from the top level
549 $caller_name = 'main' if ($caller_name eq ""
550 || $caller_name eq 'trace');
553 foreach my $string (@input) {
554 #print STDERR __LINE__, ": ", join ", ", @input, "\n";
555 if (ref $string eq 'ARRAY' || ref $string eq 'HASH') {
556 $output .= simple_dumper($string);
559 $string = "$string" if ref $string;
560 $string = $UNDEF unless defined $string;
562 $string = '""' if $string eq "";
563 $output .= " " if $output ne ""
565 && substr($output, -1, 1) ne " "
566 && substr($string, 0, 1) ne " ";
571 print STDERR sprintf "%4d: ", $line_number if defined $line_number;
572 print STDERR "$caller_name: " if $print_caller;
573 print STDERR $output, "\n";
578 # This is for a rarely used development feature that allows you to compare two
579 # versions of the Unicode standard without having to deal with changes caused
580 # by the code points introduced in the later version. Change the 0 to a
581 # string containing a SINGLE dotted Unicode release number (e.g. "2.1"). Only
582 # code points introduced in that release and earlier will be used; later ones
583 # are thrown away. You use the version number of the earliest one you want to
584 # compare; then run this program on directory structures containing each
585 # release, and compare the outputs. These outputs will therefore include only
586 # the code points common to both releases, and you can see the changes caused
587 # just by the underlying release semantic changes. For versions earlier than
588 # 3.2, you must copy a version of DAge.txt into the directory.
589 my $string_compare_versions = DEBUG && 0; # e.g., "2.1";
590 my $compare_versions = DEBUG
591 && $string_compare_versions
592 && pack "C*", split /\./, $string_compare_versions;
595 # Returns non-duplicated input values. From "Perl Best Practices:
596 # Encapsulated Cleverness". p. 455 in first edition.
599 # Arguably this breaks encapsulation, if the goal is to permit multiple
600 # distinct objects to stringify to the same value, and be interchangeable.
601 # However, for this program, no two objects stringify identically, and all
602 # lists passed to this function are either objects or strings. So this
603 # doesn't affect correctness, but it does give a couple of percent speedup.
605 return grep { ! $seen{$_}++ } @_;
608 $0 = File::Spec->canonpath($0);
610 my $make_test_script = 0; # ? Should we output a test script
611 my $write_unchanged_files = 0; # ? Should we update the output files even if
612 # we don't think they have changed
613 my $use_directory = ""; # ? Should we chdir somewhere.
614 my $pod_directory; # input directory to store the pod file.
615 my $pod_file = 'perluniprops';
616 my $t_path; # Path to the .t test file
617 my $file_list = 'mktables.lst'; # File to store input and output file names.
618 # This is used to speed up the build, by not
619 # executing the main body of the program if
620 # nothing on the list has changed since the
622 my $make_list = 1; # ? Should we write $file_list. Set to always
623 # make a list so that when the pumpking is
624 # preparing a release, s/he won't have to do
626 my $glob_list = 0; # ? Should we try to include unknown .txt files
628 my $output_range_counts = $debugging_build; # ? Should we include the number
629 # of code points in ranges in
631 my $annotate = 0; # ? Should character names be in the output
633 # Verbosity levels; 0 is quiet
634 my $NORMAL_VERBOSITY = 1;
638 my $verbosity = $NORMAL_VERBOSITY;
642 my $arg = shift @ARGV;
644 $verbosity = $VERBOSE;
646 elsif ($arg eq '-p') {
647 $verbosity = $PROGRESS;
648 $| = 1; # Flush buffers as we go.
650 elsif ($arg eq '-q') {
653 elsif ($arg eq '-w') {
654 $write_unchanged_files = 1; # update the files even if havent changed
656 elsif ($arg eq '-check') {
657 my $this = shift @ARGV;
658 my $ok = shift @ARGV;
660 print "Skipping as check params are not the same.\n";
664 elsif ($arg eq '-P' && defined ($pod_directory = shift)) {
665 -d $pod_directory or croak "Directory '$pod_directory' doesn't exist";
667 elsif ($arg eq '-maketest' || ($arg eq '-T' && defined ($t_path = shift)))
669 $make_test_script = 1;
671 elsif ($arg eq '-makelist') {
674 elsif ($arg eq '-C' && defined ($use_directory = shift)) {
675 -d $use_directory or croak "Unknown directory '$use_directory'";
677 elsif ($arg eq '-L') {
679 # Existence not tested until have chdir'd
682 elsif ($arg eq '-globlist') {
685 elsif ($arg eq '-c') {
686 $output_range_counts = ! $output_range_counts
688 elsif ($arg eq '-annotate') {
690 $debugging_build = 1;
691 $output_range_counts = 1;
695 $with_c .= 'out' if $output_range_counts; # Complements the state
697 usage: $0 [-c|-p|-q|-v|-w] [-C dir] [-L filelist] [ -P pod_dir ]
698 [ -T test_file_path ] [-globlist] [-makelist] [-maketest]
700 -c : Output comments $with_c number of code points in ranges
701 -q : Quiet Mode: Only output serious warnings.
702 -p : Set verbosity level to normal plus show progress.
703 -v : Set Verbosity level high: Show progress and non-serious
705 -w : Write files regardless
706 -C dir : Change to this directory before proceeding. All relative paths
707 except those specified by the -P and -T options will be done
708 with respect to this directory.
709 -P dir : Output $pod_file file to directory 'dir'.
710 -T path : Create a test script as 'path'; overrides -maketest
711 -L filelist : Use alternate 'filelist' instead of standard one
712 -globlist : Take as input all non-Test *.txt files in current and sub
714 -maketest : Make test script 'TestProp.pl' in current (or -C directory),
716 -makelist : Rewrite the file list $file_list based on current setup
717 -annotate : Output an annotation for each character in the table files;
718 useful for debugging mktables, looking at diffs; but is slow,
719 memory intensive; resulting tables are usable but slow and
721 -check A B : Executes $0 only if A and B are the same
726 # Stores the most-recently changed file. If none have changed, can skip the
728 my $most_recent = (stat $0)[9]; # Do this before the chdir!
730 # Change directories now, because need to read 'version' early.
731 if ($use_directory) {
732 if ($pod_directory && ! File::Spec->file_name_is_absolute($pod_directory)) {
733 $pod_directory = File::Spec->rel2abs($pod_directory);
735 if ($t_path && ! File::Spec->file_name_is_absolute($t_path)) {
736 $t_path = File::Spec->rel2abs($t_path);
738 chdir $use_directory or croak "Failed to chdir to '$use_directory':$!";
739 if ($pod_directory && File::Spec->file_name_is_absolute($pod_directory)) {
740 $pod_directory = File::Spec->abs2rel($pod_directory);
742 if ($t_path && File::Spec->file_name_is_absolute($t_path)) {
743 $t_path = File::Spec->abs2rel($t_path);
747 # Get Unicode version into regular and v-string. This is done now because
748 # various tables below get populated based on it. These tables are populated
749 # here to be near the top of the file, and so easily seeable by those needing
751 open my $VERSION, "<", "version"
752 or croak "$0: can't open required file 'version': $!\n";
753 my $string_version = <$VERSION>;
755 chomp $string_version;
756 my $v_version = pack "C*", split /\./, $string_version; # v string
758 # The following are the complete names of properties with property values that
759 # are known to not match any code points in some versions of Unicode, but that
760 # may change in the future so they should be matchable, hence an empty file is
761 # generated for them.
762 my @tables_that_may_be_empty = (
763 'Joining_Type=Left_Joining',
765 push @tables_that_may_be_empty, 'Script=Common' if $v_version le v4.0.1;
766 push @tables_that_may_be_empty, 'Title' if $v_version lt v2.0.0;
767 push @tables_that_may_be_empty, 'Script=Katakana_Or_Hiragana'
768 if $v_version ge v4.1.0;
769 push @tables_that_may_be_empty, 'Script_Extensions=Katakana_Or_Hiragana'
770 if $v_version ge v6.0.0;
772 # The lists below are hashes, so the key is the item in the list, and the
773 # value is the reason why it is in the list. This makes generation of
774 # documentation easier.
776 my %why_suppressed; # No file generated for these.
778 # Files aren't generated for empty extraneous properties. This is arguable.
779 # Extraneous properties generally come about because a property is no longer
780 # used in a newer version of Unicode. If we generated a file without code
781 # points, programs that used to work on that property will still execute
782 # without errors. It just won't ever match (or will always match, with \P{}).
783 # This means that the logic is now likely wrong. I (khw) think its better to
784 # find this out by getting an error message. Just move them to the table
785 # above to change this behavior
786 my %why_suppress_if_empty_warn_if_not = (
788 # It is the only property that has ever officially been removed from the
789 # Standard. The database never contained any code points for it.
790 'Special_Case_Condition' => 'Obsolete',
792 # Apparently never official, but there were code points in some versions of
793 # old-style PropList.txt
794 'Non_Break' => 'Obsolete',
797 # These would normally go in the warn table just above, but they were changed
798 # a long time before this program was written, so warnings about them are
800 if ($v_version gt v3.2.0) {
801 push @tables_that_may_be_empty,
802 'Canonical_Combining_Class=Attached_Below_Left'
805 # These are listed in the Property aliases file in 6.0, but Unihan is ignored
806 # unless explicitly added.
807 if ($v_version ge v5.2.0) {
808 my $unihan = 'Unihan; remove from list if using Unihan';
809 foreach my $table (qw (
813 kCompatibilityVariant
827 $why_suppress_if_empty_warn_if_not{$table} = $unihan;
831 # Enum values for to_output_map() method in the Map_Table package.
832 my $EXTERNAL_MAP = 1;
833 my $INTERNAL_MAP = 2;
835 # To override computed values for writing the map tables for these properties.
836 # The default for enum map tables is to write them out, so that the Unicode
837 # .txt files can be removed, but all the data to compute any property value
838 # for any code point is available in a more compact form.
839 my %global_to_output_map = (
840 # Needed by UCD.pm, but don't want to publicize that it exists, so won't
841 # get stuck supporting it if things change. Since it is a STRING
842 # property, it normally would be listed in the pod, but INTERNAL_MAP
844 Unicode_1_Name => $INTERNAL_MAP,
846 Present_In => 0, # Suppress, as easily computed from Age
847 Block => 0, # Suppress, as Blocks.txt is retained.
849 # Suppress, as mapping can be found instead from the
850 # Perl_Decomposition_Mapping file
851 Decomposition_Type => 0,
854 # Properties that this program ignores.
855 my @unimplemented_properties;
857 # With this release, it is automatically handled if the Unihan db is
859 push @unimplemented_properties, 'Unicode_Radical_Stroke' if $v_version le v5.2.0;
861 # There are several types of obsolete properties defined by Unicode. These
862 # must be hand-edited for every new Unicode release.
863 my %why_deprecated; # Generates a deprecated warning message if used.
864 my %why_stabilized; # Documentation only
865 my %why_obsolete; # Documentation only
868 my $simple = 'Perl uses the more complete version of this property';
869 my $unihan = 'Unihan properties are by default not enabled in the Perl core. Instead use CPAN: Unicode::Unihan';
871 my $other_properties = 'other properties';
872 my $contributory = "Used by Unicode internally for generating $other_properties and not intended to be used stand-alone";
873 my $why_no_expand = "Deprecated by Unicode. These are characters that expand to more than one character in the specified normalization form, but whether they actually take up more bytes or not depends on the encoding being used. For example, a UTF-8 encoded character may expand to a different number of bytes than a UTF-32 encoded character.";
876 'Grapheme_Link' => 'Deprecated by Unicode: Duplicates ccc=vr (Canonical_Combining_Class=Virama)',
877 'Jamo_Short_Name' => $contributory,
878 'Line_Break=Surrogate' => 'Deprecated by Unicode because surrogates should never appear in well-formed text, and therefore shouldn\'t be the basis for line breaking',
879 'Other_Alphabetic' => $contributory,
880 'Other_Default_Ignorable_Code_Point' => $contributory,
881 'Other_Grapheme_Extend' => $contributory,
882 'Other_ID_Continue' => $contributory,
883 'Other_ID_Start' => $contributory,
884 'Other_Lowercase' => $contributory,
885 'Other_Math' => $contributory,
886 'Other_Uppercase' => $contributory,
887 'Expands_On_NFC' => $why_no_expand,
888 'Expands_On_NFD' => $why_no_expand,
889 'Expands_On_NFKC' => $why_no_expand,
890 'Expands_On_NFKD' => $why_no_expand,
894 # There is a lib/unicore/Decomposition.pl (used by Normalize.pm) which
895 # contains the same information, but without the algorithmically
896 # determinable Hangul syllables'. This file is not published, so it's
897 # existence is not noted in the comment.
898 'Decomposition_Mapping' => 'Accessible via Unicode::Normalize or Unicode::UCD::prop_invmap()',
900 'Indic_Matra_Category' => "Provisional",
901 'Indic_Syllabic_Category' => "Provisional",
903 # Don't suppress ISO_Comment, as otherwise special handling is needed
904 # to differentiate between it and gc=c, which can be written as 'isc',
905 # which is the same characters as ISO_Comment's short name.
907 'Name' => "Accessible via \\N{...} or 'use charnames;' or Unicode::UCD::prop_invmap()",
909 'Simple_Case_Folding' => "$simple. Can access this through Unicode::UCD::casefold or Unicode::UCD::prop_invmap()",
910 'Simple_Lowercase_Mapping' => "$simple. Can access this through Unicode::UCD::charinfo or Unicode::UCD::prop_invmap()",
911 'Simple_Titlecase_Mapping' => "$simple. Can access this through Unicode::UCD::charinfo or Unicode::UCD::prop_invmap()",
912 'Simple_Uppercase_Mapping' => "$simple. Can access this through Unicode::UCD::charinfo or Unicode::UCD::prop_invmap()",
914 FC_NFKC_Closure => 'Supplanted in usage by NFKC_Casefold; otherwise not useful',
917 foreach my $property (
919 # The following are suppressed because they were made contributory
920 # or deprecated by Unicode before Perl ever thought about
929 # The following are suppressed because they have been marked
930 # as deprecated for a sufficient amount of time
932 'Other_Default_Ignorable_Code_Point',
933 'Other_Grapheme_Extend',
940 $why_suppressed{$property} = $why_deprecated{$property};
943 # Customize the message for all the 'Other_' properties
944 foreach my $property (keys %why_deprecated) {
945 next if (my $main_property = $property) !~ s/^Other_//;
946 $why_deprecated{$property} =~ s/$other_properties/the $main_property property (which should be used instead)/;
950 if ($v_version ge 4.0.0) {
951 $why_stabilized{'Hyphen'} = 'Use the Line_Break property instead; see www.unicode.org/reports/tr14';
952 if ($v_version ge 6.0.0) {
953 $why_deprecated{'Hyphen'} = 'Supplanted by Line_Break property values; see www.unicode.org/reports/tr14';
956 if ($v_version ge 5.2.0 && $v_version lt 6.0.0) {
957 $why_obsolete{'ISO_Comment'} = 'Code points for it have been removed';
958 if ($v_version ge 6.0.0) {
959 $why_deprecated{'ISO_Comment'} = 'No longer needed for Unicode\'s internal chart generation; otherwise not useful, and code points for it have been removed';
963 # Probably obsolete forever
964 if ($v_version ge v4.1.0) {
965 $why_suppressed{'Script=Katakana_Or_Hiragana'} = 'Obsolete. All code points previously matched by this have been moved to "Script=Common".';
967 if ($v_version ge v6.0.0) {
968 $why_suppressed{'Script=Katakana_Or_Hiragana'} .= ' Consider instead using "Script_Extensions=Katakana" or "Script_Extensions=Hiragana (or both)"';
969 $why_suppressed{'Script_Extensions=Katakana_Or_Hiragana'} = 'All code points that would be matched by this are matched by either "Script_Extensions=Katakana" or "Script_Extensions=Hiragana"';
972 # This program can create files for enumerated-like properties, such as
973 # 'Numeric_Type'. This file would be the same format as for a string
974 # property, with a mapping from code point to its value, so you could look up,
975 # for example, the script a code point is in. But no one so far wants this
976 # mapping, or they have found another way to get it since this is a new
977 # feature. So no file is generated except if it is in this list.
978 my @output_mapped_properties = split "\n", <<END;
981 # If you are using the Unihan database in a Unicode version before 5.2, you
982 # need to add the properties that you want to extract from it to this table.
983 # For your convenience, the properties in the 6.0 PropertyAliases.txt file are
984 # listed, commented out
985 my @cjk_properties = split "\n", <<'END';
986 #cjkAccountingNumeric; kAccountingNumeric
987 #cjkOtherNumeric; kOtherNumeric
988 #cjkPrimaryNumeric; kPrimaryNumeric
989 #cjkCompatibilityVariant; kCompatibilityVariant
991 #cjkIRG_GSource; kIRG_GSource
992 #cjkIRG_HSource; kIRG_HSource
993 #cjkIRG_JSource; kIRG_JSource
994 #cjkIRG_KPSource; kIRG_KPSource
995 #cjkIRG_KSource; kIRG_KSource
996 #cjkIRG_TSource; kIRG_TSource
997 #cjkIRG_USource; kIRG_USource
998 #cjkIRG_VSource; kIRG_VSource
999 #cjkRSUnicode; kRSUnicode ; Unicode_Radical_Stroke; URS
1002 # Similarly for the property values. For your convenience, the lines in the
1003 # 6.0 PropertyAliases.txt file are listed. Just remove the first BUT NOT both
1004 # '#' marks (for Unicode versions before 5.2)
1005 my @cjk_property_values = split "\n", <<'END';
1006 ## @missing: 0000..10FFFF; cjkAccountingNumeric; NaN
1007 ## @missing: 0000..10FFFF; cjkCompatibilityVariant; <code point>
1008 ## @missing: 0000..10FFFF; cjkIICore; <none>
1009 ## @missing: 0000..10FFFF; cjkIRG_GSource; <none>
1010 ## @missing: 0000..10FFFF; cjkIRG_HSource; <none>
1011 ## @missing: 0000..10FFFF; cjkIRG_JSource; <none>
1012 ## @missing: 0000..10FFFF; cjkIRG_KPSource; <none>
1013 ## @missing: 0000..10FFFF; cjkIRG_KSource; <none>
1014 ## @missing: 0000..10FFFF; cjkIRG_TSource; <none>
1015 ## @missing: 0000..10FFFF; cjkIRG_USource; <none>
1016 ## @missing: 0000..10FFFF; cjkIRG_VSource; <none>
1017 ## @missing: 0000..10FFFF; cjkOtherNumeric; NaN
1018 ## @missing: 0000..10FFFF; cjkPrimaryNumeric; NaN
1019 ## @missing: 0000..10FFFF; cjkRSUnicode; <none>
1022 # The input files don't list every code point. Those not listed are to be
1023 # defaulted to some value. Below are hard-coded what those values are for
1024 # non-binary properties as of 5.1. Starting in 5.0, there are
1025 # machine-parsable comment lines in the files the give the defaults; so this
1026 # list shouldn't have to be extended. The claim is that all missing entries
1027 # for binary properties will default to 'N'. Unicode tried to change that in
1028 # 5.2, but the beta period produced enough protest that they backed off.
1030 # The defaults for the fields that appear in UnicodeData.txt in this hash must
1031 # be in the form that it expects. The others may be synonyms.
1032 my $CODE_POINT = '<code point>';
1033 my %default_mapping = (
1034 Age => "Unassigned",
1035 # Bidi_Class => Complicated; set in code
1036 Bidi_Mirroring_Glyph => "",
1037 Block => 'No_Block',
1038 Canonical_Combining_Class => 0,
1039 Case_Folding => $CODE_POINT,
1040 Decomposition_Mapping => $CODE_POINT,
1041 Decomposition_Type => 'None',
1042 East_Asian_Width => "Neutral",
1043 FC_NFKC_Closure => $CODE_POINT,
1044 General_Category => 'Cn',
1045 Grapheme_Cluster_Break => 'Other',
1046 Hangul_Syllable_Type => 'NA',
1048 Jamo_Short_Name => "",
1049 Joining_Group => "No_Joining_Group",
1050 # Joining_Type => Complicated; set in code
1051 kIICore => 'N', # Is converted to binary
1052 #Line_Break => Complicated; set in code
1053 Lowercase_Mapping => $CODE_POINT,
1060 Numeric_Type => 'None',
1061 Numeric_Value => 'NaN',
1062 Script => ($v_version le 4.1.0) ? 'Common' : 'Unknown',
1063 Sentence_Break => 'Other',
1064 Simple_Case_Folding => $CODE_POINT,
1065 Simple_Lowercase_Mapping => $CODE_POINT,
1066 Simple_Titlecase_Mapping => $CODE_POINT,
1067 Simple_Uppercase_Mapping => $CODE_POINT,
1068 Titlecase_Mapping => $CODE_POINT,
1069 Unicode_1_Name => "",
1070 Unicode_Radical_Stroke => "",
1071 Uppercase_Mapping => $CODE_POINT,
1072 Word_Break => 'Other',
1075 # Below are files that Unicode furnishes, but this program ignores, and why
1076 my %ignored_files = (
1077 'CJKRadicals.txt' => 'Maps the kRSUnicode property values to corresponding code points',
1078 'Index.txt' => 'Alphabetical index of Unicode characters',
1079 'NamedSqProv.txt' => 'Named sequences proposed for inclusion in a later version of the Unicode Standard; if you need them now, you can append this file to F<NamedSequences.txt> and recompile perl',
1080 'NamesList.txt' => 'Annotated list of characters',
1081 'NormalizationCorrections.txt' => 'Documentation of corrections already incorporated into the Unicode data base',
1082 'Props.txt' => 'Only in very early releases; is a subset of F<PropList.txt> (which is used instead)',
1083 'ReadMe.txt' => 'Documentation',
1084 'StandardizedVariants.txt' => 'Certain glyph variations for character display are standardized. This lists the non-Unihan ones; the Unihan ones are also not used by Perl, and are in a separate Unicode data base L<http://www.unicode.org/ivd>',
1085 'EmojiSources.txt' => 'Maps certain Unicode code points to their legacy Japanese cell-phone values',
1086 'auxiliary/WordBreakTest.html' => 'Documentation of validation tests',
1087 'auxiliary/SentenceBreakTest.html' => 'Documentation of validation tests',
1088 'auxiliary/GraphemeBreakTest.html' => 'Documentation of validation tests',
1089 'auxiliary/LineBreakTest.html' => 'Documentation of validation tests',
1092 my %skipped_files; # List of files that we skip
1094 ### End of externally interesting definitions, except for @input_file_objects
1097 # !!!!!!! DO NOT EDIT THIS FILE !!!!!!!
1098 # This file is machine-generated by $0 from the Unicode
1099 # database, Version $string_version. Any changes made here will be lost!
1102 my $INTERNAL_ONLY_HEADER = <<"EOF";
1104 # !!!!!!! INTERNAL PERL USE ONLY !!!!!!!
1105 # This file is for internal use by core Perl only. The format and even the
1106 # name or existence of this file are subject to change without notice. Don't
1110 my $DEVELOPMENT_ONLY=<<"EOF";
1111 # !!!!!!! DEVELOPMENT USE ONLY !!!!!!!
1112 # This file contains information artificially constrained to code points
1113 # present in Unicode release $string_compare_versions.
1114 # IT CANNOT BE RELIED ON. It is for use during development only and should
1115 # not be used for production.
1119 my $MAX_UNICODE_CODEPOINT_STRING = "10FFFF";
1120 my $MAX_UNICODE_CODEPOINT = hex $MAX_UNICODE_CODEPOINT_STRING;
1121 my $MAX_UNICODE_CODEPOINTS = $MAX_UNICODE_CODEPOINT + 1;
1123 # Matches legal code point. 4-6 hex numbers, If there are 6, the first
1124 # two must be 10; if there are 5, the first must not be a 0. Written this way
1125 # to decrease backtracking. The first regex allows the code point to be at
1126 # the end of a word, but to work properly, the word shouldn't end with a valid
1127 # hex character. The second one won't match a code point at the end of a
1128 # word, and doesn't have the run-on issue
1129 my $run_on_code_point_re =
1130 qr/ (?: 10[0-9A-F]{4} | [1-9A-F][0-9A-F]{4} | [0-9A-F]{4} ) \b/x;
1131 my $code_point_re = qr/\b$run_on_code_point_re/;
1133 # This matches the beginning of the line in the Unicode db files that give the
1134 # defaults for code points not listed (i.e., missing) in the file. The code
1135 # depends on this ending with a semi-colon, so it can assume it is a valid
1136 # field when the line is split() by semi-colons
1137 my $missing_defaults_prefix =
1138 qr/^#\s+\@missing:\s+0000\.\.$MAX_UNICODE_CODEPOINT_STRING\s*;/;
1140 # Property types. Unicode has more types, but these are sufficient for our
1142 my $UNKNOWN = -1; # initialized to illegal value
1143 my $NON_STRING = 1; # Either binary or enum
1145 my $FORCED_BINARY = 3; # Not a binary property, but, besides its normal
1146 # tables, additional true and false tables are
1147 # generated so that false is anything matching the
1148 # default value, and true is everything else.
1149 my $ENUM = 4; # Include catalog
1150 my $STRING = 5; # Anything else: string or misc
1152 # Some input files have lines that give default values for code points not
1153 # contained in the file. Sometimes these should be ignored.
1154 my $NO_DEFAULTS = 0; # Must evaluate to false
1155 my $NOT_IGNORED = 1;
1158 # Range types. Each range has a type. Most ranges are type 0, for normal,
1159 # and will appear in the main body of the tables in the output files, but
1160 # there are other types of ranges as well, listed below, that are specially
1161 # handled. There are pseudo-types as well that will never be stored as a
1162 # type, but will affect the calculation of the type.
1164 # 0 is for normal, non-specials
1165 my $MULTI_CP = 1; # Sequence of more than code point
1166 my $HANGUL_SYLLABLE = 2;
1167 my $CP_IN_NAME = 3; # The NAME contains the code point appended to it.
1168 my $NULL = 4; # The map is to the null string; utf8.c can't
1169 # handle these, nor is there an accepted syntax
1170 # for them in \p{} constructs
1171 my $COMPUTE_NO_MULTI_CP = 5; # Pseudo-type; means that ranges that would
1172 # otherwise be $MULTI_CP type are instead type 0
1174 # process_generic_property_file() can accept certain overrides in its input.
1175 # Each of these must begin AND end with $CMD_DELIM.
1176 my $CMD_DELIM = "\a";
1177 my $REPLACE_CMD = 'replace'; # Override the Replace
1178 my $MAP_TYPE_CMD = 'map_type'; # Override the Type
1183 # Values for the Replace argument to add_range.
1184 # $NO # Don't replace; add only the code points not
1186 my $IF_NOT_EQUIVALENT = 1; # Replace only under certain conditions; details in
1187 # the comments at the subroutine definition.
1188 my $UNCONDITIONALLY = 2; # Replace without conditions.
1189 my $MULTIPLE_BEFORE = 4; # Don't replace, but add a duplicate record if
1191 my $MULTIPLE_AFTER = 5; # Don't replace, but add a duplicate record if
1193 my $CROAK = 6; # Die with an error if is already there
1195 # Flags to give property statuses. The phrases are to remind maintainers that
1196 # if the flag is changed, the indefinite article referring to it in the
1197 # documentation may need to be as well.
1199 my $DEPRECATED = 'D';
1200 my $a_bold_deprecated = "a 'B<$DEPRECATED>'";
1201 my $A_bold_deprecated = "A 'B<$DEPRECATED>'";
1202 my $DISCOURAGED = 'X';
1203 my $a_bold_discouraged = "an 'B<$DISCOURAGED>'";
1204 my $A_bold_discouraged = "An 'B<$DISCOURAGED>'";
1206 my $a_bold_stricter = "a 'B<$STRICTER>'";
1207 my $A_bold_stricter = "A 'B<$STRICTER>'";
1208 my $STABILIZED = 'S';
1209 my $a_bold_stabilized = "an 'B<$STABILIZED>'";
1210 my $A_bold_stabilized = "An 'B<$STABILIZED>'";
1212 my $a_bold_obsolete = "an 'B<$OBSOLETE>'";
1213 my $A_bold_obsolete = "An 'B<$OBSOLETE>'";
1215 my %status_past_participles = (
1216 $DISCOURAGED => 'discouraged',
1217 $STABILIZED => 'stabilized',
1218 $OBSOLETE => 'obsolete',
1219 $DEPRECATED => 'deprecated',
1222 # Table fates. These are somewhat ordered, so that fates < $MAP_PROXIED should be
1223 # externally documented.
1224 my $ORDINARY = 0; # The normal fate.
1225 my $MAP_PROXIED = 1; # The map table for the property isn't written out,
1226 # but there is a file written that can be used to
1227 # reconstruct this table
1228 my $SUPPRESSED = 3; # The file for this table is not written out.
1229 my $INTERNAL_ONLY = 4; # The file for this table is written out, but it is
1230 # for Perl's internal use only
1231 my $PLACEHOLDER = 5; # A property that is defined as a placeholder in a
1232 # Unicode version that doesn't have it, but we need it
1233 # to be defined, if empty, to have things work.
1234 # Implies no pod entry generated
1236 # The format of the values of the tables:
1237 my $EMPTY_FORMAT = "";
1238 my $BINARY_FORMAT = 'b';
1239 my $DECIMAL_FORMAT = 'd';
1240 my $FLOAT_FORMAT = 'f';
1241 my $INTEGER_FORMAT = 'i';
1242 my $HEX_FORMAT = 'x';
1243 my $RATIONAL_FORMAT = 'r';
1244 my $STRING_FORMAT = 's';
1245 my $DECOMP_STRING_FORMAT = 'c';
1246 my $STRING_WHITE_SPACE_LIST = 'sw';
1248 my %map_table_formats = (
1249 $BINARY_FORMAT => 'binary',
1250 $DECIMAL_FORMAT => 'single decimal digit',
1251 $FLOAT_FORMAT => 'floating point number',
1252 $INTEGER_FORMAT => 'integer',
1253 $HEX_FORMAT => 'non-negative hex whole number; a code point',
1254 $RATIONAL_FORMAT => 'rational: an integer or a fraction',
1255 $STRING_FORMAT => 'string',
1256 $DECOMP_STRING_FORMAT => 'Perl\'s internal (Normalize.pm) decomposition mapping',
1257 $STRING_WHITE_SPACE_LIST => 'string, but some elements are interpreted as a list; white space occurs only as list item separators'
1260 # Unicode didn't put such derived files in a separate directory at first.
1261 my $EXTRACTED_DIR = (-d 'extracted') ? 'extracted' : "";
1262 my $EXTRACTED = ($EXTRACTED_DIR) ? "$EXTRACTED_DIR/" : "";
1263 my $AUXILIARY = 'auxiliary';
1265 # Hashes that will eventually go into Heavy.pl for the use of utf8_heavy.pl
1266 # and into UCD.pl for the use of UCD.pm
1267 my %loose_to_file_of; # loosely maps table names to their respective
1269 my %stricter_to_file_of; # same; but for stricter mapping.
1270 my %loose_property_to_file_of; # Maps a loose property name to its map file
1271 my %file_to_swash_name; # Maps the file name to its corresponding key name
1272 # in the hash %utf8::SwashInfo
1273 my %nv_floating_to_rational; # maps numeric values floating point numbers to
1274 # their rational equivalent
1275 my %loose_property_name_of; # Loosely maps (non_string) property names to
1277 my %string_property_loose_to_name; # Same, for string properties.
1278 my %loose_defaults; # keys are of form "prop=value", where 'prop' is
1279 # the property name in standard loose form, and
1280 # 'value' is the default value for that property,
1281 # also in standard loose form.
1282 my %loose_to_standard_value; # loosely maps table names to the canonical
1284 my %ambiguous_names; # keys are alias names (in standard form) that
1285 # have more than one possible meaning.
1286 my %prop_aliases; # Keys are standard property name; values are each
1288 my %prop_value_aliases; # Keys of top level are standard property name;
1289 # values are keys to another hash, Each one is
1290 # one of the property's values, in standard form.
1291 # The values are that prop-val's aliases.
1292 my %ucd_pod; # Holds entries that will go into the UCD section of the pod
1294 # Most properties are immune to caseless matching, otherwise you would get
1295 # nonsensical results, as properties are a function of a code point, not
1296 # everything that is caselessly equivalent to that code point. For example,
1297 # Changes_When_Case_Folded('s') should be false, whereas caselessly it would
1298 # be true because 's' and 'S' are equivalent caselessly. However,
1299 # traditionally, [:upper:] and [:lower:] are equivalent caselessly, so we
1300 # extend that concept to those very few properties that are like this. Each
1301 # such property will match the full range caselessly. They are hard-coded in
1302 # the program; it's not worth trying to make it general as it's extremely
1303 # unlikely that they will ever change.
1304 my %caseless_equivalent_to;
1306 # These constants names and values were taken from the Unicode standard,
1307 # version 5.1, section 3.12. They are used in conjunction with Hangul
1308 # syllables. The '_string' versions are so generated tables can retain the
1309 # hex format, which is the more familiar value
1310 my $SBase_string = "0xAC00";
1311 my $SBase = CORE::hex $SBase_string;
1312 my $LBase_string = "0x1100";
1313 my $LBase = CORE::hex $LBase_string;
1314 my $VBase_string = "0x1161";
1315 my $VBase = CORE::hex $VBase_string;
1316 my $TBase_string = "0x11A7";
1317 my $TBase = CORE::hex $TBase_string;
1322 my $NCount = $VCount * $TCount;
1324 # For Hangul syllables; These store the numbers from Jamo.txt in conjunction
1325 # with the above published constants.
1327 my %Jamo_L; # Leading consonants
1328 my %Jamo_V; # Vowels
1329 my %Jamo_T; # Trailing consonants
1331 # For code points whose name contains its ordinal as a '-ABCD' suffix.
1332 # The key is the base name of the code point, and the value is an
1333 # array giving all the ranges that use this base name. Each range
1334 # is actually a hash giving the 'low' and 'high' values of it.
1335 my %names_ending_in_code_point;
1336 my %loose_names_ending_in_code_point; # Same as above, but has blanks, dashes
1337 # removed from the names
1338 # Inverse mapping. The list of ranges that have these kinds of
1339 # names. Each element contains the low, high, and base names in an
1341 my @code_points_ending_in_code_point;
1343 # Boolean: does this Unicode version have the hangul syllables, and are we
1344 # writing out a table for them?
1345 my $has_hangul_syllables = 0;
1347 # Does this Unicode version have code points whose names end in their
1348 # respective code points, and are we writing out a table for them? 0 for no;
1349 # otherwise points to first property that a table is needed for them, so that
1350 # if multiple tables are needed, we don't create duplicates
1351 my $needing_code_points_ending_in_code_point = 0;
1353 my @backslash_X_tests; # List of tests read in for testing \X
1354 my @unhandled_properties; # Will contain a list of properties found in
1355 # the input that we didn't process.
1356 my @match_properties; # Properties that have match tables, to be
1358 my @map_properties; # Properties that get map files written
1359 my @named_sequences; # NamedSequences.txt contents.
1360 my %potential_files; # Generated list of all .txt files in the directory
1361 # structure so we can warn if something is being
1363 my @files_actually_output; # List of files we generated.
1364 my @more_Names; # Some code point names are compound; this is used
1365 # to store the extra components of them.
1366 my $MIN_FRACTION_LENGTH = 3; # How many digits of a floating point number at
1367 # the minimum before we consider it equivalent to a
1368 # candidate rational
1369 my $MAX_FLOATING_SLOP = 10 ** - $MIN_FRACTION_LENGTH; # And in floating terms
1371 # These store references to certain commonly used property objects
1380 # Are there conflicting names because of beginning with 'In_', or 'Is_'
1381 my $has_In_conflicts = 0;
1382 my $has_Is_conflicts = 0;
1384 sub internal_file_to_platform ($) {
1385 # Convert our file paths which have '/' separators to those of the
1389 return undef unless defined $file;
1391 return File::Spec->join(split '/', $file);
1394 sub file_exists ($) { # platform independent '-e'. This program internally
1395 # uses slash as a path separator.
1397 return 0 if ! defined $file;
1398 return -e internal_file_to_platform($file);
1402 # Returns the address of the blessed input object.
1403 # It doesn't check for blessedness because that would do a string eval
1404 # every call, and the program is structured so that this is never called
1405 # for a non-blessed object.
1407 no overloading; # If overloaded, numifying below won't work.
1409 # Numifying a ref gives its address.
1410 return pack 'J', $_[0];
1413 # These are used only if $annotate is true.
1414 # The entire range of Unicode characters is examined to populate these
1415 # after all the input has been processed. But most can be skipped, as they
1416 # have the same descriptive phrases, such as being unassigned
1417 my @viacode; # Contains the 1 million character names
1418 my @printable; # boolean: And are those characters printable?
1419 my @annotate_char_type; # Contains a type of those characters, specifically
1420 # for the purposes of annotation.
1421 my $annotate_ranges; # A map of ranges of code points that have the same
1422 # name for the purposes of annotation. They map to the
1423 # upper edge of the range, so that the end point can
1424 # be immediately found. This is used to skip ahead to
1425 # the end of a range, and avoid processing each
1426 # individual code point in it.
1427 my $unassigned_sans_noncharacters; # A Range_List of the unassigned
1428 # characters, but excluding those which are
1429 # also noncharacter code points
1431 # The annotation types are an extension of the regular range types, though
1432 # some of the latter are folded into one. Make the new types negative to
1433 # avoid conflicting with the regular types
1434 my $SURROGATE_TYPE = -1;
1435 my $UNASSIGNED_TYPE = -2;
1436 my $PRIVATE_USE_TYPE = -3;
1437 my $NONCHARACTER_TYPE = -4;
1438 my $CONTROL_TYPE = -5;
1439 my $UNKNOWN_TYPE = -6; # Used only if there is a bug in this program
1441 sub populate_char_info ($) {
1442 # Used only with the $annotate option. Populates the arrays with the
1443 # input code point's info that are needed for outputting more detailed
1444 # comments. If calling context wants a return, it is the end point of
1445 # any contiguous range of characters that share essentially the same info
1448 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
1450 $viacode[$i] = $perl_charname->value_of($i) || "";
1452 # A character is generally printable if Unicode says it is,
1453 # but below we make sure that most Unicode general category 'C' types
1455 $printable[$i] = $print->contains($i);
1457 $annotate_char_type[$i] = $perl_charname->type_of($i) || 0;
1459 # Only these two regular types are treated specially for annotations
1461 $annotate_char_type[$i] = 0 if $annotate_char_type[$i] != $CP_IN_NAME
1462 && $annotate_char_type[$i] != $HANGUL_SYLLABLE;
1464 # Give a generic name to all code points that don't have a real name.
1465 # We output ranges, if applicable, for these. Also calculate the end
1466 # point of the range.
1468 if (! $viacode[$i]) {
1469 if ($gc-> table('Surrogate')->contains($i)) {
1470 $viacode[$i] = 'Surrogate';
1471 $annotate_char_type[$i] = $SURROGATE_TYPE;
1473 $end = $gc->table('Surrogate')->containing_range($i)->end;
1475 elsif ($gc-> table('Private_use')->contains($i)) {
1476 $viacode[$i] = 'Private Use';
1477 $annotate_char_type[$i] = $PRIVATE_USE_TYPE;
1479 $end = $gc->table('Private_Use')->containing_range($i)->end;
1481 elsif (Property::property_ref('Noncharacter_Code_Point')-> table('Y')->
1484 $viacode[$i] = 'Noncharacter';
1485 $annotate_char_type[$i] = $NONCHARACTER_TYPE;
1487 $end = property_ref('Noncharacter_Code_Point')->table('Y')->
1488 containing_range($i)->end;
1490 elsif ($gc-> table('Control')->contains($i)) {
1491 $viacode[$i] = 'Control';
1492 $annotate_char_type[$i] = $CONTROL_TYPE;
1494 $end = 0x81 if $i == 0x80; # Hard-code this one known case
1496 elsif ($gc-> table('Unassigned')->contains($i)) {
1497 $viacode[$i] = 'Unassigned, block=' . $block-> value_of($i);
1498 $annotate_char_type[$i] = $UNASSIGNED_TYPE;
1501 # Because we name the unassigned by the blocks they are in, it
1502 # can't go past the end of that block, and it also can't go past
1503 # the unassigned range it is in. The special table makes sure
1504 # that the non-characters, which are unassigned, are separated
1506 $end = min($block->containing_range($i)->end,
1507 $unassigned_sans_noncharacters-> containing_range($i)->
1511 Carp::my_carp_bug("Can't figure out how to annotate "
1512 . sprintf("U+%04X", $i)
1513 . ". Proceeding anyway.");
1514 $viacode[$i] = 'UNKNOWN';
1515 $annotate_char_type[$i] = $UNKNOWN_TYPE;
1520 # Here, has a name, but if it's one in which the code point number is
1521 # appended to the name, do that.
1522 elsif ($annotate_char_type[$i] == $CP_IN_NAME) {
1523 $viacode[$i] .= sprintf("-%04X", $i);
1524 $end = $perl_charname->containing_range($i)->end;
1527 # And here, has a name, but if it's a hangul syllable one, replace it with
1528 # the correct name from the Unicode algorithm
1529 elsif ($annotate_char_type[$i] == $HANGUL_SYLLABLE) {
1531 my $SIndex = $i - $SBase;
1532 my $L = $LBase + $SIndex / $NCount;
1533 my $V = $VBase + ($SIndex % $NCount) / $TCount;
1534 my $T = $TBase + $SIndex % $TCount;
1535 $viacode[$i] = "HANGUL SYLLABLE $Jamo{$L}$Jamo{$V}";
1536 $viacode[$i] .= $Jamo{$T} if $T != $TBase;
1537 $end = $perl_charname->containing_range($i)->end;
1540 return if ! defined wantarray;
1541 return $i if ! defined $end; # If not a range, return the input
1543 # Save this whole range so can find the end point quickly
1544 $annotate_ranges->add_map($i, $end, $end);
1549 # Commented code below should work on Perl 5.8.
1550 ## This 'require' doesn't necessarily work in miniperl, and even if it does,
1551 ## the native perl version of it (which is what would operate under miniperl)
1552 ## is extremely slow, as it does a string eval every call.
1553 #my $has_fast_scalar_util = $
\18 !~ /miniperl/
1554 # && defined eval "require Scalar::Util";
1557 # # Returns the address of the blessed input object. Uses the XS version if
1558 # # available. It doesn't check for blessedness because that would do a
1559 # # string eval every call, and the program is structured so that this is
1560 # # never called for a non-blessed object.
1562 # return Scalar::Util::refaddr($_[0]) if $has_fast_scalar_util;
1564 # # Check at least that is a ref.
1565 # my $pkg = ref($_[0]) or return undef;
1567 # # Change to a fake package to defeat any overloaded stringify
1568 # bless $_[0], 'main::Fake';
1570 # # Numifying a ref gives its address.
1571 # my $addr = pack 'J', $_[0];
1573 # # Return to original class
1574 # bless $_[0], $pkg;
1581 return $a if $a >= $b;
1588 return $a if $a <= $b;
1592 sub clarify_number ($) {
1593 # This returns the input number with underscores inserted every 3 digits
1594 # in large (5 digits or more) numbers. Input must be entirely digits, not
1598 my $pos = length($number) - 3;
1599 return $number if $pos <= 1;
1601 substr($number, $pos, 0) = '_';
1610 # These routines give a uniform treatment of messages in this program. They
1611 # are placed in the Carp package to cause the stack trace to not include them,
1612 # although an alternative would be to use another package and set @CARP_NOT
1615 our $Verbose = 1 if main::DEBUG; # Useful info when debugging
1617 # This is a work-around suggested by Nicholas Clark to fix a problem with Carp
1618 # and overload trying to load Scalar:Util under miniperl. See
1619 # http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/2009-11/msg01057.html
1620 undef $overload::VERSION;
1623 my $message = shift || "";
1624 my $nofold = shift || 0;
1627 $message = main::join_lines($message);
1628 $message =~ s/^$0: *//; # Remove initial program name
1629 $message =~ s/[.;,]+$//; # Remove certain ending punctuation
1630 $message = "\n$0: $message;";
1632 # Fold the message with program name, semi-colon end punctuation
1633 # (which looks good with the message that carp appends to it), and a
1634 # hanging indent for continuation lines.
1635 $message = main::simple_fold($message, "", 4) unless $nofold;
1636 $message =~ s/\n$//; # Remove the trailing nl so what carp
1637 # appends is to the same line
1640 return $message if defined wantarray; # If a caller just wants the msg
1647 # This is called when it is clear that the problem is caused by a bug in
1650 my $message = shift;
1651 $message =~ s/^$0: *//;
1652 $message = my_carp("Bug in $0. Please report it by running perlbug or if that is unavailable, by sending email to perbug\@perl.org:\n$message");
1657 sub carp_too_few_args {
1659 my_carp_bug("Wrong number of arguments: to 'carp_too_few_arguments'. No action taken.");
1663 my $args_ref = shift;
1666 my_carp_bug("Need at least $count arguments to "
1668 . ". Instead got: '"
1669 . join ', ', @$args_ref
1670 . "'. No action taken.");
1674 sub carp_extra_args {
1675 my $args_ref = shift;
1676 my_carp_bug("Too many arguments to 'carp_extra_args': (" . join(', ', @_) . "); Extras ignored.") if @_;
1678 unless (ref $args_ref) {
1679 my_carp_bug("Argument to 'carp_extra_args' ($args_ref) must be a ref. Not checking arguments.");
1682 my ($package, $file, $line) = caller;
1683 my $subroutine = (caller 1)[3];
1686 if (ref $args_ref eq 'HASH') {
1687 foreach my $key (keys %$args_ref) {
1688 $args_ref->{$key} = $UNDEF unless defined $args_ref->{$key};
1690 $list = join ', ', each %{$args_ref};
1692 elsif (ref $args_ref eq 'ARRAY') {
1693 foreach my $arg (@$args_ref) {
1694 $arg = $UNDEF unless defined $arg;
1696 $list = join ', ', @$args_ref;
1699 my_carp_bug("Can't cope with ref "
1701 . " . argument to 'carp_extra_args'. Not checking arguments.");
1705 my_carp_bug("Unrecognized parameters in options: '$list' to $subroutine. Skipped.");
1713 # This program uses the inside-out method for objects, as recommended in
1714 # "Perl Best Practices". This closure aids in generating those. There
1715 # are two routines. setup_package() is called once per package to set
1716 # things up, and then set_access() is called for each hash representing a
1717 # field in the object. These routines arrange for the object to be
1718 # properly destroyed when no longer used, and for standard accessor
1719 # functions to be generated. If you need more complex accessors, just
1720 # write your own and leave those accesses out of the call to set_access().
1721 # More details below.
1723 my %constructor_fields; # fields that are to be used in constructors; see
1726 # The values of this hash will be the package names as keys to other
1727 # hashes containing the name of each field in the package as keys, and
1728 # references to their respective hashes as values.
1732 # Sets up the package, creating standard DESTROY and dump methods
1733 # (unless already defined). The dump method is used in debugging by
1735 # The optional parameters are:
1736 # a) a reference to a hash, that gets populated by later
1737 # set_access() calls with one of the accesses being
1738 # 'constructor'. The caller can then refer to this, but it is
1739 # not otherwise used by these two routines.
1740 # b) a reference to a callback routine to call during destruction
1741 # of the object, before any fields are actually destroyed
1744 my $constructor_ref = delete $args{'Constructor_Fields'};
1745 my $destroy_callback = delete $args{'Destroy_Callback'};
1746 Carp::carp_extra_args(\@_) if main::DEBUG && %args;
1749 my $package = (caller)[0];
1751 $package_fields{$package} = \%fields;
1752 $constructor_fields{$package} = $constructor_ref;
1754 unless ($package->can('DESTROY')) {
1755 my $destroy_name = "${package}::DESTROY";
1758 # Use typeglob to give the anonymous subroutine the name we want
1759 *$destroy_name = sub {
1761 my $addr = do { no overloading; pack 'J', $self; };
1763 $self->$destroy_callback if $destroy_callback;
1764 foreach my $field (keys %{$package_fields{$package}}) {
1765 #print STDERR __LINE__, ": Destroying ", ref $self, " ", sprintf("%04X", $addr), ": ", $field, "\n";
1766 delete $package_fields{$package}{$field}{$addr};
1772 unless ($package->can('dump')) {
1773 my $dump_name = "${package}::dump";
1777 return dump_inside_out($self, $package_fields{$package}, @_);
1784 # Arrange for the input field to be garbage collected when no longer
1785 # needed. Also, creates standard accessor functions for the field
1786 # based on the optional parameters-- none if none of these parameters:
1787 # 'addable' creates an 'add_NAME()' accessor function.
1788 # 'readable' or 'readable_array' creates a 'NAME()' accessor
1790 # 'settable' creates a 'set_NAME()' accessor function.
1791 # 'constructor' doesn't create an accessor function, but adds the
1792 # field to the hash that was previously passed to
1794 # Any of the accesses can be abbreviated down, so that 'a', 'ad',
1795 # 'add' etc. all mean 'addable'.
1796 # The read accessor function will work on both array and scalar
1797 # values. If another accessor in the parameter list is 'a', the read
1798 # access assumes an array. You can also force it to be array access
1799 # by specifying 'readable_array' instead of 'readable'
1801 # A sort-of 'protected' access can be set-up by preceding the addable,
1802 # readable or settable with some initial portion of 'protected_' (but,
1803 # the underscore is required), like 'p_a', 'pro_set', etc. The
1804 # "protection" is only by convention. All that happens is that the
1805 # accessor functions' names begin with an underscore. So instead of
1806 # calling set_foo, the call is _set_foo. (Real protection could be
1807 # accomplished by having a new subroutine, end_package, called at the
1808 # end of each package, and then storing the __LINE__ ranges and
1809 # checking them on every accessor. But that is way overkill.)
1811 # We create anonymous subroutines as the accessors and then use
1812 # typeglobs to assign them to the proper package and name
1814 my $name = shift; # Name of the field
1815 my $field = shift; # Reference to the inside-out hash containing the
1818 my $package = (caller)[0];
1820 if (! exists $package_fields{$package}) {
1821 croak "$0: Must call 'setup_package' before 'set_access'";
1824 # Stash the field so DESTROY can get it.
1825 $package_fields{$package}{$name} = $field;
1827 # Remaining arguments are the accessors. For each...
1828 foreach my $access (@_) {
1829 my $access = lc $access;
1833 # Match the input as far as it goes.
1834 if ($access =~ /^(p[^_]*)_/) {
1836 if (substr('protected_', 0, length $protected)
1840 # Add 1 for the underscore not included in $protected
1841 $access = substr($access, length($protected) + 1);
1849 if (substr('addable', 0, length $access) eq $access) {
1850 my $subname = "${package}::${protected}add_$name";
1853 # add_ accessor. Don't add if already there, which we
1854 # determine using 'eq' for scalars and '==' otherwise.
1857 return Carp::carp_too_few_args(\@_, 2) if main::DEBUG && @_ < 2;
1860 my $addr = do { no overloading; pack 'J', $self; };
1861 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
1863 return if grep { $value == $_ } @{$field->{$addr}};
1866 return if grep { $value eq $_ } @{$field->{$addr}};
1868 push @{$field->{$addr}}, $value;
1872 elsif (substr('constructor', 0, length $access) eq $access) {
1874 Carp::my_carp_bug("Can't set-up 'protected' constructors")
1877 $constructor_fields{$package}{$name} = $field;
1880 elsif (substr('readable_array', 0, length $access) eq $access) {
1882 # Here has read access. If one of the other parameters for
1883 # access is array, or this one specifies array (by being more
1884 # than just 'readable_'), then create a subroutine that
1885 # assumes the data is an array. Otherwise just a scalar
1886 my $subname = "${package}::${protected}$name";
1887 if (grep { /^a/i } @_
1888 or length($access) > length('readable_'))
1893 Carp::carp_extra_args(\@_) if main::DEBUG && @_ > 1;
1894 my $addr = do { no overloading; pack 'J', $_[0]; };
1895 if (ref $field->{$addr} ne 'ARRAY') {
1896 my $type = ref $field->{$addr};
1897 $type = 'scalar' unless $type;
1898 Carp::my_carp_bug("Trying to read $name as an array when it is a $type. Big problems.");
1901 return scalar @{$field->{$addr}} unless wantarray;
1903 # Make a copy; had problems with caller modifying the
1904 # original otherwise
1905 my @return = @{$field->{$addr}};
1911 # Here not an array value, a simpler function.
1915 Carp::carp_extra_args(\@_) if main::DEBUG && @_ > 1;
1917 return $field->{pack 'J', $_[0]};
1921 elsif (substr('settable', 0, length $access) eq $access) {
1922 my $subname = "${package}::${protected}set_$name";
1927 return Carp::carp_too_few_args(\@_, 2) if @_ < 2;
1928 Carp::carp_extra_args(\@_) if @_ > 2;
1930 # $self is $_[0]; $value is $_[1]
1932 $field->{pack 'J', $_[0]} = $_[1];
1937 Carp::my_carp_bug("Unknown accessor type $access. No accessor set.");
1946 # All input files use this object, which stores various attributes about them,
1947 # and provides for convenient, uniform handling. The run method wraps the
1948 # processing. It handles all the bookkeeping of opening, reading, and closing
1949 # the file, returning only significant input lines.
1951 # Each object gets a handler which processes the body of the file, and is
1952 # called by run(). Most should use the generic, default handler, which has
1953 # code scrubbed to handle things you might not expect. A handler should
1954 # basically be a while(next_line()) {...} loop.
1956 # You can also set up handlers to
1957 # 1) call before the first line is read for pre processing
1958 # 2) call to adjust each line of the input before the main handler gets them
1959 # 3) call upon EOF before the main handler exits its loop
1960 # 4) call at the end for post processing
1962 # $_ is used to store the input line, and is to be filtered by the
1963 # each_line_handler()s. So, if the format of the line is not in the desired
1964 # format for the main handler, these are used to do that adjusting. They can
1965 # be stacked (by enclosing them in an [ anonymous array ] in the constructor,
1966 # so the $_ output of one is used as the input to the next. None of the other
1967 # handlers are stackable, but could easily be changed to be so.
1969 # Most of the handlers can call insert_lines() or insert_adjusted_lines()
1970 # which insert the parameters as lines to be processed before the next input
1971 # file line is read. This allows the EOF handler to flush buffers, for
1972 # example. The difference between the two routines is that the lines inserted
1973 # by insert_lines() are subjected to the each_line_handler()s. (So if you
1974 # called it from such a handler, you would get infinite recursion.) Lines
1975 # inserted by insert_adjusted_lines() go directly to the main handler without
1976 # any adjustments. If the post-processing handler calls any of these, there
1977 # will be no effect. Some error checking for these conditions could be added,
1978 # but it hasn't been done.
1980 # carp_bad_line() should be called to warn of bad input lines, which clears $_
1981 # to prevent further processing of the line. This routine will output the
1982 # message as a warning once, and then keep a count of the lines that have the
1983 # same message, and output that count at the end of the file's processing.
1984 # This keeps the number of messages down to a manageable amount.
1986 # get_missings() should be called to retrieve any @missing input lines.
1987 # Messages will be raised if this isn't done if the options aren't to ignore
1990 sub trace { return main::trace(@_); }
1993 # Keep track of fields that are to be put into the constructor.
1994 my %constructor_fields;
1996 main::setup_package(Constructor_Fields => \%constructor_fields);
1998 my %file; # Input file name, required
1999 main::set_access('file', \%file, qw{ c r });
2001 my %first_released; # Unicode version file was first released in, required
2002 main::set_access('first_released', \%first_released, qw{ c r });
2004 my %handler; # Subroutine to process the input file, defaults to
2005 # 'process_generic_property_file'
2006 main::set_access('handler', \%handler, qw{ c });
2009 # name of property this file is for. defaults to none, meaning not
2010 # applicable, or is otherwise determinable, for example, from each line.
2011 main::set_access('property', \%property, qw{ c });
2014 # If this is true, the file is optional. If not present, no warning is
2015 # output. If it is present, the string given by this parameter is
2016 # evaluated, and if false the file is not processed.
2017 main::set_access('optional', \%optional, 'c', 'r');
2020 # This is used for debugging, to skip processing of all but a few input
2021 # files. Add 'non_skip => 1' to the constructor for those files you want
2022 # processed when you set the $debug_skip global.
2023 main::set_access('non_skip', \%non_skip, 'c');
2026 # This is used to skip processing of this input file semi-permanently,
2027 # when it evaluates to true. The value should be the reason the file is
2028 # being skipped. It is used for files that we aren't planning to process
2029 # anytime soon, but want to allow to be in the directory and not raise a
2030 # message that we are not handling. Mostly for test files. This is in
2031 # contrast to the non_skip element, which is supposed to be used very
2032 # temporarily for debugging. Sets 'optional' to 1. Also, files that we
2033 # pretty much will never look at can be placed in the global
2034 # %ignored_files instead. Ones used here will be added to %skipped files
2035 main::set_access('skip', \%skip, 'c');
2037 my %each_line_handler;
2038 # list of subroutines to look at and filter each non-comment line in the
2039 # file. defaults to none. The subroutines are called in order, each is
2040 # to adjust $_ for the next one, and the final one adjusts it for
2042 main::set_access('each_line_handler', \%each_line_handler, 'c');
2044 my %has_missings_defaults;
2045 # ? Are there lines in the file giving default values for code points
2046 # missing from it?. Defaults to NO_DEFAULTS. Otherwise NOT_IGNORED is
2047 # the norm, but IGNORED means it has such lines, but the handler doesn't
2048 # use them. Having these three states allows us to catch changes to the
2049 # UCD that this program should track
2050 main::set_access('has_missings_defaults',
2051 \%has_missings_defaults, qw{ c r });
2054 # Subroutine to call before doing anything else in the file. If undef, no
2055 # such handler is called.
2056 main::set_access('pre_handler', \%pre_handler, qw{ c });
2059 # Subroutine to call upon getting an EOF on the input file, but before
2060 # that is returned to the main handler. This is to allow buffers to be
2061 # flushed. The handler is expected to call insert_lines() or
2062 # insert_adjusted() with the buffered material
2063 main::set_access('eof_handler', \%eof_handler, qw{ c r });
2066 # Subroutine to call after all the lines of the file are read in and
2067 # processed. If undef, no such handler is called.
2068 main::set_access('post_handler', \%post_handler, qw{ c });
2070 my %progress_message;
2071 # Message to print to display progress in lieu of the standard one
2072 main::set_access('progress_message', \%progress_message, qw{ c });
2075 # cache open file handle, internal. Is undef if file hasn't been
2076 # processed at all, empty if has;
2077 main::set_access('handle', \%handle);
2080 # cache of lines added virtually to the file, internal
2081 main::set_access('added_lines', \%added_lines);
2084 # cache of errors found, internal
2085 main::set_access('errors', \%errors);
2088 # storage of '@missing' defaults lines
2089 main::set_access('missings', \%missings);
2094 my $self = bless \do{ my $anonymous_scalar }, $class;
2095 my $addr = do { no overloading; pack 'J', $self; };
2098 $handler{$addr} = \&main::process_generic_property_file;
2099 $non_skip{$addr} = 0;
2101 $has_missings_defaults{$addr} = $NO_DEFAULTS;
2102 $handle{$addr} = undef;
2103 $added_lines{$addr} = [ ];
2104 $each_line_handler{$addr} = [ ];
2105 $errors{$addr} = { };
2106 $missings{$addr} = [ ];
2108 # Two positional parameters.
2109 return Carp::carp_too_few_args(\@_, 2) if main::DEBUG && @_ < 2;
2110 $file{$addr} = main::internal_file_to_platform(shift);
2111 $first_released{$addr} = shift;
2113 # The rest of the arguments are key => value pairs
2114 # %constructor_fields has been set up earlier to list all possible
2115 # ones. Either set or push, depending on how the default has been set
2118 foreach my $key (keys %args) {
2119 my $argument = $args{$key};
2121 # Note that the fields are the lower case of the constructor keys
2122 my $hash = $constructor_fields{lc $key};
2123 if (! defined $hash) {
2124 Carp::my_carp_bug("Unrecognized parameters '$key => $argument' to new() for $self. Skipped");
2127 if (ref $hash->{$addr} eq 'ARRAY') {
2128 if (ref $argument eq 'ARRAY') {
2129 foreach my $argument (@{$argument}) {
2130 next if ! defined $argument;
2131 push @{$hash->{$addr}}, $argument;
2135 push @{$hash->{$addr}}, $argument if defined $argument;
2139 $hash->{$addr} = $argument;
2144 # If the file has a property for it, it means that the property is not
2145 # listed in the file's entries. So add a handler to the list of line
2146 # handlers to insert the property name into the lines, to provide a
2147 # uniform interface to the final processing subroutine.
2148 # the final code doesn't have to worry about that.
2149 if ($property{$addr}) {
2150 push @{$each_line_handler{$addr}}, \&_insert_property_into_line;
2153 if ($non_skip{$addr} && ! $debug_skip && $verbosity) {
2154 print "Warning: " . __PACKAGE__ . " constructor for $file{$addr} has useless 'non_skip' in it\n";
2157 # If skipping, set to optional, and add to list of ignored files,
2158 # including its reason
2160 $optional{$addr} = 1;
2161 $skipped_files{$file{$addr}} = $skip{$addr}
2170 qw("") => "_operator_stringify",
2171 "." => \&main::_operator_dot,
2174 sub _operator_stringify {
2177 return __PACKAGE__ . " object for " . $self->file;
2180 # flag to make sure extracted files are processed early
2181 my $seen_non_extracted_non_age = 0;
2184 # Process the input object $self. This opens and closes the file and
2185 # calls all the handlers for it. Currently, this can only be called
2186 # once per file, as it destroy's the EOF handler
2189 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2191 my $addr = do { no overloading; pack 'J', $self; };
2193 my $file = $file{$addr};
2195 # Don't process if not expecting this file (because released later
2196 # than this Unicode version), and isn't there. This means if someone
2197 # copies it into an earlier version's directory, we will go ahead and
2199 return if $first_released{$addr} gt $v_version && ! -e $file;
2201 # If in debugging mode and this file doesn't have the non-skip
2202 # flag set, and isn't one of the critical files, skip it.
2204 && $first_released{$addr} ne v0
2205 && ! $non_skip{$addr})
2207 print "Skipping $file in debugging\n" if $verbosity;
2211 # File could be optional
2212 if ($optional{$addr}) {
2213 return unless -e $file;
2214 my $result = eval $optional{$addr};
2215 if (! defined $result) {
2216 Carp::my_carp_bug("Got '$@' when tried to eval $optional{$addr}. $file Skipped.");
2221 print STDERR "Skipping processing input file '$file' because '$optional{$addr}' is not true\n";
2227 if (! defined $file || ! -e $file) {
2229 # If the file doesn't exist, see if have internal data for it
2230 # (based on first_released being 0).
2231 if ($first_released{$addr} eq v0) {
2232 $handle{$addr} = 'pretend_is_open';
2235 if (! $optional{$addr} # File could be optional
2236 && $v_version ge $first_released{$addr})
2238 print STDERR "Skipping processing input file '$file' because not found\n" if $v_version ge $first_released{$addr};
2245 # Here, the file exists. Some platforms may change the case of
2247 if ($seen_non_extracted_non_age) {
2248 if ($file =~ /$EXTRACTED/i) {
2249 Carp::my_carp_bug(join_lines(<<END
2250 $file should be processed just after the 'Prop...Alias' files, and before
2251 anything not in the $EXTRACTED_DIR directory. Proceeding, but the results may
2252 have subtle problems
2257 elsif ($EXTRACTED_DIR
2258 && $first_released{$addr} ne v0
2259 && $file !~ /$EXTRACTED/i
2260 && lc($file) ne 'dage.txt')
2262 # We don't set this (by the 'if' above) if we have no
2263 # extracted directory, so if running on an early version,
2264 # this test won't work. Not worth worrying about.
2265 $seen_non_extracted_non_age = 1;
2268 # And mark the file as having being processed, and warn if it
2269 # isn't a file we are expecting. As we process the files,
2270 # they are deleted from the hash, so any that remain at the
2271 # end of the program are files that we didn't process.
2272 my $fkey = File::Spec->rel2abs($file);
2273 my $expecting = delete $potential_files{lc($fkey)};
2275 Carp::my_carp("Was not expecting '$file'.") if
2277 && ! defined $handle{$addr};
2279 # Having deleted from expected files, we can quit if not to do
2280 # anything. Don't print progress unless really want verbosity
2282 print "Skipping $file.\n" if $verbosity >= $VERBOSE;
2286 # Open the file, converting the slashes used in this program
2287 # into the proper form for the OS
2289 if (not open $file_handle, "<", $file) {
2290 Carp::my_carp("Can't open $file. Skipping: $!");
2293 $handle{$addr} = $file_handle; # Cache the open file handle
2296 if ($verbosity >= $PROGRESS) {
2297 if ($progress_message{$addr}) {
2298 print "$progress_message{$addr}\n";
2301 # If using a virtual file, say so.
2302 print "Processing ", (-e $file)
2304 : "substitute $file",
2310 # Call any special handler for before the file.
2311 &{$pre_handler{$addr}}($self) if $pre_handler{$addr};
2313 # Then the main handler
2314 &{$handler{$addr}}($self);
2316 # Then any special post-file handler.
2317 &{$post_handler{$addr}}($self) if $post_handler{$addr};
2319 # If any errors have been accumulated, output the counts (as the first
2320 # error message in each class was output when it was encountered).
2321 if ($errors{$addr}) {
2324 foreach my $error (keys %{$errors{$addr}}) {
2325 $total += $errors{$addr}->{$error};
2326 delete $errors{$addr}->{$error};
2331 = "A total of $total lines had errors in $file. ";
2333 $message .= ($types == 1)
2334 ? '(Only the first one was displayed.)'
2335 : '(Only the first of each type was displayed.)';
2336 Carp::my_carp($message);
2340 if (@{$missings{$addr}}) {
2341 Carp::my_carp_bug("Handler for $file didn't look at all the \@missing lines. Generated tables likely are wrong");
2344 # If a real file handle, close it.
2345 close $handle{$addr} or Carp::my_carp("Can't close $file: $!") if
2347 $handle{$addr} = ""; # Uses empty to indicate that has already seen
2348 # the file, as opposed to undef
2353 # Sets $_ to be the next logical input line, if any. Returns non-zero
2354 # if such a line exists. 'logical' means that any lines that have
2355 # been added via insert_lines() will be returned in $_ before the file
2359 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2361 my $addr = do { no overloading; pack 'J', $self; };
2363 # Here the file is open (or if the handle is not a ref, is an open
2364 # 'virtual' file). Get the next line; any inserted lines get priority
2365 # over the file itself.
2369 while (1) { # Loop until find non-comment, non-empty line
2370 #local $to_trace = 1 if main::DEBUG;
2371 my $inserted_ref = shift @{$added_lines{$addr}};
2372 if (defined $inserted_ref) {
2373 ($adjusted, $_) = @{$inserted_ref};
2374 trace $adjusted, $_ if main::DEBUG && $to_trace;
2375 return 1 if $adjusted;
2378 last if ! ref $handle{$addr}; # Don't read unless is real file
2379 last if ! defined ($_ = readline $handle{$addr});
2382 trace $_ if main::DEBUG && $to_trace;
2384 # See if this line is the comment line that defines what property
2385 # value that code points that are not listed in the file should
2386 # have. The format or existence of these lines is not guaranteed
2387 # by Unicode since they are comments, but the documentation says
2388 # that this was added for machine-readability, so probably won't
2389 # change. This works starting in Unicode Version 5.0. They look
2392 # @missing: 0000..10FFFF; Not_Reordered
2393 # @missing: 0000..10FFFF; Decomposition_Mapping; <code point>
2394 # @missing: 0000..10FFFF; ; NaN
2396 # Save the line for a later get_missings() call.
2397 if (/$missing_defaults_prefix/) {
2398 if ($has_missings_defaults{$addr} == $NO_DEFAULTS) {
2399 $self->carp_bad_line("Unexpected \@missing line. Assuming no missing entries");
2401 elsif ($has_missings_defaults{$addr} == $NOT_IGNORED) {
2402 my @defaults = split /\s* ; \s*/x, $_;
2404 # The first field is the @missing, which ends in a
2405 # semi-colon, so can safely shift.
2408 # Some of these lines may have empty field placeholders
2409 # which get in the way. An example is:
2410 # @missing: 0000..10FFFF; ; NaN
2411 # Remove them. Process starting from the top so the
2412 # splice doesn't affect things still to be looked at.
2413 for (my $i = @defaults - 1; $i >= 0; $i--) {
2414 next if $defaults[$i] ne "";
2415 splice @defaults, $i, 1;
2418 # What's left should be just the property (maybe) and the
2419 # default. Having only one element means it doesn't have
2423 if (@defaults >= 1) {
2424 if (@defaults == 1) {
2425 $default = $defaults[0];
2428 $property = $defaults[0];
2429 $default = $defaults[1];
2435 || ($default =~ /^</
2436 && $default !~ /^<code *point>$/i
2437 && $default !~ /^<none>$/i
2438 && $default !~ /^<script>$/i))
2440 $self->carp_bad_line("Unrecognized \@missing line: $_. Assuming no missing entries");
2444 # If the property is missing from the line, it should
2445 # be the one for the whole file
2446 $property = $property{$addr} if ! defined $property;
2448 # Change <none> to the null string, which is what it
2449 # really means. If the default is the code point
2450 # itself, set it to <code point>, which is what
2451 # Unicode uses (but sometimes they've forgotten the
2453 if ($default =~ /^<none>$/i) {
2456 elsif ($default =~ /^<code *point>$/i) {
2457 $default = $CODE_POINT;
2459 elsif ($default =~ /^<script>$/i) {
2461 # Special case this one. Currently is from
2462 # ScriptExtensions.txt, and means for all unlisted
2463 # code points, use their Script property values.
2464 # For the code points not listed in that file, the
2465 # default value is 'Unknown'.
2466 $default = "Unknown";
2469 # Store them as a sub-arrays with both components.
2470 push @{$missings{$addr}}, [ $default, $property ];
2474 # There is nothing for the caller to process on this comment
2479 # Remove comments and trailing space, and skip this line if the
2485 # Call any handlers for this line, and skip further processing of
2486 # the line if the handler sets the line to null.
2487 foreach my $sub_ref (@{$each_line_handler{$addr}}) {
2492 # Here the line is ok. return success.
2494 } # End of looping through lines.
2496 # If there is an EOF handler, call it (only once) and if it generates
2497 # more lines to process go back in the loop to handle them.
2498 if ($eof_handler{$addr}) {
2499 &{$eof_handler{$addr}}($self);
2500 $eof_handler{$addr} = ""; # Currently only get one shot at it.
2501 goto LINE if $added_lines{$addr};
2504 # Return failure -- no more lines.
2509 # Not currently used, not fully tested.
2511 # # Non-destructive look-ahead one non-adjusted, non-comment, non-blank
2512 # # record. Not callable from an each_line_handler(), nor does it call
2513 # # an each_line_handler() on the line.
2516 # my $addr = do { no overloading; pack 'J', $self; };
2518 # foreach my $inserted_ref (@{$added_lines{$addr}}) {
2519 # my ($adjusted, $line) = @{$inserted_ref};
2520 # next if $adjusted;
2522 # # Remove comments and trailing space, and return a non-empty
2525 # $line =~ s/\s+$//;
2526 # return $line if $line ne "";
2529 # return if ! ref $handle{$addr}; # Don't read unless is real file
2530 # while (1) { # Loop until find non-comment, non-empty line
2531 # local $to_trace = 1 if main::DEBUG;
2532 # trace $_ if main::DEBUG && $to_trace;
2533 # return if ! defined (my $line = readline $handle{$addr});
2535 # push @{$added_lines{$addr}}, [ 0, $line ];
2538 # $line =~ s/\s+$//;
2539 # return $line if $line ne "";
2547 # Lines can be inserted so that it looks like they were in the input
2548 # file at the place it was when this routine is called. See also
2549 # insert_adjusted_lines(). Lines inserted via this routine go through
2550 # any each_line_handler()
2554 # Each inserted line is an array, with the first element being 0 to
2555 # indicate that this line hasn't been adjusted, and needs to be
2558 push @{$added_lines{pack 'J', $self}}, map { [ 0, $_ ] } @_;
2562 sub insert_adjusted_lines {
2563 # Lines can be inserted so that it looks like they were in the input
2564 # file at the place it was when this routine is called. See also
2565 # insert_lines(). Lines inserted via this routine are already fully
2566 # adjusted, ready to be processed; each_line_handler()s handlers will
2567 # not be called. This means this is not a completely general
2568 # facility, as only the last each_line_handler on the stack should
2569 # call this. It could be made more general, by passing to each of the
2570 # line_handlers their position on the stack, which they would pass on
2571 # to this routine, and that would replace the boolean first element in
2572 # the anonymous array pushed here, so that the next_line routine could
2573 # use that to call only those handlers whose index is after it on the
2574 # stack. But this is overkill for what is needed now.
2577 trace $_[0] if main::DEBUG && $to_trace;
2579 # Each inserted line is an array, with the first element being 1 to
2580 # indicate that this line has been adjusted
2582 push @{$added_lines{pack 'J', $self}}, map { [ 1, $_ ] } @_;
2587 # Returns the stored up @missings lines' values, and clears the list.
2588 # The values are in an array, consisting of the default in the first
2589 # element, and the property in the 2nd. However, since these lines
2590 # can be stacked up, the return is an array of all these arrays.
2593 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2595 my $addr = do { no overloading; pack 'J', $self; };
2597 # If not accepting a list return, just return the first one.
2598 return shift @{$missings{$addr}} unless wantarray;
2600 my @return = @{$missings{$addr}};
2601 undef @{$missings{$addr}};
2605 sub _insert_property_into_line {
2606 # Add a property field to $_, if this file requires it.
2609 my $addr = do { no overloading; pack 'J', $self; };
2610 my $property = $property{$addr};
2611 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2613 $_ =~ s/(;|$)/; $property$1/;
2618 # Output consistent error messages, using either a generic one, or the
2619 # one given by the optional parameter. To avoid gazillions of the
2620 # same message in case the syntax of a file is way off, this routine
2621 # only outputs the first instance of each message, incrementing a
2622 # count so the totals can be output at the end of the file.
2625 my $message = shift;
2626 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2628 my $addr = do { no overloading; pack 'J', $self; };
2630 $message = 'Unexpected line' unless $message;
2632 # No trailing punctuation so as to fit with our addenda.
2633 $message =~ s/[.:;,]$//;
2635 # If haven't seen this exact message before, output it now. Otherwise
2636 # increment the count of how many times it has occurred
2637 unless ($errors{$addr}->{$message}) {
2638 Carp::my_carp("$message in '$_' in "
2640 . " at line $.. Skipping this line;");
2641 $errors{$addr}->{$message} = 1;
2644 $errors{$addr}->{$message}++;
2647 # Clear the line to prevent any further (meaningful) processing of it.
2654 package Multi_Default;
2656 # Certain properties in early versions of Unicode had more than one possible
2657 # default for code points missing from the files. In these cases, one
2658 # default applies to everything left over after all the others are applied,
2659 # and for each of the others, there is a description of which class of code
2660 # points applies to it. This object helps implement this by storing the
2661 # defaults, and for all but that final default, an eval string that generates
2662 # the class that it applies to.
2667 main::setup_package();
2670 # The defaults structure for the classes
2671 main::set_access('class_defaults', \%class_defaults);
2674 # The default that applies to everything left over.
2675 main::set_access('other_default', \%other_default, 'r');
2679 # The constructor is called with default => eval pairs, terminated by
2680 # the left-over default. e.g.
2681 # Multi_Default->new(
2682 # 'T' => '$gc->table("Mn") + $gc->table("Cf") - 0x200C
2684 # 'R' => 'some other expression that evaluates to code points',
2692 my $self = bless \do{my $anonymous_scalar}, $class;
2693 my $addr = do { no overloading; pack 'J', $self; };
2696 my $default = shift;
2698 $class_defaults{$addr}->{$default} = $eval;
2701 $other_default{$addr} = shift;
2706 sub get_next_defaults {
2707 # Iterates and returns the next class of defaults.
2709 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2711 my $addr = do { no overloading; pack 'J', $self; };
2713 return each %{$class_defaults{$addr}};
2719 # An alias is one of the names that a table goes by. This class defines them
2720 # including some attributes. Everything is currently setup in the
2726 main::setup_package();
2729 main::set_access('name', \%name, 'r');
2732 # Should this name match loosely or not.
2733 main::set_access('loose_match', \%loose_match, 'r');
2735 my %make_re_pod_entry;
2736 # Some aliases should not get their own entries in the re section of the
2737 # pod, because they are covered by a wild-card, and some we want to
2738 # discourage use of. Binary
2739 main::set_access('make_re_pod_entry', \%make_re_pod_entry, 'r', 's');
2742 # Is this documented to be accessible via Unicode::UCD
2743 main::set_access('ucd', \%ucd, 'r', 's');
2746 # Aliases have a status, like deprecated, or even suppressed (which means
2747 # they don't appear in documentation). Enum
2748 main::set_access('status', \%status, 'r');
2751 # Similarly, some aliases should not be considered as usable ones for
2752 # external use, such as file names, or we don't want documentation to
2753 # recommend them. Boolean
2754 main::set_access('ok_as_filename', \%ok_as_filename, 'r');
2759 my $self = bless \do { my $anonymous_scalar }, $class;
2760 my $addr = do { no overloading; pack 'J', $self; };
2762 $name{$addr} = shift;
2763 $loose_match{$addr} = shift;
2764 $make_re_pod_entry{$addr} = shift;
2765 $ok_as_filename{$addr} = shift;
2766 $status{$addr} = shift;
2767 $ucd{$addr} = shift;
2769 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2771 # Null names are never ok externally
2772 $ok_as_filename{$addr} = 0 if $name{$addr} eq "";
2780 # A range is the basic unit for storing code points, and is described in the
2781 # comments at the beginning of the program. Each range has a starting code
2782 # point; an ending code point (not less than the starting one); a value
2783 # that applies to every code point in between the two end-points, inclusive;
2784 # and an enum type that applies to the value. The type is for the user's
2785 # convenience, and has no meaning here, except that a non-zero type is
2786 # considered to not obey the normal Unicode rules for having standard forms.
2788 # The same structure is used for both map and match tables, even though in the
2789 # latter, the value (and hence type) is irrelevant and could be used as a
2790 # comment. In map tables, the value is what all the code points in the range
2791 # map to. Type 0 values have the standardized version of the value stored as
2792 # well, so as to not have to recalculate it a lot.
2794 sub trace { return main::trace(@_); }
2798 main::setup_package();
2801 main::set_access('start', \%start, 'r', 's');
2804 main::set_access('end', \%end, 'r', 's');
2807 main::set_access('value', \%value, 'r');
2810 main::set_access('type', \%type, 'r');
2813 # The value in internal standard form. Defined only if the type is 0.
2814 main::set_access('standard_form', \%standard_form);
2816 # Note that if these fields change, the dump() method should as well
2819 return Carp::carp_too_few_args(\@_, 3) if main::DEBUG && @_ < 3;
2822 my $self = bless \do { my $anonymous_scalar }, $class;
2823 my $addr = do { no overloading; pack 'J', $self; };
2825 $start{$addr} = shift;
2826 $end{$addr} = shift;
2830 my $value = delete $args{'Value'}; # Can be 0
2831 $value = "" unless defined $value;
2832 $value{$addr} = $value;
2834 $type{$addr} = delete $args{'Type'} || 0;
2836 Carp::carp_extra_args(\%args) if main::DEBUG && %args;
2838 if (! $type{$addr}) {
2839 $standard_form{$addr} = main::standardize($value);
2847 qw("") => "_operator_stringify",
2848 "." => \&main::_operator_dot,
2851 sub _operator_stringify {
2853 my $addr = do { no overloading; pack 'J', $self; };
2855 # Output it like '0041..0065 (value)'
2856 my $return = sprintf("%04X", $start{$addr})
2858 . sprintf("%04X", $end{$addr});
2859 my $value = $value{$addr};
2860 my $type = $type{$addr};
2862 $return .= "$value";
2863 $return .= ", Type=$type" if $type != 0;
2870 # The standard form is the value itself if the standard form is
2871 # undefined (that is if the value is special)
2874 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2876 my $addr = do { no overloading; pack 'J', $self; };
2878 return $standard_form{$addr} if defined $standard_form{$addr};
2879 return $value{$addr};
2883 # Human, not machine readable. For machine readable, comment out this
2884 # entire routine and let the standard one take effect.
2887 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2889 my $addr = do { no overloading; pack 'J', $self; };
2891 my $return = $indent
2892 . sprintf("%04X", $start{$addr})
2894 . sprintf("%04X", $end{$addr})
2895 . " '$value{$addr}';";
2896 if (! defined $standard_form{$addr}) {
2897 $return .= "(type=$type{$addr})";
2899 elsif ($standard_form{$addr} ne $value{$addr}) {
2900 $return .= "(standard '$standard_form{$addr}')";
2906 package _Range_List_Base;
2908 # Base class for range lists. A range list is simply an ordered list of
2909 # ranges, so that the ranges with the lowest starting numbers are first in it.
2911 # When a new range is added that is adjacent to an existing range that has the
2912 # same value and type, it merges with it to form a larger range.
2914 # Ranges generally do not overlap, except that there can be multiple entries
2915 # of single code point ranges. This is because of NameAliases.txt.
2917 # In this program, there is a standard value such that if two different
2918 # values, have the same standard value, they are considered equivalent. This
2919 # value was chosen so that it gives correct results on Unicode data
2921 # There are a number of methods to manipulate range lists, and some operators
2922 # are overloaded to handle them.
2924 sub trace { return main::trace(@_); }
2930 main::setup_package();
2933 # The list of ranges
2934 main::set_access('ranges', \%ranges, 'readable_array');
2937 # The highest code point in the list. This was originally a method, but
2938 # actual measurements said it was used a lot.
2939 main::set_access('max', \%max, 'r');
2941 my %each_range_iterator;
2942 # Iterator position for each_range()
2943 main::set_access('each_range_iterator', \%each_range_iterator);
2946 # Name of parent this is attached to, if any. Solely for better error
2948 main::set_access('owner_name_of', \%owner_name_of, 'p_r');
2950 my %_search_ranges_cache;
2951 # A cache of the previous result from _search_ranges(), for better
2953 main::set_access('_search_ranges_cache', \%_search_ranges_cache);
2959 # Optional initialization data for the range list.
2960 my $initialize = delete $args{'Initialize'};
2964 # Use _union() to initialize. _union() returns an object of this
2965 # class, which means that it will call this constructor recursively.
2966 # But it won't have this $initialize parameter so that it won't
2967 # infinitely loop on this.
2968 return _union($class, $initialize, %args) if defined $initialize;
2970 $self = bless \do { my $anonymous_scalar }, $class;
2971 my $addr = do { no overloading; pack 'J', $self; };
2973 # Optional parent object, only for debug info.
2974 $owner_name_of{$addr} = delete $args{'Owner'};
2975 $owner_name_of{$addr} = "" if ! defined $owner_name_of{$addr};
2977 # Stringify, in case it is an object.
2978 $owner_name_of{$addr} = "$owner_name_of{$addr}";
2980 # This is used only for error messages, and so a colon is added
2981 $owner_name_of{$addr} .= ": " if $owner_name_of{$addr} ne "";
2983 Carp::carp_extra_args(\%args) if main::DEBUG && %args;
2985 # Max is initialized to a negative value that isn't adjacent to 0,
2989 $_search_ranges_cache{$addr} = 0;
2990 $ranges{$addr} = [];
2997 qw("") => "_operator_stringify",
2998 "." => \&main::_operator_dot,
3001 sub _operator_stringify {
3003 my $addr = do { no overloading; pack 'J', $self; };
3005 return "Range_List attached to '$owner_name_of{$addr}'"
3006 if $owner_name_of{$addr};
3007 return "anonymous Range_List " . \$self;
3011 # Returns the union of the input code points. It can be called as
3012 # either a constructor or a method. If called as a method, the result
3013 # will be a new() instance of the calling object, containing the union
3014 # of that object with the other parameter's code points; if called as
3015 # a constructor, the first parameter gives the class the new object
3016 # should be, and the second parameter gives the code points to go into
3018 # In either case, there are two parameters looked at by this routine;
3019 # any additional parameters are passed to the new() constructor.
3021 # The code points can come in the form of some object that contains
3022 # ranges, and has a conventionally named method to access them; or
3023 # they can be an array of individual code points (as integers); or
3024 # just a single code point.
3026 # If they are ranges, this routine doesn't make any effort to preserve
3027 # the range values of one input over the other. Therefore this base
3028 # class should not allow _union to be called from other than
3029 # initialization code, so as to prevent two tables from being added
3030 # together where the range values matter. The general form of this
3031 # routine therefore belongs in a derived class, but it was moved here
3032 # to avoid duplication of code. The failure to overload this in this
3033 # class keeps it safe.
3037 my @args; # Arguments to pass to the constructor
3041 # If a method call, will start the union with the object itself, and
3042 # the class of the new object will be the same as self.
3049 # Add the other required parameter.
3051 # Rest of parameters are passed on to the constructor
3053 # Accumulate all records from both lists.
3055 for my $arg (@args) {
3056 #local $to_trace = 0 if main::DEBUG;
3057 trace "argument = $arg" if main::DEBUG && $to_trace;
3058 if (! defined $arg) {
3060 if (defined $self) {
3062 $message .= $owner_name_of{pack 'J', $self};
3064 Carp::my_carp_bug($message .= "Undefined argument to _union. No union done.");
3067 $arg = [ $arg ] if ! ref $arg;
3068 my $type = ref $arg;
3069 if ($type eq 'ARRAY') {
3070 foreach my $element (@$arg) {
3071 push @records, Range->new($element, $element);
3074 elsif ($arg->isa('Range')) {
3075 push @records, $arg;
3077 elsif ($arg->can('ranges')) {
3078 push @records, $arg->ranges;
3082 if (defined $self) {
3084 $message .= $owner_name_of{pack 'J', $self};
3086 Carp::my_carp_bug($message . "Cannot take the union of a $type. No union done.");
3091 # Sort with the range containing the lowest ordinal first, but if
3092 # two ranges start at the same code point, sort with the bigger range
3093 # of the two first, because it takes fewer cycles.
3094 @records = sort { ($a->start <=> $b->start)
3096 # if b is shorter than a, b->end will be
3097 # less than a->end, and we want to select
3098 # a, so want to return -1
3099 ($b->end <=> $a->end)
3102 my $new = $class->new(@_);
3104 # Fold in records so long as they add new information.
3105 for my $set (@records) {
3106 my $start = $set->start;
3107 my $end = $set->end;
3108 my $value = $set->value;
3109 if ($start > $new->max) {
3110 $new->_add_delete('+', $start, $end, $value);
3112 elsif ($end > $new->max) {
3113 $new->_add_delete('+', $new->max +1, $end, $value);
3120 sub range_count { # Return the number of ranges in the range list
3122 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3125 return scalar @{$ranges{pack 'J', $self}};
3129 # Returns the minimum code point currently in the range list, or if
3130 # the range list is empty, 2 beyond the max possible. This is a
3131 # method because used so rarely, that not worth saving between calls,
3132 # and having to worry about changing it as ranges are added and
3136 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3138 my $addr = do { no overloading; pack 'J', $self; };
3140 # If the range list is empty, return a large value that isn't adjacent
3141 # to any that could be in the range list, for simpler tests
3142 return $MAX_UNICODE_CODEPOINT + 2 unless scalar @{$ranges{$addr}};
3143 return $ranges{$addr}->[0]->start;
3147 # Boolean: Is argument in the range list? If so returns $i such that:
3148 # range[$i]->end < $codepoint <= range[$i+1]->end
3149 # which is one beyond what you want; this is so that the 0th range
3150 # doesn't return false
3152 my $codepoint = shift;
3153 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3155 my $i = $self->_search_ranges($codepoint);
3156 return 0 unless defined $i;
3158 # The search returns $i, such that
3159 # range[$i-1]->end < $codepoint <= range[$i]->end
3160 # So is in the table if and only iff it is at least the start position
3163 return 0 if $ranges{pack 'J', $self}->[$i]->start > $codepoint;
3167 sub containing_range {
3168 # Returns the range object that contains the code point, undef if none
3171 my $codepoint = shift;
3172 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3174 my $i = $self->contains($codepoint);
3177 # contains() returns 1 beyond where we should look
3179 return $ranges{pack 'J', $self}->[$i-1];
3183 # Returns the value associated with the code point, undef if none
3186 my $codepoint = shift;
3187 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3189 my $range = $self->containing_range($codepoint);
3190 return unless defined $range;
3192 return $range->value;
3196 # Returns the type of the range containing the code point, undef if
3197 # the code point is not in the table
3200 my $codepoint = shift;
3201 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3203 my $range = $self->containing_range($codepoint);
3204 return unless defined $range;
3206 return $range->type;
3209 sub _search_ranges {
3210 # Find the range in the list which contains a code point, or where it
3211 # should go if were to add it. That is, it returns $i, such that:
3212 # range[$i-1]->end < $codepoint <= range[$i]->end
3213 # Returns undef if no such $i is possible (e.g. at end of table), or
3214 # if there is an error.
3217 my $code_point = shift;
3218 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3220 my $addr = do { no overloading; pack 'J', $self; };
3222 return if $code_point > $max{$addr};
3223 my $r = $ranges{$addr}; # The current list of ranges
3224 my $range_list_size = scalar @$r;
3227 use integer; # want integer division
3229 # Use the cached result as the starting guess for this one, because,
3230 # an experiment on 5.1 showed that 90% of the time the cache was the
3231 # same as the result on the next call (and 7% it was one less).
3232 $i = $_search_ranges_cache{$addr};
3233 $i = 0 if $i >= $range_list_size; # Reset if no longer valid (prob.
3234 # from an intervening deletion
3235 #local $to_trace = 1 if main::DEBUG;
3236 trace "previous \$i is still valid: $i" if main::DEBUG && $to_trace && $code_point <= $r->[$i]->end && ($i == 0 || $r->[$i-1]->end < $code_point);
3237 return $i if $code_point <= $r->[$i]->end
3238 && ($i == 0 || $r->[$i-1]->end < $code_point);
3240 # Here the cache doesn't yield the correct $i. Try adding 1.
3241 if ($i < $range_list_size - 1
3242 && $r->[$i]->end < $code_point &&
3243 $code_point <= $r->[$i+1]->end)
3246 trace "next \$i is correct: $i" if main::DEBUG && $to_trace;
3247 $_search_ranges_cache{$addr} = $i;
3251 # Here, adding 1 also didn't work. We do a binary search to
3252 # find the correct position, starting with current $i
3254 my $upper = $range_list_size - 1;
3256 trace "top of loop i=$i:", sprintf("%04X", $r->[$lower]->start), "[$lower] .. ", sprintf("%04X", $r->[$i]->start), "[$i] .. ", sprintf("%04X", $r->[$upper]->start), "[$upper]" if main::DEBUG && $to_trace;
3258 if ($code_point <= $r->[$i]->end) {
3260 # Here we have met the upper constraint. We can quit if we
3261 # also meet the lower one.
3262 last if $i == 0 || $r->[$i-1]->end < $code_point;
3264 $upper = $i; # Still too high.
3269 # Here, $r[$i]->end < $code_point, so look higher up.
3273 # Split search domain in half to try again.
3274 my $temp = ($upper + $lower) / 2;
3276 # No point in continuing unless $i changes for next time
3280 # We can't reach the highest element because of the averaging.
3281 # So if one below the upper edge, force it there and try one
3283 if ($i == $range_list_size - 2) {
3285 trace "Forcing to upper edge" if main::DEBUG && $to_trace;
3286 $i = $range_list_size - 1;
3288 # Change $lower as well so if fails next time through,
3289 # taking the average will yield the same $i, and we will
3290 # quit with the error message just below.
3294 Carp::my_carp_bug("$owner_name_of{$addr}Can't find where the range ought to go. No action taken.");
3298 } # End of while loop
3300 if (main::DEBUG && $to_trace) {
3301 trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i;
3302 trace "i= [ $i ]", $r->[$i];
3303 trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < $range_list_size - 1;
3306 # Here we have found the offset. Cache it as a starting point for the
3308 $_search_ranges_cache{$addr} = $i;
3313 # Add, replace or delete ranges to or from a list. The $type
3314 # parameter gives which:
3315 # '+' => insert or replace a range, returning a list of any changed
3317 # '-' => delete a range, returning a list of any deleted ranges.
3319 # The next three parameters give respectively the start, end, and
3320 # value associated with the range. 'value' should be null unless the
3323 # The range list is kept sorted so that the range with the lowest
3324 # starting position is first in the list, and generally, adjacent
3325 # ranges with the same values are merged into a single larger one (see
3326 # exceptions below).
3328 # There are more parameters; all are key => value pairs:
3329 # Type gives the type of the value. It is only valid for '+'.
3330 # All ranges have types; if this parameter is omitted, 0 is
3331 # assumed. Ranges with type 0 are assumed to obey the
3332 # Unicode rules for casing, etc; ranges with other types are
3333 # not. Otherwise, the type is arbitrary, for the caller's
3334 # convenience, and looked at only by this routine to keep
3335 # adjacent ranges of different types from being merged into
3336 # a single larger range, and when Replace =>
3337 # $IF_NOT_EQUIVALENT is specified (see just below).
3338 # Replace determines what to do if the range list already contains
3339 # ranges which coincide with all or portions of the input
3340 # range. It is only valid for '+':
3341 # => $NO means that the new value is not to replace
3342 # any existing ones, but any empty gaps of the
3343 # range list coinciding with the input range
3344 # will be filled in with the new value.
3345 # => $UNCONDITIONALLY means to replace the existing values with
3346 # this one unconditionally. However, if the
3347 # new and old values are identical, the
3348 # replacement is skipped to save cycles
3349 # => $IF_NOT_EQUIVALENT means to replace the existing values
3350 # with this one if they are not equivalent.
3351 # Ranges are equivalent if their types are the
3352 # same, and they are the same string; or if
3353 # both are type 0 ranges, if their Unicode
3354 # standard forms are identical. In this last
3355 # case, the routine chooses the more "modern"
3356 # one to use. This is because some of the
3357 # older files are formatted with values that
3358 # are, for example, ALL CAPs, whereas the
3359 # derived files have a more modern style,
3360 # which looks better. By looking for this
3361 # style when the pre-existing and replacement
3362 # standard forms are the same, we can move to
3364 # => $MULTIPLE_BEFORE means that if this range duplicates an
3365 # existing one, but has a different value,
3366 # don't replace the existing one, but insert
3367 # this, one so that the same range can occur
3368 # multiple times. They are stored LIFO, so
3369 # that the final one inserted is the first one
3370 # returned in an ordered search of the table.
3371 # => $MULTIPLE_AFTER is like $MULTIPLE_BEFORE, but is stored
3372 # FIFO, so that this one is inserted after all
3373 # others that currently exist.
3374 # => anything else is the same as => $IF_NOT_EQUIVALENT
3376 # "same value" means identical for non-type-0 ranges, and it means
3377 # having the same standard forms for type-0 ranges.
3379 return Carp::carp_too_few_args(\@_, 5) if main::DEBUG && @_ < 5;
3382 my $operation = shift; # '+' for add/replace; '-' for delete;
3389 $value = "" if not defined $value; # warning: $value can be "0"
3391 my $replace = delete $args{'Replace'};
3392 $replace = $IF_NOT_EQUIVALENT unless defined $replace;
3394 my $type = delete $args{'Type'};
3395 $type = 0 unless defined $type;
3397 Carp::carp_extra_args(\%args) if main::DEBUG && %args;
3399 my $addr = do { no overloading; pack 'J', $self; };
3401 if ($operation ne '+' && $operation ne '-') {
3402 Carp::my_carp_bug("$owner_name_of{$addr}First parameter to _add_delete must be '+' or '-'. No action taken.");
3405 unless (defined $start && defined $end) {
3406 Carp::my_carp_bug("$owner_name_of{$addr}Undefined start and/or end to _add_delete. No action taken.");
3409 unless ($end >= $start) {
3410 Carp::my_carp_bug("$owner_name_of{$addr}End of range (" . sprintf("%04X", $end) . ") must not be before start (" . sprintf("%04X", $start) . "). No action taken.");
3413 #local $to_trace = 1 if main::DEBUG;
3415 if ($operation eq '-') {
3416 if ($replace != $IF_NOT_EQUIVALENT) {
3417 Carp::my_carp_bug("$owner_name_of{$addr}Replace => \$IF_NOT_EQUIVALENT is required when deleting a range from a range list. Assuming Replace => \$IF_NOT_EQUIVALENT.");
3418 $replace = $IF_NOT_EQUIVALENT;
3421 Carp::my_carp_bug("$owner_name_of{$addr}Type => 0 is required when deleting a range from a range list. Assuming Type => 0.");
3425 Carp::my_carp_bug("$owner_name_of{$addr}Value => \"\" is required when deleting a range from a range list. Assuming Value => \"\".");
3430 my $r = $ranges{$addr}; # The current list of ranges
3431 my $range_list_size = scalar @$r; # And its size
3432 my $max = $max{$addr}; # The current high code point in
3433 # the list of ranges
3435 # Do a special case requiring fewer machine cycles when the new range
3436 # starts after the current highest point. The Unicode input data is
3437 # structured so this is common.
3438 if ($start > $max) {
3440 trace "$owner_name_of{$addr} $operation", sprintf("%04X", $start) . '..' . sprintf("%04X", $end) . " ($value) type=$type" if main::DEBUG && $to_trace;
3441 return if $operation eq '-'; # Deleting a non-existing range is a
3444 # If the new range doesn't logically extend the current final one
3445 # in the range list, create a new range at the end of the range
3446 # list. (max cleverly is initialized to a negative number not
3447 # adjacent to 0 if the range list is empty, so even adding a range
3448 # to an empty range list starting at 0 will have this 'if'
3450 if ($start > $max + 1 # non-adjacent means can't extend.
3451 || @{$r}[-1]->value ne $value # values differ, can't extend.
3452 || @{$r}[-1]->type != $type # types differ, can't extend.
3454 push @$r, Range->new($start, $end,
3460 # Here, the new range starts just after the current highest in
3461 # the range list, and they have the same type and value.
3462 # Extend the current range to incorporate the new one.
3463 @{$r}[-1]->set_end($end);
3466 # This becomes the new maximum.
3471 #local $to_trace = 0 if main::DEBUG;
3473 trace "$owner_name_of{$addr} $operation", sprintf("%04X", $start) . '..' . sprintf("%04X", $end) . " ($value) replace=$replace" if main::DEBUG && $to_trace;
3475 # Here, the input range isn't after the whole rest of the range list.
3476 # Most likely 'splice' will be needed. The rest of the routine finds
3477 # the needed splice parameters, and if necessary, does the splice.
3478 # First, find the offset parameter needed by the splice function for
3479 # the input range. Note that the input range may span multiple
3480 # existing ones, but we'll worry about that later. For now, just find
3481 # the beginning. If the input range is to be inserted starting in a
3482 # position not currently in the range list, it must (obviously) come
3483 # just after the range below it, and just before the range above it.
3484 # Slightly less obviously, it will occupy the position currently
3485 # occupied by the range that is to come after it. More formally, we
3486 # are looking for the position, $i, in the array of ranges, such that:
3488 # r[$i-1]->start <= r[$i-1]->end < $start < r[$i]->start <= r[$i]->end
3490 # (The ordered relationships within existing ranges are also shown in
3491 # the equation above). However, if the start of the input range is
3492 # within an existing range, the splice offset should point to that
3493 # existing range's position in the list; that is $i satisfies a
3494 # somewhat different equation, namely:
3496 #r[$i-1]->start <= r[$i-1]->end < r[$i]->start <= $start <= r[$i]->end
3498 # More briefly, $start can come before or after r[$i]->start, and at
3499 # this point, we don't know which it will be. However, these
3500 # two equations share these constraints:
3502 # r[$i-1]->end < $start <= r[$i]->end
3504 # And that is good enough to find $i.
3506 my $i = $self->_search_ranges($start);
3508 Carp::my_carp_bug("Searching $self for range beginning with $start unexpectedly returned undefined. Operation '$operation' not performed");
3512 # The search function returns $i such that:
3514 # r[$i-1]->end < $start <= r[$i]->end
3516 # That means that $i points to the first range in the range list
3517 # that could possibly be affected by this operation. We still don't
3518 # know if the start of the input range is within r[$i], or if it
3519 # points to empty space between r[$i-1] and r[$i].
3520 trace "[$i] is the beginning splice point. Existing range there is ", $r->[$i] if main::DEBUG && $to_trace;
3522 # Special case the insertion of data that is not to replace any
3524 if ($replace == $NO) { # If $NO, has to be operation '+'
3525 #local $to_trace = 1 if main::DEBUG;
3526 trace "Doesn't replace" if main::DEBUG && $to_trace;
3528 # Here, the new range is to take effect only on those code points
3529 # that aren't already in an existing range. This can be done by
3530 # looking through the existing range list and finding the gaps in
3531 # the ranges that this new range affects, and then calling this
3532 # function recursively on each of those gaps, leaving untouched
3533 # anything already in the list. Gather up a list of the changed
3534 # gaps first so that changes to the internal state as new ranges
3535 # are added won't be a problem.
3538 # First, if the starting point of the input range is outside an
3539 # existing one, there is a gap from there to the beginning of the
3540 # existing range -- add a span to fill the part that this new
3542 if ($start < $r->[$i]->start) {
3543 push @gap_list, Range->new($start,
3545 $r->[$i]->start - 1),
3547 trace "gap before $r->[$i] [$i], will add", $gap_list[-1] if main::DEBUG && $to_trace;
3550 # Then look through the range list for other gaps until we reach
3551 # the highest range affected by the input one.
3553 for ($j = $i+1; $j < $range_list_size; $j++) {
3554 trace "j=[$j]", $r->[$j] if main::DEBUG && $to_trace;
3555 last if $end < $r->[$j]->start;
3557 # If there is a gap between when this range starts and the
3558 # previous one ends, add a span to fill it. Note that just
3559 # because there are two ranges doesn't mean there is a
3560 # non-zero gap between them. It could be that they have
3561 # different values or types
3562 if ($r->[$j-1]->end + 1 != $r->[$j]->start) {
3564 Range->new($r->[$j-1]->end + 1,
3565 $r->[$j]->start - 1,
3567 trace "gap between $r->[$j-1] and $r->[$j] [$j], will add: $gap_list[-1]" if main::DEBUG && $to_trace;
3571 # Here, we have either found an existing range in the range list,
3572 # beyond the area affected by the input one, or we fell off the
3573 # end of the loop because the input range affects the whole rest
3574 # of the range list. In either case, $j is 1 higher than the
3575 # highest affected range. If $j == $i, it means that there are no
3576 # affected ranges, that the entire insertion is in the gap between
3577 # r[$i-1], and r[$i], which we already have taken care of before
3579 # On the other hand, if there are affected ranges, it might be
3580 # that there is a gap that needs filling after the final such
3581 # range to the end of the input range
3582 if ($r->[$j-1]->end < $end) {
3583 push @gap_list, Range->new(main::max($start,
3584 $r->[$j-1]->end + 1),
3587 trace "gap after $r->[$j-1], will add $gap_list[-1]" if main::DEBUG && $to_trace;
3590 # Call recursively to fill in all the gaps.
3591 foreach my $gap (@gap_list) {
3592 $self->_add_delete($operation,
3602 # Here, we have taken care of the case where $replace is $NO.
3603 # Remember that here, r[$i-1]->end < $start <= r[$i]->end
3604 # If inserting a multiple record, this is where it goes, before the
3605 # first (if any) existing one if inserting LIFO. (If this is to go
3606 # afterwards, FIFO, we below move the pointer to there.) These imply
3607 # an insertion, and no change to any existing ranges. Note that $i
3608 # can be -1 if this new range doesn't actually duplicate any existing,
3609 # and comes at the beginning of the list.
3610 if ($replace == $MULTIPLE_BEFORE || $replace == $MULTIPLE_AFTER) {
3612 if ($start != $end) {
3613 Carp::my_carp_bug("$owner_name_of{$addr}Can't cope with adding a multiple record when the range ($start..$end) contains more than one code point. No action taken.");
3617 # If the new code point is within a current range ...
3618 if ($end >= $r->[$i]->start) {
3620 # Don't add an exact duplicate, as it isn't really a multiple
3621 my $existing_value = $r->[$i]->value;
3622 my $existing_type = $r->[$i]->type;
3623 return if $value eq $existing_value && $type eq $existing_type;
3625 # If the multiple value is part of an existing range, we want
3626 # to split up that range, so that only the single code point
3627 # is affected. To do this, we first call ourselves
3628 # recursively to delete that code point from the table, having
3629 # preserved its current data above. Then we call ourselves
3630 # recursively again to add the new multiple, which we know by
3631 # the test just above is different than the current code
3632 # point's value, so it will become a range containing a single
3633 # code point: just itself. Finally, we add back in the
3634 # pre-existing code point, which will again be a single code
3635 # point range. Because 'i' likely will have changed as a
3636 # result of these operations, we can't just continue on, but
3637 # do this operation recursively as well. If we are inserting
3638 # LIFO, the pre-existing code point needs to go after the new
3639 # one, so use MULTIPLE_AFTER; and vice versa.
3640 if ($r->[$i]->start != $r->[$i]->end) {
3641 $self->_add_delete('-', $start, $end, "");
3642 $self->_add_delete('+', $start, $end, $value, Type => $type);
3643 return $self->_add_delete('+',
3646 Type => $existing_type,
3647 Replace => ($replace == $MULTIPLE_BEFORE)
3649 : $MULTIPLE_BEFORE);
3653 # If to place this new record after, move to beyond all existing
3655 if ($replace == $MULTIPLE_AFTER) {
3656 while ($i < @$r && $r->[$i]->start == $start) {
3661 trace "Adding multiple record at $i with $start..$end, $value" if main::DEBUG && $to_trace;
3662 my @return = splice @$r,
3669 if (main::DEBUG && $to_trace) {
3670 trace "After splice:";
3671 trace 'i-2=[', $i-2, ']', $r->[$i-2] if $i >= 2;
3672 trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i >= 1;
3673 trace "i =[", $i, "]", $r->[$i] if $i >= 0;
3674 trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < @$r - 1;
3675 trace 'i+2=[', $i+2, ']', $r->[$i+2] if $i < @$r - 2;
3676 trace 'i+3=[', $i+3, ']', $r->[$i+3] if $i < @$r - 3;
3681 # Here, we have taken care of $NO and $MULTIPLE_foo replaces. This
3682 # leaves delete, insert, and replace either unconditionally or if not
3683 # equivalent. $i still points to the first potential affected range.
3684 # Now find the highest range affected, which will determine the length
3685 # parameter to splice. (The input range can span multiple existing
3686 # ones.) If this isn't a deletion, while we are looking through the
3687 # range list, see also if this is a replacement rather than a clean
3688 # insertion; that is if it will change the values of at least one
3689 # existing range. Start off assuming it is an insert, until find it
3691 my $clean_insert = $operation eq '+';
3692 my $j; # This will point to the highest affected range
3694 # For non-zero types, the standard form is the value itself;
3695 my $standard_form = ($type) ? $value : main::standardize($value);
3697 for ($j = $i; $j < $range_list_size; $j++) {
3698 trace "Looking for highest affected range; the one at $j is ", $r->[$j] if main::DEBUG && $to_trace;
3700 # If find a range that it doesn't overlap into, we can stop
3702 last if $end < $r->[$j]->start;
3704 # Here, overlaps the range at $j. If the values don't match,
3705 # and so far we think this is a clean insertion, it becomes a
3706 # non-clean insertion, i.e., a 'change' or 'replace' instead.
3707 if ($clean_insert) {
3708 if ($r->[$j]->standard_form ne $standard_form) {
3710 if ($replace == $CROAK) {
3711 main::croak("The range to add "
3712 . sprintf("%04X", $start)
3714 . sprintf("%04X", $end)
3715 . " with value '$value' overlaps an existing range $r->[$j]");
3720 # Here, the two values are essentially the same. If the
3721 # two are actually identical, replacing wouldn't change
3722 # anything so skip it.
3723 my $pre_existing = $r->[$j]->value;
3724 if ($pre_existing ne $value) {
3726 # Here the new and old standardized values are the
3727 # same, but the non-standardized values aren't. If
3728 # replacing unconditionally, then replace
3729 if( $replace == $UNCONDITIONALLY) {
3734 # Here, are replacing conditionally. Decide to
3735 # replace or not based on which appears to look
3736 # the "nicest". If one is mixed case and the
3737 # other isn't, choose the mixed case one.
3738 my $new_mixed = $value =~ /[A-Z]/
3739 && $value =~ /[a-z]/;
3740 my $old_mixed = $pre_existing =~ /[A-Z]/
3741 && $pre_existing =~ /[a-z]/;
3743 if ($old_mixed != $new_mixed) {
3744 $clean_insert = 0 if $new_mixed;
3745 if (main::DEBUG && $to_trace) {
3746 if ($clean_insert) {
3747 trace "Retaining $pre_existing over $value";
3750 trace "Replacing $pre_existing with $value";
3756 # Here casing wasn't different between the two.
3757 # If one has hyphens or underscores and the
3758 # other doesn't, choose the one with the
3760 my $new_punct = $value =~ /[-_]/;
3761 my $old_punct = $pre_existing =~ /[-_]/;
3763 if ($old_punct != $new_punct) {
3764 $clean_insert = 0 if $new_punct;
3765 if (main::DEBUG && $to_trace) {
3766 if ($clean_insert) {
3767 trace "Retaining $pre_existing over $value";
3770 trace "Replacing $pre_existing with $value";
3773 } # else existing one is just as "good";
3774 # retain it to save cycles.
3780 } # End of loop looking for highest affected range.
3782 # Here, $j points to one beyond the highest range that this insertion
3783 # affects (hence to beyond the range list if that range is the final
3784 # one in the range list).
3786 # The splice length is all the affected ranges. Get it before
3787 # subtracting, for efficiency, so we don't have to later add 1.
3788 my $length = $j - $i;
3790 $j--; # $j now points to the highest affected range.
3791 trace "Final affected range is $j: $r->[$j]" if main::DEBUG && $to_trace;
3793 # Here, have taken care of $NO and $MULTIPLE_foo replaces.
3794 # $j points to the highest affected range. But it can be < $i or even
3795 # -1. These happen only if the insertion is entirely in the gap
3796 # between r[$i-1] and r[$i]. Here's why: j < i means that the j loop
3797 # above exited first time through with $end < $r->[$i]->start. (And
3798 # then we subtracted one from j) This implies also that $start <
3799 # $r->[$i]->start, but we know from above that $r->[$i-1]->end <
3800 # $start, so the entire input range is in the gap.
3803 # Here the entire input range is in the gap before $i.
3805 if (main::DEBUG && $to_trace) {
3807 trace "Entire range is between $r->[$i-1] and $r->[$i]";
3810 trace "Entire range is before $r->[$i]";
3813 return if $operation ne '+'; # Deletion of a non-existent range is
3818 # Here part of the input range is not in the gap before $i. Thus,
3819 # there is at least one affected one, and $j points to the highest
3822 # At this point, here is the situation:
3823 # This is not an insertion of a multiple, nor of tentative ($NO)
3825 # $i points to the first element in the current range list that
3826 # may be affected by this operation. In fact, we know
3827 # that the range at $i is affected because we are in
3828 # the else branch of this 'if'
3829 # $j points to the highest affected range.
3831 # r[$i-1]->end < $start <= r[$i]->end
3833 # r[$i-1]->end < $start <= $end <= r[$j]->end
3836 # $clean_insert is a boolean which is set true if and only if
3837 # this is a "clean insertion", i.e., not a change nor a
3838 # deletion (multiple was handled above).
3840 # We now have enough information to decide if this call is a no-op
3841 # or not. It is a no-op if this is an insertion of already
3844 if (main::DEBUG && $to_trace && $clean_insert
3846 && $start >= $r->[$i]->start)
3850 return if $clean_insert
3851 && $i == $j # more than one affected range => not no-op
3853 # Here, r[$i-1]->end < $start <= $end <= r[$i]->end
3854 # Further, $start and/or $end is >= r[$i]->start
3855 # The test below hence guarantees that
3856 # r[$i]->start < $start <= $end <= r[$i]->end
3857 # This means the input range is contained entirely in
3858 # the one at $i, so is a no-op
3859 && $start >= $r->[$i]->start;
3862 # Here, we know that some action will have to be taken. We have
3863 # calculated the offset and length (though adjustments may be needed)
3864 # for the splice. Now start constructing the replacement list.
3866 my $splice_start = $i;
3871 # See if should extend any adjacent ranges.
3872 if ($operation eq '-') { # Don't extend deletions
3873 $extends_below = $extends_above = 0;
3875 else { # Here, should extend any adjacent ranges. See if there are
3877 $extends_below = ($i > 0
3878 # can't extend unless adjacent
3879 && $r->[$i-1]->end == $start -1
3880 # can't extend unless are same standard value
3881 && $r->[$i-1]->standard_form eq $standard_form
3882 # can't extend unless share type
3883 && $r->[$i-1]->type == $type);
3884 $extends_above = ($j+1 < $range_list_size
3885 && $r->[$j+1]->start == $end +1
3886 && $r->[$j+1]->standard_form eq $standard_form
3887 && $r->[$j+1]->type == $type);
3889 if ($extends_below && $extends_above) { # Adds to both
3890 $splice_start--; # start replace at element below
3891 $length += 2; # will replace on both sides
3892 trace "Extends both below and above ranges" if main::DEBUG && $to_trace;
3894 # The result will fill in any gap, replacing both sides, and
3895 # create one large range.
3896 @replacement = Range->new($r->[$i-1]->start,
3903 # Here we know that the result won't just be the conglomeration of
3904 # a new range with both its adjacent neighbors. But it could
3905 # extend one of them.
3907 if ($extends_below) {
3909 # Here the new element adds to the one below, but not to the
3910 # one above. If inserting, and only to that one range, can
3911 # just change its ending to include the new one.
3912 if ($length == 0 && $clean_insert) {
3913 $r->[$i-1]->set_end($end);
3914 trace "inserted range extends range to below so it is now $r->[$i-1]" if main::DEBUG && $to_trace;
3918 trace "Changing inserted range to start at ", sprintf("%04X", $r->[$i-1]->start), " instead of ", sprintf("%04X", $start) if main::DEBUG && $to_trace;
3919 $splice_start--; # start replace at element below
3920 $length++; # will replace the element below
3921 $start = $r->[$i-1]->start;
3924 elsif ($extends_above) {
3926 # Here the new element adds to the one above, but not below.
3927 # Mirror the code above
3928 if ($length == 0 && $clean_insert) {
3929 $r->[$j+1]->set_start($start);
3930 trace "inserted range extends range to above so it is now $r->[$j+1]" if main::DEBUG && $to_trace;
3934 trace "Changing inserted range to end at ", sprintf("%04X", $r->[$j+1]->end), " instead of ", sprintf("%04X", $end) if main::DEBUG && $to_trace;
3935 $length++; # will replace the element above
3936 $end = $r->[$j+1]->end;
3940 trace "Range at $i is $r->[$i]" if main::DEBUG && $to_trace;
3942 # Finally, here we know there will have to be a splice.
3943 # If the change or delete affects only the highest portion of the
3944 # first affected range, the range will have to be split. The
3945 # splice will remove the whole range, but will replace it by a new
3946 # range containing just the unaffected part. So, in this case,
3947 # add to the replacement list just this unaffected portion.
3948 if (! $extends_below
3949 && $start > $r->[$i]->start && $start <= $r->[$i]->end)
3952 Range->new($r->[$i]->start,
3954 Value => $r->[$i]->value,
3955 Type => $r->[$i]->type);
3958 # In the case of an insert or change, but not a delete, we have to
3959 # put in the new stuff; this comes next.
3960 if ($operation eq '+') {
3961 push @replacement, Range->new($start,
3967 trace "Range at $j is $r->[$j]" if main::DEBUG && $to_trace && $j != $i;
3968 #trace "$end >=", $r->[$j]->start, " && $end <", $r->[$j]->end if main::DEBUG && $to_trace;
3970 # And finally, if we're changing or deleting only a portion of the
3971 # highest affected range, it must be split, as the lowest one was.
3972 if (! $extends_above
3973 && $j >= 0 # Remember that j can be -1 if before first
3975 && $end >= $r->[$j]->start
3976 && $end < $r->[$j]->end)
3979 Range->new($end + 1,
3981 Value => $r->[$j]->value,
3982 Type => $r->[$j]->type);
3986 # And do the splice, as calculated above
3987 if (main::DEBUG && $to_trace) {
3988 trace "replacing $length element(s) at $i with ";
3989 foreach my $replacement (@replacement) {
3990 trace " $replacement";
3992 trace "Before splice:";
3993 trace 'i-2=[', $i-2, ']', $r->[$i-2] if $i >= 2;
3994 trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i >= 1;
3995 trace "i =[", $i, "]", $r->[$i];
3996 trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < @$r - 1;
3997 trace 'i+2=[', $i+2, ']', $r->[$i+2] if $i < @$r - 2;
4000 my @return = splice @$r, $splice_start, $length, @replacement;
4002 if (main::DEBUG && $to_trace) {
4003 trace "After splice:";
4004 trace 'i-2=[', $i-2, ']', $r->[$i-2] if $i >= 2;
4005 trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i >= 1;
4006 trace "i =[", $i, "]", $r->[$i];
4007 trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < @$r - 1;
4008 trace 'i+2=[', $i+2, ']', $r->[$i+2] if $i < @$r - 2;
4009 trace "removed ", @return if @return;
4012 # An actual deletion could have changed the maximum in the list.
4013 # There was no deletion if the splice didn't return something, but
4014 # otherwise recalculate it. This is done too rarely to worry about
4016 if ($operation eq '-' && @return) {
4017 $max{$addr} = $r->[-1]->end;
4022 sub reset_each_range { # reset the iterator for each_range();
4024 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4027 undef $each_range_iterator{pack 'J', $self};
4032 # Iterate over each range in a range list. Results are undefined if
4033 # the range list is changed during the iteration.
4036 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4038 my $addr = do { no overloading; pack 'J', $self; };
4040 return if $self->is_empty;
4042 $each_range_iterator{$addr} = -1
4043 if ! defined $each_range_iterator{$addr};
4044 $each_range_iterator{$addr}++;
4045 return $ranges{$addr}->[$each_range_iterator{$addr}]
4046 if $each_range_iterator{$addr} < @{$ranges{$addr}};
4047 undef $each_range_iterator{$addr};
4051 sub count { # Returns count of code points in range list
4053 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4055 my $addr = do { no overloading; pack 'J', $self; };
4058 foreach my $range (@{$ranges{$addr}}) {
4059 $count += $range->end - $range->start + 1;
4064 sub delete_range { # Delete a range
4069 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4071 return $self->_add_delete('-', $start, $end, "");
4074 sub is_empty { # Returns boolean as to if a range list is empty
4076 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4079 return scalar @{$ranges{pack 'J', $self}} == 0;
4083 # Quickly returns a scalar suitable for separating tables into
4084 # buckets, i.e. it is a hash function of the contents of a table, so
4085 # there are relatively few conflicts.
4088 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4090 my $addr = do { no overloading; pack 'J', $self; };
4092 # These are quickly computable. Return looks like 'min..max;count'
4093 return $self->min . "..$max{$addr};" . scalar @{$ranges{$addr}};
4095 } # End closure for _Range_List_Base
4098 use base '_Range_List_Base';
4100 # A Range_List is a range list for match tables; i.e. the range values are
4101 # not significant. Thus a number of operations can be safely added to it,
4102 # such as inversion, intersection. Note that union is also an unsafe
4103 # operation when range values are cared about, and that method is in the base
4104 # class, not here. But things are set up so that that method is callable only
4105 # during initialization. Only in this derived class, is there an operation
4106 # that combines two tables. A Range_Map can thus be used to initialize a
4107 # Range_List, and its mappings will be in the list, but are not significant to
4110 sub trace { return main::trace(@_); }
4116 '+' => sub { my $self = shift;
4119 return $self->_union($other)
4121 '&' => sub { my $self = shift;
4124 return $self->_intersect($other, 0);
4131 # Returns a new Range_List that gives all code points not in $self.
4135 my $new = Range_List->new;
4137 # Go through each range in the table, finding the gaps between them
4138 my $max = -1; # Set so no gap before range beginning at 0
4139 for my $range ($self->ranges) {
4140 my $start = $range->start;
4141 my $end = $range->end;
4143 # If there is a gap before this range, the inverse will contain
4145 if ($start > $max + 1) {
4146 $new->add_range($max + 1, $start - 1);
4151 # And finally, add the gap from the end of the table to the max
4152 # possible code point
4153 if ($max < $MAX_UNICODE_CODEPOINT) {
4154 $new->add_range($max + 1, $MAX_UNICODE_CODEPOINT);
4160 # Returns a new Range_List with the argument deleted from it. The
4161 # argument can be a single code point, a range, or something that has
4162 # a range, with the _range_list() method on it returning them
4166 my $reversed = shift;
4167 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4170 Carp::my_carp_bug("Can't cope with a "
4172 . " being the second parameter in a '-'. Subtraction ignored.");
4176 my $new = Range_List->new(Initialize => $self);
4178 if (! ref $other) { # Single code point
4179 $new->delete_range($other, $other);
4181 elsif ($other->isa('Range')) {
4182 $new->delete_range($other->start, $other->end);
4184 elsif ($other->can('_range_list')) {
4185 foreach my $range ($other->_range_list->ranges) {
4186 $new->delete_range($range->start, $range->end);
4190 Carp::my_carp_bug("Can't cope with a "
4192 . " argument to '-'. Subtraction ignored."
4201 # Returns either a boolean giving whether the two inputs' range lists
4202 # intersect (overlap), or a new Range_List containing the intersection
4203 # of the two lists. The optional final parameter being true indicates
4204 # to do the check instead of the intersection.
4206 my $a_object = shift;
4207 my $b_object = shift;
4208 my $check_if_overlapping = shift;
4209 $check_if_overlapping = 0 unless defined $check_if_overlapping;
4210 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4212 if (! defined $b_object) {
4214 $message .= $a_object->_owner_name_of if defined $a_object;
4215 Carp::my_carp_bug($message .= "Called with undefined value. Intersection not done.");
4219 # a & b = !(!a | !b), or in our terminology = ~ ( ~a + -b )
4220 # Thus the intersection could be much more simply be written:
4221 # return ~(~$a_object + ~$b_object);
4222 # But, this is slower, and when taking the inverse of a large
4223 # range_size_1 table, back when such tables were always stored that
4224 # way, it became prohibitively slow, hence the code was changed to the
4227 if ($b_object->isa('Range')) {
4228 $b_object = Range_List->new(Initialize => $b_object,
4229 Owner => $a_object->_owner_name_of);
4231 $b_object = $b_object->_range_list if $b_object->can('_range_list');
4233 my @a_ranges = $a_object->ranges;
4234 my @b_ranges = $b_object->ranges;
4236 #local $to_trace = 1 if main::DEBUG;
4237 trace "intersecting $a_object with ", scalar @a_ranges, "ranges and $b_object with", scalar @b_ranges, " ranges" if main::DEBUG && $to_trace;
4239 # Start with the first range in each list
4241 my $range_a = $a_ranges[$a_i];
4243 my $range_b = $b_ranges[$b_i];
4245 my $new = __PACKAGE__->new(Owner => $a_object->_owner_name_of)
4246 if ! $check_if_overlapping;
4248 # If either list is empty, there is no intersection and no overlap
4249 if (! defined $range_a || ! defined $range_b) {
4250 return $check_if_overlapping ? 0 : $new;
4252 trace "range_a[$a_i]=$range_a; range_b[$b_i]=$range_b" if main::DEBUG && $to_trace;
4254 # Otherwise, must calculate the intersection/overlap. Start with the
4255 # very first code point in each list
4256 my $a = $range_a->start;
4257 my $b = $range_b->start;
4259 # Loop through all the ranges of each list; in each iteration, $a and
4260 # $b are the current code points in their respective lists
4263 # If $a and $b are the same code point, ...
4266 # it means the lists overlap. If just checking for overlap
4267 # know the answer now,
4268 return 1 if $check_if_overlapping;
4270 # The intersection includes this code point plus anything else
4271 # common to both current ranges.
4273 my $end = main::min($range_a->end, $range_b->end);
4274 if (! $check_if_overlapping) {
4275 trace "adding intersection range ", sprintf("%04X", $start) . ".." . sprintf("%04X", $end) if main::DEBUG && $to_trace;
4276 $new->add_range($start, $end);
4279 # Skip ahead to the end of the current intersect
4282 # If the current intersect ends at the end of either range (as
4283 # it must for at least one of them), the next possible one
4284 # will be the beginning code point in it's list's next range.
4285 if ($a == $range_a->end) {
4286 $range_a = $a_ranges[++$a_i];
4287 last unless defined $range_a;
4288 $a = $range_a->start;
4290 if ($b == $range_b->end) {
4291 $range_b = $b_ranges[++$b_i];
4292 last unless defined $range_b;
4293 $b = $range_b->start;
4296 trace "range_a[$a_i]=$range_a; range_b[$b_i]=$range_b" if main::DEBUG && $to_trace;
4300 # Not equal, but if the range containing $a encompasses $b,
4301 # change $a to be the middle of the range where it does equal
4302 # $b, so the next iteration will get the intersection
4303 if ($range_a->end >= $b) {
4308 # Here, the current range containing $a is entirely below
4309 # $b. Go try to find a range that could contain $b.
4310 $a_i = $a_object->_search_ranges($b);
4312 # If no range found, quit.
4313 last unless defined $a_i;
4315 # The search returns $a_i, such that
4316 # range_a[$a_i-1]->end < $b <= range_a[$a_i]->end
4317 # Set $a to the beginning of this new range, and repeat.
4318 $range_a = $a_ranges[$a_i];
4319 $a = $range_a->start;
4322 else { # Here, $b < $a.
4324 # Mirror image code to the leg just above
4325 if ($range_b->end >= $a) {
4329 $b_i = $b_object->_search_ranges($a);
4330 last unless defined $b_i;
4331 $range_b = $b_ranges[$b_i];
4332 $b = $range_b->start;
4335 } # End of looping through ranges.
4337 # Intersection fully computed, or now know that there is no overlap
4338 return $check_if_overlapping ? 0 : $new;
4342 # Returns boolean giving whether the two arguments overlap somewhere
4346 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4348 return $self->_intersect($other, 1);
4352 # Add a range to the list.
4357 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4359 return $self->_add_delete('+', $start, $end, "");
4362 sub matches_identically_to {
4363 # Return a boolean as to whether or not two Range_Lists match identical
4364 # sets of code points.
4368 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4370 # These are ordered in increasing real time to figure out (at least
4371 # until a patch changes that and doesn't change this)
4372 return 0 if $self->max != $other->max;
4373 return 0 if $self->min != $other->min;
4374 return 0 if $self->range_count != $other->range_count;
4375 return 0 if $self->count != $other->count;
4377 # Here they could be identical because all the tests above passed.
4378 # The loop below is somewhat simpler since we know they have the same
4379 # number of elements. Compare range by range, until reach the end or
4380 # find something that differs.
4381 my @a_ranges = $self->ranges;
4382 my @b_ranges = $other->ranges;
4383 for my $i (0 .. @a_ranges - 1) {
4384 my $a = $a_ranges[$i];
4385 my $b = $b_ranges[$i];
4386 trace "self $a; other $b" if main::DEBUG && $to_trace;
4387 return 0 if ! defined $b
4388 || $a->start != $b->start
4389 || $a->end != $b->end;
4394 sub is_code_point_usable {
4395 # This used only for making the test script. See if the input
4396 # proposed trial code point is one that Perl will handle. If second
4397 # parameter is 0, it won't select some code points for various
4398 # reasons, noted below.
4401 my $try_hard = shift;
4402 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4404 return 0 if $code < 0; # Never use a negative
4406 # shun null. I'm (khw) not sure why this was done, but NULL would be
4407 # the character very frequently used.
4408 return $try_hard if $code == 0x0000;
4410 # shun non-character code points.
4411 return $try_hard if $code >= 0xFDD0 && $code <= 0xFDEF;
4412 return $try_hard if ($code & 0xFFFE) == 0xFFFE; # includes FFFF
4414 return $try_hard if $code > $MAX_UNICODE_CODEPOINT; # keep in range
4415 return $try_hard if $code >= 0xD800 && $code <= 0xDFFF; # no surrogate
4420 sub get_valid_code_point {
4421 # Return a code point that's part of the range list. Returns nothing
4422 # if the table is empty or we can't find a suitable code point. This
4423 # used only for making the test script.
4426 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4428 my $addr = do { no overloading; pack 'J', $self; };
4430 # On first pass, don't choose less desirable code points; if no good
4431 # one is found, repeat, allowing a less desirable one to be selected.
4432 for my $try_hard (0, 1) {
4434 # Look through all the ranges for a usable code point.
4435 for my $set ($self->ranges) {
4437 # Try the edge cases first, starting with the end point of the
4439 my $end = $set->end;
4440 return $end if is_code_point_usable($end, $try_hard);
4442 # End point didn't, work. Start at the beginning and try
4443 # every one until find one that does work.
4444 for my $trial ($set->start .. $end - 1) {
4445 return $trial if is_code_point_usable($trial, $try_hard);
4449 return (); # If none found, give up.
4452 sub get_invalid_code_point {
4453 # Return a code point that's not part of the table. Returns nothing
4454 # if the table covers all code points or a suitable code point can't
4455 # be found. This used only for making the test script.
4458 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4460 # Just find a valid code point of the inverse, if any.
4461 return Range_List->new(Initialize => ~ $self)->get_valid_code_point;
4463 } # end closure for Range_List
4466 use base '_Range_List_Base';
4468 # A Range_Map is a range list in which the range values (called maps) are
4469 # significant, and hence shouldn't be manipulated by our other code, which
4470 # could be ambiguous or lose things. For example, in taking the union of two
4471 # lists, which share code points, but which have differing values, which one
4472 # has precedence in the union?
4473 # It turns out that these operations aren't really necessary for map tables,
4474 # and so this class was created to make sure they aren't accidentally
4480 # Add a range containing a mapping value to the list
4483 # Rest of parameters passed on
4485 return $self->_add_delete('+', @_);
4489 # Adds entry to a range list which can duplicate an existing entry
4492 my $code_point = shift;
4495 my $replace = delete $args{'Replace'} // $MULTIPLE_BEFORE;
4496 Carp::carp_extra_args(\%args) if main::DEBUG && %args;
4498 return $self->add_map($code_point, $code_point,
4499 $value, Replace => $replace);
4501 } # End of closure for package Range_Map
4503 package _Base_Table;
4505 # A table is the basic data structure that gets written out into a file for
4506 # use by the Perl core. This is the abstract base class implementing the
4507 # common elements from the derived ones. A list of the methods to be
4508 # furnished by an implementing class is just after the constructor.
4510 sub standardize { return main::standardize($_[0]); }
4511 sub trace { return main::trace(@_); }
4515 main::setup_package();
4518 # Object containing the ranges of the table.
4519 main::set_access('range_list', \%range_list, 'p_r', 'p_s');
4522 # The full table name.
4523 main::set_access('full_name', \%full_name, 'r');
4526 # The table name, almost always shorter
4527 main::set_access('name', \%name, 'r');
4530 # The shortest of all the aliases for this table, with underscores removed
4531 main::set_access('short_name', \%short_name);
4533 my %nominal_short_name_length;
4534 # The length of short_name before removing underscores
4535 main::set_access('nominal_short_name_length',
4536 \%nominal_short_name_length);
4539 # The complete name, including property.
4540 main::set_access('complete_name', \%complete_name, 'r');
4543 # Parent property this table is attached to.
4544 main::set_access('property', \%property, 'r');
4547 # Ordered list of alias objects of the table's name. The first ones in
4548 # the list are output first in comments
4549 main::set_access('aliases', \%aliases, 'readable_array');
4552 # A comment associated with the table for human readers of the files
4553 main::set_access('comment', \%comment, 's');
4556 # A comment giving a short description of the table's meaning for human
4557 # readers of the files.
4558 main::set_access('description', \%description, 'readable_array');
4561 # A comment giving a short note about the table for human readers of the
4563 main::set_access('note', \%note, 'readable_array');
4566 # Enum; there are a number of possibilities for what happens to this
4567 # table: it could be normal, or suppressed, or not for external use. See
4568 # values at definition for $SUPPRESSED.
4569 main::set_access('fate', \%fate, 'r');
4571 my %find_table_from_alias;
4572 # The parent property passes this pointer to a hash which this class adds
4573 # all its aliases to, so that the parent can quickly take an alias and
4575 main::set_access('find_table_from_alias', \%find_table_from_alias, 'p_r');
4578 # After this table is made equivalent to another one; we shouldn't go
4579 # changing the contents because that could mean it's no longer equivalent
4580 main::set_access('locked', \%locked, 'r');
4583 # This gives the final path to the file containing the table. Each
4584 # directory in the path is an element in the array
4585 main::set_access('file_path', \%file_path, 'readable_array');
4588 # What is the table's status, normal, $OBSOLETE, etc. Enum
4589 main::set_access('status', \%status, 'r');
4592 # A comment about its being obsolete, or whatever non normal status it has
4593 main::set_access('status_info', \%status_info, 'r');
4595 my %caseless_equivalent;
4596 # The table this is equivalent to under /i matching, if any.
4597 main::set_access('caseless_equivalent', \%caseless_equivalent, 'r', 's');
4600 # Is the table to be output with each range only a single code point?
4601 # This is done to avoid breaking existing code that may have come to rely
4602 # on this behavior in previous versions of this program.)
4603 main::set_access('range_size_1', \%range_size_1, 'r', 's');
4606 # A boolean set iff this table is a Perl extension to the Unicode
4608 main::set_access('perl_extension', \%perl_extension, 'r');
4610 my %output_range_counts;
4611 # A boolean set iff this table is to have comments written in the
4612 # output file that contain the number of code points in the range.
4613 # The constructor can override the global flag of the same name.
4614 main::set_access('output_range_counts', \%output_range_counts, 'r');
4617 # The format of the entries of the table. This is calculated from the
4618 # data in the table (or passed in the constructor). This is an enum e.g.,
4619 # $STRING_FORMAT. It is marked protected as it should not be generally
4620 # used to override calculations.
4621 main::set_access('format', \%format, 'r', 'p_s');
4624 # All arguments are key => value pairs, which you can see below, most
4625 # of which match fields documented above. Otherwise: Re_Pod_Entry,
4626 # OK_as_Filename, and Fuzzy apply to the names of the table, and are
4627 # documented in the Alias package
4629 return Carp::carp_too_few_args(\@_, 2) if main::DEBUG && @_ < 2;
4633 my $self = bless \do { my $anonymous_scalar }, $class;
4634 my $addr = do { no overloading; pack 'J', $self; };
4638 $name{$addr} = delete $args{'Name'};
4639 $find_table_from_alias{$addr} = delete $args{'_Alias_Hash'};
4640 $full_name{$addr} = delete $args{'Full_Name'};
4641 my $complete_name = $complete_name{$addr}
4642 = delete $args{'Complete_Name'};
4643 $format{$addr} = delete $args{'Format'};
4644 $output_range_counts{$addr} = delete $args{'Output_Range_Counts'};
4645 $property{$addr} = delete $args{'_Property'};
4646 $range_list{$addr} = delete $args{'_Range_List'};
4647 $status{$addr} = delete $args{'Status'} || $NORMAL;
4648 $status_info{$addr} = delete $args{'_Status_Info'} || "";
4649 $range_size_1{$addr} = delete $args{'Range_Size_1'} || 0;
4650 $caseless_equivalent{$addr} = delete $args{'Caseless_Equivalent'} || 0;
4651 $fate{$addr} = delete $args{'Fate'} || $ORDINARY;
4652 my $ucd = delete $args{'UCD'};
4654 my $description = delete $args{'Description'};
4655 my $ok_as_filename = delete $args{'OK_as_Filename'};
4656 my $loose_match = delete $args{'Fuzzy'};
4657 my $note = delete $args{'Note'};
4658 my $make_re_pod_entry = delete $args{'Re_Pod_Entry'};
4659 my $perl_extension = delete $args{'Perl_Extension'};
4661 # Shouldn't have any left over
4662 Carp::carp_extra_args(\%args) if main::DEBUG && %args;
4664 # Can't use || above because conceivably the name could be 0, and
4665 # can't use // operator in case this program gets used in Perl 5.8
4666 $full_name{$addr} = $name{$addr} if ! defined $full_name{$addr};
4667 $output_range_counts{$addr} = $output_range_counts if
4668 ! defined $output_range_counts{$addr};
4670 $aliases{$addr} = [ ];
4671 $comment{$addr} = [ ];
4672 $description{$addr} = [ ];
4674 $file_path{$addr} = [ ];
4675 $locked{$addr} = "";
4677 push @{$description{$addr}}, $description if $description;
4678 push @{$note{$addr}}, $note if $note;
4680 if ($fate{$addr} == $PLACEHOLDER) {
4682 # A placeholder table doesn't get documented, is a perl extension,
4683 # and quite likely will be empty
4684 $make_re_pod_entry = 0 if ! defined $make_re_pod_entry;
4685 $perl_extension = 1 if ! defined $perl_extension;
4686 $ucd = 0 if ! defined $ucd;
4687 push @tables_that_may_be_empty, $complete_name{$addr};
4688 $self->add_comment(<<END);
4689 This is a placeholder because it is not in Version $string_version of Unicode,
4690 but is needed by the Perl core to work gracefully. Because it is not in this
4691 version of Unicode, it will not be listed in $pod_file.pod
4694 elsif (exists $why_suppressed{$complete_name}
4695 # Don't suppress if overridden
4696 && ! grep { $_ eq $complete_name{$addr} }
4697 @output_mapped_properties)
4699 $fate{$addr} = $SUPPRESSED;
4701 elsif ($fate{$addr} == $SUPPRESSED
4702 && ! exists $why_suppressed{$property{$addr}->complete_name})
4704 Carp::my_carp_bug("There is no current capability to set the reason for suppressing.");
4705 # perhaps Fate => [ $SUPPRESSED, "reason" ]
4708 # If hasn't set its status already, see if it is on one of the
4709 # lists of properties or tables that have particular statuses; if
4710 # not, is normal. The lists are prioritized so the most serious
4711 # ones are checked first
4712 if (! $status{$addr}) {
4713 if (exists $why_deprecated{$complete_name}) {
4714 $status{$addr} = $DEPRECATED;
4716 elsif (exists $why_stabilized{$complete_name}) {
4717 $status{$addr} = $STABILIZED;
4719 elsif (exists $why_obsolete{$complete_name}) {
4720 $status{$addr} = $OBSOLETE;
4723 # Existence above doesn't necessarily mean there is a message
4724 # associated with it. Use the most serious message.
4725 if ($status{$addr}) {
4726 if ($why_deprecated{$complete_name}) {
4728 = $why_deprecated{$complete_name};
4730 elsif ($why_stabilized{$complete_name}) {
4732 = $why_stabilized{$complete_name};
4734 elsif ($why_obsolete{$complete_name}) {
4736 = $why_obsolete{$complete_name};
4741 $perl_extension{$addr} = $perl_extension || 0;
4743 # Don't list a property by default that is internal only
4744 if ($fate{$addr} > $MAP_PROXIED) {
4745 $make_re_pod_entry = 0 if ! defined $make_re_pod_entry;
4746 $ucd = 0 if ! defined $ucd;
4749 $ucd = 1 if ! defined $ucd;
4752 # By convention what typically gets printed only or first is what's
4753 # first in the list, so put the full name there for good output
4754 # clarity. Other routines rely on the full name being first on the
4756 $self->add_alias($full_name{$addr},
4757 OK_as_Filename => $ok_as_filename,
4758 Fuzzy => $loose_match,
4759 Re_Pod_Entry => $make_re_pod_entry,
4760 Status => $status{$addr},
4764 # Then comes the other name, if meaningfully different.
4765 if (standardize($full_name{$addr}) ne standardize($name{$addr})) {
4766 $self->add_alias($name{$addr},
4767 OK_as_Filename => $ok_as_filename,
4768 Fuzzy => $loose_match,
4769 Re_Pod_Entry => $make_re_pod_entry,
4770 Status => $status{$addr},
4778 # Here are the methods that are required to be defined by any derived
4781 handle_special_range
4785 # write() knows how to write out normal ranges, but it calls
4786 # handle_special_range() when it encounters a non-normal one.
4787 # append_to_body() is called by it after it has handled all
4788 # ranges to add anything after the main portion of the table.
4789 # And finally, pre_body() is called after all this to build up
4790 # anything that should appear before the main portion of the
4791 # table. Doing it this way allows things in the middle to
4792 # affect what should appear before the main portion of the
4797 Carp::my_carp_bug( __LINE__
4798 . ": Must create method '$sub()' for "
4806 "." => \&main::_operator_dot,
4807 '!=' => \&main::_operator_not_equal,
4808 '==' => \&main::_operator_equal,
4812 # Returns the array of ranges associated with this table.
4815 return $range_list{pack 'J', shift}->ranges;
4819 # Add a synonym for this table.
4821 return Carp::carp_too_few_args(\@_, 3) if main::DEBUG && @_ < 3;
4824 my $name = shift; # The name to add.
4825 my $pointer = shift; # What the alias hash should point to. For
4826 # map tables, this is the parent property;
4827 # for match tables, it is the table itself.
4830 my $loose_match = delete $args{'Fuzzy'};
4832 my $make_re_pod_entry = delete $args{'Re_Pod_Entry'};
4833 $make_re_pod_entry = $YES unless defined $make_re_pod_entry;
4835 my $ok_as_filename = delete $args{'OK_as_Filename'};
4836 $ok_as_filename = 1 unless defined $ok_as_filename;
4838 my $status = delete $args{'Status'};
4839 $status = $NORMAL unless defined $status;
4841 my $ucd = delete $args{'UCD'} // 1;
4843 Carp::carp_extra_args(\%args) if main::DEBUG && %args;
4845 # Capitalize the first letter of the alias unless it is one of the CJK
4846 # ones which specifically begins with a lower 'k'. Do this because
4847 # Unicode has varied whether they capitalize first letters or not, and
4848 # have later changed their minds and capitalized them, but not the
4849 # other way around. So do it always and avoid changes from release to
4851 $name = ucfirst($name) unless $name =~ /^k[A-Z]/;
4853 my $addr = do { no overloading; pack 'J', $self; };
4855 # Figure out if should be loosely matched if not already specified.
4856 if (! defined $loose_match) {
4858 # Is a loose_match if isn't null, and doesn't begin with an
4859 # underscore and isn't just a number
4861 && substr($name, 0, 1) ne '_'
4862 && $name !~ qr{^[0-9_.+-/]+$})
4871 # If this alias has already been defined, do nothing.
4872 return if defined $find_table_from_alias{$addr}->{$name};
4874 # That includes if it is standardly equivalent to an existing alias,
4875 # in which case, add this name to the list, so won't have to search
4877 my $standard_name = main::standardize($name);
4878 if (defined $find_table_from_alias{$addr}->{$standard_name}) {
4879 $find_table_from_alias{$addr}->{$name}
4880 = $find_table_from_alias{$addr}->{$standard_name};
4884 # Set the index hash for this alias for future quick reference.
4885 $find_table_from_alias{$addr}->{$name} = $pointer;
4886 $find_table_from_alias{$addr}->{$standard_name} = $pointer;
4887 local $to_trace = 0 if main::DEBUG;
4888 trace "adding alias $name to $pointer" if main::DEBUG && $to_trace;
4889 trace "adding alias $standard_name to $pointer" if main::DEBUG && $to_trace;
4892 # Put the new alias at the end of the list of aliases unless the final
4893 # element begins with an underscore (meaning it is for internal perl
4894 # use) or is all numeric, in which case, put the new one before that
4895 # one. This floats any all-numeric or underscore-beginning aliases to
4896 # the end. This is done so that they are listed last in output lists,
4897 # to encourage the user to use a better name (either more descriptive
4898 # or not an internal-only one) instead. This ordering is relied on
4899 # implicitly elsewhere in this program, like in short_name()
4900 my $list = $aliases{$addr};
4901 my $insert_position = (@$list == 0
4902 || (substr($list->[-1]->name, 0, 1) ne '_'
4903 && $list->[-1]->name =~ /\D/))
4909 Alias->new($name, $loose_match, $make_re_pod_entry,
4910 $ok_as_filename, $status, $ucd);
4912 # This name may be shorter than any existing ones, so clear the cache
4913 # of the shortest, so will have to be recalculated.
4915 undef $short_name{pack 'J', $self};
4920 # Returns a name suitable for use as the base part of a file name.
4921 # That is, shorter wins. It can return undef if there is no suitable
4922 # name. The name has all non-essential underscores removed.
4924 # The optional second parameter is a reference to a scalar in which
4925 # this routine will store the length the returned name had before the
4926 # underscores were removed, or undef if the return is undef.
4928 # The shortest name can change if new aliases are added. So using
4929 # this should be deferred until after all these are added. The code
4930 # that does that should clear this one's cache.
4931 # Any name with alphabetics is preferred over an all numeric one, even
4935 my $nominal_length_ptr = shift;
4936 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4938 my $addr = do { no overloading; pack 'J', $self; };
4940 # For efficiency, don't recalculate, but this means that adding new
4941 # aliases could change what the shortest is, so the code that does
4942 # that needs to undef this.
4943 if (defined $short_name{$addr}) {
4944 if ($nominal_length_ptr) {
4945 $$nominal_length_ptr = $nominal_short_name_length{$addr};
4947 return $short_name{$addr};
4950 # Look at each alias
4951 foreach my $alias ($self->aliases()) {
4953 # Don't use an alias that isn't ok to use for an external name.
4954 next if ! $alias->ok_as_filename;
4956 my $name = main::Standardize($alias->name);
4957 trace $self, $name if main::DEBUG && $to_trace;
4959 # Take the first one, or a shorter one that isn't numeric. This
4960 # relies on numeric aliases always being last in the array
4961 # returned by aliases(). Any alpha one will have precedence.
4962 if (! defined $short_name{$addr}
4964 && length($name) < length($short_name{$addr})))
4966 # Remove interior underscores.
4967 ($short_name{$addr} = $name) =~ s/ (?<= . ) _ (?= . ) //xg;
4969 $nominal_short_name_length{$addr} = length $name;
4973 # If the short name isn't a nice one, perhaps an equivalent table has
4975 if (! defined $short_name{$addr}
4976 || $short_name{$addr} eq ""
4977 || $short_name{$addr} eq "_")
4980 foreach my $follower ($self->children) { # All equivalents
4981 my $follower_name = $follower->short_name;
4982 next unless defined $follower_name;
4984 # Anything (except undefined) is better than underscore or
4986 if (! defined $return || $return eq "_") {
4987 $return = $follower_name;
4991 # If the new follower name isn't "_" and is shorter than the
4992 # current best one, prefer the new one.
4993 next if $follower_name eq "_";
4994 next if length $follower_name > length $return;
4995 $return = $follower_name;
4997 $short_name{$addr} = $return if defined $return;
5000 # If no suitable external name return undef
5001 if (! defined $short_name{$addr}) {
5002 $$nominal_length_ptr = undef if $nominal_length_ptr;
5006 # Don't allow a null short name.
5007 if ($short_name{$addr} eq "") {
5008 $short_name{$addr} = '_';
5009 $nominal_short_name_length{$addr} = 1;
5012 trace $self, $short_name{$addr} if main::DEBUG && $to_trace;
5014 if ($nominal_length_ptr) {
5015 $$nominal_length_ptr = $nominal_short_name_length{$addr};
5017 return $short_name{$addr};
5021 # Returns the external name that this table should be known by. This
5022 # is usually the short_name, but not if the short_name is undefined,
5023 # in which case the external_name is arbitrarily set to the
5027 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5029 my $short = $self->short_name;
5030 return $short if defined $short;
5035 sub add_description { # Adds the parameter as a short description.
5038 my $description = shift;
5040 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5043 push @{$description{pack 'J', $self}}, $description;
5048 sub add_note { # Adds the parameter as a short note.
5053 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5056 push @{$note{pack 'J', $self}}, $note;
5061 sub add_comment { # Adds the parameter as a comment.
5063 return unless $debugging_build;
5066 my $comment = shift;
5067 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5072 push @{$comment{pack 'J', $self}}, $comment;
5078 # Return the current comment for this table. If called in list
5079 # context, returns the array of comments. In scalar, returns a string
5080 # of each element joined together with a period ending each.
5083 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5085 my $addr = do { no overloading; pack 'J', $self; };
5086 my @list = @{$comment{$addr}};
5087 return @list if wantarray;
5089 foreach my $sentence (@list) {
5090 $return .= '. ' if $return;
5091 $return .= $sentence;
5094 $return .= '.' if $return;
5099 # Initialize the table with the argument which is any valid
5100 # initialization for range lists.
5103 my $addr = do { no overloading; pack 'J', $self; };
5104 my $initialization = shift;
5105 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5107 # Replace the current range list with a new one of the same exact
5109 my $class = ref $range_list{$addr};
5110 $range_list{$addr} = $class->new(Owner => $self,
5111 Initialize => $initialization);
5117 # The header that is output for the table in the file it is written
5121 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5124 $return .= $DEVELOPMENT_ONLY if $compare_versions;
5130 # Write a representation of the table to its file. It calls several
5131 # functions furnished by sub-classes of this abstract base class to
5132 # handle non-normal ranges, to add stuff before the table, and at its
5136 my $tab_stops = shift; # The number of tab stops over to put any
5138 my $suppress_value = shift; # Optional, if the value associated with
5139 # a range equals this one, don't write
5141 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5143 my $addr = do { no overloading; pack 'J', $self; };
5145 # Start with the header
5146 my @HEADER = $self->header;
5149 push @HEADER, "\n", main::simple_fold($comment{$addr}, '# '), "\n"
5152 # Things discovered processing the main body of the document may
5153 # affect what gets output before it, therefore pre_body() isn't called
5154 # until after all other processing of the table is done.
5156 # The main body looks like a 'here' document. If annotating, get rid
5157 # of the comments before passing to the caller, as some callers, such
5158 # as charnames.pm, can't cope with them. (Outputting range counts
5159 # also introduces comments, but these don't show up in the tables that
5160 # can't cope with comments, and there aren't that many of them that
5161 # it's worth the extra real time to get rid of them).
5164 # Use the line below in Perls that don't have /r
5165 #push @OUT, 'return join "\n", map { s/\s*#.*//mg; $_ } split "\n", <<\'END\';' . "\n";
5166 push @OUT, "return <<'END' =~ s/\\s*#.*//mgr;\n";
5168 push @OUT, "return <<'END';\n";
5171 if ($range_list{$addr}->is_empty) {
5173 # This is a kludge for empty tables to silence a warning in
5174 # utf8.c, which can't really deal with empty tables, but it can
5175 # deal with a table that matches nothing, as the inverse of 'Any'
5177 push @OUT, "!utf8::Any\n";
5179 elsif ($self->name eq 'N'
5181 # To save disk space and table cache space, avoid putting out
5182 # binary N tables, but instead create a file which just inverts
5183 # the Y table. Since the file will still exist and occupy a
5184 # certain number of blocks, might as well output the whole
5185 # thing if it all will fit in one block. The number of
5186 # ranges below is an approximate number for that.
5187 && ($self->property->type == $BINARY
5188 || $self->property->type == $FORCED_BINARY)
5189 # && $self->property->tables == 2 Can't do this because the
5190 # non-binary properties, like NFDQC aren't specifiable
5192 && $range_list{$addr}->ranges > 15
5193 && ! $annotate) # Under --annotate, want to see everything
5195 push @OUT, "!utf8::" . $self->property->name . "\n";
5198 my $range_size_1 = $range_size_1{$addr};
5199 my $format; # Used only in $annotate option
5200 my $include_name; # Used only in $annotate option
5204 # if annotating each code point, must print 1 per line.
5205 # The variable could point to a subroutine, and we don't want
5206 # to lose that fact, so only set if not set already
5207 $range_size_1 = 1 if ! $range_size_1;
5209 $format = $self->format;
5211 # The name of the character is output only for tables that
5212 # don't already include the name in the output.
5213 my $property = $self->property;
5215 ! ($property == $perl_charname
5216 || $property == main::property_ref('Unicode_1_Name')
5217 || $property == main::property_ref('Name')
5218 || $property == main::property_ref('Name_Alias')
5222 # Output each range as part of the here document.
5224 for my $set ($range_list{$addr}->ranges) {
5225 if ($set->type != 0) {
5226 $self->handle_special_range($set);
5229 my $start = $set->start;
5230 my $end = $set->end;
5231 my $value = $set->value;
5233 # Don't output ranges whose value is the one to suppress
5234 next RANGE if defined $suppress_value
5235 && $value eq $suppress_value;
5237 # If there is a range and doesn't need a single point range
5239 if ($start != $end && ! $range_size_1) {
5240 push @OUT, sprintf "%04X\t%04X", $start, $end;
5241 $OUT[-1] .= "\t$value" if $value ne "";
5243 # Add a comment with the size of the range, if requested.
5244 # Expand Tabs to make sure they all start in the same
5245 # column, and then unexpand to use mostly tabs.
5246 if (! $output_range_counts{$addr}) {
5250 $OUT[-1] = Text::Tabs::expand($OUT[-1]);
5251 my $count = main::clarify_number($end - $start + 1);
5254 my $width = $tab_stops * 8 - 1;
5255 $OUT[-1] = sprintf("%-*s # [%s]\n",
5259 $OUT[-1] = Text::Tabs::unexpand($OUT[-1]);
5264 # Here to output a single code point per line
5266 # If not to annotate, use the simple formats
5269 # Use any passed in subroutine to output.
5270 if (ref $range_size_1 eq 'CODE') {
5271 for my $i ($start .. $end) {
5272 push @OUT, &{$range_size_1}($i, $value);
5277 # Here, caller is ok with default output.
5278 for (my $i = $start; $i <= $end; $i++) {
5279 push @OUT, sprintf "%04X\t\t%s\n", $i, $value;
5285 # Here, wants annotation.
5286 for (my $i = $start; $i <= $end; $i++) {
5288 # Get character information if don't have it already
5289 main::populate_char_info($i)
5290 if ! defined $viacode[$i];
5291 my $type = $annotate_char_type[$i];
5293 # Figure out if should output the next code points as part
5294 # of a range or not. If this is not in an annotation
5295 # range, then won't output as a range, so returns $i.
5296 # Otherwise use the end of the annotation range, but no
5297 # further than the maximum possible end point of the loop.
5298 my $range_end = main::min($annotate_ranges->value_of($i)
5302 # Use a range if it is a range, and either is one of the
5303 # special annotation ranges, or the range is at most 3
5304 # long. This last case causes the algorithmically named
5305 # code points to be output individually in spans of at
5306 # most 3, as they are the ones whose $type is > 0.
5307 if ($range_end != $i
5308 && ( $type < 0 || $range_end - $i > 2))
5310 # Here is to output a range. We don't allow a
5311 # caller-specified output format--just use the
5313 push @OUT, sprintf "%04X\t%04X\t%s\t#", $i,
5316 my $range_name = $viacode[$i];
5318 # For the code points which end in their hex value, we
5319 # eliminate that from the output annotation, and
5320 # capitalize only the first letter of each word.
5321 if ($type == $CP_IN_NAME) {
5322 my $hex = sprintf "%04X", $i;
5323 $range_name =~ s/-$hex$//;
5324 my @words = split " ", $range_name;
5325 for my $word (@words) {
5326 $word = ucfirst(lc($word)) if $word ne 'CJK';
5328 $range_name = join " ", @words;
5330 elsif ($type == $HANGUL_SYLLABLE) {
5331 $range_name = "Hangul Syllable";
5334 $OUT[-1] .= " $range_name" if $range_name;
5336 # Include the number of code points in the range
5337 my $count = main::clarify_number($range_end - $i + 1);
5338 $OUT[-1] .= " [$count]\n";
5340 # Skip to the end of the range
5343 else { # Not in a range.
5346 # When outputting the names of each character, use
5347 # the character itself if printable
5348 $comment .= "'" . chr($i) . "' " if $printable[$i];
5350 # To make it more readable, use a minimum indentation
5353 # Determine the annotation
5354 if ($format eq $DECOMP_STRING_FORMAT) {
5356 # This is very specialized, with the type of
5357 # decomposition beginning the line enclosed in
5358 # <...>, and the code points that the code point
5359 # decomposes to separated by blanks. Create two
5360 # strings, one of the printable characters, and
5361 # one of their official names.
5362 (my $map = $value) =~ s/ \ * < .*? > \ +//x;
5366 foreach my $to (split " ", $map) {
5367 $to = CORE::hex $to;
5368 $to_name .= " + " if $to_name;
5369 $to_chr .= chr($to);
5370 main::populate_char_info($to)
5371 if ! defined $viacode[$to];
5372 $to_name .= $viacode[$to];
5376 "=> '$to_chr'; $viacode[$i] => $to_name";
5377 $comment_indent = 25; # Determined by experiment
5381 # Assume that any table that has hex format is a
5382 # mapping of one code point to another.
5383 if ($format eq $HEX_FORMAT) {
5384 my $decimal_value = CORE::hex $value;
5385 main::populate_char_info($decimal_value)
5386 if ! defined $viacode[$decimal_value];
5388 . chr($decimal_value)
5389 . "'; " if $printable[$decimal_value];
5391 $comment .= $viacode[$i] if $include_name
5393 if ($format eq $HEX_FORMAT) {
5394 my $decimal_value = CORE::hex $value;
5395 $comment .= " => $viacode[$decimal_value]"
5396 if $viacode[$decimal_value];
5399 # If including the name, no need to indent, as the
5400 # name will already be way across the line.
5401 $comment_indent = ($include_name) ? 0 : 60;
5404 # Use any passed in routine to output the base part of
5406 if (ref $range_size_1 eq 'CODE') {
5407 my $base_part = &{$range_size_1}($i, $value);
5409 push @OUT, $base_part;
5412 push @OUT, sprintf "%04X\t\t%s", $i, $value;
5415 # And add the annotation.
5416 $OUT[-1] = sprintf "%-*s\t# %s", $comment_indent,
5418 $comment if $comment;
5422 } # End of loop through all the table's ranges
5425 # Add anything that goes after the main body, but within the here
5427 my $append_to_body = $self->append_to_body;
5428 push @OUT, $append_to_body if $append_to_body;
5430 # And finish the here document.
5433 # Done with the main portion of the body. Can now figure out what
5434 # should appear before it in the file.
5435 my $pre_body = $self->pre_body;
5436 push @HEADER, $pre_body, "\n" if $pre_body;
5438 # All these files should have a .pl suffix added to them.
5439 my @file_with_pl = @{$file_path{$addr}};
5440 $file_with_pl[-1] .= '.pl';
5442 main::write(\@file_with_pl,
5443 $annotate, # utf8 iff annotating
5449 sub set_status { # Set the table's status
5451 my $status = shift; # The status enum value
5452 my $info = shift; # Any message associated with it.
5453 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5455 my $addr = do { no overloading; pack 'J', $self; };
5457 $status{$addr} = $status;
5458 $status_info{$addr} = $info;
5462 sub set_fate { # Set the fate of a table
5466 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5468 my $addr = do { no overloading; pack 'J', $self; };
5470 return if $fate{$addr} == $fate; # If no-op
5472 # Can only change the ordinary fate, except if going to $MAP_PROXIED
5473 return if $fate{$addr} != $ORDINARY && $fate != $MAP_PROXIED;
5475 $fate{$addr} = $fate;
5477 # Don't document anything to do with a non-normal fated table
5478 if ($fate != $ORDINARY) {
5479 my $put_in_pod = ($fate == $MAP_PROXIED) ? 1 : 0;
5480 foreach my $alias ($self->aliases) {
5481 $alias->set_ucd($put_in_pod);
5483 # MAP_PROXIED doesn't affect the match tables
5484 next if $fate == $MAP_PROXIED;
5485 $alias->set_make_re_pod_entry($put_in_pod);
5489 # Save the reason for suppression for output
5490 if ($fate == $SUPPRESSED && defined $reason) {
5491 $why_suppressed{$complete_name{$addr}} = $reason;
5498 # Don't allow changes to the table from now on. This stores a stack
5499 # trace of where it was called, so that later attempts to modify it
5500 # can immediately show where it got locked.
5503 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5505 my $addr = do { no overloading; pack 'J', $self; };
5507 $locked{$addr} = "";
5509 my $line = (caller(0))[2];
5512 # Accumulate the stack trace
5514 my ($pkg, $file, $caller_line, $caller) = caller $i++;
5516 last unless defined $caller;
5518 $locked{$addr} .= " called from $caller() at line $line\n";
5519 $line = $caller_line;
5521 $locked{$addr} .= " called from main at line $line\n";
5526 sub carp_if_locked {
5527 # Return whether a table is locked or not, and, by the way, complain
5531 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5533 my $addr = do { no overloading; pack 'J', $self; };
5535 return 0 if ! $locked{$addr};
5536 Carp::my_carp_bug("Can't modify a locked table. Stack trace of locking:\n$locked{$addr}\n\n");
5540 sub set_file_path { # Set the final directory path for this table
5542 # Rest of parameters passed on
5545 @{$file_path{pack 'J', $self}} = @_;
5549 # Accessors for the range list stored in this table. First for
5558 matches_identically_to
5571 return $self->_range_list->$sub(@_);
5575 # Then for ones that should fail if locked
5585 return if $self->carp_if_locked;
5587 return $self->_range_list->$sub(@_);
5594 use base '_Base_Table';
5596 # A Map Table is a table that contains the mappings from code points to
5597 # values. There are two weird cases:
5598 # 1) Anomalous entries are ones that aren't maps of ranges of code points, but
5599 # are written in the table's file at the end of the table nonetheless. It
5600 # requires specially constructed code to handle these; utf8.c can not read
5601 # these in, so they should not go in $map_directory. As of this writing,
5602 # the only case that these happen is for named sequences used in
5603 # charnames.pm. But this code doesn't enforce any syntax on these, so
5604 # something else could come along that uses it.
5605 # 2) Specials are anything that doesn't fit syntactically into the body of the
5606 # table. The ranges for these have a map type of non-zero. The code below
5607 # knows about and handles each possible type. In most cases, these are
5608 # written as part of the header.
5610 # A map table deliberately can't be manipulated at will unlike match tables.
5611 # This is because of the ambiguities having to do with what to do with
5612 # overlapping code points. And there just isn't a need for those things;
5613 # what one wants to do is just query, add, replace, or delete mappings, plus
5614 # write the final result.
5615 # However, there is a method to get the list of possible ranges that aren't in
5616 # this table to use for defaulting missing code point mappings. And,
5617 # map_add_or_replace_non_nulls() does allow one to add another table to this
5618 # one, but it is clearly very specialized, and defined that the other's
5619 # non-null values replace this one's if there is any overlap.
5621 sub trace { return main::trace(@_); }
5625 main::setup_package();
5628 # Many input files omit some entries; this gives what the mapping for the
5629 # missing entries should be
5630 main::set_access('default_map', \%default_map, 'r');
5632 my %anomalous_entries;
5633 # Things that go in the body of the table which don't fit the normal
5634 # scheme of things, like having a range. Not much can be done with these
5635 # once there except to output them. This was created to handle named
5637 main::set_access('anomalous_entry', \%anomalous_entries, 'a');
5638 main::set_access('anomalous_entries', # Append singular, read plural
5639 \%anomalous_entries,
5643 # Enum as to whether or not to write out this map table:
5645 # $EXTERNAL_MAP means its existence is noted in the documentation, and
5646 # it should not be removed nor its format changed. This
5647 # is done for those files that have traditionally been
5649 # $INTERNAL_MAP means Perl reserves the right to do anything it wants
5651 main::set_access('to_output_map', \%to_output_map, 's');
5660 # Optional initialization data for the table.
5661 my $initialize = delete $args{'Initialize'};
5663 my $default_map = delete $args{'Default_Map'};
5664 my $property = delete $args{'_Property'};
5665 my $full_name = delete $args{'Full_Name'};
5667 # Rest of parameters passed on
5669 my $range_list = Range_Map->new(Owner => $property);
5671 my $self = $class->SUPER::new(
5673 Complete_Name => $full_name,
5674 Full_Name => $full_name,
5675 _Property => $property,
5676 _Range_List => $range_list,
5679 my $addr = do { no overloading; pack 'J', $self; };
5681 $anomalous_entries{$addr} = [];
5682 $default_map{$addr} = $default_map;
5684 $self->initialize($initialize) if defined $initialize;
5691 qw("") => "_operator_stringify",
5694 sub _operator_stringify {
5697 my $name = $self->property->full_name;
5698 $name = '""' if $name eq "";
5699 return "Map table for Property '$name'";
5703 # Add a synonym for this table (which means the property itself)
5706 # Rest of parameters passed on.
5708 $self->SUPER::add_alias($name, $self->property, @_);
5713 # Add a range of code points to the list of specially-handled code
5714 # points. $MULTI_CP is assumed if the type of special is not passed
5723 my $type = delete $args{'Type'} || 0;
5724 # Rest of parameters passed on
5726 # Can't change the table if locked.
5727 return if $self->carp_if_locked;
5729 my $addr = do { no overloading; pack 'J', $self; };
5731 $self->_range_list->add_map($lower, $upper,
5738 sub append_to_body {
5739 # Adds to the written HERE document of the table's body any anomalous
5740 # entries in the table..
5743 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5745 my $addr = do { no overloading; pack 'J', $self; };
5747 return "" unless @{$anomalous_entries{$addr}};
5748 return join("\n", @{$anomalous_entries{$addr}}) . "\n";
5751 sub map_add_or_replace_non_nulls {
5752 # This adds the mappings in the table $other to $self. Non-null
5753 # mappings from $other override those in $self. It essentially merges
5754 # the two tables, with the second having priority except for null
5759 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5761 return if $self->carp_if_locked;
5763 if (! $other->isa(__PACKAGE__)) {
5764 Carp::my_carp_bug("$other should be a "
5772 my $addr = do { no overloading; pack 'J', $self; };
5773 my $other_addr = do { no overloading; pack 'J', $other; };
5775 local $to_trace = 0 if main::DEBUG;
5777 my $self_range_list = $self->_range_list;
5778 my $other_range_list = $other->_range_list;
5779 foreach my $range ($other_range_list->ranges) {
5780 my $value = $range->value;
5781 next if $value eq "";
5782 $self_range_list->_add_delete('+',
5786 Type => $range->type,
5787 Replace => $UNCONDITIONALLY);
5793 sub set_default_map {
5794 # Define what code points that are missing from the input files should
5799 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5801 my $addr = do { no overloading; pack 'J', $self; };
5803 # Convert the input to the standard equivalent, if any (won't have any
5804 # for $STRING properties)
5805 my $standard = $self->_find_table_from_alias->{$map};
5806 $map = $standard->name if defined $standard;
5808 # Warn if there already is a non-equivalent default map for this
5809 # property. Note that a default map can be a ref, which means that
5810 # what it actually means is delayed until later in the program, and it
5811 # IS permissible to override it here without a message.
5812 my $default_map = $default_map{$addr};
5813 if (defined $default_map
5814 && ! ref($default_map)
5815 && $default_map ne $map
5816 && main::Standardize($map) ne $default_map)
5818 my $property = $self->property;
5819 my $map_table = $property->table($map);
5820 my $default_table = $property->table($default_map);
5821 if (defined $map_table
5822 && defined $default_table
5823 && $map_table != $default_table)
5825 Carp::my_carp("Changing the default mapping for "
5827 . " from $default_map to $map'");
5831 $default_map{$addr} = $map;
5833 # Don't also create any missing table for this map at this point,
5834 # because if we did, it could get done before the main table add is
5835 # done for PropValueAliases.txt; instead the caller will have to make
5836 # sure it exists, if desired.
5841 # Returns boolean: should we write this map table?
5844 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5846 my $addr = do { no overloading; pack 'J', $self; };
5848 # If overridden, use that
5849 return $to_output_map{$addr} if defined $to_output_map{$addr};
5851 my $full_name = $self->full_name;
5852 return $global_to_output_map{$full_name}
5853 if defined $global_to_output_map{$full_name};
5855 # If table says to output, do so; if says to suppress it, do so.
5856 my $fate = $self->fate;
5857 return $INTERNAL_MAP if $fate == $INTERNAL_ONLY;
5858 return $EXTERNAL_MAP if grep { $_ eq $full_name } @output_mapped_properties;
5859 return 0 if $fate == $SUPPRESSED || $fate == $MAP_PROXIED;
5861 my $type = $self->property->type;
5863 # Don't want to output binary map tables even for debugging.
5864 return 0 if $type == $BINARY;
5866 # But do want to output string ones.
5867 return $EXTERNAL_MAP if $type == $STRING;
5869 # Otherwise is an $ENUM, do output it, for Perl's purposes
5870 return $INTERNAL_MAP;
5874 # Returns a Range_List that is gaps of the current table. That is,
5878 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5880 my $current = Range_List->new(Initialize => $self->_range_list,
5881 Owner => $self->property);
5887 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5889 my $return = $self->SUPER::header();
5891 if ($self->to_output_map == $INTERNAL_MAP) {
5892 $return .= $INTERNAL_ONLY_HEADER;
5895 my $property_name = $self->property->full_name;
5898 # !!!!!!! IT IS DEPRECATED TO USE THIS FILE !!!!!!!
5900 # This file is for internal use by core Perl only. It is retained for
5901 # backwards compatibility with applications that may have come to rely on it,
5902 # but its format and even its name or existence are subject to change without
5903 # notice in a future Perl version. Don't use it directly. Instead, its
5904 # contents are now retrievable through a stable API in the Unicode::UCD
5905 # module: Unicode::UCD::prop_invmap('$property_name').
5911 sub set_final_comment {
5912 # Just before output, create the comment that heads the file
5913 # containing this table.
5915 return unless $debugging_build;
5918 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5920 # No sense generating a comment if aren't going to write it out.
5921 return if ! $self->to_output_map;
5923 my $addr = do { no overloading; pack 'J', $self; };
5925 my $property = $self->property;
5927 # Get all the possible names for this property. Don't use any that
5928 # aren't ok for use in a file name, etc. This is perhaps causing that
5929 # flag to do double duty, and may have to be changed in the future to
5930 # have our own flag for just this purpose; but it works now to exclude
5931 # Perl generated synonyms from the lists for properties, where the
5932 # name is always the proper Unicode one.
5933 my @property_aliases = grep { $_->ok_as_filename } $self->aliases;
5935 my $count = $self->count;
5936 my $default_map = $default_map{$addr};
5938 # The ranges that map to the default aren't output, so subtract that
5939 # to get those actually output. A property with matching tables
5940 # already has the information calculated.
5941 if ($property->type != $STRING) {
5942 $count -= $property->table($default_map)->count;
5944 elsif (defined $default_map) {
5946 # But for $STRING properties, must calculate now. Subtract the
5947 # count from each range that maps to the default.
5948 foreach my $range ($self->_range_list->ranges) {
5949 if ($range->value eq $default_map) {
5950 $count -= $range->end +1 - $range->start;
5956 # Get a string version of $count with underscores in large numbers,
5958 my $string_count = main::clarify_number($count);
5960 my $code_points = ($count == 1)
5961 ? 'single code point'
5962 : "$string_count code points";
5967 if (@property_aliases <= 1) {
5968 $mapping = 'mapping';
5969 $these_mappings = 'this mapping';
5973 $mapping = 'synonymous mappings';
5974 $these_mappings = 'these mappings';
5978 if ($count >= $MAX_UNICODE_CODEPOINTS) {
5979 $cp = "any code point in Unicode Version $string_version";
5983 if ($default_map eq "") {
5984 $map_to = 'the null string';
5986 elsif ($default_map eq $CODE_POINT) {
5990 $map_to = "'$default_map'";
5993 $cp = "the single code point";
5996 $cp = "one of the $code_points";
5998 $cp .= " in Unicode Version $string_version for which the mapping is not to $map_to";
6003 my $status = $self->status;
6005 my $warn = uc $status_past_participles{$status};
6008 !!!!!!! $warn !!!!!!!!!!!!!!!!!!!
6009 All property or property=value combinations contained in this file are $warn.
6010 See $unicode_reference_url for what this means.
6014 $comment .= "This file returns the $mapping:\n";
6016 for my $i (0 .. @property_aliases - 1) {
6017 $comment .= sprintf("%-8s%s\n",
6019 $property_aliases[$i]->name . '(cp)'
6022 my $full_name = $self->property->full_name;
6023 $comment .= "\nwhere 'cp' is $cp. Note that $these_mappings $are accessible via the function prop_invmap('$full_name') in Unicode::UCD";
6025 # And append any commentary already set from the actual property.
6026 $comment .= "\n\n" . $self->comment if $self->comment;
6027 if ($self->description) {
6028 $comment .= "\n\n" . join " ", $self->description;
6031 $comment .= "\n\n" . join " ", $self->note;
6035 if (! $self->perl_extension) {
6038 For information about what this property really means, see:
6039 $unicode_reference_url
6043 if ($count) { # Format differs for empty table
6044 $comment.= "\nThe format of the ";
6045 if ($self->range_size_1) {
6047 main body of lines of this file is: CODE_POINT\\t\\tMAPPING where CODE_POINT
6048 is in hex; MAPPING is what CODE_POINT maps to.
6053 # There are tables which end up only having one element per
6054 # range, but it is not worth keeping track of for making just
6055 # this comment a little better.
6057 non-comment portions of the main body of lines of this file is:
6058 START\\tSTOP\\tMAPPING where START is the starting code point of the
6059 range, in hex; STOP is the ending point, or if omitted, the range has just one
6060 code point; MAPPING is what each code point between START and STOP maps to.
6062 if ($self->output_range_counts) {
6064 Numbers in comments in [brackets] indicate how many code points are in the
6065 range (omitted when the range is a single code point or if the mapping is to
6071 $self->set_comment(main::join_lines($comment));
6075 my %swash_keys; # Makes sure don't duplicate swash names.
6077 # The remaining variables are temporaries used while writing each table,
6078 # to output special ranges.
6079 my @multi_code_point_maps; # Map is to more than one code point.
6081 sub handle_special_range {
6082 # Called in the middle of write when it finds a range it doesn't know
6087 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6089 my $addr = do { no overloading; pack 'J', $self; };
6091 my $type = $range->type;
6093 my $low = $range->start;
6094 my $high = $range->end;
6095 my $map = $range->value;
6097 # No need to output the range if it maps to the default.
6098 return if $map eq $default_map{$addr};
6100 my $property = $self->property;
6102 # Switch based on the map type...
6103 if ($type == $HANGUL_SYLLABLE) {
6105 # These are entirely algorithmically determinable based on
6106 # some constants furnished by Unicode; for now, just set a
6107 # flag to indicate that have them. After everything is figured
6108 # out, we will output the code that does the algorithm. (Don't
6109 # output them if not needed because we are suppressing this
6111 $has_hangul_syllables = 1 if $property->to_output_map;
6113 elsif ($type == $CP_IN_NAME) {
6115 # Code points whose name ends in their code point are also
6116 # algorithmically determinable, but need information about the map
6117 # to do so. Both the map and its inverse are stored in data
6118 # structures output in the file. They are stored in the mean time
6119 # in global lists The lists will be written out later into Name.pm,
6120 # which is created only if needed. In order to prevent duplicates
6121 # in the list, only add to them for one property, should multiple
6123 if ($needing_code_points_ending_in_code_point == 0) {
6124 $needing_code_points_ending_in_code_point = $property;
6126 if ($property == $needing_code_points_ending_in_code_point) {
6127 push @{$names_ending_in_code_point{$map}->{'low'}}, $low;
6128 push @{$names_ending_in_code_point{$map}->{'high'}}, $high;
6130 my $squeezed = $map =~ s/[-\s]+//gr;
6131 push @{$loose_names_ending_in_code_point{$squeezed}->{'low'}},
6133 push @{$loose_names_ending_in_code_point{$squeezed}->{'high'}},
6136 push @code_points_ending_in_code_point, { low => $low,
6142 elsif ($range->type == $MULTI_CP || $range->type == $NULL) {
6144 # Multi-code point maps and null string maps have an entry
6145 # for each code point in the range. They use the same
6147 for my $code_point ($low .. $high) {
6149 # The pack() below can't cope with surrogates. XXX This may
6151 if ($code_point >= 0xD800 && $code_point <= 0xDFFF) {
6152 Carp::my_carp("Surrogate code point '$code_point' in mapping to '$map' in $self. No map created");
6156 # Generate the hash entries for these in the form that
6157 # utf8.c understands.
6161 foreach my $to (split " ", $map) {
6162 if ($to !~ /^$code_point_re$/) {
6163 Carp::my_carp("Illegal code point '$to' in mapping '$map' from $code_point in $self. No map created");
6166 $tostr .= sprintf "\\x{%s}", $to;
6167 $to = CORE::hex $to;
6169 $to_name .= " + " if $to_name;
6170 $to_chr .= chr($to);
6171 main::populate_char_info($to)
6172 if ! defined $viacode[$to];
6173 $to_name .= $viacode[$to];
6177 # I (khw) have never waded through this line to
6178 # understand it well enough to comment it.
6179 my $utf8 = sprintf(qq["%s" => "$tostr",],
6180 join("", map { sprintf "\\x%02X", $_ }
6181 unpack("U0C*", pack("U", $code_point))));
6183 # Add a comment so that a human reader can more easily
6184 # see what's going on.
6185 push @multi_code_point_maps,
6186 sprintf("%-45s # U+%04X", $utf8, $code_point);
6188 $multi_code_point_maps[-1] .= " => $map";
6191 main::populate_char_info($code_point)
6192 if ! defined $viacode[$code_point];
6193 $multi_code_point_maps[-1] .= " '"
6195 . "' => '$to_chr'; $viacode[$code_point] => $to_name";
6200 Carp::my_carp("Unrecognized map type '$range->type' in '$range' in $self. Not written");
6207 # Returns the string that should be output in the file before the main
6208 # body of this table. It isn't called until the main body is
6209 # calculated, saving a pass. The string includes some hash entries
6210 # identifying the format of the body, and what the single value should
6211 # be for all ranges missing from it. It also includes any code points
6212 # which have map_types that don't go in the main table.
6215 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6217 my $addr = do { no overloading; pack 'J', $self; };
6219 my $name = $self->property->swash_name;
6221 # Currently there is nothing in the pre_body unless a swash is being
6223 return unless defined $name;
6225 if (defined $swash_keys{$name}) {
6226 Carp::my_carp(join_lines(<<END
6227 Already created a swash name '$name' for $swash_keys{$name}. This means that
6228 the same name desired for $self shouldn't be used. Bad News. This must be
6229 fixed before production use, but proceeding anyway
6233 $swash_keys{$name} = "$self";
6237 # Here we assume we were called after have gone through the whole
6238 # file. If we actually generated anything for each map type, add its
6239 # respective header and trailer
6240 my $specials_name = "";
6241 if (@multi_code_point_maps) {
6242 $specials_name = "utf8::ToSpec$name";
6245 # Some code points require special handling because their mappings are each to
6246 # multiple code points. These do not appear in the main body, but are defined
6247 # in the hash below.
6249 # Each key is the string of N bytes that together make up the UTF-8 encoding
6250 # for the code point. (i.e. the same as looking at the code point's UTF-8
6251 # under "use bytes"). Each value is the UTF-8 of the translation, for speed.
6252 \%$specials_name = (
6254 $pre_body .= join("\n", @multi_code_point_maps) . "\n);\n";
6257 my $format = $self->format;
6260 # The name this swash is to be known by, with the format of the mappings in
6261 # the main body of the table, and what all code points missing from this file
6263 \$utf8::SwashInfo{'To$name'}{'format'} = '$format'; # $map_table_formats{$format}
6265 if ($specials_name) {
6267 \$utf8::SwashInfo{'To$name'}{'specials_name'} = '$specials_name'; # Name of hash of special mappings
6270 my $default_map = $default_map{$addr};
6271 $return .= "\$utf8::SwashInfo{'To$name'}{'missing'} = '$default_map';";
6273 if ($default_map eq $CODE_POINT) {
6274 $return .= ' # code point maps to itself';
6276 elsif ($default_map eq "") {
6277 $return .= ' # code point maps to the null string';
6281 $return .= $pre_body;
6287 # Write the table to the file.
6290 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6292 my $addr = do { no overloading; pack 'J', $self; };
6294 # Clear the temporaries
6295 undef @multi_code_point_maps;
6297 # Calculate the format of the table if not already done.
6298 my $format = $self->format;
6299 my $type = $self->property->type;
6300 my $default_map = $self->default_map;
6301 if (! defined $format) {
6302 if ($type == $BINARY) {
6304 # Don't bother checking the values, because we elsewhere
6305 # verify that a binary table has only 2 values.
6306 $format = $BINARY_FORMAT;
6309 my @ranges = $self->_range_list->ranges;
6311 # default an empty table based on its type and default map
6314 # But it turns out that the only one we can say is a
6315 # non-string (besides binary, handled above) is when the
6316 # table is a string and the default map is to a code point
6317 if ($type == $STRING && $default_map eq $CODE_POINT) {
6318 $format = $HEX_FORMAT;
6321 $format = $STRING_FORMAT;
6326 # Start with the most restrictive format, and as we find
6327 # something that doesn't fit with that, change to the next
6328 # most restrictive, and so on.
6329 $format = $DECIMAL_FORMAT;
6330 foreach my $range (@ranges) {
6331 next if $range->type != 0; # Non-normal ranges don't
6332 # affect the main body
6333 my $map = $range->value;
6334 if ($map ne $default_map) {
6335 last if $format eq $STRING_FORMAT; # already at
6338 $format = $INTEGER_FORMAT
6339 if $format eq $DECIMAL_FORMAT
6340 && $map !~ / ^ [0-9] $ /x;
6341 $format = $FLOAT_FORMAT
6342 if $format eq $INTEGER_FORMAT
6343 && $map !~ / ^ -? [0-9]+ $ /x;
6344 $format = $RATIONAL_FORMAT
6345 if $format eq $FLOAT_FORMAT
6346 && $map !~ / ^ -? [0-9]+ \. [0-9]* $ /x;
6347 $format = $HEX_FORMAT
6348 if $format eq $RATIONAL_FORMAT
6349 && $map !~ / ^ -? [0-9]+ ( \/ [0-9]+ )? $ /x;
6350 $format = $STRING_FORMAT if $format eq $HEX_FORMAT
6351 && $map =~ /[^0-9A-F]/;
6356 } # end of calculating format
6358 if ($default_map eq $CODE_POINT
6359 && $format ne $HEX_FORMAT
6360 && ! defined $self->format) # manual settings are always
6363 Carp::my_carp_bug("Expecting hex format for mapping table for $self, instead got '$format'")
6366 $self->_set_format($format);
6368 # Core Perl has a different definition of mapping ranges than we do,
6369 # that is applicable mainly to mapping code points, so for tables
6370 # where it is possible that core Perl could be used to read it,
6371 # make it range size 1 to prevent possible confusion
6372 $self->set_range_size_1(1) if $format eq $HEX_FORMAT;
6374 return $self->SUPER::write(
6375 ($self->property == $block)
6376 ? 7 # block file needs more tab stops
6378 $default_map); # don't write defaulteds
6381 # Accessors for the underlying list that should fail if locked.
6391 return if $self->carp_if_locked;
6392 return $self->_range_list->$sub(@_);
6395 } # End closure for Map_Table
6397 package Match_Table;
6398 use base '_Base_Table';
6400 # A Match table is one which is a list of all the code points that have
6401 # the same property and property value, for use in \p{property=value}
6402 # constructs in regular expressions. It adds very little data to the base
6403 # structure, but many methods, as these lists can be combined in many ways to
6405 # There are only a few concepts added:
6406 # 1) Equivalents and Relatedness.
6407 # Two tables can match the identical code points, but have different names.
6408 # This always happens when there is a perl single form extension
6409 # \p{IsProperty} for the Unicode compound form \P{Property=True}. The two
6410 # tables are set to be related, with the Perl extension being a child, and
6411 # the Unicode property being the parent.
6413 # It may be that two tables match the identical code points and we don't
6414 # know if they are related or not. This happens most frequently when the
6415 # Block and Script properties have the exact range. But note that a
6416 # revision to Unicode could add new code points to the script, which would
6417 # now have to be in a different block (as the block was filled, or there
6418 # would have been 'Unknown' script code points in it and they wouldn't have
6419 # been identical). So we can't rely on any two properties from Unicode
6420 # always matching the same code points from release to release, and thus
6421 # these tables are considered coincidentally equivalent--not related. When
6422 # two tables are unrelated but equivalent, one is arbitrarily chosen as the
6423 # 'leader', and the others are 'equivalents'. This concept is useful
6424 # to minimize the number of tables written out. Only one file is used for
6425 # any identical set of code points, with entries in Heavy.pl mapping all
6426 # the involved tables to it.
6428 # Related tables will always be identical; we set them up to be so. Thus
6429 # if the Unicode one is deprecated, the Perl one will be too. Not so for
6430 # unrelated tables. Relatedness makes generating the documentation easier.
6433 # Like equivalents, two tables may be the inverses of each other, the
6434 # intersection between them is null, and the union is every Unicode code
6435 # point. The two tables that occupy a binary property are necessarily like
6436 # this. By specifying one table as the complement of another, we can avoid
6437 # storing it on disk (using the other table and performing a fast
6438 # transform), and some memory and calculations.
6440 # 3) Conflicting. It may be that there will eventually be name clashes, with
6441 # the same name meaning different things. For a while, there actually were
6442 # conflicts, but they have so far been resolved by changing Perl's or
6443 # Unicode's definitions to match the other, but when this code was written,
6444 # it wasn't clear that that was what was going to happen. (Unicode changed
6445 # because of protests during their beta period.) Name clashes are warned
6446 # about during compilation, and the documentation. The generated tables
6447 # are sane, free of name clashes, because the code suppresses the Perl
6448 # version. But manual intervention to decide what the actual behavior
6449 # should be may be required should this happen. The introductory comments
6450 # have more to say about this.
6452 sub standardize { return main::standardize($_[0]); }
6453 sub trace { return main::trace(@_); }
6458 main::setup_package();
6461 # The leader table of this one; initially $self.
6462 main::set_access('leader', \%leader, 'r');
6465 # An array of any tables that have this one as their leader
6466 main::set_access('equivalents', \%equivalents, 'readable_array');
6469 # The parent table to this one, initially $self. This allows us to
6470 # distinguish between equivalent tables that are related (for which this
6471 # is set to), and those which may not be, but share the same output file
6472 # because they match the exact same set of code points in the current
6474 main::set_access('parent', \%parent, 'r');
6477 # An array of any tables that have this one as their parent
6478 main::set_access('children', \%children, 'readable_array');
6481 # Array of any tables that would have the same name as this one with
6482 # a different meaning. This is used for the generated documentation.
6483 main::set_access('conflicting', \%conflicting, 'readable_array');
6486 # Set in the constructor for tables that are expected to match all code
6488 main::set_access('matches_all', \%matches_all, 'r');
6491 # Points to the complement that this table is expressed in terms of; 0 if
6493 main::set_access('complement', \%complement, 'r');
6500 # The property for which this table is a listing of property values.
6501 my $property = delete $args{'_Property'};
6503 my $name = delete $args{'Name'};
6504 my $full_name = delete $args{'Full_Name'};
6505 $full_name = $name if ! defined $full_name;
6508 my $initialize = delete $args{'Initialize'};
6509 my $matches_all = delete $args{'Matches_All'} || 0;
6510 my $format = delete $args{'Format'};
6511 # Rest of parameters passed on.
6513 my $range_list = Range_List->new(Initialize => $initialize,
6514 Owner => $property);
6516 my $complete = $full_name;
6517 $complete = '""' if $complete eq ""; # A null name shouldn't happen,
6518 # but this helps debug if it
6520 # The complete name for a match table includes it's property in a
6521 # compound form 'property=table', except if the property is the
6522 # pseudo-property, perl, in which case it is just the single form,
6523 # 'table' (If you change the '=' must also change the ':' in lots of
6524 # places in this program that assume an equal sign)
6525 $complete = $property->full_name . "=$complete" if $property != $perl;
6527 my $self = $class->SUPER::new(%args,
6529 Complete_Name => $complete,
6530 Full_Name => $full_name,
6531 _Property => $property,
6532 _Range_List => $range_list,
6533 Format => $EMPTY_FORMAT,
6535 my $addr = do { no overloading; pack 'J', $self; };
6537 $conflicting{$addr} = [ ];
6538 $equivalents{$addr} = [ ];
6539 $children{$addr} = [ ];
6540 $matches_all{$addr} = $matches_all;
6541 $leader{$addr} = $self;
6542 $parent{$addr} = $self;
6543 $complement{$addr} = 0;
6545 if (defined $format && $format ne $EMPTY_FORMAT) {
6546 Carp::my_carp_bug("'Format' must be '$EMPTY_FORMAT' in a match table instead of '$format'. Using '$EMPTY_FORMAT'");
6552 # See this program's beginning comment block about overloading these.
6555 qw("") => "_operator_stringify",
6559 return if $self->carp_if_locked;
6567 return $self->_range_list + $other;
6573 return $self->_range_list & $other;
6579 return if $self->carp_if_locked;
6581 my $addr = do { no overloading; pack 'J', $self; };
6585 # Change the range list of this table to be the
6587 $self->_set_range_list($self->_range_list
6590 else { # $other is just a simple value
6591 $self->add_range($other, $other);
6595 '-' => sub { my $self = shift;
6597 my $reversed = shift;
6600 Carp::my_carp_bug("Can't cope with a "
6602 . " being the first parameter in a '-'. Subtraction ignored.");
6606 return $self->_range_list - $other;
6608 '~' => sub { my $self = shift;
6609 return ~ $self->_range_list;
6613 sub _operator_stringify {
6616 my $name = $self->complete_name;
6617 return "Table '$name'";
6621 # Returns the range list associated with this table, which will be the
6622 # complement's if it has one.
6626 if (($complement = $self->complement) != 0) {
6627 return ~ $complement->_range_list;
6630 return $self->SUPER::_range_list;
6635 # Add a synonym for this table. See the comments in the base class
6639 # Rest of parameters passed on.
6641 $self->SUPER::add_alias($name, $self, @_);
6645 sub add_conflicting {
6646 # Add the name of some other object to the list of ones that name
6647 # clash with this match table.
6650 my $conflicting_name = shift; # The name of the conflicting object
6651 my $p = shift || 'p'; # Optional, is this a \p{} or \P{} ?
6652 my $conflicting_object = shift; # Optional, the conflicting object
6653 # itself. This is used to
6654 # disambiguate the text if the input
6655 # name is identical to any of the
6656 # aliases $self is known by.
6657 # Sometimes the conflicting object is
6658 # merely hypothetical, so this has to
6659 # be an optional parameter.
6660 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6662 my $addr = do { no overloading; pack 'J', $self; };
6664 # Check if the conflicting name is exactly the same as any existing
6665 # alias in this table (as long as there is a real object there to
6666 # disambiguate with).
6667 if (defined $conflicting_object) {
6668 foreach my $alias ($self->aliases) {
6669 if ($alias->name eq $conflicting_name) {
6671 # Here, there is an exact match. This results in
6672 # ambiguous comments, so disambiguate by changing the
6673 # conflicting name to its object's complete equivalent.
6674 $conflicting_name = $conflicting_object->complete_name;
6680 # Convert to the \p{...} final name
6681 $conflicting_name = "\\$p" . "{$conflicting_name}";
6684 return if grep { $conflicting_name eq $_ } @{$conflicting{$addr}};
6686 push @{$conflicting{$addr}}, $conflicting_name;
6691 sub is_set_equivalent_to {
6692 # Return boolean of whether or not the other object is a table of this
6693 # type and has been marked equivalent to this one.
6697 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6699 return 0 if ! defined $other; # Can happen for incomplete early
6701 unless ($other->isa(__PACKAGE__)) {
6702 my $ref_other = ref $other;
6703 my $ref_self = ref $self;
6704 Carp::my_carp_bug("Argument to 'is_set_equivalent_to' must be another $ref_self, not a '$ref_other'. $other not set equivalent to $self.");
6708 # Two tables are equivalent if they have the same leader.
6710 return $leader{pack 'J', $self} == $leader{pack 'J', $other};
6714 sub set_equivalent_to {
6715 # Set $self equivalent to the parameter table.
6716 # The required Related => 'x' parameter is a boolean indicating
6717 # whether these tables are related or not. If related, $other becomes
6718 # the 'parent' of $self; if unrelated it becomes the 'leader'
6720 # Related tables share all characteristics except names; equivalents
6721 # not quite so many.
6722 # If they are related, one must be a perl extension. This is because
6723 # we can't guarantee that Unicode won't change one or the other in a
6724 # later release even if they are identical now.
6730 my $related = delete $args{'Related'};
6732 Carp::carp_extra_args(\%args) if main::DEBUG && %args;
6734 return if ! defined $other; # Keep on going; happens in some early
6737 if (! defined $related) {
6738 Carp::my_carp_bug("set_equivalent_to must have 'Related => [01] parameter. Assuming $self is not related to $other");
6742 # If already are equivalent, no need to re-do it; if subroutine
6743 # returns null, it found an error, also do nothing
6744 my $are_equivalent = $self->is_set_equivalent_to($other);
6745 return if ! defined $are_equivalent || $are_equivalent;
6747 my $addr = do { no overloading; pack 'J', $self; };
6748 my $current_leader = ($related) ? $parent{$addr} : $leader{$addr};
6751 if ($current_leader->perl_extension) {
6752 if ($other->perl_extension) {
6753 Carp::my_carp_bug("Use add_alias() to set two Perl tables '$self' and '$other', equivalent.");
6756 } elsif ($self->property != $other->property # Depending on
6762 && ! $other->perl_extension)
6764 Carp::my_carp_bug("set_equivalent_to should have 'Related => 0 for equivalencing two Unicode properties. Assuming $self is not related to $other");
6769 if (! $self->is_empty && ! $self->matches_identically_to($other)) {
6770 Carp::my_carp_bug("$self should be empty or match identically to $other. Not setting equivalent");
6774 my $leader = do { no overloading; pack 'J', $current_leader; };
6775 my $other_addr = do { no overloading; pack 'J', $other; };
6777 # Any tables that are equivalent to or children of this table must now
6778 # instead be equivalent to or (children) to the new leader (parent),
6779 # still equivalent. The equivalency includes their matches_all info,
6780 # and for related tables, their fate and status.
6781 # All related tables are of necessity equivalent, but the converse
6782 # isn't necessarily true
6783 my $status = $other->status;
6784 my $status_info = $other->status_info;
6785 my $fate = $other->fate;
6786 my $matches_all = $matches_all{other_addr};
6787 my $caseless_equivalent = $other->caseless_equivalent;
6788 foreach my $table ($current_leader, @{$equivalents{$leader}}) {
6789 next if $table == $other;
6790 trace "setting $other to be the leader of $table, status=$status" if main::DEBUG && $to_trace;
6792 my $table_addr = do { no overloading; pack 'J', $table; };
6793 $leader{$table_addr} = $other;
6794 $matches_all{$table_addr} = $matches_all;
6795 $self->_set_range_list($other->_range_list);
6796 push @{$equivalents{$other_addr}}, $table;
6798 $parent{$table_addr} = $other;
6799 push @{$children{$other_addr}}, $table;
6800 $table->set_status($status, $status_info);
6802 # This reason currently doesn't get exposed outside; otherwise
6803 # would have to look up the parent's reason and use it instead.
6804 $table->set_fate($fate, "Parent's fate");
6806 $self->set_caseless_equivalent($caseless_equivalent);
6810 # Now that we've declared these to be equivalent, any changes to one
6811 # of the tables would invalidate that equivalency.
6817 sub set_complement {
6818 # Set $self to be the complement of the parameter table. $self is
6819 # locked, as what it contains should all come from the other table.
6825 Carp::carp_extra_args(\%args) if main::DEBUG && %args;
6827 if ($other->complement != 0) {
6828 Carp::my_carp_bug("Can't set $self to be the complement of $other, which itself is the complement of " . $other->complement);
6831 my $addr = do { no overloading; pack 'J', $self; };
6832 $complement{$addr} = $other;
6837 sub add_range { # Add a range to the list for this table.
6839 # Rest of parameters passed on
6841 return if $self->carp_if_locked;
6842 return $self->_range_list->add_range(@_);
6847 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6849 # All match tables are to be used only by the Perl core.
6850 return $self->SUPER::header() . $INTERNAL_ONLY_HEADER;
6853 sub pre_body { # Does nothing for match tables.
6857 sub append_to_body { # Does nothing for match tables.
6865 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6867 $self->SUPER::set_fate($fate, $reason);
6869 # All children share this fate
6870 foreach my $child ($self->children) {
6871 $child->set_fate($fate, $reason);
6878 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6880 return $self->SUPER::write(2); # 2 tab stops
6883 sub set_final_comment {
6884 # This creates a comment for the file that is to hold the match table
6885 # $self. It is somewhat convoluted to make the English read nicely,
6886 # but, heh, it's just a comment.
6887 # This should be called only with the leader match table of all the
6888 # ones that share the same file. It lists all such tables, ordered so
6889 # that related ones are together.
6891 return unless $debugging_build;
6893 my $leader = shift; # Should only be called on the leader table of
6894 # an equivalent group
6895 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6897 my $addr = do { no overloading; pack 'J', $leader; };
6899 if ($leader{$addr} != $leader) {
6900 Carp::my_carp_bug(<<END
6901 set_final_comment() must be called on a leader table, which $leader is not.
6902 It is equivalent to $leader{$addr}. No comment created
6908 # Get the number of code points matched by each of the tables in this
6909 # file, and add underscores for clarity.
6910 my $count = $leader->count;
6911 my $string_count = main::clarify_number($count);
6913 my $loose_count = 0; # how many aliases loosely matched
6914 my $compound_name = ""; # ? Are any names compound?, and if so, an
6916 my $properties_with_compound_names = 0; # count of these
6919 my %flags; # The status flags used in the file
6920 my $total_entries = 0; # number of entries written in the comment
6921 my $matches_comment = ""; # The portion of the comment about the
6923 my @global_comments; # List of all the tables' comments that are
6924 # there before this routine was called.
6926 # Get list of all the parent tables that are equivalent to this one
6927 # (including itself).
6928 my @parents = grep { $parent{main::objaddr $_} == $_ }
6929 main::uniques($leader, @{$equivalents{$addr}});
6930 my $has_unrelated = (@parents >= 2); # boolean, ? are there unrelated
6933 for my $parent (@parents) {
6935 my $property = $parent->property;
6937 # Special case 'N' tables in properties with two match tables when
6938 # the other is a 'Y' one. These are likely to be binary tables,
6939 # but not necessarily. In either case, \P{} will match the
6940 # complement of \p{}, and so if something is a synonym of \p, the
6941 # complement of that something will be the synonym of \P. This
6942 # would be true of any property with just two match tables, not
6943 # just those whose values are Y and N; but that would require a
6944 # little extra work, and there are none such so far in Unicode.
6945 my $perl_p = 'p'; # which is it? \p{} or \P{}
6946 my @yes_perl_synonyms; # list of any synonyms for the 'Y' table
6948 if (scalar $property->tables == 2
6949 && $parent == $property->table('N')
6950 && defined (my $yes = $property->table('Y')))
6952 my $yes_addr = do { no overloading; pack 'J', $yes; };
6954 = grep { $_->property == $perl }
6957 $parent{$yes_addr}->children);
6959 # But these synonyms are \P{} ,not \p{}
6963 my @description; # Will hold the table description
6964 my @note; # Will hold the table notes.
6965 my @conflicting; # Will hold the table conflicts.
6967 # Look at the parent, any yes synonyms, and all the children
6968 my $parent_addr = do { no overloading; pack 'J', $parent; };
6969 for my $table ($parent,
6971 @{$children{$parent_addr}})
6973 my $table_addr = do { no overloading; pack 'J', $table; };
6974 my $table_property = $table->property;
6976 # Tables are separated by a blank line to create a grouping.
6977 $matches_comment .= "\n" if $matches_comment;
6979 # The table is named based on the property and value
6980 # combination it is for, like script=greek. But there may be
6981 # a number of synonyms for each side, like 'sc' for 'script',
6982 # and 'grek' for 'greek'. Any combination of these is a valid
6983 # name for this table. In this case, there are three more,
6984 # 'sc=grek', 'sc=greek', and 'script='grek'. Rather than
6985 # listing all possible combinations in the comment, we make
6986 # sure that each synonym occurs at least once, and add
6987 # commentary that the other combinations are possible.
6988 # Because regular expressions don't recognize things like
6989 # \p{jsn=}, only look at non-null right-hand-sides
6990 my @property_aliases = $table_property->aliases;
6991 my @table_aliases = grep { $_->name ne "" } $table->aliases;
6993 # The alias lists above are already ordered in the order we
6994 # want to output them. To ensure that each synonym is listed,
6995 # we must use the max of the two numbers. But if there are no
6996 # legal synonyms (nothing in @table_aliases), then we don't
6998 my $listed_combos = (@table_aliases)
6999 ? main::max(scalar @table_aliases,
7000 scalar @property_aliases)
7002 trace "$listed_combos, tables=", scalar @table_aliases, "; names=", scalar @property_aliases if main::DEBUG;
7005 my $property_had_compound_name = 0;
7007 for my $i (0 .. $listed_combos - 1) {
7010 # The current alias for the property is the next one on
7011 # the list, or if beyond the end, start over. Similarly
7012 # for the table (\p{prop=table})
7013 my $property_alias = $property_aliases
7014 [$i % @property_aliases]->name;
7015 my $table_alias_object = $table_aliases
7016 [$i % @table_aliases];
7017 my $table_alias = $table_alias_object->name;
7018 my $loose_match = $table_alias_object->loose_match;
7020 if ($table_alias !~ /\D/) { # Clarify large numbers.
7021 $table_alias = main::clarify_number($table_alias)
7024 # Add a comment for this alias combination
7025 my $current_match_comment;
7026 if ($table_property == $perl) {
7027 $current_match_comment = "\\$perl_p"
7031 $current_match_comment
7032 = "\\p{$property_alias=$table_alias}";
7033 $property_had_compound_name = 1;
7036 # Flag any abnormal status for this table.
7037 my $flag = $property->status
7039 || $table_alias_object->status;
7040 $flags{$flag} = $status_past_participles{$flag} if $flag;
7044 # Pretty up the comment. Note the \b; it says don't make
7045 # this line a continuation.
7046 $matches_comment .= sprintf("\b%-1s%-s%s\n",
7049 $current_match_comment);
7050 } # End of generating the entries for this table.
7052 # Save these for output after this group of related tables.
7053 push @description, $table->description;
7054 push @note, $table->note;
7055 push @conflicting, $table->conflicting;
7057 # And this for output after all the tables.
7058 push @global_comments, $table->comment;
7060 # Compute an alternate compound name using the final property
7061 # synonym and the first table synonym with a colon instead of
7062 # the equal sign used elsewhere.
7063 if ($property_had_compound_name) {
7064 $properties_with_compound_names ++;
7065 if (! $compound_name || @property_aliases > 1) {
7066 $compound_name = $property_aliases[-1]->name
7068 . $table_aliases[0]->name;
7071 } # End of looping through all children of this table
7073 # Here have assembled in $matches_comment all the related tables
7074 # to the current parent (preceded by the same info for all the
7075 # previous parents). Put out information that applies to all of
7076 # the current family.
7079 # But output the conflicting information now, as it applies to
7081 my $conflicting = join ", ", @conflicting;
7083 $matches_comment .= <<END;
7085 Note that contrary to what you might expect, the above is NOT the same as
7087 $matches_comment .= "any of: " if @conflicting > 1;
7088 $matches_comment .= "$conflicting\n";
7092 $matches_comment .= "\n Meaning: "
7093 . join('; ', @description)
7097 $matches_comment .= "\n Note: "
7098 . join("\n ", @note)
7101 } # End of looping through all tables
7109 $code_points = 'single code point';
7113 $code_points = "$string_count code points";
7118 if ($total_entries == 1) {
7121 $any_of_these = 'this'
7124 $synonyms = " any of the following regular expression constructs";
7125 $entries = 'entries';
7126 $any_of_these = 'any of these'
7129 my $comment = "Use Unicode::UCD::prop_invlist() to access the contents of this file.\n\n";
7130 if ($has_unrelated) {
7132 This file is for tables that are not necessarily related: To conserve
7133 resources, every table that matches the identical set of code points in this
7134 version of Unicode uses this file. Each one is listed in a separate group
7135 below. It could be that the tables will match the same set of code points in
7136 other Unicode releases, or it could be purely coincidence that they happen to
7137 be the same in Unicode $string_version, and hence may not in other versions.
7143 foreach my $flag (sort keys %flags) {
7145 '$flag' below means that this form is $flags{$flag}.
7146 Consult $pod_file.pod
7152 if ($total_entries == 0) {
7153 Carp::my_carp("No regular expression construct can match $leader, as all names for it are the null string. Creating file anyway.");
7155 This file returns the $code_points in Unicode Version $string_version for
7156 $leader, but it is inaccessible through Perl regular expressions, as
7157 "\\p{prop=}" is not recognized.
7162 This file returns the $code_points in Unicode Version $string_version that
7166 $pod_file.pod should be consulted for the syntax rules for $any_of_these,
7167 including if adding or subtracting white space, underscore, and hyphen
7168 characters matters or doesn't matter, and other permissible syntactic
7169 variants. Upper/lower case distinctions never matter.
7173 if ($compound_name) {
7176 A colon can be substituted for the equals sign, and
7178 if ($properties_with_compound_names > 1) {
7180 within each group above,
7183 $compound_name = sprintf("%-8s\\p{%s}", " ", $compound_name);
7185 # Note the \b below, it says don't make that line a continuation.
7187 anything to the left of the equals (or colon) can be combined with anything to
7188 the right. Thus, for example,
7194 # And append any comment(s) from the actual tables. They are all
7195 # gathered here, so may not read all that well.
7196 if (@global_comments) {
7197 $comment .= "\n" . join("\n\n", @global_comments) . "\n";
7200 if ($count) { # The format differs if no code points, and needs no
7201 # explanation in that case
7204 The format of the lines of this file is:
7207 START\\tSTOP\\twhere START is the starting code point of the range, in hex;
7208 STOP is the ending point, or if omitted, the range has just one code point.
7210 if ($leader->output_range_counts) {
7212 Numbers in comments in [brackets] indicate how many code points are in the
7218 $leader->set_comment(main::join_lines($comment));
7222 # Accessors for the underlying list
7224 get_valid_code_point
7225 get_invalid_code_point
7233 return $self->_range_list->$sub(@_);
7236 } # End closure for Match_Table
7240 # The Property class represents a Unicode property, or the $perl
7241 # pseudo-property. It contains a map table initialized empty at construction
7242 # time, and for properties accessible through regular expressions, various
7243 # match tables, created through the add_match_table() method, and referenced
7244 # by the table('NAME') or tables() methods, the latter returning a list of all
7245 # of the match tables. Otherwise table operations implicitly are for the map
7248 # Most of the data in the property is actually about its map table, so it
7249 # mostly just uses that table's accessors for most methods. The two could
7250 # have been combined into one object, but for clarity because of their
7251 # differing semantics, they have been kept separate. It could be argued that
7252 # the 'file' and 'directory' fields should be kept with the map table.
7254 # Each property has a type. This can be set in the constructor, or in the
7255 # set_type accessor, but mostly it is figured out by the data. Every property
7256 # starts with unknown type, overridden by a parameter to the constructor, or
7257 # as match tables are added, or ranges added to the map table, the data is
7258 # inspected, and the type changed. After the table is mostly or entirely
7259 # filled, compute_type() should be called to finalize they analysis.
7261 # There are very few operations defined. One can safely remove a range from
7262 # the map table, and property_add_or_replace_non_nulls() adds the maps from another
7263 # table to this one, replacing any in the intersection of the two.
7265 sub standardize { return main::standardize($_[0]); }
7266 sub trace { return main::trace(@_) if main::DEBUG && $to_trace }
7270 # This hash will contain as keys, all the aliases of all properties, and
7271 # as values, pointers to their respective property objects. This allows
7272 # quick look-up of a property from any of its names.
7273 my %alias_to_property_of;
7275 sub dump_alias_to_property_of {
7278 print "\n", main::simple_dumper (\%alias_to_property_of), "\n";
7283 # This is a package subroutine, not called as a method.
7284 # If the single parameter is a literal '*' it returns a list of all
7285 # defined properties.
7286 # Otherwise, the single parameter is a name, and it returns a pointer
7287 # to the corresponding property object, or undef if none.
7289 # Properties can have several different names. The 'standard' form of
7290 # each of them is stored in %alias_to_property_of as they are defined.
7291 # But it's possible that this subroutine will be called with some
7292 # variant, so if the initial lookup fails, it is repeated with the
7293 # standardized form of the input name. If found, besides returning the
7294 # result, the input name is added to the list so future calls won't
7295 # have to do the conversion again.
7299 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7301 if (! defined $name) {
7302 Carp::my_carp_bug("Undefined input property. No action taken.");
7306 return main::uniques(values %alias_to_property_of) if $name eq '*';
7308 # Return cached result if have it.
7309 my $result = $alias_to_property_of{$name};
7310 return $result if defined $result;
7312 # Convert the input to standard form.
7313 my $standard_name = standardize($name);
7315 $result = $alias_to_property_of{$standard_name};
7316 return unless defined $result; # Don't cache undefs
7318 # Cache the result before returning it.
7319 $alias_to_property_of{$name} = $result;
7324 main::setup_package();
7327 # A pointer to the map table object for this property
7328 main::set_access('map', \%map);
7331 # The property's full name. This is a duplicate of the copy kept in the
7332 # map table, but is needed because stringify needs it during
7333 # construction of the map table, and then would have a chicken before egg
7335 main::set_access('full_name', \%full_name, 'r');
7338 # This hash will contain as keys, all the aliases of any match tables
7339 # attached to this property, and as values, the pointers to their
7340 # respective tables. This allows quick look-up of a table from any of its
7342 main::set_access('table_ref', \%table_ref);
7345 # The type of the property, $ENUM, $BINARY, etc
7346 main::set_access('type', \%type, 'r');
7349 # The filename where the map table will go (if actually written).
7350 # Normally defaulted, but can be overridden.
7351 main::set_access('file', \%file, 'r', 's');
7354 # The directory where the map table will go (if actually written).
7355 # Normally defaulted, but can be overridden.
7356 main::set_access('directory', \%directory, 's');
7358 my %pseudo_map_type;
7359 # This is used to affect the calculation of the map types for all the
7360 # ranges in the table. It should be set to one of the values that signify
7361 # to alter the calculation.
7362 main::set_access('pseudo_map_type', \%pseudo_map_type, 'r');
7364 my %has_only_code_point_maps;
7365 # A boolean used to help in computing the type of data in the map table.
7366 main::set_access('has_only_code_point_maps', \%has_only_code_point_maps);
7369 # A list of the first few distinct mappings this property has. This is
7370 # used to disambiguate between binary and enum property types, so don't
7371 # have to keep more than three.
7372 main::set_access('unique_maps', \%unique_maps);
7374 my %pre_declared_maps;
7375 # A boolean that gives whether the input data should declare all the
7376 # tables used, or not. If the former, unknown ones raise a warning.
7377 main::set_access('pre_declared_maps',
7378 \%pre_declared_maps, 'r', 's');
7381 # The only required parameter is the positionally first, name. All
7382 # other parameters are key => value pairs. See the documentation just
7383 # above for the meanings of the ones not passed directly on to the map
7384 # table constructor.
7387 my $name = shift || "";
7389 my $self = property_ref($name);
7390 if (defined $self) {
7391 my $options_string = join ", ", @_;
7392 $options_string = ". Ignoring options $options_string" if $options_string;
7393 Carp::my_carp("$self is already in use. Using existing one$options_string;");
7399 $self = bless \do { my $anonymous_scalar }, $class;
7400 my $addr = do { no overloading; pack 'J', $self; };
7402 $directory{$addr} = delete $args{'Directory'};
7403 $file{$addr} = delete $args{'File'};
7404 $full_name{$addr} = delete $args{'Full_Name'} || $name;
7405 $type{$addr} = delete $args{'Type'} || $UNKNOWN;
7406 $pseudo_map_type{$addr} = delete $args{'Map_Type'};
7407 $pre_declared_maps{$addr} = delete $args{'Pre_Declared_Maps'}
7408 # Starting in this release, property
7409 # values should be defined for all
7410 # properties, except those overriding this
7411 // $v_version ge v5.1.0;
7413 # Rest of parameters passed on.
7415 $has_only_code_point_maps{$addr} = 1;
7416 $table_ref{$addr} = { };
7417 $unique_maps{$addr} = { };
7419 $map{$addr} = Map_Table->new($name,
7420 Full_Name => $full_name{$addr},
7421 _Alias_Hash => \%alias_to_property_of,
7427 # See this program's beginning comment block about overloading the copy
7428 # constructor. Few operations are defined on properties, but a couple are
7429 # useful. It is safe to take the inverse of a property, and to remove a
7430 # single code point from it.
7433 qw("") => "_operator_stringify",
7434 "." => \&main::_operator_dot,
7435 '==' => \&main::_operator_equal,
7436 '!=' => \&main::_operator_not_equal,
7437 '=' => sub { return shift },
7438 '-=' => "_minus_and_equal",
7441 sub _operator_stringify {
7442 return "Property '" . shift->full_name . "'";
7445 sub _minus_and_equal {
7446 # Remove a single code point from the map table of a property.
7450 my $reversed = shift;
7451 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7454 Carp::my_carp_bug("Can't cope with a "
7456 . " argument to '-='. Subtraction ignored.");
7459 elsif ($reversed) { # Shouldn't happen in a -=, but just in case
7460 Carp::my_carp_bug("Can't cope with a "
7462 . " being the first parameter in a '-='. Subtraction ignored.");
7467 $map{pack 'J', $self}->delete_range($other, $other);
7472 sub add_match_table {
7473 # Add a new match table for this property, with name given by the
7474 # parameter. It returns a pointer to the table.
7480 my $addr = do { no overloading; pack 'J', $self; };
7482 my $table = $table_ref{$addr}{$name};
7483 my $standard_name = main::standardize($name);
7485 || (defined ($table = $table_ref{$addr}{$standard_name})))
7487 Carp::my_carp("Table '$name' in $self is already in use. Using existing one");
7488 $table_ref{$addr}{$name} = $table;
7493 # See if this is a perl extension, if not passed in.
7494 my $perl_extension = delete $args{'Perl_Extension'};
7496 = $self->perl_extension if ! defined $perl_extension;
7498 $table = Match_Table->new(
7500 Perl_Extension => $perl_extension,
7501 _Alias_Hash => $table_ref{$addr},
7504 # gets property's fate and status by default
7505 Fate => $self->fate,
7506 Status => $self->status,
7507 _Status_Info => $self->status_info,
7509 return unless defined $table;
7512 # Save the names for quick look up
7513 $table_ref{$addr}{$standard_name} = $table;
7514 $table_ref{$addr}{$name} = $table;
7516 # Perhaps we can figure out the type of this property based on the
7517 # fact of adding this match table. First, string properties don't
7518 # have match tables; second, a binary property can't have 3 match
7520 if ($type{$addr} == $UNKNOWN) {
7521 $type{$addr} = $NON_STRING;
7523 elsif ($type{$addr} == $STRING) {
7524 Carp::my_carp("$self Added a match table '$name' to a string property '$self'. Changed it to a non-string property. Bad News.");
7525 $type{$addr} = $NON_STRING;
7527 elsif ($type{$addr} != $ENUM && $type{$addr} != $FORCED_BINARY) {
7528 if (scalar main::uniques(values %{$table_ref{$addr}}) > 2
7529 && $type{$addr} == $BINARY)
7531 Carp::my_carp("$self now has more than 2 tables (with the addition of '$name'), and so is no longer binary. Changing its type to 'enum'. Bad News.");
7532 $type{$addr} = $ENUM;
7539 sub delete_match_table {
7540 # Delete the table referred to by $2 from the property $1.
7543 my $table_to_remove = shift;
7544 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7546 my $addr = do { no overloading; pack 'J', $self; };
7548 # Remove all names that refer to it.
7549 foreach my $key (keys %{$table_ref{$addr}}) {
7550 delete $table_ref{$addr}{$key}
7551 if $table_ref{$addr}{$key} == $table_to_remove;
7554 $table_to_remove->DESTROY;
7559 # Return a pointer to the match table (with name given by the
7560 # parameter) associated with this property; undef if none.
7564 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7566 my $addr = do { no overloading; pack 'J', $self; };
7568 return $table_ref{$addr}{$name} if defined $table_ref{$addr}{$name};
7570 # If quick look-up failed, try again using the standard form of the
7571 # input name. If that succeeds, cache the result before returning so
7572 # won't have to standardize this input name again.
7573 my $standard_name = main::standardize($name);
7574 return unless defined $table_ref{$addr}{$standard_name};
7576 $table_ref{$addr}{$name} = $table_ref{$addr}{$standard_name};
7577 return $table_ref{$addr}{$name};
7581 # Return a list of pointers to all the match tables attached to this
7585 return main::uniques(values %{$table_ref{pack 'J', shift}});
7589 # Returns the directory the map table for this property should be
7590 # output in. If a specific directory has been specified, that has
7591 # priority; 'undef' is returned if the type isn't defined;
7592 # or $map_directory for everything else.
7594 my $addr = do { no overloading; pack 'J', shift; };
7596 return $directory{$addr} if defined $directory{$addr};
7597 return undef if $type{$addr} == $UNKNOWN;
7598 return $map_directory;
7602 # Return the name that is used to both:
7603 # 1) Name the file that the map table is written to.
7604 # 2) The name of swash related stuff inside that file.
7605 # The reason for this is that the Perl core historically has used
7606 # certain names that aren't the same as the Unicode property names.
7607 # To continue using these, $file is hard-coded in this file for those,
7608 # but otherwise the standard name is used. This is different from the
7609 # external_name, so that the rest of the files, like in lib can use
7610 # the standard name always, without regard to historical precedent.
7613 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7615 my $addr = do { no overloading; pack 'J', $self; };
7617 # Swash names are used only on regular map tables; otherwise there
7618 # should be no access to the property map table from other parts of
7620 return if $map{$addr}->fate != $ORDINARY;
7622 return $file{$addr} if defined $file{$addr};
7623 return $map{$addr}->external_name;
7626 sub to_create_match_tables {
7627 # Returns a boolean as to whether or not match tables should be
7628 # created for this property.
7631 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7633 # The whole point of this pseudo property is match tables.
7634 return 1 if $self == $perl;
7636 my $addr = do { no overloading; pack 'J', $self; };
7638 # Don't generate tables of code points that match the property values
7639 # of a string property. Such a list would most likely have many
7640 # property values, each with just one or very few code points mapping
7642 return 0 if $type{$addr} == $STRING;
7644 # Don't generate anything for unimplemented properties.
7645 return 0 if grep { $self->complete_name eq $_ }
7646 @unimplemented_properties;
7651 sub property_add_or_replace_non_nulls {
7652 # This adds the mappings in the property $other to $self. Non-null
7653 # mappings from $other override those in $self. It essentially merges
7654 # the two properties, with the second having priority except for null
7659 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7661 if (! $other->isa(__PACKAGE__)) {
7662 Carp::my_carp_bug("$other should be a "
7671 return $map{pack 'J', $self}->map_add_or_replace_non_nulls($map{pack 'J', $other});
7675 # Certain tables are not generally written out to files, but
7676 # Unicode::UCD has the intelligence to know that the file for $self
7677 # can be used to reconstruct those tables. This routine just changes
7678 # things so that UCD pod entries for those suppressed tables are
7679 # generated, so the fact that a proxy is used is invisible to the
7684 foreach my $property_name (@_) {
7685 my $ref = property_ref($property_name);
7686 next if $ref->to_output_map;
7687 $ref->set_fate($MAP_PROXIED);
7692 # Set the type of the property. Mostly this is figured out by the
7693 # data in the table. But this is used to set it explicitly. The
7694 # reason it is not a standard accessor is that when setting a binary
7695 # property, we need to make sure that all the true/false aliases are
7696 # present, as they were omitted in early Unicode releases.
7700 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7704 && $type != $FORCED_BINARY
7705 && $type != $STRING)
7707 Carp::my_carp("Unrecognized type '$type'. Type not set");
7711 { no overloading; $type{pack 'J', $self} = $type; }
7712 return if $type != $BINARY && $type != $FORCED_BINARY;
7714 my $yes = $self->table('Y');
7715 $yes = $self->table('Yes') if ! defined $yes;
7716 $yes = $self->add_match_table('Y', Full_Name => 'Yes')
7719 # Add aliases in order wanted, duplicates will be ignored. We use a
7720 # binary property present in all releases for its ordered lists of
7721 # true/false aliases. Note, that could run into problems in
7722 # outputting things in that we don't distinguish between the name and
7723 # full name of these. Hopefully, if the table was already created
7724 # before this code is executed, it was done with these set properly.
7725 my $bm = property_ref("Bidi_Mirrored");
7726 foreach my $alias ($bm->table("Y")->aliases) {
7727 $yes->add_alias($alias->name);
7729 my $no = $self->table('N');
7730 $no = $self->table('No') if ! defined $no;
7731 $no = $self->add_match_table('N', Full_Name => 'No') if ! defined $no;
7732 foreach my $alias ($bm->table("N")->aliases) {
7733 $no->add_alias($alias->name);
7740 # Add a map to the property's map table. This also keeps
7741 # track of the maps so that the property type can be determined from
7745 my $start = shift; # First code point in range
7746 my $end = shift; # Final code point in range
7747 my $map = shift; # What the range maps to.
7748 # Rest of parameters passed on.
7750 my $addr = do { no overloading; pack 'J', $self; };
7752 # If haven't the type of the property, gather information to figure it
7754 if ($type{$addr} == $UNKNOWN) {
7756 # If the map contains an interior blank or dash, or most other
7757 # nonword characters, it will be a string property. This
7758 # heuristic may actually miss some string properties. If so, they
7759 # may need to have explicit set_types called for them. This
7760 # happens in the Unihan properties.
7761 if ($map =~ / (?<= . ) [ -] (?= . ) /x
7762 || $map =~ / [^\w.\/\ -] /x)
7764 $self->set_type($STRING);
7766 # $unique_maps is used for disambiguating between ENUM and
7767 # BINARY later; since we know the property is not going to be
7768 # one of those, no point in keeping the data around
7769 undef $unique_maps{$addr};
7773 # Not necessarily a string. The final decision has to be
7774 # deferred until all the data are in. We keep track of if all
7775 # the values are code points for that eventual decision.
7776 $has_only_code_point_maps{$addr} &=
7777 $map =~ / ^ $code_point_re $/x;
7779 # For the purposes of disambiguating between binary and other
7780 # enumerations at the end, we keep track of the first three
7781 # distinct property values. Once we get to three, we know
7782 # it's not going to be binary, so no need to track more.
7783 if (scalar keys %{$unique_maps{$addr}} < 3) {
7784 $unique_maps{$addr}{main::standardize($map)} = 1;
7789 # Add the mapping by calling our map table's method
7790 return $map{$addr}->add_map($start, $end, $map, @_);
7794 # Compute the type of the property: $ENUM, $STRING, or $BINARY. This
7795 # should be called after the property is mostly filled with its maps.
7796 # We have been keeping track of what the property values have been,
7797 # and now have the necessary information to figure out the type.
7800 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7802 my $addr = do { no overloading; pack 'J', $self; };
7804 my $type = $type{$addr};
7806 # If already have figured these out, no need to do so again, but we do
7807 # a double check on ENUMS to make sure that a string property hasn't
7808 # improperly been classified as an ENUM, so continue on with those.
7809 return if $type == $STRING
7811 || $type == $FORCED_BINARY;
7813 # If every map is to a code point, is a string property.
7814 if ($type == $UNKNOWN
7815 && ($has_only_code_point_maps{$addr}
7816 || (defined $map{$addr}->default_map
7817 && $map{$addr}->default_map eq "")))
7819 $self->set_type($STRING);
7823 # Otherwise, it is to some sort of enumeration. (The case where
7824 # it is a Unicode miscellaneous property, and treated like a
7825 # string in this program is handled in add_map()). Distinguish
7826 # between binary and some other enumeration type. Of course, if
7827 # there are more than two values, it's not binary. But more
7828 # subtle is the test that the default mapping is defined means it
7829 # isn't binary. This in fact may change in the future if Unicode
7830 # changes the way its data is structured. But so far, no binary
7831 # properties ever have @missing lines for them, so the default map
7832 # isn't defined for them. The few properties that are two-valued
7833 # and aren't considered binary have the default map defined
7834 # starting in Unicode 5.0, when the @missing lines appeared; and
7835 # this program has special code to put in a default map for them
7836 # for earlier than 5.0 releases.
7838 || scalar keys %{$unique_maps{$addr}} > 2
7839 || defined $self->default_map)
7841 my $tables = $self->tables;
7842 my $count = $self->count;
7843 if ($verbosity && $count > 500 && $tables/$count > .1) {
7844 Carp::my_carp_bug("It appears that $self should be a \$STRING property, not an \$ENUM because it has too many match tables: $count\n");
7846 $self->set_type($ENUM);
7849 $self->set_type($BINARY);
7852 undef $unique_maps{$addr}; # Garbage collect
7859 my $reason = shift; # Ignored unless suppressing
7860 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7862 my $addr = do { no overloading; pack 'J', $self; };
7863 if ($fate == $SUPPRESSED) {
7864 $why_suppressed{$self->complete_name} = $reason;
7867 # Each table shares the property's fate, except that MAP_PROXIED
7868 # doesn't affect match tables
7869 $map{$addr}->set_fate($fate, $reason);
7870 if ($fate != $MAP_PROXIED) {
7871 foreach my $table ($map{$addr}, $self->tables) {
7872 $table->set_fate($fate, $reason);
7879 # Most of the accessors for a property actually apply to its map table.
7880 # Setup up accessor functions for those, referring to %map
7929 # 'property' above is for symmetry, so that one can take
7930 # the property of a property and get itself, and so don't
7931 # have to distinguish between properties and tables in
7939 return $map{pack 'J', $self}->$sub(@_);
7949 # Returns lines of the input joined together, so that they can be folded
7951 # This causes continuation lines to be joined together into one long line
7952 # for folding. A continuation line is any line that doesn't begin with a
7953 # space or "\b" (the latter is stripped from the output). This is so
7954 # lines can be be in a HERE document so as to fit nicely in the terminal
7955 # width, but be joined together in one long line, and then folded with
7956 # indents, '#' prefixes, etc, properly handled.
7957 # A blank separates the joined lines except if there is a break; an extra
7958 # blank is inserted after a period ending a line.
7960 # Initialize the return with the first line.
7961 my ($return, @lines) = split "\n", shift;
7963 # If the first line is null, it was an empty line, add the \n back in
7964 $return = "\n" if $return eq "";
7966 # Now join the remainder of the physical lines.
7967 for my $line (@lines) {
7969 # An empty line means wanted a blank line, so add two \n's to get that
7970 # effect, and go to the next line.
7971 if (length $line == 0) {
7976 # Look at the last character of what we have so far.
7977 my $previous_char = substr($return, -1, 1);
7979 # And at the next char to be output.
7980 my $next_char = substr($line, 0, 1);
7982 if ($previous_char ne "\n") {
7984 # Here didn't end wth a nl. If the next char a blank or \b, it
7985 # means that here there is a break anyway. So add a nl to the
7987 if ($next_char eq " " || $next_char eq "\b") {
7988 $previous_char = "\n";
7989 $return .= $previous_char;
7992 # Add an extra space after periods.
7993 $return .= " " if $previous_char eq '.';
7996 # Here $previous_char is still the latest character to be output. If
7997 # it isn't a nl, it means that the next line is to be a continuation
7998 # line, with a blank inserted between them.
7999 $return .= " " if $previous_char ne "\n";
8002 substr($line, 0, 1) = "" if $next_char eq "\b";
8004 # And append this next line.
8011 sub simple_fold($;$$$) {
8012 # Returns a string of the input (string or an array of strings) folded
8013 # into multiple-lines each of no more than $MAX_LINE_WIDTH characters plus
8015 # This is tailored for the kind of text written by this program,
8016 # especially the pod file, which can have very long names with
8017 # underscores in the middle, or words like AbcDefgHij.... We allow
8018 # breaking in the middle of such constructs if the line won't fit
8019 # otherwise. The break in such cases will come either just after an
8020 # underscore, or just before one of the Capital letters.
8022 local $to_trace = 0 if main::DEBUG;
8025 my $prefix = shift; # Optional string to prepend to each output
8027 $prefix = "" unless defined $prefix;
8029 my $hanging_indent = shift; # Optional number of spaces to indent
8030 # continuation lines
8031 $hanging_indent = 0 unless $hanging_indent;
8033 my $right_margin = shift; # Optional number of spaces to narrow the
8035 $right_margin = 0 unless defined $right_margin;
8037 # Call carp with the 'nofold' option to avoid it from trying to call us
8039 Carp::carp_extra_args(\@_, 'nofold') if main::DEBUG && @_;
8041 # The space available doesn't include what's automatically prepended
8042 # to each line, or what's reserved on the right.
8043 my $max = $MAX_LINE_WIDTH - length($prefix) - $right_margin;
8044 # XXX Instead of using the 'nofold' perhaps better to look up the stack
8046 if (DEBUG && $hanging_indent >= $max) {
8047 Carp::my_carp("Too large a hanging indent ($hanging_indent); must be < $max. Using 0", 'nofold');
8048 $hanging_indent = 0;
8051 # First, split into the current physical lines.
8053 if (ref $line) { # Better be an array, because not bothering to
8055 foreach my $line (@{$line}) {
8056 push @line, split /\n/, $line;
8060 @line = split /\n/, $line;
8063 #local $to_trace = 1 if main::DEBUG;
8064 trace "", join(" ", @line), "\n" if main::DEBUG && $to_trace;
8066 # Look at each current physical line.
8067 for (my $i = 0; $i < @line; $i++) {
8068 Carp::my_carp("Tabs don't work well.", 'nofold') if $line[$i] =~ /\t/;
8069 #local $to_trace = 1 if main::DEBUG;
8070 trace "i=$i: $line[$i]\n" if main::DEBUG && $to_trace;
8072 # Remove prefix, because will be added back anyway, don't want
8074 $line[$i] =~ s/^$prefix//;
8076 # Remove trailing space
8077 $line[$i] =~ s/\s+\Z//;
8079 # If the line is too long, fold it.
8080 if (length $line[$i] > $max) {
8083 # Here needs to fold. Save the leading space in the line for
8085 $line[$i] =~ /^ ( \s* )/x;
8086 my $leading_space = $1;
8087 trace "line length", length $line[$i], "; lead length", length($leading_space) if main::DEBUG && $to_trace;
8089 # If character at final permissible position is white space,
8090 # fold there, which will delete that white space
8091 if (substr($line[$i], $max - 1, 1) =~ /\s/) {
8092 $remainder = substr($line[$i], $max);
8093 $line[$i] = substr($line[$i], 0, $max - 1);
8097 # Otherwise fold at an acceptable break char closest to
8098 # the max length. Look at just the maximal initial
8099 # segment of the line
8100 my $segment = substr($line[$i], 0, $max - 1);
8102 /^ ( .{$hanging_indent} # Don't look before the
8104 \ * # Don't look in leading
8105 # blanks past the indent
8106 [^ ] .* # Find the right-most
8107 (?: # acceptable break:
8108 [ \s = ] # space or equal
8109 | - (?! [.0-9] ) # or non-unary minus.
8110 ) # $1 includes the character
8113 # Split into the initial part that fits, and remaining
8115 $remainder = substr($line[$i], length $1);
8117 trace $line[$i] if DEBUG && $to_trace;
8118 trace $remainder if DEBUG && $to_trace;
8121 # If didn't find a good breaking spot, see if there is a
8122 # not-so-good breaking spot. These are just after
8123 # underscores or where the case changes from lower to
8124 # upper. Use \a as a soft hyphen, but give up
8125 # and don't break the line if there is actually a \a
8126 # already in the input. We use an ascii character for the
8127 # soft-hyphen to avoid any attempt by miniperl to try to
8128 # access the files that this program is creating.
8129 elsif ($segment !~ /\a/
8130 && ($segment =~ s/_/_\a/g
8131 || $segment =~ s/ ( [a-z] ) (?= [A-Z] )/$1\a/xg))
8133 # Here were able to find at least one place to insert
8134 # our substitute soft hyphen. Find the right-most one
8135 # and replace it by a real hyphen.
8136 trace $segment if DEBUG && $to_trace;
8138 rindex($segment, "\a"),
8141 # Then remove the soft hyphen substitutes.
8142 $segment =~ s/\a//g;
8143 trace $segment if DEBUG && $to_trace;
8145 # And split into the initial part that fits, and
8146 # remainder of the line
8147 my $pos = rindex($segment, '-');
8148 $remainder = substr($line[$i], $pos);
8149 trace $remainder if DEBUG && $to_trace;
8150 $line[$i] = substr($segment, 0, $pos + 1);
8154 # Here we know if we can fold or not. If we can, $remainder
8155 # is what remains to be processed in the next iteration.
8156 if (defined $remainder) {
8157 trace "folded='$line[$i]'" if main::DEBUG && $to_trace;
8159 # Insert the folded remainder of the line as a new element
8160 # of the array. (It may still be too long, but we will
8161 # deal with that next time through the loop.) Omit any
8162 # leading space in the remainder.
8163 $remainder =~ s/^\s+//;
8164 trace "remainder='$remainder'" if main::DEBUG && $to_trace;
8166 # But then indent by whichever is larger of:
8167 # 1) the leading space on the input line;
8168 # 2) the hanging indent.
8169 # This preserves indentation in the original line.
8170 my $lead = ($leading_space)
8171 ? length $leading_space
8173 $lead = max($lead, $hanging_indent);
8174 splice @line, $i+1, 0, (" " x $lead) . $remainder;
8178 # Ready to output the line. Get rid of any trailing space
8179 # And prefix by the required $prefix passed in.
8180 $line[$i] =~ s/\s+$//;
8181 $line[$i] = "$prefix$line[$i]\n";
8182 } # End of looping through all the lines.
8184 return join "", @line;
8187 sub property_ref { # Returns a reference to a property object.
8188 return Property::property_ref(@_);
8191 sub force_unlink ($) {
8192 my $filename = shift;
8193 return unless file_exists($filename);
8194 return if CORE::unlink($filename);
8196 # We might need write permission
8197 chmod 0777, $filename;
8198 CORE::unlink($filename) or Carp::my_carp("Couldn't unlink $filename. Proceeding anyway: $!");
8203 # Given a filename and references to arrays of lines, write the lines of
8204 # each array to the file
8205 # Filename can be given as an arrayref of directory names
8207 return Carp::carp_too_few_args(\@_, 3) if main::DEBUG && @_ < 3;
8210 my $use_utf8 = shift;
8212 # Get into a single string if an array, and get rid of, in Unix terms, any
8214 $file= File::Spec->join(@$file) if ref $file eq 'ARRAY';
8215 $file = File::Spec->canonpath($file);
8217 # If has directories, make sure that they all exist
8218 (undef, my $directories, undef) = File::Spec->splitpath($file);
8219 File::Path::mkpath($directories) if $directories && ! -d $directories;
8221 push @files_actually_output, $file;
8223 force_unlink ($file);
8226 if (not open $OUT, ">", $file) {
8227 Carp::my_carp("can't open $file for output. Skipping this file: $!");
8231 binmode $OUT, ":utf8" if $use_utf8;
8233 while (defined (my $lines_ref = shift)) {
8234 unless (@$lines_ref) {
8235 Carp::my_carp("An array of lines for writing to file '$file' is empty; writing it anyway;");
8238 print $OUT @$lines_ref or die Carp::my_carp("write to '$file' failed: $!");
8240 close $OUT or die Carp::my_carp("close '$file' failed: $!");
8242 print "$file written.\n" if $verbosity >= $VERBOSE;
8248 sub Standardize($) {
8249 # This converts the input name string into a standardized equivalent to
8253 unless (defined $name) {
8254 Carp::my_carp_bug("Standardize() called with undef. Returning undef.");
8258 # Remove any leading or trailing white space
8262 # Convert interior white space and hyphens into underscores.
8263 $name =~ s/ (?<= .) [ -]+ (.) /_$1/xg;
8265 # Capitalize the letter following an underscore, and convert a sequence of
8266 # multiple underscores to a single one
8267 $name =~ s/ (?<= .) _+ (.) /_\u$1/xg;
8269 # And capitalize the first letter, but not for the special cjk ones.
8270 $name = ucfirst($name) unless $name =~ /^k[A-Z]/;
8274 sub standardize ($) {
8275 # Returns a lower-cased standardized name, without underscores. This form
8276 # is chosen so that it can distinguish between any real versus superficial
8277 # Unicode name differences. It relies on the fact that Unicode doesn't
8278 # have interior underscores, white space, nor dashes in any
8279 # stricter-matched name. It should not be used on Unicode code point
8280 # names (the Name property), as they mostly, but not always follow these
8283 my $name = Standardize(shift);
8284 return if !defined $name;
8286 $name =~ s/ (?<= .) _ (?= . ) //xg;
8290 sub utf8_heavy_name ($$) {
8291 # Returns the name that utf8_heavy.pl will use to find a table. XXX
8292 # perhaps this function should be placed somewhere, like Heavy.pl so that
8293 # utf8_heavy can use it directly without duplicating code that can get
8298 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8300 my $property = $table->property;
8301 $property = ($property == $perl)
8302 ? "" # 'perl' is never explicitly stated
8303 : standardize($property->name) . '=';
8304 if ($alias->loose_match) {
8305 return $property . standardize($alias->name);
8308 return lc ($property . $alias->name);
8316 my $indent_increment = " " x (($debugging_build) ? 2 : 0);
8319 $main::simple_dumper_nesting = 0;
8322 # Like Simple Data::Dumper. Good enough for our needs. We can't use
8323 # the real thing as we have to run under miniperl.
8325 # It is designed so that on input it is at the beginning of a line,
8326 # and the final thing output in any call is a trailing ",\n".
8330 $indent = "" if ! $debugging_build || ! defined $indent;
8332 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8334 # nesting level is localized, so that as the call stack pops, it goes
8335 # back to the prior value.
8336 local $main::simple_dumper_nesting = $main::simple_dumper_nesting;
8337 undef %already_output if $main::simple_dumper_nesting == 0;
8338 $main::simple_dumper_nesting++;
8339 #print STDERR __LINE__, ": $main::simple_dumper_nesting: $indent$item\n";
8341 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8343 # Determine the indent for recursive calls.
8344 my $next_indent = $indent . $indent_increment;
8349 # Dump of scalar: just output it in quotes if not a number. To do
8350 # so we must escape certain characters, and therefore need to
8351 # operate on a copy to avoid changing the original
8353 $copy = $UNDEF unless defined $copy;
8355 # Quote non-integers (integers also have optional leading '-')
8356 if ($copy eq "" || $copy !~ /^ -? \d+ $/x) {
8358 # Escape apostrophe and backslash
8359 $copy =~ s/ ( ['\\] ) /\\$1/xg;
8362 $output = "$indent$copy,\n";
8366 # Keep track of cycles in the input, and refuse to infinitely loop
8367 my $addr = do { no overloading; pack 'J', $item; };
8368 if (defined $already_output{$addr}) {
8369 return "${indent}ALREADY OUTPUT: $item\n";
8371 $already_output{$addr} = $item;
8373 if (ref $item eq 'ARRAY') {
8376 if ($main::simple_dumper_nesting > 1) {
8378 $using_brackets = 1;
8381 $using_brackets = 0;
8384 # If the array is empty, put the closing bracket on the same
8385 # line. Otherwise, recursively add each array element
8391 for (my $i = 0; $i < @$item; $i++) {
8393 # Indent array elements one level
8394 $output .= &simple_dumper($item->[$i], $next_indent);
8395 next if ! $debugging_build;
8396 $output =~ s/\n$//; # Remove any trailing nl so
8397 $output .= " # [$i]\n"; # as to add a comment giving
8400 $output .= $indent; # Indent closing ']' to orig level
8402 $output .= ']' if $using_brackets;
8405 elsif (ref $item eq 'HASH') {
8410 # No surrounding braces at top level
8412 if ($main::simple_dumper_nesting > 1) {
8415 $body_indent = $next_indent;
8416 $next_indent .= $indent_increment;
8421 $body_indent = $indent;
8425 # Output hashes sorted alphabetically instead of apparently
8426 # random. Use caseless alphabetic sort
8427 foreach my $key (sort { lc $a cmp lc $b } keys %$item)
8429 if ($is_first_line) {
8433 $output .= "$body_indent";
8436 # The key must be a scalar, but this recursive call quotes
8438 $output .= &simple_dumper($key);
8440 # And change the trailing comma and nl to the hash fat
8441 # comma for clarity, and so the value can be on the same
8443 $output =~ s/,\n$/ => /;
8445 # Recursively call to get the value's dump.
8446 my $next = &simple_dumper($item->{$key}, $next_indent);
8448 # If the value is all on one line, remove its indent, so
8449 # will follow the => immediately. If it takes more than
8450 # one line, start it on a new line.
8451 if ($next !~ /\n.*\n/) {
8460 $output .= "$indent},\n" if $using_braces;
8462 elsif (ref $item eq 'CODE' || ref $item eq 'GLOB') {
8463 $output = $indent . ref($item) . "\n";
8464 # XXX see if blessed
8466 elsif ($item->can('dump')) {
8468 # By convention in this program, objects furnish a 'dump'
8469 # method. Since not doing any output at this level, just pass
8470 # on the input indent
8471 $output = $item->dump($indent);
8474 Carp::my_carp("Can't cope with dumping a " . ref($item) . ". Skipping.");
8481 sub dump_inside_out {
8482 # Dump inside-out hashes in an object's state by converting them to a
8483 # regular hash and then calling simple_dumper on that.
8486 my $fields_ref = shift;
8487 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8489 my $addr = do { no overloading; pack 'J', $object; };
8492 foreach my $key (keys %$fields_ref) {
8493 $hash{$key} = $fields_ref->{$key}{$addr};
8496 return simple_dumper(\%hash, @_);
8500 # Overloaded '.' method that is common to all packages. It uses the
8501 # package's stringify method.
8505 my $reversed = shift;
8506 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8508 $other = "" unless defined $other;
8510 foreach my $which (\$self, \$other) {
8511 next unless ref $$which;
8512 if ($$which->can('_operator_stringify')) {
8513 $$which = $$which->_operator_stringify;
8516 my $ref = ref $$which;
8517 my $addr = do { no overloading; pack 'J', $$which; };
8518 $$which = "$ref ($addr)";
8526 sub _operator_equal {
8527 # Generic overloaded '==' routine. To be equal, they must be the exact
8533 return 0 unless defined $other;
8534 return 0 unless ref $other;
8536 return $self == $other;
8539 sub _operator_not_equal {
8543 return ! _operator_equal($self, $other);
8546 sub process_PropertyAliases($) {
8547 # This reads in the PropertyAliases.txt file, which contains almost all
8548 # the character properties in Unicode and their equivalent aliases:
8549 # scf ; Simple_Case_Folding ; sfc
8551 # Field 0 is the preferred short name for the property.
8552 # Field 1 is the full name.
8553 # Any succeeding ones are other accepted names.
8556 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8558 # This whole file was non-existent in early releases, so use our own
8560 $file->insert_lines(get_old_property_aliases())
8561 if ! -e 'PropertyAliases.txt';
8563 # Add any cjk properties that may have been defined.
8564 $file->insert_lines(@cjk_properties);
8566 while ($file->next_line) {
8568 my @data = split /\s*;\s*/;
8570 my $full = $data[1];
8572 my $this = Property->new($data[0], Full_Name => $full);
8574 # Start looking for more aliases after these two.
8575 for my $i (2 .. @data - 1) {
8576 $this->add_alias($data[$i]);
8583 sub finish_property_setup {
8584 # Finishes setting up after PropertyAliases.
8587 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8589 # This entry was missing from this file in earlier Unicode versions
8590 if (-e 'Jamo.txt') {
8591 my $jsn = property_ref('JSN');
8592 if (! defined $jsn) {
8593 $jsn = Property->new('JSN', Full_Name => 'Jamo_Short_Name');
8597 # This entry is still missing as of 6.0, perhaps because no short name for
8599 if (-e 'NameAliases.txt') {
8600 my $aliases = property_ref('Name_Alias');
8601 if (! defined $aliases) {
8602 $aliases = Property->new('Name_Alias');
8606 # These are used so much, that we set globals for them.
8607 $gc = property_ref('General_Category');
8608 $block = property_ref('Block');
8609 $script = property_ref('Script');
8611 # Perl adds this alias.
8612 $gc->add_alias('Category');
8614 # For backwards compatibility, these property files have particular names.
8615 property_ref('Uppercase_Mapping')->set_file('Upper'); # This is what
8617 property_ref('Lowercase_Mapping')->set_file('Lower');
8618 property_ref('Titlecase_Mapping')->set_file('Title');
8620 my $fold = property_ref('Case_Folding');
8621 $fold->set_file('Fold') if defined $fold;
8623 # Unicode::Normalize expects this file with this name and directory.
8624 my $ccc = property_ref('Canonical_Combining_Class');
8626 $ccc->set_file('CombiningClass');
8627 $ccc->set_directory(File::Spec->curdir());
8630 # utf8.c has a different meaning for non range-size-1 for map properties
8631 # that this program doesn't currently handle; and even if it were changed
8632 # to do so, some other code may be using them expecting range size 1.
8633 foreach my $property (qw {
8640 property_ref($property)->set_range_size_1(1);
8643 # These two properties aren't actually used in the core, but unfortunately
8644 # the names just above that are in the core interfere with these, so
8645 # choose different names. These aren't a problem unless the map tables
8646 # for these files get written out.
8647 my $lowercase = property_ref('Lowercase');
8648 $lowercase->set_file('IsLower') if defined $lowercase;
8649 my $uppercase = property_ref('Uppercase');
8650 $uppercase->set_file('IsUpper') if defined $uppercase;
8652 # Set up the hard-coded default mappings, but only on properties defined
8654 foreach my $property (keys %default_mapping) {
8655 my $property_object = property_ref($property);
8656 next if ! defined $property_object;
8657 my $default_map = $default_mapping{$property};
8658 $property_object->set_default_map($default_map);
8660 # A map of <code point> implies the property is string.
8661 if ($property_object->type == $UNKNOWN
8662 && $default_map eq $CODE_POINT)
8664 $property_object->set_type($STRING);
8668 # The following use the Multi_Default class to create objects for
8671 # Bidi class has a complicated default, but the derived file takes care of
8672 # the complications, leaving just 'L'.
8673 if (file_exists("${EXTRACTED}DBidiClass.txt")) {
8674 property_ref('Bidi_Class')->set_default_map('L');
8679 # The derived file was introduced in 3.1.1. The values below are
8680 # taken from table 3-8, TUS 3.0
8682 'my $default = Range_List->new;
8683 $default->add_range(0x0590, 0x05FF);
8684 $default->add_range(0xFB1D, 0xFB4F);'
8687 # The defaults apply only to unassigned characters
8688 $default_R .= '$gc->table("Unassigned") & $default;';
8690 if ($v_version lt v3.0.0) {
8691 $default = Multi_Default->new(R => $default_R, 'L');
8695 # AL apparently not introduced until 3.0: TUS 2.x references are
8696 # not on-line to check it out
8698 'my $default = Range_List->new;
8699 $default->add_range(0x0600, 0x07BF);
8700 $default->add_range(0xFB50, 0xFDFF);
8701 $default->add_range(0xFE70, 0xFEFF);'
8704 # Non-character code points introduced in this release; aren't AL
8705 if ($v_version ge 3.1.0) {
8706 $default_AL .= '$default->delete_range(0xFDD0, 0xFDEF);';
8708 $default_AL .= '$gc->table("Unassigned") & $default';
8709 $default = Multi_Default->new(AL => $default_AL,
8713 property_ref('Bidi_Class')->set_default_map($default);
8716 # Joining type has a complicated default, but the derived file takes care
8717 # of the complications, leaving just 'U' (or Non_Joining), except the file
8719 if (file_exists("${EXTRACTED}DJoinType.txt") || -e 'ArabicShaping.txt') {
8720 if (file_exists("${EXTRACTED}DJoinType.txt") && $v_version ne 3.1.0) {
8721 property_ref('Joining_Type')->set_default_map('Non_Joining');
8725 # Otherwise, there are not one, but two possibilities for the
8726 # missing defaults: T and U.
8727 # The missing defaults that evaluate to T are given by:
8728 # T = Mn + Cf - ZWNJ - ZWJ
8729 # where Mn and Cf are the general category values. In other words,
8730 # any non-spacing mark or any format control character, except
8731 # U+200C ZERO WIDTH NON-JOINER (joining type U) and U+200D ZERO
8732 # WIDTH JOINER (joining type C).
8733 my $default = Multi_Default->new(
8734 'T' => '$gc->table("Mn") + $gc->table("Cf") - 0x200C - 0x200D',
8736 property_ref('Joining_Type')->set_default_map($default);
8740 # Line break has a complicated default in early releases. It is 'Unknown'
8741 # for non-assigned code points; 'AL' for assigned.
8742 if (file_exists("${EXTRACTED}DLineBreak.txt") || -e 'LineBreak.txt') {
8743 my $lb = property_ref('Line_Break');
8744 if ($v_version gt 3.2.0) {
8745 $lb->set_default_map('Unknown');
8748 my $default = Multi_Default->new( 'Unknown' => '$gc->table("Cn")',
8750 $lb->set_default_map($default);
8753 # If has the URS property, make sure that the standard aliases are in
8754 # it, since not in the input tables in some versions.
8755 my $urs = property_ref('Unicode_Radical_Stroke');
8757 $urs->add_alias('cjkRSUnicode');
8758 $urs->add_alias('kRSUnicode');
8764 sub get_old_property_aliases() {
8765 # Returns what would be in PropertyAliases.txt if it existed in very old
8766 # versions of Unicode. It was derived from the one in 3.2, and pared
8767 # down based on the data that was actually in the older releases.
8768 # An attempt was made to use the existence of files to mean inclusion or
8769 # not of various aliases, but if this was not sufficient, using version
8770 # numbers was resorted to.
8774 # These are to be used in all versions (though some are constructed by
8775 # this program if missing)
8776 push @return, split /\n/, <<'END';
8778 Bidi_M ; Bidi_Mirrored
8780 ccc ; Canonical_Combining_Class
8781 dm ; Decomposition_Mapping
8782 dt ; Decomposition_Type
8783 gc ; General_Category
8785 lc ; Lowercase_Mapping
8787 na1 ; Unicode_1_Name
8790 sfc ; Simple_Case_Folding
8791 slc ; Simple_Lowercase_Mapping
8792 stc ; Simple_Titlecase_Mapping
8793 suc ; Simple_Uppercase_Mapping
8794 tc ; Titlecase_Mapping
8795 uc ; Uppercase_Mapping
8798 if (-e 'Blocks.txt') {
8799 push @return, "blk ; Block\n";
8801 if (-e 'ArabicShaping.txt') {
8802 push @return, split /\n/, <<'END';
8807 if (-e 'PropList.txt') {
8809 # This first set is in the original old-style proplist.
8810 push @return, split /\n/, <<'END';
8812 Bidi_C ; Bidi_Control
8820 Join_C ; Join_Control
8822 QMark ; Quotation_Mark
8823 Term ; Terminal_Punctuation
8824 WSpace ; White_Space
8826 # The next sets were added later
8827 if ($v_version ge v3.0.0) {
8828 push @return, split /\n/, <<'END';
8833 if ($v_version ge v3.0.1) {
8834 push @return, split /\n/, <<'END';
8835 NChar ; Noncharacter_Code_Point
8838 # The next sets were added in the new-style
8839 if ($v_version ge v3.1.0) {
8840 push @return, split /\n/, <<'END';
8841 OAlpha ; Other_Alphabetic
8842 OLower ; Other_Lowercase
8844 OUpper ; Other_Uppercase
8847 if ($v_version ge v3.1.1) {
8848 push @return, "AHex ; ASCII_Hex_Digit\n";
8851 if (-e 'EastAsianWidth.txt') {
8852 push @return, "ea ; East_Asian_Width\n";
8854 if (-e 'CompositionExclusions.txt') {
8855 push @return, "CE ; Composition_Exclusion\n";
8857 if (-e 'LineBreak.txt') {
8858 push @return, "lb ; Line_Break\n";
8860 if (-e 'BidiMirroring.txt') {
8861 push @return, "bmg ; Bidi_Mirroring_Glyph\n";
8863 if (-e 'Scripts.txt') {
8864 push @return, "sc ; Script\n";
8866 if (-e 'DNormalizationProps.txt') {
8867 push @return, split /\n/, <<'END';
8868 Comp_Ex ; Full_Composition_Exclusion
8869 FC_NFKC ; FC_NFKC_Closure
8870 NFC_QC ; NFC_Quick_Check
8871 NFD_QC ; NFD_Quick_Check
8872 NFKC_QC ; NFKC_Quick_Check
8873 NFKD_QC ; NFKD_Quick_Check
8874 XO_NFC ; Expands_On_NFC
8875 XO_NFD ; Expands_On_NFD
8876 XO_NFKC ; Expands_On_NFKC
8877 XO_NFKD ; Expands_On_NFKD
8880 if (-e 'DCoreProperties.txt') {
8881 push @return, split /\n/, <<'END';
8886 # These can also appear in some versions of PropList.txt
8887 push @return, "Lower ; Lowercase\n"
8888 unless grep { $_ =~ /^Lower\b/} @return;
8889 push @return, "Upper ; Uppercase\n"
8890 unless grep { $_ =~ /^Upper\b/} @return;
8893 # This flag requires the DAge.txt file to be copied into the directory.
8894 if (DEBUG && $compare_versions) {
8895 push @return, 'age ; Age';
8901 sub process_PropValueAliases {
8902 # This file contains values that properties look like:
8903 # bc ; AL ; Arabic_Letter
8904 # blk; n/a ; Greek_And_Coptic ; Greek
8906 # Field 0 is the property.
8907 # Field 1 is the short name of a property value or 'n/a' if no
8908 # short name exists;
8909 # Field 2 is the full property value name;
8910 # Any other fields are more synonyms for the property value.
8911 # Purely numeric property values are omitted from the file; as are some
8912 # others, fewer and fewer in later releases
8914 # Entries for the ccc property have an extra field before the
8916 # ccc; 0; NR ; Not_Reordered
8917 # It is the numeric value that the names are synonyms for.
8919 # There are comment entries for values missing from this file:
8920 # # @missing: 0000..10FFFF; ISO_Comment; <none>
8921 # # @missing: 0000..10FFFF; Lowercase_Mapping; <code point>
8924 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8926 # This whole file was non-existent in early releases, so use our own
8927 # internal one if necessary.
8928 if (! -e 'PropValueAliases.txt') {
8929 $file->insert_lines(get_old_property_value_aliases());
8932 # Add any explicit cjk values
8933 $file->insert_lines(@cjk_property_values);
8935 # This line is used only for testing the code that checks for name
8936 # conflicts. There is a script Inherited, and when this line is executed
8937 # it causes there to be a name conflict with the 'Inherited' that this
8938 # program generates for this block property value
8939 #$file->insert_lines('blk; n/a; Herited');
8942 # Process each line of the file ...
8943 while ($file->next_line) {
8945 my ($property, @data) = split /\s*;\s*/;
8947 # The ccc property has an extra field at the beginning, which is the
8948 # numeric value. Move it to be after the other two, mnemonic, fields,
8949 # so that those will be used as the property value's names, and the
8950 # number will be an extra alias. (Rightmost splice removes field 1-2,
8951 # returning them in a slice; left splice inserts that before anything,
8952 # thus shifting the former field 0 to after them.)
8953 splice (@data, 0, 0, splice(@data, 1, 2)) if $property eq 'ccc';
8955 # Field 0 is a short name unless "n/a"; field 1 is the full name. If
8956 # there is no short name, use the full one in element 1
8957 if ($data[0] eq "n/a") {
8958 $data[0] = $data[1];
8960 elsif ($data[0] ne $data[1]
8961 && standardize($data[0]) eq standardize($data[1])
8962 && $data[1] !~ /[[:upper:]]/)
8964 # Also, there is a bug in the file in which "n/a" is omitted, and
8965 # the two fields are identical except for case, and the full name
8966 # is all lower case. Copy the "short" name unto the full one to
8967 # give it some upper case.
8969 $data[1] = $data[0];
8972 # Earlier releases had the pseudo property 'qc' that should expand to
8973 # the ones that replace it below.
8974 if ($property eq 'qc') {
8975 if (lc $data[0] eq 'y') {
8976 $file->insert_lines('NFC_QC; Y ; Yes',
8982 elsif (lc $data[0] eq 'n') {
8983 $file->insert_lines('NFC_QC; N ; No',
8989 elsif (lc $data[0] eq 'm') {
8990 $file->insert_lines('NFC_QC; M ; Maybe',
8991 'NFKC_QC; M ; Maybe',
8995 $file->carp_bad_line("qc followed by unexpected '$data[0]");
9000 # The first field is the short name, 2nd is the full one.
9001 my $property_object = property_ref($property);
9002 my $table = $property_object->add_match_table($data[0],
9003 Full_Name => $data[1]);
9005 # Start looking for more aliases after these two.
9006 for my $i (2 .. @data - 1) {
9007 $table->add_alias($data[$i]);
9009 } # End of looping through the file
9011 # As noted in the comments early in the program, it generates tables for
9012 # the default values for all releases, even those for which the concept
9013 # didn't exist at the time. Here we add those if missing.
9014 my $age = property_ref('age');
9015 if (defined $age && ! defined $age->table('Unassigned')) {
9016 $age->add_match_table('Unassigned');
9018 $block->add_match_table('No_Block') if -e 'Blocks.txt'
9019 && ! defined $block->table('No_Block');
9022 # Now set the default mappings of the properties from the file. This is
9023 # done after the loop because a number of properties have only @missings
9024 # entries in the file, and may not show up until the end.
9025 my @defaults = $file->get_missings;
9026 foreach my $default_ref (@defaults) {
9027 my $default = $default_ref->[0];
9028 my $property = property_ref($default_ref->[1]);
9029 $property->set_default_map($default);
9034 sub get_old_property_value_aliases () {
9035 # Returns what would be in PropValueAliases.txt if it existed in very old
9036 # versions of Unicode. It was derived from the one in 3.2, and pared
9037 # down. An attempt was made to use the existence of files to mean
9038 # inclusion or not of various aliases, but if this was not sufficient,
9039 # using version numbers was resorted to.
9041 my @return = split /\n/, <<'END';
9042 bc ; AN ; Arabic_Number
9043 bc ; B ; Paragraph_Separator
9044 bc ; CS ; Common_Separator
9045 bc ; EN ; European_Number
9046 bc ; ES ; European_Separator
9047 bc ; ET ; European_Terminator
9048 bc ; L ; Left_To_Right
9049 bc ; ON ; Other_Neutral
9050 bc ; R ; Right_To_Left
9051 bc ; WS ; White_Space
9053 # The standard combining classes are very much different in v1, so only use
9054 # ones that look right (not checked thoroughly)
9055 ccc; 0; NR ; Not_Reordered
9056 ccc; 1; OV ; Overlay
9058 ccc; 8; KV ; Kana_Voicing
9060 ccc; 202; ATBL ; Attached_Below_Left
9061 ccc; 216; ATAR ; Attached_Above_Right
9062 ccc; 218; BL ; Below_Left
9064 ccc; 222; BR ; Below_Right
9066 ccc; 228; AL ; Above_Left
9068 ccc; 232; AR ; Above_Right
9069 ccc; 234; DA ; Double_Above
9071 dt ; can ; canonical
9085 gc ; C ; Other # Cc | Cf | Cn | Co | Cs
9087 gc ; Cn ; Unassigned
9088 gc ; Co ; Private_Use
9089 gc ; L ; Letter # Ll | Lm | Lo | Lt | Lu
9090 gc ; LC ; Cased_Letter # Ll | Lt | Lu
9091 gc ; Ll ; Lowercase_Letter
9092 gc ; Lm ; Modifier_Letter
9093 gc ; Lo ; Other_Letter
9094 gc ; Lu ; Uppercase_Letter
9095 gc ; M ; Mark # Mc | Me | Mn
9096 gc ; Mc ; Spacing_Mark
9097 gc ; Mn ; Nonspacing_Mark
9098 gc ; N ; Number # Nd | Nl | No
9099 gc ; Nd ; Decimal_Number
9100 gc ; No ; Other_Number
9101 gc ; P ; Punctuation # Pc | Pd | Pe | Pf | Pi | Po | Ps
9102 gc ; Pd ; Dash_Punctuation
9103 gc ; Pe ; Close_Punctuation
9104 gc ; Po ; Other_Punctuation
9105 gc ; Ps ; Open_Punctuation
9106 gc ; S ; Symbol # Sc | Sk | Sm | So
9107 gc ; Sc ; Currency_Symbol
9108 gc ; Sm ; Math_Symbol
9109 gc ; So ; Other_Symbol
9110 gc ; Z ; Separator # Zl | Zp | Zs
9111 gc ; Zl ; Line_Separator
9112 gc ; Zp ; Paragraph_Separator
9113 gc ; Zs ; Space_Separator
9121 if (-e 'ArabicShaping.txt') {
9122 push @return, split /\n/, <<'END';
9129 jg ; n/a ; NO_JOINING_GROUP
9137 jt ; C ; Join_Causing
9138 jt ; D ; Dual_Joining
9139 jt ; L ; Left_Joining
9140 jt ; R ; Right_Joining
9141 jt ; U ; Non_Joining
9142 jt ; T ; Transparent
9144 if ($v_version ge v3.0.0) {
9145 push @return, split /\n/, <<'END';
9149 jg ; n/a ; DALATH_RISH
9152 jg ; n/a ; FINAL_SEMKATH
9155 jg ; n/a ; HAMZA_ON_HEH_GOAL
9162 jg ; n/a ; KNOTTED_HEH
9169 jg ; n/a ; REVERSED_PE
9173 jg ; n/a ; SWASH_KAF
9175 jg ; n/a ; TEH_MARBUTA
9178 jg ; n/a ; YEH_BARREE
9179 jg ; n/a ; YEH_WITH_TAIL
9188 if (-e 'EastAsianWidth.txt') {
9189 push @return, split /\n/, <<'END';
9199 if (-e 'LineBreak.txt') {
9200 push @return, split /\n/, <<'END';
9202 lb ; AL ; Alphabetic
9203 lb ; B2 ; Break_Both
9204 lb ; BA ; Break_After
9205 lb ; BB ; Break_Before
9206 lb ; BK ; Mandatory_Break
9207 lb ; CB ; Contingent_Break
9208 lb ; CL ; Close_Punctuation
9209 lb ; CM ; Combining_Mark
9210 lb ; CR ; Carriage_Return
9211 lb ; EX ; Exclamation
9214 lb ; ID ; Ideographic
9215 lb ; IN ; Inseperable
9216 lb ; IS ; Infix_Numeric
9218 lb ; NS ; Nonstarter
9220 lb ; OP ; Open_Punctuation
9221 lb ; PO ; Postfix_Numeric
9222 lb ; PR ; Prefix_Numeric
9224 lb ; SA ; Complex_Context
9227 lb ; SY ; Break_Symbols
9233 if (-e 'DNormalizationProps.txt') {
9234 push @return, split /\n/, <<'END';
9241 if (-e 'Scripts.txt') {
9242 push @return, split /\n/, <<'END';
9244 sc ; Armn ; Armenian
9246 sc ; Bopo ; Bopomofo
9247 sc ; Cans ; Canadian_Aboriginal
9248 sc ; Cher ; Cherokee
9249 sc ; Cyrl ; Cyrillic
9250 sc ; Deva ; Devanagari
9252 sc ; Ethi ; Ethiopic
9253 sc ; Geor ; Georgian
9256 sc ; Gujr ; Gujarati
9257 sc ; Guru ; Gurmukhi
9261 sc ; Hira ; Hiragana
9262 sc ; Ital ; Old_Italic
9263 sc ; Kana ; Katakana
9268 sc ; Mlym ; Malayalam
9269 sc ; Mong ; Mongolian
9273 sc ; Qaai ; Inherited
9287 if ($v_version ge v2.0.0) {
9288 push @return, split /\n/, <<'END';
9292 dt ; vert ; vertical
9297 gc ; Lt ; Titlecase_Letter
9298 gc ; Me ; Enclosing_Mark
9299 gc ; Nl ; Letter_Number
9300 gc ; Pc ; Connector_Punctuation
9301 gc ; Sk ; Modifier_Symbol
9304 if ($v_version ge v2.1.2) {
9305 push @return, "bc ; S ; Segment_Separator\n";
9307 if ($v_version ge v2.1.5) {
9308 push @return, split /\n/, <<'END';
9309 gc ; Pf ; Final_Punctuation
9310 gc ; Pi ; Initial_Punctuation
9313 if ($v_version ge v2.1.8) {
9314 push @return, "ccc; 240; IS ; Iota_Subscript\n";
9317 if ($v_version ge v3.0.0) {
9318 push @return, split /\n/, <<'END';
9319 bc ; AL ; Arabic_Letter
9320 bc ; BN ; Boundary_Neutral
9321 bc ; LRE ; Left_To_Right_Embedding
9322 bc ; LRO ; Left_To_Right_Override
9323 bc ; NSM ; Nonspacing_Mark
9324 bc ; PDF ; Pop_Directional_Format
9325 bc ; RLE ; Right_To_Left_Embedding
9326 bc ; RLO ; Right_To_Left_Override
9328 ccc; 233; DB ; Double_Below
9332 if ($v_version ge v3.1.0) {
9333 push @return, "ccc; 226; R ; Right\n";
9339 sub output_perl_charnames_line ($$) {
9341 # Output the entries in Perl_charnames specially, using 5 digits instead
9342 # of four. This makes the entries a constant length, and simplifies
9343 # charnames.pm which this table is for. Unicode can have 6 digit
9344 # ordinals, but they are all private use or noncharacters which do not
9345 # have names, so won't be in this table.
9347 return sprintf "%05X\t%s\n", $_[0], $_[1];
9351 # This is used to store the range list of all the code points usable when
9352 # the little used $compare_versions feature is enabled.
9353 my $compare_versions_range_list;
9355 # These are constants to the $property_info hash in this subroutine, to
9356 # avoid using a quoted-string which might have a typo.
9358 my $DEFAULT_MAP = 'default_map';
9359 my $DEFAULT_TABLE = 'default_table';
9360 my $PSEUDO_MAP_TYPE = 'pseudo_map_type';
9361 my $MISSINGS = 'missings';
9363 sub process_generic_property_file {
9364 # This processes a file containing property mappings and puts them
9365 # into internal map tables. It should be used to handle any property
9366 # files that have mappings from a code point or range thereof to
9367 # something else. This means almost all the UCD .txt files.
9368 # each_line_handlers() should be set to adjust the lines of these
9369 # files, if necessary, to what this routine understands:
9374 # the fields are: "codepoint-range ; property; map"
9376 # meaning the codepoints in the range all have the value 'map' under
9378 # Beginning and trailing white space in each field are not significant.
9379 # Note there is not a trailing semi-colon in the above. A trailing
9380 # semi-colon means the map is a null-string. An omitted map, as
9381 # opposed to a null-string, is assumed to be 'Y', based on Unicode
9382 # table syntax. (This could have been hidden from this routine by
9383 # doing it in the $file object, but that would require parsing of the
9384 # line there, so would have to parse it twice, or change the interface
9385 # to pass this an array. So not done.)
9387 # The map field may begin with a sequence of commands that apply to
9388 # this range. Each such command begins and ends with $CMD_DELIM.
9389 # These are used to indicate, for example, that the mapping for a
9390 # range has a non-default type.
9392 # This loops through the file, calling it's next_line() method, and
9393 # then taking the map and adding it to the property's table.
9394 # Complications arise because any number of properties can be in the
9395 # file, in any order, interspersed in any way. The first time a
9396 # property is seen, it gets information about that property and
9397 # caches it for quick retrieval later. It also normalizes the maps
9398 # so that only one of many synonyms is stored. The Unicode input
9399 # files do use some multiple synonyms.
9402 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9404 my %property_info; # To keep track of what properties
9405 # have already had entries in the
9406 # current file, and info about each,
9407 # so don't have to recompute.
9408 my $property_name; # property currently being worked on
9409 my $property_type; # and its type
9410 my $previous_property_name = ""; # name from last time through loop
9411 my $property_object; # pointer to the current property's
9413 my $property_addr; # the address of that object
9414 my $default_map; # the string that code points missing
9415 # from the file map to
9416 my $default_table; # For non-string properties, a
9417 # reference to the match table that
9418 # will contain the list of code
9419 # points that map to $default_map.
9421 # Get the next real non-comment line
9423 while ($file->next_line) {
9425 # Default replacement type; means that if parts of the range have
9426 # already been stored in our tables, the new map overrides them if
9427 # they differ more than cosmetically
9428 my $replace = $IF_NOT_EQUIVALENT;
9429 my $map_type; # Default type for the map of this range
9431 #local $to_trace = 1 if main::DEBUG;
9432 trace $_ if main::DEBUG && $to_trace;
9434 # Split the line into components
9435 my ($range, $property_name, $map, @remainder)
9436 = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
9438 # If more or less on the line than we are expecting, warn and skip
9441 $file->carp_bad_line('Extra fields');
9444 elsif ( ! defined $property_name) {
9445 $file->carp_bad_line('Missing property');
9449 # Examine the range.
9450 if ($range !~ /^ ($code_point_re) (?:\.\. ($code_point_re) )? $/x)
9452 $file->carp_bad_line("Range '$range' not of the form 'CP1' or 'CP1..CP2' (where CP1,2 are code points in hex)");
9456 my $high = (defined $2) ? hex $2 : $low;
9458 # For the very specialized case of comparing two Unicode
9460 if (DEBUG && $compare_versions) {
9461 if ($property_name eq 'Age') {
9463 # Only allow code points at least as old as the version
9465 my $age = pack "C*", split(/\./, $map); # v string
9466 next LINE if $age gt $compare_versions;
9470 # Again, we throw out code points younger than those of
9471 # the specified version. By now, the Age property is
9472 # populated. We use the intersection of each input range
9473 # with this property to find what code points in it are
9474 # valid. To do the intersection, we have to convert the
9475 # Age property map to a Range_list. We only have to do
9477 if (! defined $compare_versions_range_list) {
9478 my $age = property_ref('Age');
9479 if (! -e 'DAge.txt') {
9480 croak "Need to have 'DAge.txt' file to do version comparison";
9482 elsif ($age->count == 0) {
9483 croak "The 'Age' table is empty, but its file exists";
9485 $compare_versions_range_list
9486 = Range_List->new(Initialize => $age);
9489 # An undefined map is always 'Y'
9490 $map = 'Y' if ! defined $map;
9492 # Calculate the intersection of the input range with the
9493 # code points that are known in the specified version
9494 my @ranges = ($compare_versions_range_list
9495 & Range->new($low, $high))->ranges;
9497 # If the intersection is empty, throw away this range
9498 next LINE unless @ranges;
9500 # Only examine the first range this time through the loop.
9501 my $this_range = shift @ranges;
9503 # Put any remaining ranges in the queue to be processed
9504 # later. Note that there is unnecessary work here, as we
9505 # will do the intersection again for each of these ranges
9506 # during some future iteration of the LINE loop, but this
9507 # code is not used in production. The later intersections
9508 # are guaranteed to not splinter, so this will not become
9510 my $line = join ';', $property_name, $map;
9511 foreach my $range (@ranges) {
9512 $file->insert_adjusted_lines(sprintf("%04X..%04X; %s",
9518 # And process the first range, like any other.
9519 $low = $this_range->start;
9520 $high = $this_range->end;
9522 } # End of $compare_versions
9524 # If changing to a new property, get the things constant per
9526 if ($previous_property_name ne $property_name) {
9528 $property_object = property_ref($property_name);
9529 if (! defined $property_object) {
9530 $file->carp_bad_line("Unexpected property '$property_name'. Skipped");
9533 { no overloading; $property_addr = pack 'J', $property_object; }
9535 # Defer changing names until have a line that is acceptable
9536 # (the 'next' statement above means is unacceptable)
9537 $previous_property_name = $property_name;
9539 # If not the first time for this property, retrieve info about
9541 if (defined ($property_info{$property_addr}{$TYPE})) {
9542 $property_type = $property_info{$property_addr}{$TYPE};
9543 $default_map = $property_info{$property_addr}{$DEFAULT_MAP};
9545 = $property_info{$property_addr}{$PSEUDO_MAP_TYPE};
9547 = $property_info{$property_addr}{$DEFAULT_TABLE};
9551 # Here, is the first time for this property. Set up the
9553 $property_type = $property_info{$property_addr}{$TYPE}
9554 = $property_object->type;
9556 = $property_info{$property_addr}{$PSEUDO_MAP_TYPE}
9557 = $property_object->pseudo_map_type;
9559 # The Unicode files are set up so that if the map is not
9560 # defined, it is a binary property
9561 if (! defined $map && $property_type != $BINARY) {
9562 if ($property_type != $UNKNOWN
9563 && $property_type != $NON_STRING)
9565 $file->carp_bad_line("No mapping defined on a non-binary property. Using 'Y' for the map");
9568 $property_object->set_type($BINARY);
9570 = $property_info{$property_addr}{$TYPE}
9575 # Get any @missings default for this property. This
9576 # should precede the first entry for the property in the
9577 # input file, and is located in a comment that has been
9578 # stored by the Input_file class until we access it here.
9579 # It's possible that there is more than one such line
9580 # waiting for us; collect them all, and parse
9581 my @missings_list = $file->get_missings
9582 if $file->has_missings_defaults;
9583 foreach my $default_ref (@missings_list) {
9584 my $default = $default_ref->[0];
9585 my $addr = do { no overloading; pack 'J', property_ref($default_ref->[1]); };
9587 # For string properties, the default is just what the
9588 # file says, but non-string properties should already
9589 # have set up a table for the default property value;
9590 # use the table for these, so can resolve synonyms
9591 # later to a single standard one.
9592 if ($property_type == $STRING
9593 || $property_type == $UNKNOWN)
9595 $property_info{$addr}{$MISSINGS} = $default;
9598 $property_info{$addr}{$MISSINGS}
9599 = $property_object->table($default);
9603 # Finished storing all the @missings defaults in the input
9604 # file so far. Get the one for the current property.
9605 my $missings = $property_info{$property_addr}{$MISSINGS};
9607 # But we likely have separately stored what the default
9608 # should be. (This is to accommodate versions of the
9609 # standard where the @missings lines are absent or
9610 # incomplete.) Hopefully the two will match. But check
9612 $default_map = $property_object->default_map;
9614 # If the map is a ref, it means that the default won't be
9615 # processed until later, so undef it, so next few lines
9616 # will redefine it to something that nothing will match
9617 undef $default_map if ref $default_map;
9619 # Create a $default_map if don't have one; maybe a dummy
9620 # that won't match anything.
9621 if (! defined $default_map) {
9623 # Use any @missings line in the file.
9624 if (defined $missings) {
9625 if (ref $missings) {
9626 $default_map = $missings->full_name;
9627 $default_table = $missings;
9630 $default_map = $missings;
9633 # And store it with the property for outside use.
9634 $property_object->set_default_map($default_map);
9638 # Neither an @missings nor a default map. Create
9639 # a dummy one, so won't have to test definedness
9641 $default_map = '_Perl This will never be in a file
9646 # Here, we have $default_map defined, possibly in terms of
9647 # $missings, but maybe not, and possibly is a dummy one.
9648 if (defined $missings) {
9650 # Make sure there is no conflict between the two.
9651 # $missings has priority.
9652 if (ref $missings) {
9654 = $property_object->table($default_map);
9655 if (! defined $default_table
9656 || $default_table != $missings)
9658 if (! defined $default_table) {
9659 $default_table = $UNDEF;
9661 $file->carp_bad_line(<<END
9662 The \@missings line for $property_name in $file says that missings default to
9663 $missings, but we expect it to be $default_table. $missings used.
9666 $default_table = $missings;
9667 $default_map = $missings->full_name;
9669 $property_info{$property_addr}{$DEFAULT_TABLE}
9672 elsif ($default_map ne $missings) {
9673 $file->carp_bad_line(<<END
9674 The \@missings line for $property_name in $file says that missings default to
9675 $missings, but we expect it to be $default_map. $missings used.
9678 $default_map = $missings;
9682 $property_info{$property_addr}{$DEFAULT_MAP}
9685 # If haven't done so already, find the table corresponding
9686 # to this map for non-string properties.
9687 if (! defined $default_table
9688 && $property_type != $STRING
9689 && $property_type != $UNKNOWN)
9691 $default_table = $property_info{$property_addr}
9693 = $property_object->table($default_map);
9695 } # End of is first time for this property
9696 } # End of switching properties.
9698 # Ready to process the line.
9699 # The Unicode files are set up so that if the map is not defined,
9700 # it is a binary property with value 'Y'
9701 if (! defined $map) {
9706 # If the map begins with a special command to us (enclosed in
9707 # delimiters), extract the command(s).
9708 while ($map =~ s/ ^ $CMD_DELIM (.*?) $CMD_DELIM //x) {
9710 if ($command =~ / ^ $REPLACE_CMD= (.*) /x) {
9713 elsif ($command =~ / ^ $MAP_TYPE_CMD= (.*) /x) {
9717 $file->carp_bad_line("Unknown command line: '$1'");
9723 if ($default_map eq $CODE_POINT && $map =~ / ^ $code_point_re $/x)
9726 # Here, we have a map to a particular code point, and the
9727 # default map is to a code point itself. If the range
9728 # includes the particular code point, change that portion of
9729 # the range to the default. This makes sure that in the final
9730 # table only the non-defaults are listed.
9731 my $decimal_map = hex $map;
9732 if ($low <= $decimal_map && $decimal_map <= $high) {
9734 # If the range includes stuff before or after the map
9735 # we're changing, split it and process the split-off parts
9737 if ($low < $decimal_map) {
9738 $file->insert_adjusted_lines(
9739 sprintf("%04X..%04X; %s; %s",
9745 if ($high > $decimal_map) {
9746 $file->insert_adjusted_lines(
9747 sprintf("%04X..%04X; %s; %s",
9753 $low = $high = $decimal_map;
9758 # If we can tell that this is a synonym for the default map, use
9759 # the default one instead.
9760 if ($property_type != $STRING
9761 && $property_type != $UNKNOWN)
9763 my $table = $property_object->table($map);
9764 if (defined $table && $table == $default_table) {
9765 $map = $default_map;
9769 # And figure out the map type if not known.
9770 if (! defined $map_type || $map_type == $COMPUTE_NO_MULTI_CP) {
9771 if ($map eq "") { # Nulls are always $NULL map type
9773 } # Otherwise, non-strings, and those that don't allow
9774 # $MULTI_CP, and those that aren't multiple code points are
9777 (($property_type != $STRING && $property_type != $UNKNOWN)
9778 || (defined $map_type && $map_type == $COMPUTE_NO_MULTI_CP)
9779 || $map !~ /^ $code_point_re ( \ $code_point_re )+ $ /x)
9784 $map_type = $MULTI_CP;
9788 $property_object->add_map($low, $high,
9791 Replace => $replace);
9792 } # End of loop through file's lines
9798 { # Closure for UnicodeData.txt handling
9800 # This file was the first one in the UCD; its design leads to some
9801 # awkwardness in processing. Here is a sample line:
9802 # 0041;LATIN CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0061;
9803 # The fields in order are:
9804 my $i = 0; # The code point is in field 0, and is shifted off.
9805 my $CHARNAME = $i++; # character name (e.g. "LATIN CAPITAL LETTER A")
9806 my $CATEGORY = $i++; # category (e.g. "Lu")
9807 my $CCC = $i++; # Canonical combining class (e.g. "230")
9808 my $BIDI = $i++; # directional class (e.g. "L")
9809 my $PERL_DECOMPOSITION = $i++; # decomposition mapping
9810 my $PERL_DECIMAL_DIGIT = $i++; # decimal digit value
9811 my $NUMERIC_TYPE_OTHER_DIGIT = $i++; # digit value, like a superscript
9812 # Dual-use in this program; see below
9813 my $NUMERIC = $i++; # numeric value
9814 my $MIRRORED = $i++; # ? mirrored
9815 my $UNICODE_1_NAME = $i++; # name in Unicode 1.0
9816 my $COMMENT = $i++; # iso comment
9817 my $UPPER = $i++; # simple uppercase mapping
9818 my $LOWER = $i++; # simple lowercase mapping
9819 my $TITLE = $i++; # simple titlecase mapping
9820 my $input_field_count = $i;
9822 # This routine in addition outputs these extra fields:
9823 my $DECOMP_TYPE = $i++; # Decomposition type
9825 # These fields are modifications of ones above, and are usually
9826 # suppressed; they must come last, as for speed, the loop upper bound is
9827 # normally set to ignore them
9828 my $NAME = $i++; # This is the strict name field, not the one that
9830 my $DECOMP_MAP = $i++; # Strict decomposition mapping; not the one used
9831 # by Unicode::Normalize
9832 my $last_field = $i - 1;
9834 # All these are read into an array for each line, with the indices defined
9835 # above. The empty fields in the example line above indicate that the
9836 # value is defaulted. The handler called for each line of the input
9837 # changes these to their defaults.
9839 # Here are the official names of the properties, in a parallel array:
9841 $field_names[$BIDI] = 'Bidi_Class';
9842 $field_names[$CATEGORY] = 'General_Category';
9843 $field_names[$CCC] = 'Canonical_Combining_Class';
9844 $field_names[$CHARNAME] = 'Perl_Charnames';
9845 $field_names[$COMMENT] = 'ISO_Comment';
9846 $field_names[$DECOMP_MAP] = 'Decomposition_Mapping';
9847 $field_names[$DECOMP_TYPE] = 'Decomposition_Type';
9848 $field_names[$LOWER] = 'Lowercase_Mapping';
9849 $field_names[$MIRRORED] = 'Bidi_Mirrored';
9850 $field_names[$NAME] = 'Name';
9851 $field_names[$NUMERIC] = 'Numeric_Value';
9852 $field_names[$NUMERIC_TYPE_OTHER_DIGIT] = 'Numeric_Type';
9853 $field_names[$PERL_DECIMAL_DIGIT] = 'Perl_Decimal_Digit';
9854 $field_names[$PERL_DECOMPOSITION] = 'Perl_Decomposition_Mapping';
9855 $field_names[$TITLE] = 'Titlecase_Mapping';
9856 $field_names[$UNICODE_1_NAME] = 'Unicode_1_Name';
9857 $field_names[$UPPER] = 'Uppercase_Mapping';
9859 # Some of these need a little more explanation:
9860 # The $PERL_DECIMAL_DIGIT field does not lead to an official Unicode
9861 # property, but is used in calculating the Numeric_Type. Perl however,
9862 # creates a file from this field, so a Perl property is created from it.
9863 # Similarly, the Other_Digit field is used only for calculating the
9864 # Numeric_Type, and so it can be safely re-used as the place to store
9865 # the value for Numeric_Type; hence it is referred to as
9866 # $NUMERIC_TYPE_OTHER_DIGIT.
9867 # The input field named $PERL_DECOMPOSITION is a combination of both the
9868 # decomposition mapping and its type. Perl creates a file containing
9869 # exactly this field, so it is used for that. The two properties are
9870 # separated into two extra output fields, $DECOMP_MAP and $DECOMP_TYPE.
9871 # $DECOMP_MAP is usually suppressed (unless the lists are changed to
9872 # output it), as Perl doesn't use it directly.
9873 # The input field named here $CHARNAME is used to construct the
9874 # Perl_Charnames property, which is a combination of the Name property
9875 # (which the input field contains), and the Unicode_1_Name property, and
9876 # others from other files. Since, the strict Name property is not used
9877 # by Perl, this field is used for the table that Perl does use. The
9878 # strict Name property table is usually suppressed (unless the lists are
9879 # changed to output it), so it is accumulated in a separate field,
9880 # $NAME, which to save time is discarded unless the table is actually to
9883 # This file is processed like most in this program. Control is passed to
9884 # process_generic_property_file() which calls filter_UnicodeData_line()
9885 # for each input line. This filter converts the input into line(s) that
9886 # process_generic_property_file() understands. There is also a setup
9887 # routine called before any of the file is processed, and a handler for
9888 # EOF processing, all in this closure.
9890 # A huge speed-up occurred at the cost of some added complexity when these
9891 # routines were altered to buffer the outputs into ranges. Almost all the
9892 # lines of the input file apply to just one code point, and for most
9893 # properties, the map for the next code point up is the same as the
9894 # current one. So instead of creating a line for each property for each
9895 # input line, filter_UnicodeData_line() remembers what the previous map
9896 # of a property was, and doesn't generate a line to pass on until it has
9897 # to, as when the map changes; and that passed-on line encompasses the
9898 # whole contiguous range of code points that have the same map for that
9899 # property. This means a slight amount of extra setup, and having to
9900 # flush these buffers on EOF, testing if the maps have changed, plus
9901 # remembering state information in the closure. But it means a lot less
9902 # real time in not having to change the data base for each property on
9905 # Another complication is that there are already a few ranges designated
9906 # in the input. There are two lines for each, with the same maps except
9907 # the code point and name on each line. This was actually the hardest
9908 # thing to design around. The code points in those ranges may actually
9909 # have real maps not given by these two lines. These maps will either
9910 # be algorithmically determinable, or be in the extracted files furnished
9911 # with the UCD. In the event of conflicts between these extracted files,
9912 # and this one, Unicode says that this one prevails. But it shouldn't
9913 # prevail for conflicts that occur in these ranges. The data from the
9914 # extracted files prevails in those cases. So, this program is structured
9915 # so that those files are processed first, storing maps. Then the other
9916 # files are processed, generally overwriting what the extracted files
9917 # stored. But just the range lines in this input file are processed
9918 # without overwriting. This is accomplished by adding a special string to
9919 # the lines output to tell process_generic_property_file() to turn off the
9920 # overwriting for just this one line.
9921 # A similar mechanism is used to tell it that the map is of a non-default
9924 sub setup_UnicodeData { # Called before any lines of the input are read
9926 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9928 # Create a new property specially located that is a combination of the
9929 # various Name properties: Name, Unicode_1_Name, Named Sequences, and
9930 # Name_Alias properties. (The final duplicates elements of the
9931 # first.) A comment for it will later be constructed based on the
9932 # actual properties present and used
9933 $perl_charname = Property->new('Perl_Charnames',
9935 Directory => File::Spec->curdir(),
9937 Fate => $INTERNAL_ONLY,
9938 Perl_Extension => 1,
9939 Range_Size_1 => \&output_perl_charnames_line,
9942 $perl_charname->set_proxy_for('Name', 'Name_Alias');
9944 my $Perl_decomp = Property->new('Perl_Decomposition_Mapping',
9945 Directory => File::Spec->curdir(),
9946 File => 'Decomposition',
9947 Format => $DECOMP_STRING_FORMAT,
9948 Fate => $INTERNAL_ONLY,
9949 Perl_Extension => 1,
9950 Default_Map => $CODE_POINT,
9952 # normalize.pm can't cope with these
9953 Output_Range_Counts => 0,
9955 # This is a specially formatted table
9956 # explicitly for normalize.pm, which
9957 # is expecting a particular format,
9958 # which means that mappings containing
9959 # multiple code points are in the main
9961 Map_Type => $COMPUTE_NO_MULTI_CP,
9964 $Perl_decomp->set_proxy_for('Decomposition_Mapping', 'Decomposition_Type');
9965 $Perl_decomp->add_comment(join_lines(<<END
9966 This mapping is a combination of the Unicode 'Decomposition_Type' and
9967 'Decomposition_Mapping' properties, formatted for use by normalize.pm. It is
9968 identical to the official Unicode 'Decomposition_Mapping' property except for
9970 1) It omits the algorithmically determinable Hangul syllable decompositions,
9971 which normalize.pm handles algorithmically.
9972 2) It contains the decomposition type as well. Non-canonical decompositions
9973 begin with a word in angle brackets, like <super>, which denotes the
9974 compatible decomposition type. If the map does not begin with the <angle
9975 brackets>, the decomposition is canonical.
9979 my $Decimal_Digit = Property->new("Perl_Decimal_Digit",
9981 Perl_Extension => 1,
9982 File => 'Digit', # Trad. location
9983 Directory => $map_directory,
9987 $Decimal_Digit->add_comment(join_lines(<<END
9988 This file gives the mapping of all code points which represent a single
9989 decimal digit [0-9] to their respective digits. For example, the code point
9990 U+0031 (an ASCII '1') is mapped to a numeric 1. These code points are those
9991 that have Numeric_Type=Decimal; not special things, like subscripts nor Roman
9996 # These properties are not used for generating anything else, and are
9997 # usually not output. By making them last in the list, we can just
9998 # change the high end of the loop downwards to avoid the work of
9999 # generating a table(s) that is/are just going to get thrown away.
10000 if (! property_ref('Decomposition_Mapping')->to_output_map
10001 && ! property_ref('Name')->to_output_map)
10003 $last_field = min($NAME, $DECOMP_MAP) - 1;
10004 } elsif (property_ref('Decomposition_Mapping')->to_output_map) {
10005 $last_field = $DECOMP_MAP;
10006 } elsif (property_ref('Name')->to_output_map) {
10007 $last_field = $NAME;
10012 my $first_time = 1; # ? Is this the first line of the file
10013 my $in_range = 0; # ? Are we in one of the file's ranges
10014 my $previous_cp; # hex code point of previous line
10015 my $decimal_previous_cp = -1; # And its decimal equivalent
10016 my @start; # For each field, the current starting
10017 # code point in hex for the range
10018 # being accumulated.
10019 my @fields; # The input fields;
10020 my @previous_fields; # And those from the previous call
10022 sub filter_UnicodeData_line {
10023 # Handle a single input line from UnicodeData.txt; see comments above
10024 # Conceptually this takes a single line from the file containing N
10025 # properties, and converts it into N lines with one property per line,
10026 # which is what the final handler expects. But there are
10027 # complications due to the quirkiness of the input file, and to save
10028 # time, it accumulates ranges where the property values don't change
10029 # and only emits lines when necessary. This is about an order of
10030 # magnitude fewer lines emitted.
10033 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10035 # $_ contains the input line.
10036 # -1 in split means retain trailing null fields
10037 (my $cp, @fields) = split /\s*;\s*/, $_, -1;
10039 #local $to_trace = 1 if main::DEBUG;
10040 trace $cp, @fields , $input_field_count if main::DEBUG && $to_trace;
10041 if (@fields > $input_field_count) {
10042 $file->carp_bad_line('Extra fields');
10047 my $decimal_cp = hex $cp;
10049 # We have to output all the buffered ranges when the next code point
10050 # is not exactly one after the previous one, which means there is a
10051 # gap in the ranges.
10052 my $force_output = ($decimal_cp != $decimal_previous_cp + 1);
10054 # The decomposition mapping field requires special handling. It looks
10057 # <compat> 0032 0020
10060 # The decomposition type is enclosed in <brackets>; if missing, it
10061 # means the type is canonical. There are two decomposition mapping
10062 # tables: the one for use by Perl's normalize.pm has a special format
10063 # which is this field intact; the other, for general use is of
10064 # standard format. In either case we have to find the decomposition
10065 # type. Empty fields have None as their type, and map to the code
10067 if ($fields[$PERL_DECOMPOSITION] eq "") {
10068 $fields[$DECOMP_TYPE] = 'None';
10069 $fields[$DECOMP_MAP] = $fields[$PERL_DECOMPOSITION] = $CODE_POINT;
10072 ($fields[$DECOMP_TYPE], my $map) = $fields[$PERL_DECOMPOSITION]
10073 =~ / < ( .+? ) > \s* ( .+ ) /x;
10074 if (! defined $fields[$DECOMP_TYPE]) {
10075 $fields[$DECOMP_TYPE] = 'Canonical';
10076 $fields[$DECOMP_MAP] = $fields[$PERL_DECOMPOSITION];
10079 $fields[$DECOMP_MAP] = $map;
10083 # The 3 numeric fields also require special handling. The 2 digit
10084 # fields must be either empty or match the number field. This means
10085 # that if it is empty, they must be as well, and the numeric type is
10086 # None, and the numeric value is 'Nan'.
10087 # The decimal digit field must be empty or match the other digit
10088 # field. If the decimal digit field is non-empty, the code point is
10089 # a decimal digit, and the other two fields will have the same value.
10090 # If it is empty, but the other digit field is non-empty, the code
10091 # point is an 'other digit', and the number field will have the same
10092 # value as the other digit field. If the other digit field is empty,
10093 # but the number field is non-empty, the code point is a generic
10095 if ($fields[$NUMERIC] eq "") {
10096 if ($fields[$PERL_DECIMAL_DIGIT] ne ""
10097 || $fields[$NUMERIC_TYPE_OTHER_DIGIT] ne ""
10099 $file->carp_bad_line("Numeric values inconsistent. Trying to process anyway");
10101 $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'None';
10102 $fields[$NUMERIC] = 'NaN';
10105 $file->carp_bad_line("'$fields[$NUMERIC]' should be a whole or rational number. Processing as if it were") if $fields[$NUMERIC] !~ qr{ ^ -? \d+ ( / \d+ )? $ }x;
10106 if ($fields[$PERL_DECIMAL_DIGIT] ne "") {
10107 $file->carp_bad_line("$fields[$PERL_DECIMAL_DIGIT] should equal $fields[$NUMERIC]. Processing anyway") if $fields[$PERL_DECIMAL_DIGIT] != $fields[$NUMERIC];
10108 $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'Decimal';
10110 elsif ($fields[$NUMERIC_TYPE_OTHER_DIGIT] ne "") {
10111 $file->carp_bad_line("$fields[$NUMERIC_TYPE_OTHER_DIGIT] should equal $fields[$NUMERIC]. Processing anyway") if $fields[$NUMERIC_TYPE_OTHER_DIGIT] != $fields[$NUMERIC];
10112 $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'Digit';
10115 $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'Numeric';
10117 # Rationals require extra effort.
10118 register_fraction($fields[$NUMERIC])
10119 if $fields[$NUMERIC] =~ qr{/};
10123 # For the properties that have empty fields in the file, and which
10124 # mean something different from empty, change them to that default.
10125 # Certain fields just haven't been empty so far in any Unicode
10126 # version, so don't look at those, namely $MIRRORED, $BIDI, $CCC,
10127 # $CATEGORY. This leaves just the two fields, and so we hard-code in
10128 # the defaults; which are very unlikely to ever change.
10129 $fields[$UPPER] = $CODE_POINT if $fields[$UPPER] eq "";
10130 $fields[$LOWER] = $CODE_POINT if $fields[$LOWER] eq "";
10132 # UAX44 says that if title is empty, it is the same as whatever upper
10134 $fields[$TITLE] = $fields[$UPPER] if $fields[$TITLE] eq "";
10136 # There are a few pairs of lines like:
10137 # AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
10138 # D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
10139 # that define ranges. These should be processed after the fields are
10140 # adjusted above, as they may override some of them; but mostly what
10141 # is left is to possibly adjust the $CHARNAME field. The names of all the
10142 # paired lines start with a '<', but this is also true of '<control>,
10143 # which isn't one of these special ones.
10144 if ($fields[$CHARNAME] eq '<control>') {
10146 # Some code points in this file have the pseudo-name
10147 # '<control>', but the official name for such ones is the null
10148 # string. For charnames.pm, we use the Unicode version 1 name
10149 $fields[$NAME] = "";
10150 $fields[$CHARNAME] = $fields[$UNICODE_1_NAME];
10152 # We had better not be in between range lines.
10154 $file->carp_bad_line("Expecting a closing range line, not a $fields[$CHARNAME]'. Trying anyway");
10158 elsif (substr($fields[$CHARNAME], 0, 1) ne '<') {
10160 # Here is a non-range line. We had better not be in between range
10163 $file->carp_bad_line("Expecting a closing range line, not a $fields[$CHARNAME]'. Trying anyway");
10166 if ($fields[$CHARNAME] =~ s/- $cp $//x) {
10168 # These are code points whose names end in their code points,
10169 # which means the names are algorithmically derivable from the
10170 # code points. To shorten the output Name file, the algorithm
10171 # for deriving these is placed in the file instead of each
10172 # code point, so they have map type $CP_IN_NAME
10173 $fields[$CHARNAME] = $CMD_DELIM
10178 . $fields[$CHARNAME];
10180 $fields[$NAME] = $fields[$CHARNAME];
10182 elsif ($fields[$CHARNAME] =~ /^<(.+), First>$/) {
10183 $fields[$CHARNAME] = $fields[$NAME] = $1;
10185 # Here we are at the beginning of a range pair.
10187 $file->carp_bad_line("Expecting a closing range line, not a beginning one, $fields[$CHARNAME]'. Trying anyway");
10191 # Because the properties in the range do not overwrite any already
10192 # in the db, we must flush the buffers of what's already there, so
10193 # they get handled in the normal scheme.
10197 elsif ($fields[$CHARNAME] !~ s/^<(.+), Last>$/$1/) {
10198 $file->carp_bad_line("Unexpected name starting with '<' $fields[$CHARNAME]. Ignoring this line.");
10202 else { # Here, we are at the last line of a range pair.
10205 $file->carp_bad_line("Unexpected end of range $fields[$CHARNAME] when not in one. Ignoring this line.");
10211 $fields[$NAME] = $fields[$CHARNAME];
10213 # Check that the input is valid: that the closing of the range is
10214 # the same as the beginning.
10215 foreach my $i (0 .. $last_field) {
10216 next if $fields[$i] eq $previous_fields[$i];
10217 $file->carp_bad_line("Expecting '$fields[$i]' to be the same as '$previous_fields[$i]'. Bad News. Trying anyway");
10220 # The processing differs depending on the type of range,
10221 # determined by its $CHARNAME
10222 if ($fields[$CHARNAME] =~ /^Hangul Syllable/) {
10224 # Check that the data looks right.
10225 if ($decimal_previous_cp != $SBase) {
10226 $file->carp_bad_line("Unexpected Hangul syllable start = $previous_cp. Bad News. Results will be wrong");
10228 if ($decimal_cp != $SBase + $SCount - 1) {
10229 $file->carp_bad_line("Unexpected Hangul syllable end = $cp. Bad News. Results will be wrong");
10232 # The Hangul syllable range has a somewhat complicated name
10233 # generation algorithm. Each code point in it has a canonical
10234 # decomposition also computable by an algorithm. The
10235 # perl decomposition map table built from these is used only
10236 # by normalize.pm, which has the algorithm built in it, so the
10237 # decomposition maps are not needed, and are large, so are
10238 # omitted from it. If the full decomposition map table is to
10239 # be output, the decompositions are generated for it, in the
10240 # EOF handling code for this input file.
10242 $previous_fields[$DECOMP_TYPE] = 'Canonical';
10244 # This range is stored in our internal structure with its
10245 # own map type, different from all others.
10246 $previous_fields[$CHARNAME] = $previous_fields[$NAME]
10252 . $fields[$CHARNAME];
10254 elsif ($fields[$CHARNAME] =~ /^CJK/) {
10256 # The name for these contains the code point itself, and all
10257 # are defined to have the same base name, regardless of what
10258 # is in the file. They are stored in our internal structure
10259 # with a map type of $CP_IN_NAME
10260 $previous_fields[$CHARNAME] = $previous_fields[$NAME]
10266 . 'CJK UNIFIED IDEOGRAPH';
10269 elsif ($fields[$CATEGORY] eq 'Co'
10270 || $fields[$CATEGORY] eq 'Cs')
10272 # The names of all the code points in these ranges are set to
10273 # null, as there are no names for the private use and
10274 # surrogate code points.
10276 $previous_fields[$CHARNAME] = $previous_fields[$NAME] = "";
10279 $file->carp_bad_line("Unexpected code point range $fields[$CHARNAME] because category is $fields[$CATEGORY]. Attempting to process it.");
10282 # The first line of the range caused everything else to be output,
10283 # and then its values were stored as the beginning values for the
10284 # next set of ranges, which this one ends. Now, for each value,
10285 # add a command to tell the handler that these values should not
10286 # replace any existing ones in our database.
10287 foreach my $i (0 .. $last_field) {
10288 $previous_fields[$i] = $CMD_DELIM
10293 . $previous_fields[$i];
10296 # And change things so it looks like the entire range has been
10297 # gone through with this being the final part of it. Adding the
10298 # command above to each field will cause this range to be flushed
10299 # during the next iteration, as it guaranteed that the stored
10300 # field won't match whatever value the next one has.
10301 $previous_cp = $cp;
10302 $decimal_previous_cp = $decimal_cp;
10304 # We are now set up for the next iteration; so skip the remaining
10305 # code in this subroutine that does the same thing, but doesn't
10306 # know about these ranges.
10312 # On the very first line, we fake it so the code below thinks there is
10313 # nothing to output, and initialize so that when it does get output it
10314 # uses the first line's values for the lowest part of the range.
10315 # (One could avoid this by using peek(), but then one would need to
10316 # know the adjustments done above and do the same ones in the setup
10317 # routine; not worth it)
10320 @previous_fields = @fields;
10321 @start = ($cp) x scalar @fields;
10322 $decimal_previous_cp = $decimal_cp - 1;
10325 # For each field, output the stored up ranges that this code point
10326 # doesn't fit in. Earlier we figured out if all ranges should be
10327 # terminated because of changing the replace or map type styles, or if
10328 # there is a gap between this new code point and the previous one, and
10329 # that is stored in $force_output. But even if those aren't true, we
10330 # need to output the range if this new code point's value for the
10331 # given property doesn't match the stored range's.
10332 #local $to_trace = 1 if main::DEBUG;
10333 foreach my $i (0 .. $last_field) {
10334 my $field = $fields[$i];
10335 if ($force_output || $field ne $previous_fields[$i]) {
10337 # Flush the buffer of stored values.
10338 $file->insert_adjusted_lines("$start[$i]..$previous_cp; $field_names[$i]; $previous_fields[$i]");
10340 # Start a new range with this code point and its value
10342 $previous_fields[$i] = $field;
10346 # Set the values for the next time.
10347 $previous_cp = $cp;
10348 $decimal_previous_cp = $decimal_cp;
10350 # The input line has generated whatever adjusted lines are needed, and
10351 # should not be looked at further.
10356 sub EOF_UnicodeData {
10357 # Called upon EOF to flush the buffers, and create the Hangul
10358 # decomposition mappings if needed.
10361 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10363 # Flush the buffers.
10364 foreach my $i (1 .. $last_field) {
10365 $file->insert_adjusted_lines("$start[$i]..$previous_cp; $field_names[$i]; $previous_fields[$i]");
10368 if (-e 'Jamo.txt') {
10370 # The algorithm is published by Unicode, based on values in
10371 # Jamo.txt, (which should have been processed before this
10372 # subroutine), and the results left in %Jamo
10374 Carp::my_carp_bug("Jamo.txt should be processed before Unicode.txt. Hangul syllables not generated.");
10378 # If the full decomposition map table is being output, insert
10379 # into it the Hangul syllable mappings. This is to avoid having
10380 # to publish a subroutine in it to compute them. (which would
10381 # essentially be this code.) This uses the algorithm published by
10383 if (property_ref('Decomposition_Mapping')->to_output_map) {
10384 for (my $S = $SBase; $S < $SBase + $SCount; $S++) {
10386 my $SIndex = $S - $SBase;
10387 my $L = $LBase + $SIndex / $NCount;
10388 my $V = $VBase + ($SIndex % $NCount) / $TCount;
10389 my $T = $TBase + $SIndex % $TCount;
10391 trace "L=$L, V=$V, T=$T" if main::DEBUG && $to_trace;
10392 my $decomposition = sprintf("%04X %04X", $L, $V);
10393 $decomposition .= sprintf(" %04X", $T) if $T != $TBase;
10394 $file->insert_adjusted_lines(
10395 sprintf("%04X; Decomposition_Mapping; %s",
10405 sub filter_v1_ucd {
10406 # Fix UCD lines in version 1. This is probably overkill, but this
10407 # fixes some glaring errors in Version 1 UnicodeData.txt. That file:
10408 # 1) had many Hangul (U+3400 - U+4DFF) code points that were later
10409 # removed. This program retains them
10410 # 2) didn't include ranges, which it should have, and which are now
10411 # added in @corrected_lines below. It was hand populated by
10412 # taking the data from Version 2, verified by analyzing
10414 # 3) There is a syntax error in the entry for U+09F8 which could
10415 # cause problems for utf8_heavy, and so is changed. It's
10416 # numeric value was simply a minus sign, without any number.
10417 # (Eventually Unicode changed the code point to non-numeric.)
10418 # 4) The decomposition types often don't match later versions
10419 # exactly, and the whole syntax of that field is different; so
10420 # the syntax is changed as well as the types to their later
10421 # terminology. Otherwise normalize.pm would be very unhappy
10422 # 5) Many ccc classes are different. These are left intact.
10423 # 6) U+FF10 - U+FF19 are missing their numeric values in all three
10424 # fields. These are unchanged because it doesn't really cause
10425 # problems for Perl.
10426 # 7) A number of code points, such as controls, don't have their
10427 # Unicode Version 1 Names in this file. These are unchanged.
10429 my @corrected_lines = split /\n/, <<'END';
10430 4E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;;
10431 9FA5;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;;
10432 E000;<Private Use, First>;Co;0;L;;;;;N;;;;;
10433 F8FF;<Private Use, Last>;Co;0;L;;;;;N;;;;;
10434 F900;<CJK Compatibility Ideograph, First>;Lo;0;L;;;;;N;;;;;
10435 FA2D;<CJK Compatibility Ideograph, Last>;Lo;0;L;;;;;N;;;;;
10439 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10441 #local $to_trace = 1 if main::DEBUG;
10442 trace $_ if main::DEBUG && $to_trace;
10444 # -1 => retain trailing null fields
10445 my ($code_point, @fields) = split /\s*;\s*/, $_, -1;
10447 # At the first place that is wrong in the input, insert all the
10448 # corrections, replacing the wrong line.
10449 if ($code_point eq '4E00') {
10450 my @copy = @corrected_lines;
10452 ($code_point, @fields) = split /\s*;\s*/, $_, -1;
10454 $file->insert_lines(@copy);
10458 if ($fields[$NUMERIC] eq '-') {
10459 $fields[$NUMERIC] = '-1'; # This is what 2.0 made it.
10462 if ($fields[$PERL_DECOMPOSITION] ne "") {
10464 # Several entries have this change to superscript 2 or 3 in the
10465 # middle. Convert these to the modern version, which is to use
10466 # the actual U+00B2 and U+00B3 (the superscript forms) instead.
10467 # So 'HHHH HHHH <+sup> 0033 <-sup> HHHH' becomes
10468 # 'HHHH HHHH 00B3 HHHH'.
10469 # It turns out that all of these that don't have another
10470 # decomposition defined at the beginning of the line have the
10471 # <square> decomposition in later releases.
10472 if ($code_point ne '00B2' && $code_point ne '00B3') {
10473 if ($fields[$PERL_DECOMPOSITION]
10474 =~ s/<\+sup> 003([23]) <-sup>/00B$1/)
10476 if (substr($fields[$PERL_DECOMPOSITION], 0, 1) ne '<') {
10477 $fields[$PERL_DECOMPOSITION] = '<square> '
10478 . $fields[$PERL_DECOMPOSITION];
10483 # If is like '<+circled> 0052 <-circled>', convert to
10485 $fields[$PERL_DECOMPOSITION] =~
10486 s/ < \+ ( .*? ) > \s* (.*?) \s* <-\1> /<$1> $2/x;
10488 # Convert '<join> HHHH HHHH <join>' to '<medial> HHHH HHHH', etc.
10489 $fields[$PERL_DECOMPOSITION] =~
10490 s/ <join> \s* (.*?) \s* <no-join> /<final> $1/x
10491 or $fields[$PERL_DECOMPOSITION] =~
10492 s/ <join> \s* (.*?) \s* <join> /<medial> $1/x
10493 or $fields[$PERL_DECOMPOSITION] =~
10494 s/ <no-join> \s* (.*?) \s* <join> /<initial> $1/x
10495 or $fields[$PERL_DECOMPOSITION] =~
10496 s/ <no-join> \s* (.*?) \s* <no-join> /<isolated> $1/x;
10498 # Convert '<break> HHHH HHHH <break>' to '<break> HHHH', etc.
10499 $fields[$PERL_DECOMPOSITION] =~
10500 s/ <(break|no-break)> \s* (.*?) \s* <\1> /<$1> $2/x;
10502 # Change names to modern form.
10503 $fields[$PERL_DECOMPOSITION] =~ s/<font variant>/<font>/g;
10504 $fields[$PERL_DECOMPOSITION] =~ s/<no-break>/<noBreak>/g;
10505 $fields[$PERL_DECOMPOSITION] =~ s/<circled>/<circle>/g;
10506 $fields[$PERL_DECOMPOSITION] =~ s/<break>/<fraction>/g;
10508 # One entry has weird braces
10509 $fields[$PERL_DECOMPOSITION] =~ s/[{}]//g;
10512 $_ = join ';', $code_point, @fields;
10513 trace $_ if main::DEBUG && $to_trace;
10517 sub filter_v2_1_5_ucd {
10518 # A dozen entries in this 2.1.5 file had the mirrored and numeric
10519 # columns swapped; These all had mirrored be 'N'. So if the numeric
10520 # column appears to be N, swap it back.
10522 my ($code_point, @fields) = split /\s*;\s*/, $_, -1;
10523 if ($fields[$NUMERIC] eq 'N') {
10524 $fields[$NUMERIC] = $fields[$MIRRORED];
10525 $fields[$MIRRORED] = 'N';
10526 $_ = join ';', $code_point, @fields;
10531 sub filter_v6_ucd {
10533 # Unicode 6.0 co-opted the name BELL for U+1F514, but we haven't
10534 # accepted that yet to allow for some deprecation cycles.
10536 return if $_ !~ /^(?:0007|1F514|070F);/;
10538 my ($code_point, @fields) = split /\s*;\s*/, $_, -1;
10539 if ($code_point eq '0007') {
10540 $fields[$CHARNAME] = "";
10542 elsif ($code_point eq '070F') { # Unicode Corrigendum #8; see
10543 # http://www.unicode.org/versions/corrigendum8.html
10544 $fields[$BIDI] = "AL";
10546 elsif ($^V lt v5.17.0) { # For 5.18 will convert to use Unicode's name
10547 $fields[$CHARNAME] = "";
10550 $_ = join ';', $code_point, @fields;
10554 } # End closure for UnicodeData
10556 sub process_GCB_test {
10559 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10561 while ($file->next_line) {
10562 push @backslash_X_tests, $_;
10568 sub process_NamedSequences {
10569 # NamedSequences.txt entries are just added to an array. Because these
10570 # don't look like the other tables, they have their own handler.
10572 # LATIN CAPITAL LETTER A WITH MACRON AND GRAVE;0100 0300
10574 # This just adds the sequence to an array for later handling
10577 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10579 while ($file->next_line) {
10580 my ($name, $sequence, @remainder) = split /\s*;\s*/, $_, -1;
10582 $file->carp_bad_line(
10583 "Doesn't look like 'KHMER VOWEL SIGN OM;17BB 17C6'");
10587 # Note single \t in keeping with special output format of
10588 # Perl_charnames. But it turns out that the code points don't have to
10589 # be 5 digits long, like the rest, based on the internal workings of
10590 # charnames.pm. This could be easily changed for consistency.
10591 push @named_sequences, "$sequence\t$name";
10600 sub filter_early_ea_lb {
10601 # Fixes early EastAsianWidth.txt and LineBreak.txt files. These had a
10602 # third field be the name of the code point, which can be ignored in
10603 # most cases. But it can be meaningful if it marks a range:
10604 # 33FE;W;IDEOGRAPHIC TELEGRAPH SYMBOL FOR DAY THIRTY-ONE
10605 # 3400;W;<CJK Ideograph Extension A, First>
10607 # We need to see the First in the example above to know it's a range.
10608 # They did not use the later range syntaxes. This routine changes it
10609 # to use the modern syntax.
10610 # $1 is the Input_file object.
10612 my @fields = split /\s*;\s*/;
10613 if ($fields[2] =~ /^<.*, First>/) {
10614 $first_range = $fields[0];
10617 elsif ($fields[2] =~ /^<.*, Last>/) {
10618 $_ = $_ = "$first_range..$fields[0]; $fields[1]";
10621 undef $first_range;
10622 $_ = "$fields[0]; $fields[1]";
10629 sub filter_old_style_arabic_shaping {
10630 # Early versions used a different term for the later one.
10632 my @fields = split /\s*;\s*/;
10633 $fields[3] =~ s/<no shaping>/No_Joining_Group/;
10634 $fields[3] =~ s/\s+/_/g; # Change spaces to underscores
10635 $_ = join ';', @fields;
10639 sub filter_arabic_shaping_line {
10640 # ArabicShaping.txt has entries that look like:
10641 # 062A; TEH; D; BEH
10642 # The field containing 'TEH' is not used. The next field is Joining_Type
10643 # and the last is Joining_Group
10644 # This generates two lines to pass on, one for each property on the input
10648 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10650 my @fields = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
10653 $file->carp_bad_line('Extra fields');
10658 $file->insert_adjusted_lines("$fields[0]; Joining_Group; $fields[3]");
10659 $_ = "$fields[0]; Joining_Type; $fields[2]";
10665 my $lc; # Table for lowercase mapping
10669 sub setup_special_casing {
10670 # SpecialCasing.txt contains the non-simple case change mappings. The
10671 # simple ones are in UnicodeData.txt, which should already have been
10672 # read in to the full property data structures, so as to initialize
10673 # these with the simple ones. Then the SpecialCasing.txt entries
10674 # overwrite the ones which have different full mappings.
10676 # This routine sees if the simple mappings are to be output, and if
10677 # so, copies what has already been put into the full mapping tables,
10678 # while they still contain only the simple mappings.
10680 # The reason it is done this way is that the simple mappings are
10681 # probably not going to be output, so it saves work to initialize the
10682 # full tables with the simple mappings, and then overwrite those
10683 # relatively few entries in them that have different full mappings,
10684 # and thus skip the simple mapping tables altogether.
10686 # New tables with just the simple mappings that are overridden by the
10687 # full ones are constructed. These are for Unicode::UCD, which
10688 # requires the simple mappings. The Case_Folding table is a combined
10689 # table of both the simple and full mappings, with the full ones being
10690 # in the hash, and the simple ones, even those overridden by the hash,
10691 # being in the base table. That same mechanism could have been
10692 # employed here, except that the docs have said that the generated
10693 # files are usuable directly by programs, so we dare not change the
10694 # format in any way.
10697 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10699 $lc = property_ref('lc');
10700 $tc = property_ref('tc');
10701 $uc = property_ref('uc');
10703 # For each of the case change mappings...
10704 foreach my $case_table ($lc, $tc, $uc) {
10705 my $case = $case_table->name;
10706 my $full = property_ref($case);
10707 unless (defined $full && ! $full->is_empty) {
10708 Carp::my_carp_bug("Need to process UnicodeData before SpecialCasing. Only special casing will be generated.");
10711 # The simple version's name in each mapping merely has an 's' in
10712 # front of the full one's
10713 my $simple_name = 's' . $case;
10714 my $simple = property_ref($simple_name);
10715 $simple->initialize($full) if $simple->to_output_map();
10717 my $simple_only = Property->new("_s$case",
10719 Default_Map => $CODE_POINT,
10720 Perl_Extension => 1,
10721 Fate => $INTERNAL_ONLY,
10722 Description => "This contains the simple mappings for $case for just the code points that have different full mappings");
10723 $simple_only->set_to_output_map($INTERNAL_MAP);
10724 $simple_only->add_comment(join_lines( <<END
10725 This file is for UCD.pm so that it can construct simple mappings that would
10726 otherwise be lost because they are overridden by full mappings.
10730 unless ($simple->to_output_map()) {
10731 $simple_only->set_proxy_for($simple_name);
10738 sub filter_special_casing_line {
10739 # Change the format of $_ from SpecialCasing.txt into something that
10740 # the generic handler understands. Each input line contains three
10741 # case mappings. This will generate three lines to pass to the
10742 # generic handler for each of those.
10744 # The input syntax (after stripping comments and trailing white space
10745 # is like one of the following (with the final two being entries that
10747 # 00DF; 00DF; 0053 0073; 0053 0053; # LATIN SMALL LETTER SHARP S
10748 # 03A3; 03C2; 03A3; 03A3; Final_Sigma;
10749 # 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
10750 # Note the trailing semi-colon, unlike many of the input files. That
10751 # means that there will be an extra null field generated by the split
10754 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10756 my @fields = split /\s*;\s*/, $_, -1; # -1 => retain trailing null
10759 # field #4 is when this mapping is conditional. If any of these get
10760 # implemented, it would be by hard-coding in the casing functions in
10761 # the Perl core, not through tables. But if there is a new condition
10762 # we don't know about, output a warning. We know about all the
10763 # conditions through 6.0
10764 if ($fields[4] ne "") {
10765 my @conditions = split ' ', $fields[4];
10766 if ($conditions[0] ne 'tr' # We know that these languages have
10767 # conditions, and some are multiple
10768 && $conditions[0] ne 'az'
10769 && $conditions[0] ne 'lt'
10771 # And, we know about a single condition Final_Sigma, but
10773 && ($v_version gt v5.2.0
10774 && (@conditions > 1 || $conditions[0] ne 'Final_Sigma')))
10776 $file->carp_bad_line("Unknown condition '$fields[4]'. You should inspect it and either add code to handle it, or add to list of those that are to ignore");
10778 elsif ($conditions[0] ne 'Final_Sigma') {
10780 # Don't print out a message for Final_Sigma, because we
10781 # have hard-coded handling for it. (But the standard
10782 # could change what the rule should be, but it wouldn't
10783 # show up here anyway.
10785 print "# SKIPPING Special Casing: $_\n"
10786 if $verbosity >= $VERBOSE;
10791 elsif (@fields > 6 || (@fields == 6 && $fields[5] ne "" )) {
10792 $file->carp_bad_line('Extra fields');
10797 $_ = "$fields[0]; lc; $fields[1]";
10798 $file->insert_adjusted_lines("$fields[0]; tc; $fields[2]");
10799 $file->insert_adjusted_lines("$fields[0]; uc; $fields[3]");
10801 # Copy any simple case change to the special tables constructed if
10802 # being overridden by a multi-character case change.
10803 if ($fields[1] ne $fields[0]
10804 && (my $value = $lc->value_of(hex $fields[0])) ne $CODE_POINT)
10806 $file->insert_adjusted_lines("$fields[0]; _slc; $value");
10808 if ($fields[2] ne $fields[0]
10809 && (my $value = $tc->value_of(hex $fields[0])) ne $CODE_POINT)
10811 $file->insert_adjusted_lines("$fields[0]; _stc; $value");
10813 if ($fields[3] ne $fields[0]
10814 && (my $value = $uc->value_of(hex $fields[0])) ne $CODE_POINT)
10816 $file->insert_adjusted_lines("$fields[0]; _suc; $value");
10823 sub filter_old_style_case_folding {
10824 # This transforms $_ containing the case folding style of 3.0.1, to 3.1
10825 # and later style. Different letters were used in the earlier.
10828 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10830 my @fields = split /\s*;\s*/;
10831 if ($fields[0] =~ /^ 013 [01] $/x) { # The two turkish fields
10834 elsif ($fields[1] eq 'L') {
10835 $fields[1] = 'C'; # L => C always
10837 elsif ($fields[1] eq 'E') {
10838 if ($fields[2] =~ / /) { # E => C if one code point; F otherwise
10846 $file->carp_bad_line("Expecting L or E in second field");
10850 $_ = join("; ", @fields) . ';';
10854 { # Closure for case folding
10856 # Create the map for simple only if are going to output it, for otherwise
10857 # it takes no part in anything we do.
10858 my $to_output_simple;
10860 sub setup_case_folding($) {
10861 # Read in the case foldings in CaseFolding.txt. This handles both
10862 # simple and full case folding.
10865 = property_ref('Simple_Case_Folding')->to_output_map;
10867 if (! $to_output_simple) {
10868 property_ref('Case_Folding')->set_proxy_for('Simple_Case_Folding');
10871 # If we ever wanted to show that these tables were combined, a new
10872 # property method could be created, like set_combined_props()
10873 property_ref('Case_Folding')->add_comment(join_lines( <<END
10874 This file includes both the simple and full case folding maps. The simple
10875 ones are in the main body of the table below, and the full ones adding to or
10876 overriding them are in the hash.
10882 sub filter_case_folding_line {
10883 # Called for each line in CaseFolding.txt
10884 # Input lines look like:
10885 # 0041; C; 0061; # LATIN CAPITAL LETTER A
10886 # 00DF; F; 0073 0073; # LATIN SMALL LETTER SHARP S
10887 # 1E9E; S; 00DF; # LATIN CAPITAL LETTER SHARP S
10889 # 'C' means that folding is the same for both simple and full
10890 # 'F' that it is only for full folding
10891 # 'S' that it is only for simple folding
10892 # 'T' is locale-dependent, and ignored
10893 # 'I' is a type of 'F' used in some early releases.
10894 # Note the trailing semi-colon, unlike many of the input files. That
10895 # means that there will be an extra null field generated by the split
10896 # below, which we ignore and hence is not an error.
10899 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10901 my ($range, $type, $map, @remainder) = split /\s*;\s*/, $_, -1;
10902 if (@remainder > 1 || (@remainder == 1 && $remainder[0] ne "" )) {
10903 $file->carp_bad_line('Extra fields');
10908 if ($type eq 'T') { # Skip Turkic case folding, is locale dependent
10913 # C: complete, F: full, or I: dotted uppercase I -> dotless lowercase
10914 # I are all full foldings; S is single-char. For S, there is always
10915 # an F entry, so we must allow multiple values for the same code
10916 # point. Fortunately this table doesn't need further manipulation
10917 # which would preclude using multiple-values. The S is now included
10918 # so that _swash_inversion_hash() is able to construct closures
10919 # without having to worry about F mappings.
10920 if ($type eq 'C' || $type eq 'F' || $type eq 'I' || $type eq 'S') {
10921 $_ = "$range; Case_Folding; "
10922 . "$CMD_DELIM$REPLACE_CMD=$MULTIPLE_BEFORE$CMD_DELIM$map";
10926 $file->carp_bad_line('Expecting C F I S or T in second field');
10929 # C and S are simple foldings, but simple case folding is not needed
10930 # unless we explicitly want its map table output.
10931 if ($to_output_simple && $type eq 'C' || $type eq 'S') {
10932 $file->insert_adjusted_lines("$range; Simple_Case_Folding; $map");
10938 } # End case fold closure
10940 sub filter_jamo_line {
10941 # Filter Jamo.txt lines. This routine mainly is used to populate hashes
10942 # from this file that is used in generating the Name property for Jamo
10943 # code points. But, it also is used to convert early versions' syntax
10944 # into the modern form. Here are two examples:
10945 # 1100; G # HANGUL CHOSEONG KIYEOK # Modern syntax
10946 # U+1100; G; HANGUL CHOSEONG KIYEOK # 2.0 syntax
10948 # The input is $_, the output is $_ filtered.
10950 my @fields = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
10952 # Let the caller handle unexpected input. In earlier versions, there was
10953 # a third field which is supposed to be a comment, but did not have a '#'
10955 return if @fields > (($v_version gt v3.0.0) ? 2 : 3);
10957 $fields[0] =~ s/^U\+//; # Also, early versions had this extraneous
10960 # Some 2.1 versions had this wrong. Causes havoc with the algorithm.
10961 $fields[1] = 'R' if $fields[0] eq '1105';
10963 # Add to structure so can generate Names from it.
10964 my $cp = hex $fields[0];
10965 my $short_name = $fields[1];
10966 $Jamo{$cp} = $short_name;
10967 if ($cp <= $LBase + $LCount) {
10968 $Jamo_L{$short_name} = $cp - $LBase;
10970 elsif ($cp <= $VBase + $VCount) {
10971 $Jamo_V{$short_name} = $cp - $VBase;
10973 elsif ($cp <= $TBase + $TCount) {
10974 $Jamo_T{$short_name} = $cp - $TBase;
10977 Carp::my_carp_bug("Unexpected Jamo code point in $_");
10981 # Reassemble using just the first two fields to look like a typical
10982 # property file line
10983 $_ = "$fields[0]; $fields[1]";
10988 sub register_fraction($) {
10989 # This registers the input rational number so that it can be passed on to
10990 # utf8_heavy.pl, both in rational and floating forms.
10992 my $rational = shift;
10994 my $float = eval $rational;
10995 $nv_floating_to_rational{$float} = $rational;
10999 sub filter_numeric_value_line {
11000 # DNumValues contains lines of a different syntax than the typical
11002 # 0F33 ; -0.5 ; ; -1/2 # No TIBETAN DIGIT HALF ZERO
11004 # This routine transforms $_ containing the anomalous syntax to the
11005 # typical, by filtering out the extra columns, and convert early version
11006 # decimal numbers to strings that look like rational numbers.
11009 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
11011 # Starting in 5.1, there is a rational field. Just use that, omitting the
11012 # extra columns. Otherwise convert the decimal number in the second field
11013 # to a rational, and omit extraneous columns.
11014 my @fields = split /\s*;\s*/, $_, -1;
11017 if ($v_version ge v5.1.0) {
11018 if (@fields != 4) {
11019 $file->carp_bad_line('Not 4 semi-colon separated fields');
11023 $rational = $fields[3];
11024 $_ = join '; ', @fields[ 0, 3 ];
11028 # Here, is an older Unicode file, which has decimal numbers instead of
11029 # rationals in it. Use the fraction to calculate the denominator and
11030 # convert to rational.
11032 if (@fields != 2 && @fields != 3) {
11033 $file->carp_bad_line('Not 2 or 3 semi-colon separated fields');
11038 my $codepoints = $fields[0];
11039 my $decimal = $fields[1];
11040 if ($decimal =~ s/\.0+$//) {
11042 # Anything ending with a decimal followed by nothing but 0's is an
11044 $_ = "$codepoints; $decimal";
11045 $rational = $decimal;
11050 if ($decimal =~ /\.50*$/) {
11054 # Here have the hardcoded repeating decimals in the fraction, and
11055 # the denominator they imply. There were only a few denominators
11056 # in the older Unicode versions of this file which this code
11057 # handles, so it is easy to convert them.
11059 # The 4 is because of a round-off error in the Unicode 3.2 files
11060 elsif ($decimal =~ /\.33*[34]$/ || $decimal =~ /\.6+7$/) {
11063 elsif ($decimal =~ /\.[27]50*$/) {
11066 elsif ($decimal =~ /\.[2468]0*$/) {
11069 elsif ($decimal =~ /\.16+7$/ || $decimal =~ /\.83+$/) {
11072 elsif ($decimal =~ /\.(12|37|62|87)50*$/) {
11075 if ($denominator) {
11076 my $sign = ($decimal < 0) ? "-" : "";
11077 my $numerator = int((abs($decimal) * $denominator) + .5);
11078 $rational = "$sign$numerator/$denominator";
11079 $_ = "$codepoints; $rational";
11082 $file->carp_bad_line("Can't cope with number '$decimal'.");
11089 register_fraction($rational) if $rational =~ qr{/};
11094 my %unihan_properties;
11097 # Do any special setup for Unihan properties.
11099 # This property gives the wrong computed type, so override.
11100 my $usource = property_ref('kIRG_USource');
11101 $usource->set_type($STRING) if defined $usource;
11103 # This property is to be considered binary (it says so in
11104 # http://www.unicode.org/reports/tr38/)
11105 my $iicore = property_ref('kIICore');
11106 if (defined $iicore) {
11107 $iicore->set_type($FORCED_BINARY);
11108 $iicore->table("Y")->add_note("Forced to a binary property as per unicode.org UAX #38.");
11110 # Unicode doesn't include the maps for this property, so don't
11111 # warn that they are missing.
11112 $iicore->set_pre_declared_maps(0);
11113 $iicore->add_comment(join_lines( <<END
11114 This property contains enum values, but Unicode UAX #38 says it should be
11115 interpreted as binary, so Perl creates tables for both 1) its enum values,
11116 plus 2) true/false tables in which it is considered true for all code points
11117 that have a non-null value
11125 sub filter_unihan_line {
11126 # Change unihan db lines to look like the others in the db. Here is
11128 # U+341C kCangjie IEKN
11130 # Tabs are used instead of semi-colons to separate fields; therefore
11131 # they may have semi-colons embedded in them. Change these to periods
11132 # so won't screw up the rest of the code.
11135 # Remove lines that don't look like ones we accept.
11136 if ($_ !~ /^ [^\t]* \t ( [^\t]* ) /x) {
11141 # Extract the property, and save a reference to its object.
11143 if (! exists $unihan_properties{$property}) {
11144 $unihan_properties{$property} = property_ref($property);
11147 # Don't do anything unless the property is one we're handling, which
11148 # we determine by seeing if there is an object defined for it or not
11149 if (! defined $unihan_properties{$property}) {
11154 # Convert the tab separators to our standard semi-colons, and convert
11155 # the U+HHHH notation to the rest of the standard's HHHH
11157 s/\b U \+ (?= $code_point_re )//xg;
11159 #local $to_trace = 1 if main::DEBUG;
11160 trace $_ if main::DEBUG && $to_trace;
11166 sub filter_blocks_lines {
11167 # In the Blocks.txt file, the names of the blocks don't quite match the
11168 # names given in PropertyValueAliases.txt, so this changes them so they
11169 # do match: Blanks and hyphens are changed into underscores. Also makes
11170 # early release versions look like later ones
11172 # $_ is transformed to the correct value.
11175 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
11177 if ($v_version lt v3.2.0) {
11178 if (/FEFF.*Specials/) { # Bug in old versions: line wrongly inserted
11183 # Old versions used a different syntax to mark the range.
11184 $_ =~ s/;\s+/../ if $v_version lt v3.1.0;
11187 my @fields = split /\s*;\s*/, $_, -1;
11188 if (@fields != 2) {
11189 $file->carp_bad_line("Expecting exactly two fields");
11194 # Change hyphens and blanks in the block name field only
11195 $fields[1] =~ s/[ -]/_/g;
11196 $fields[1] =~ s/_ ( [a-z] ) /_\u$1/g; # Capitalize first letter of word
11198 $_ = join("; ", @fields);
11203 my $current_property;
11205 sub filter_old_style_proplist {
11206 # PropList.txt has been in Unicode since version 2.0. Until 3.1, it
11207 # was in a completely different syntax. Ken Whistler of Unicode says
11208 # that it was something he used as an aid for his own purposes, but
11209 # was never an official part of the standard. However, comments in
11210 # DAge.txt indicate that non-character code points were available in
11211 # the UCD as of 3.1. It is unclear to me (khw) how they could be
11212 # there except through this file (but on the other hand, they first
11213 # appeared there in 3.0.1), so maybe it was part of the UCD, and maybe
11214 # not. But the claim is that it was published as an aid to others who
11215 # might want some more information than was given in the official UCD
11216 # of the time. Many of the properties in it were incorporated into
11217 # the later PropList.txt, but some were not. This program uses this
11218 # early file to generate property tables that are otherwise not
11219 # accessible in the early UCD's, and most were probably not really
11220 # official at that time, so one could argue that it should be ignored,
11221 # and you can easily modify things to skip this. And there are bugs
11222 # in this file in various versions. (For example, the 2.1.9 version
11223 # removes from Alphabetic the CJK range starting at 4E00, and they
11224 # weren't added back in until 3.1.0.) Many of this file's properties
11225 # were later sanctioned, so this code generates tables for those
11226 # properties that aren't otherwise in the UCD of the time but
11227 # eventually did become official, and throws away the rest. Here is a
11228 # list of all the ones that are thrown away:
11229 # Bidi=* duplicates UnicodeData.txt
11230 # Combining never made into official property;
11232 # Composite never made into official property.
11233 # Currency Symbol duplicates UnicodeData.txt: gc=sc
11234 # Decimal Digit duplicates UnicodeData.txt: gc=nd
11235 # Delimiter never made into official property;
11237 # Format Control never made into official property;
11239 # High Surrogate duplicates Blocks.txt
11240 # Ignorable Control never made into official property;
11242 # ISO Control duplicates UnicodeData.txt: gc=cc
11243 # Left of Pair never made into official property;
11244 # Line Separator duplicates UnicodeData.txt: gc=zl
11245 # Low Surrogate duplicates Blocks.txt
11246 # Non-break was actually listed as a property
11247 # in 3.2, but without any code
11248 # points. Unicode denies that this
11249 # was ever an official property
11250 # Non-spacing duplicate UnicodeData.txt: gc=mn
11251 # Numeric duplicates UnicodeData.txt: gc=cc
11252 # Paired Punctuation never made into official property;
11253 # appears to be gc=ps + gc=pe
11254 # Paragraph Separator duplicates UnicodeData.txt: gc=cc
11255 # Private Use duplicates UnicodeData.txt: gc=co
11256 # Private Use High Surrogate duplicates Blocks.txt
11257 # Punctuation duplicates UnicodeData.txt: gc=p
11258 # Space different definition than eventual
11260 # Titlecase duplicates UnicodeData.txt: gc=lt
11261 # Unassigned Code Value duplicates UnicodeData.txt: gc=cc
11262 # Zero-width never made into official property;
11264 # Most of the properties have the same names in this file as in later
11265 # versions, but a couple do not.
11267 # This subroutine filters $_, converting it from the old style into
11268 # the new style. Here's a sample of the old-style
11270 # *******************************************
11272 # Property dump for: 0x100000A0 (Join Control)
11274 # 200C..200D (2 chars)
11276 # In the example, the property is "Join Control". It is kept in this
11277 # closure between calls to the subroutine. The numbers beginning with
11278 # 0x were internal to Ken's program that generated this file.
11280 # If this line contains the property name, extract it.
11281 if (/^Property dump for: [^(]*\((.*)\)/) {
11284 # Convert white space to underscores.
11287 # Convert the few properties that don't have the same name as
11288 # their modern counterparts
11289 s/Identifier_Part/ID_Continue/
11290 or s/Not_a_Character/NChar/;
11292 # If the name matches an existing property, use it.
11293 if (defined property_ref($_)) {
11294 trace "new property=", $_ if main::DEBUG && $to_trace;
11295 $current_property = $_;
11297 else { # Otherwise discard it
11298 trace "rejected property=", $_ if main::DEBUG && $to_trace;
11299 undef $current_property;
11301 $_ = ""; # The property is saved for the next lines of the
11302 # file, but this defining line is of no further use,
11303 # so clear it so that the caller won't process it
11306 elsif (! defined $current_property || $_ !~ /^$code_point_re/) {
11308 # Here, the input line isn't a header defining a property for the
11309 # following section, and either we aren't in such a section, or
11310 # the line doesn't look like one that defines the code points in
11311 # such a section. Ignore this line.
11316 # Here, we have a line defining the code points for the current
11317 # stashed property. Anything starting with the first blank is
11318 # extraneous. Otherwise, it should look like a normal range to
11319 # the caller. Append the property name so that it looks just like
11320 # a modern PropList entry.
11323 $_ .= "; $current_property";
11325 trace $_ if main::DEBUG && $to_trace;
11328 } # End closure for old style proplist
11330 sub filter_old_style_normalization_lines {
11331 # For early releases of Unicode, the lines were like:
11332 # 74..2A76 ; NFKD_NO
11333 # For later releases this became:
11334 # 74..2A76 ; NFKD_QC; N
11335 # Filter $_ to look like those in later releases.
11336 # Similarly for MAYBEs
11338 s/ _NO \b /_QC; N/x || s/ _MAYBE \b /_QC; M/x;
11340 # Also, the property FC_NFKC was abbreviated to FNC
11345 sub setup_script_extensions {
11346 # The Script_Extensions property starts out with a clone of the Script
11349 my $scx = property_ref("Script_Extensions");
11350 $scx = Property->new("scx", Full_Name => "Script_Extensions")
11352 $scx->_set_format($STRING_WHITE_SPACE_LIST);
11353 $scx->initialize($script);
11354 $scx->set_default_map($script->default_map);
11355 $scx->set_pre_declared_maps(0); # PropValueAliases doesn't list these
11356 $scx->add_comment(join_lines( <<END
11357 The values for code points that appear in one script are just the same as for
11358 the 'Script' property. Likewise the values for those that appear in many
11359 scripts are either 'Common' or 'Inherited', same as with 'Script'. But the
11360 values of code points that appear in a few scripts are a space separated list
11365 # Initialize scx's tables and the aliases for them to be the same as sc's
11366 foreach my $table ($script->tables) {
11367 my $scx_table = $scx->add_match_table($table->name,
11368 Full_Name => $table->full_name);
11369 foreach my $alias ($table->aliases) {
11370 $scx_table->add_alias($alias->name);
11375 sub filter_script_extensions_line {
11376 # The Scripts file comes with the full name for the scripts; the
11377 # ScriptExtensions, with the short name. The final mapping file is a
11378 # combination of these, and without adjustment, would have inconsistent
11379 # entries. This filters the latter file to convert to full names.
11380 # Entries look like this:
11381 # 064B..0655 ; Arab Syrc # Mn [11] ARABIC FATHATAN..ARABIC HAMZA BELOW
11383 my @fields = split /\s*;\s*/;
11385 foreach my $short_name (split " ", $fields[1]) {
11386 push @full_names, $script->table($short_name)->full_name;
11388 $fields[1] = join " ", @full_names;
11389 $_ = join "; ", @fields;
11394 sub setup_v6_name_alias {
11395 property_ref('Name_Alias')->add_map(7, 7, "ALERT");
11398 sub finish_Unicode() {
11399 # This routine should be called after all the Unicode files have been read
11401 # 1) Adds the mappings for code points missing from the files which have
11402 # defaults specified for them.
11403 # 2) At this this point all mappings are known, so it computes the type of
11404 # each property whose type hasn't been determined yet.
11405 # 3) Calculates all the regular expression match tables based on the
11407 # 3) Calculates and adds the tables which are defined by Unicode, but
11408 # which aren't derived by them
11410 # For each property, fill in any missing mappings, and calculate the re
11411 # match tables. If a property has more than one missing mapping, the
11412 # default is a reference to a data structure, and requires data from other
11413 # properties to resolve. The sort is used to cause these to be processed
11414 # last, after all the other properties have been calculated.
11415 # (Fortunately, the missing properties so far don't depend on each other.)
11416 foreach my $property
11417 (sort { (defined $a->default_map && ref $a->default_map) ? 1 : -1 }
11420 # $perl has been defined, but isn't one of the Unicode properties that
11421 # need to be finished up.
11422 next if $property == $perl;
11424 # Nor do we need to do anything with properties that aren't going to
11426 next if $property->fate == $SUPPRESSED;
11428 # Handle the properties that have more than one possible default
11429 if (ref $property->default_map) {
11430 my $default_map = $property->default_map;
11432 # These properties have stored in the default_map:
11434 # 1) A default map which applies to all code points in a
11436 # 2) an expression which will evaluate to the list of code
11437 # points in that class
11439 # 3) the default map which applies to every other missing code
11442 # Go through each list.
11443 while (my ($default, $eval) = $default_map->get_next_defaults) {
11445 # Get the class list, and intersect it with all the so-far
11446 # unspecified code points yielding all the code points
11447 # in the class that haven't been specified.
11448 my $list = eval $eval;
11450 Carp::my_carp("Can't set some defaults for missing code points for $property because eval '$eval' failed with '$@'");
11454 # Narrow down the list to just those code points we don't have
11456 $list = $list & $property->inverse_list;
11458 # Add mappings to the property for each code point in the list
11459 foreach my $range ($list->ranges) {
11460 $property->add_map($range->start, $range->end, $default,
11461 Replace => $CROAK);
11465 # All remaining code points have the other mapping. Set that up
11466 # so the normal single-default mapping code will work on them
11467 $property->set_default_map($default_map->other_default);
11469 # And fall through to do that
11472 # We should have enough data now to compute the type of the property.
11473 $property->compute_type;
11474 my $property_type = $property->type;
11476 next if ! $property->to_create_match_tables;
11478 # Here want to create match tables for this property
11480 # The Unicode db always (so far, and they claim into the future) have
11481 # the default for missing entries in binary properties be 'N' (unless
11482 # there is a '@missing' line that specifies otherwise)
11483 if ($property_type == $BINARY && ! defined $property->default_map) {
11484 $property->set_default_map('N');
11487 # Add any remaining code points to the mapping, using the default for
11488 # missing code points.
11490 if (defined (my $default_map = $property->default_map)) {
11492 # Make sure there is a match table for the default
11493 if (! defined ($default_table = $property->table($default_map))) {
11494 $default_table = $property->add_match_table($default_map);
11497 # And, if the property is binary, the default table will just
11498 # be the complement of the other table.
11499 if ($property_type == $BINARY) {
11500 my $non_default_table;
11502 # Find the non-default table.
11503 for my $table ($property->tables) {
11504 next if $table == $default_table;
11505 $non_default_table = $table;
11507 $default_table->set_complement($non_default_table);
11511 # This fills in any missing values with the default. It's not
11512 # necessary to do this with binary properties, as the default
11513 # is defined completely in terms of the Y table.
11514 $property->add_map(0, $MAX_UNICODE_CODEPOINT,
11515 $default_map, Replace => $NO);
11519 # Have all we need to populate the match tables.
11520 my $property_name = $property->name;
11521 my $maps_should_be_defined = $property->pre_declared_maps;
11522 foreach my $range ($property->ranges) {
11523 my $map = $range->value;
11524 my $table = $property->table($map);
11525 if (! defined $table) {
11527 # Integral and rational property values are not necessarily
11528 # defined in PropValueAliases, but whether all the other ones
11529 # should be depends on the property.
11530 if ($maps_should_be_defined
11531 && $map !~ /^ -? \d+ ( \/ \d+ )? $/x)
11533 Carp::my_carp("Table '$property_name=$map' should have been defined. Defining it now.")
11535 $table = $property->add_match_table($map);
11538 next if $table->complement != 0; # Don't need to populate these
11539 $table->add_range($range->start, $range->end);
11542 # A forced binary property has additional true/false tables which
11543 # should have been set up when it was forced into binary. The false
11544 # table matches exactly the same set as the property's default table.
11545 # The true table matches the complement of that. The false table is
11546 # not the same as an additional set of aliases on top of the default
11547 # table, so use 'set_equivalent_to'. If it were implemented as
11548 # additional aliases, various things would have to be adjusted, but
11549 # especially, if the user wants to get a list of names for the table
11550 # using Unicode::UCD::prop_value_aliases(), s/he should get a
11551 # different set depending on whether they want the default table or
11553 if ($property_type == $FORCED_BINARY) {
11554 $property->table('N')->set_equivalent_to($default_table,
11556 $property->table('Y')->set_complement($default_table);
11559 # For Perl 5.6 compatibility, all properties matchable in regexes can
11560 # have an optional 'Is_' prefix. This is now done in utf8_heavy.pl.
11561 # But warn if this creates a conflict with a (new) Unicode property
11562 # name, although it appears that Unicode has made a decision never to
11563 # begin a property name with 'Is_', so this shouldn't happen.
11564 foreach my $alias ($property->aliases) {
11565 my $Is_name = 'Is_' . $alias->name;
11566 if (defined (my $pre_existing = property_ref($Is_name))) {
11567 Carp::my_carp(<<END
11568 There is already an alias named $Is_name (from " . $pre_existing . "), so
11569 creating one for $property won't work. This is bad news. If it is not too
11570 late, get Unicode to back off. Otherwise go back to the old scheme (findable
11571 from the git blame log for this area of the code that suppressed individual
11572 aliases that conflict with the new Unicode names. Proceeding anyway.
11576 } # End of loop through aliases for this property
11577 } # End of loop through all Unicode properties.
11579 # Fill in the mappings that Unicode doesn't completely furnish. First the
11580 # single letter major general categories. If Unicode were to start
11581 # delivering the values, this would be redundant, but better that than to
11582 # try to figure out if should skip and not get it right. Ths could happen
11583 # if a new major category were to be introduced, and the hard-coded test
11584 # wouldn't know about it.
11585 # This routine depends on the standard names for the general categories
11586 # being what it thinks they are, like 'Cn'. The major categories are the
11587 # union of all the general category tables which have the same first
11588 # letters. eg. L = Lu + Lt + Ll + Lo + Lm
11589 foreach my $minor_table ($gc->tables) {
11590 my $minor_name = $minor_table->name;
11591 next if length $minor_name == 1;
11592 if (length $minor_name != 2) {
11593 Carp::my_carp_bug("Unexpected general category '$minor_name'. Skipped.");
11597 my $major_name = uc(substr($minor_name, 0, 1));
11598 my $major_table = $gc->table($major_name);
11599 $major_table += $minor_table;
11602 # LC is Ll, Lu, and Lt. (used to be L& or L_, but PropValueAliases.txt
11603 # defines it as LC)
11604 my $LC = $gc->table('LC');
11605 $LC->add_alias('L_', Status => $DISCOURAGED); # For backwards...
11606 $LC->add_alias('L&', Status => $DISCOURAGED); # compatibility.
11609 if ($LC->is_empty) { # Assume if not empty that Unicode has started to
11610 # deliver the correct values in it
11611 $LC->initialize($gc->table('Ll') + $gc->table('Lu'));
11613 # Lt not in release 1.
11614 if (defined $gc->table('Lt')) {
11615 $LC += $gc->table('Lt');
11616 $gc->table('Lt')->set_caseless_equivalent($LC);
11619 $LC->add_description('[\p{Ll}\p{Lu}\p{Lt}]');
11621 $gc->table('Ll')->set_caseless_equivalent($LC);
11622 $gc->table('Lu')->set_caseless_equivalent($LC);
11624 my $Cs = $gc->table('Cs');
11627 # Folding information was introduced later into Unicode data. To get
11628 # Perl's case ignore (/i) to work at all in releases that don't have
11629 # folding, use the best available alternative, which is lower casing.
11630 my $fold = property_ref('Simple_Case_Folding');
11631 if ($fold->is_empty) {
11632 $fold->initialize(property_ref('Simple_Lowercase_Mapping'));
11633 $fold->add_note(join_lines(<<END
11634 WARNING: This table uses lower case as a substitute for missing fold
11640 # Multiple-character mapping was introduced later into Unicode data. If
11641 # missing, use the single-characters maps as best available alternative
11642 foreach my $map (qw { Uppercase_Mapping
11647 my $full = property_ref($map);
11648 if ($full->is_empty) {
11649 my $simple = property_ref('Simple_' . $map);
11650 $full->initialize($simple);
11651 $full->add_comment($simple->comment) if ($simple->comment);
11652 $full->add_note(join_lines(<<END
11653 WARNING: This table uses simple mapping (single-character only) as a
11654 substitute for missing multiple-character information
11660 # The Script_Extensions property started out as a clone of the Script
11661 # property. But processing its data file caused some elements to be
11662 # replaced with different data. (These elements were for the Common and
11663 # Inherited properties.) This data is a qw() list of all the scripts that
11664 # the code points in the given range are in. An example line is:
11665 # 060C ; Arab Syrc Thaa # Po ARABIC COMMA
11667 # The code above has created a new match table named "Arab Syrc Thaa"
11668 # which contains 060C. (The cloned table started out with this code point
11669 # mapping to "Common".) Now we add 060C to each of the Arab, Syrc, and
11670 # Thaa match tables. Then we delete the now spurious "Arab Syrc Thaa"
11671 # match table. This is repeated for all these tables and ranges. The map
11672 # data is retained in the map table for reference, but the spurious match
11673 # tables are deleted.
11675 my $scx = property_ref("Script_Extensions");
11676 if (defined $scx) {
11677 foreach my $table ($scx->tables) {
11678 next unless $table->name =~ /\s/; # All the new and only the new
11679 # tables have a space in their
11681 my @scripts = split /\s+/, $table->name;
11682 foreach my $script (@scripts) {
11683 my $script_table = $scx->table($script);
11684 $script_table += $table;
11686 $scx->delete_match_table($table);
11693 sub compile_perl() {
11694 # Create perl-defined tables. Almost all are part of the pseudo-property
11695 # named 'perl' internally to this program. Many of these are recommended
11696 # in UTS#18 "Unicode Regular Expressions", and their derivations are based
11697 # on those found there.
11698 # Almost all of these are equivalent to some Unicode property.
11699 # A number of these properties have equivalents restricted to the ASCII
11700 # range, with their names prefaced by 'Posix', to signify that these match
11701 # what the Posix standard says they should match. A couple are
11702 # effectively this, but the name doesn't have 'Posix' in it because there
11703 # just isn't any Posix equivalent. 'XPosix' are the Posix tables extended
11704 # to the full Unicode range, by our guesses as to what is appropriate.
11706 # 'Any' is all code points. As an error check, instead of just setting it
11707 # to be that, construct it to be the union of all the major categories
11708 $Any = $perl->add_match_table('Any',
11709 Description => "[\\x{0000}-\\x{$MAX_UNICODE_CODEPOINT_STRING}]",
11712 foreach my $major_table ($gc->tables) {
11714 # Major categories are the ones with single letter names.
11715 next if length($major_table->name) != 1;
11717 $Any += $major_table;
11720 if ($Any->max != $MAX_UNICODE_CODEPOINT) {
11721 Carp::my_carp_bug("Generated highest code point ("
11722 . sprintf("%X", $Any->max)
11723 . ") doesn't match expected value $MAX_UNICODE_CODEPOINT_STRING.")
11725 if ($Any->range_count != 1 || $Any->min != 0) {
11726 Carp::my_carp_bug("Generated table 'Any' doesn't match all code points.")
11729 $Any->add_alias('All');
11731 # Assigned is the opposite of gc=unassigned
11732 my $Assigned = $perl->add_match_table('Assigned',
11733 Description => "All assigned code points",
11734 Initialize => ~ $gc->table('Unassigned'),
11737 # Our internal-only property should be treated as more than just a
11738 # synonym; grandfather it in to the pod.
11739 $perl->add_match_table('_CombAbove', Re_Pod_Entry => 1,
11740 Fate => $INTERNAL_ONLY, Status => $DISCOURAGED)
11741 ->set_equivalent_to(property_ref('ccc')->table('Above'),
11744 my $ASCII = $perl->add_match_table('ASCII', Description => '[[:ASCII:]]');
11745 if (defined $block) { # This is equivalent to the block if have it.
11746 my $Unicode_ASCII = $block->table('Basic_Latin');
11747 if (defined $Unicode_ASCII && ! $Unicode_ASCII->is_empty) {
11748 $ASCII->set_equivalent_to($Unicode_ASCII, Related => 1);
11752 # Very early releases didn't have blocks, so initialize ASCII ourselves if
11754 if ($ASCII->is_empty) {
11755 $ASCII->initialize([ 0..127 ]);
11758 # Get the best available case definitions. Early Unicode versions didn't
11759 # have Uppercase and Lowercase defined, so use the general category
11760 # instead for them.
11761 my $Lower = $perl->add_match_table('Lower');
11762 my $Unicode_Lower = property_ref('Lowercase');
11763 if (defined $Unicode_Lower && ! $Unicode_Lower->is_empty) {
11764 $Lower->set_equivalent_to($Unicode_Lower->table('Y'), Related => 1);
11765 $Unicode_Lower->table('Y')->set_caseless_equivalent(property_ref('Cased')->table('Y'));
11766 $Unicode_Lower->table('N')->set_caseless_equivalent(property_ref('Cased')->table('N'));
11767 $Lower->set_caseless_equivalent(property_ref('Cased')->table('Y'));
11771 $Lower->set_equivalent_to($gc->table('Lowercase_Letter'),
11774 $Lower->add_alias('XPosixLower');
11775 my $Posix_Lower = $perl->add_match_table("PosixLower",
11776 Description => "[a-z]",
11777 Initialize => $Lower & $ASCII,
11780 my $Upper = $perl->add_match_table('Upper');
11781 my $Unicode_Upper = property_ref('Uppercase');
11782 if (defined $Unicode_Upper && ! $Unicode_Upper->is_empty) {
11783 $Upper->set_equivalent_to($Unicode_Upper->table('Y'), Related => 1);
11784 $Unicode_Upper->table('Y')->set_caseless_equivalent(property_ref('Cased')->table('Y'));
11785 $Unicode_Upper->table('N')->set_caseless_equivalent(property_ref('Cased')->table('N'));
11786 $Upper->set_caseless_equivalent(property_ref('Cased')->table('Y'));
11789 $Upper->set_equivalent_to($gc->table('Uppercase_Letter'),
11792 $Upper->add_alias('XPosixUpper');
11793 my $Posix_Upper = $perl->add_match_table("PosixUpper",
11794 Description => "[A-Z]",
11795 Initialize => $Upper & $ASCII,
11798 # Earliest releases didn't have title case. Initialize it to empty if not
11799 # otherwise present
11800 my $Title = $perl->add_match_table('Title', Full_Name => 'Titlecase',
11801 Description => '(= \p{Gc=Lt})');
11802 my $lt = $gc->table('Lt');
11804 # Earlier versions of mktables had this related to $lt since they have
11805 # identical code points, but their caseless equivalents are not the same,
11806 # one being 'Cased' and the other being 'LC', and so now must be kept as
11807 # separate entities.
11808 $Title += $lt if defined $lt;
11810 # If this Unicode version doesn't have Cased, set up our own. From
11811 # Unicode 5.1: Definition D120: A character C is defined to be cased if
11812 # and only if C has the Lowercase or Uppercase property or has a
11813 # General_Category value of Titlecase_Letter.
11814 my $Unicode_Cased = property_ref('Cased');
11815 unless (defined $Unicode_Cased) {
11816 my $cased = $perl->add_match_table('Cased',
11817 Initialize => $Lower + $Upper + $Title,
11818 Description => 'Uppercase or Lowercase or Titlecase',
11820 $Unicode_Cased = $cased;
11822 $Title->set_caseless_equivalent($Unicode_Cased->table('Y'));
11824 # Similarly, set up our own Case_Ignorable property if this Unicode
11825 # version doesn't have it. From Unicode 5.1: Definition D121: A character
11826 # C is defined to be case-ignorable if C has the value MidLetter or the
11827 # value MidNumLet for the Word_Break property or its General_Category is
11828 # one of Nonspacing_Mark (Mn), Enclosing_Mark (Me), Format (Cf),
11829 # Modifier_Letter (Lm), or Modifier_Symbol (Sk).
11831 # Perl has long had an internal-only alias for this property; grandfather
11832 # it in to the pod, but discourage its use.
11833 my $perl_case_ignorable = $perl->add_match_table('_Case_Ignorable',
11835 Fate => $INTERNAL_ONLY,
11836 Status => $DISCOURAGED);
11837 my $case_ignorable = property_ref('Case_Ignorable');
11838 if (defined $case_ignorable && ! $case_ignorable->is_empty) {
11839 $perl_case_ignorable->set_equivalent_to($case_ignorable->table('Y'),
11844 $perl_case_ignorable->initialize($gc->table('Mn') + $gc->table('Lm'));
11846 # The following three properties are not in early releases
11847 $perl_case_ignorable += $gc->table('Me') if defined $gc->table('Me');
11848 $perl_case_ignorable += $gc->table('Cf') if defined $gc->table('Cf');
11849 $perl_case_ignorable += $gc->table('Sk') if defined $gc->table('Sk');
11851 # For versions 4.1 - 5.0, there is no MidNumLet property, and
11852 # correspondingly the case-ignorable definition lacks that one. For
11853 # 4.0, it appears that it was meant to be the same definition, but was
11854 # inadvertently omitted from the standard's text, so add it if the
11855 # property actually is there
11856 my $wb = property_ref('Word_Break');
11858 my $midlet = $wb->table('MidLetter');
11859 $perl_case_ignorable += $midlet if defined $midlet;
11860 my $midnumlet = $wb->table('MidNumLet');
11861 $perl_case_ignorable += $midnumlet if defined $midnumlet;
11865 # In earlier versions of the standard, instead of the above two
11866 # properties , just the following characters were used:
11867 $perl_case_ignorable += 0x0027 # APOSTROPHE
11868 + 0x00AD # SOFT HYPHEN (SHY)
11869 + 0x2019; # RIGHT SINGLE QUOTATION MARK
11873 # The remaining perl defined tables are mostly based on Unicode TR 18,
11874 # "Annex C: Compatibility Properties". All of these have two versions,
11875 # one whose name generally begins with Posix that is posix-compliant, and
11876 # one that matches Unicode characters beyond the Posix, ASCII range
11878 my $Alpha = $perl->add_match_table('Alpha');
11880 # Alphabetic was not present in early releases
11881 my $Alphabetic = property_ref('Alphabetic');
11882 if (defined $Alphabetic && ! $Alphabetic->is_empty) {
11883 $Alpha->set_equivalent_to($Alphabetic->table('Y'), Related => 1);
11887 # For early releases, we don't get it exactly right. The below
11888 # includes more than it should, which in 5.2 terms is: L + Nl +
11889 # Other_Alphabetic. Other_Alphabetic contains many characters from
11890 # Mn and Mc. It's better to match more than we should, than less than
11892 $Alpha->initialize($gc->table('Letter')
11894 + $gc->table('Mc'));
11895 $Alpha += $gc->table('Nl') if defined $gc->table('Nl');
11896 $Alpha->add_description('Alphabetic');
11898 $Alpha->add_alias('XPosixAlpha');
11899 my $Posix_Alpha = $perl->add_match_table("PosixAlpha",
11900 Description => "[A-Za-z]",
11901 Initialize => $Alpha & $ASCII,
11903 $Posix_Upper->set_caseless_equivalent($Posix_Alpha);
11904 $Posix_Lower->set_caseless_equivalent($Posix_Alpha);
11906 my $Alnum = $perl->add_match_table('Alnum',
11907 Description => 'Alphabetic and (decimal) Numeric',
11908 Initialize => $Alpha + $gc->table('Decimal_Number'),
11910 $Alnum->add_alias('XPosixAlnum');
11911 $perl->add_match_table("PosixAlnum",
11912 Description => "[A-Za-z0-9]",
11913 Initialize => $Alnum & $ASCII,
11916 my $Word = $perl->add_match_table('Word',
11917 Description => '\w, including beyond ASCII;'
11918 . ' = \p{Alnum} + \pM + \p{Pc}',
11919 Initialize => $Alnum + $gc->table('Mark'),
11921 $Word->add_alias('XPosixWord');
11922 my $Pc = $gc->table('Connector_Punctuation'); # 'Pc' Not in release 1
11923 $Word += $Pc if defined $Pc;
11925 # This is a Perl extension, so the name doesn't begin with Posix.
11926 my $PerlWord = $perl->add_match_table('PerlWord',
11927 Description => '\w, restricted to ASCII = [A-Za-z0-9_]',
11928 Initialize => $Word & $ASCII,
11930 $PerlWord->add_alias('PosixWord');
11932 my $Blank = $perl->add_match_table('Blank',
11933 Description => '\h, Horizontal white space',
11935 # 200B is Zero Width Space which is for line
11936 # break control, and was listed as
11937 # Space_Separator in early releases
11938 Initialize => $gc->table('Space_Separator')
11942 $Blank->add_alias('HorizSpace'); # Another name for it.
11943 $Blank->add_alias('XPosixBlank');
11944 $perl->add_match_table("PosixBlank",
11945 Description => "\\t and ' '",
11946 Initialize => $Blank & $ASCII,
11949 my $VertSpace = $perl->add_match_table('VertSpace',
11950 Description => '\v',
11951 Initialize => $gc->table('Line_Separator')
11952 + $gc->table('Paragraph_Separator')
11953 + 0x000A # LINE FEED
11954 + 0x000B # VERTICAL TAB
11955 + 0x000C # FORM FEED
11956 + 0x000D # CARRIAGE RETURN
11959 # No Posix equivalent for vertical space
11961 my $Space = $perl->add_match_table('Space',
11962 Description => '\s including beyond ASCII plus vertical tab',
11963 Initialize => $Blank + $VertSpace,
11965 $Space->add_alias('XPosixSpace');
11966 $perl->add_match_table("PosixSpace",
11967 Description => "\\t, \\n, \\cK, \\f, \\r, and ' '. (\\cK is vertical tab)",
11968 Initialize => $Space & $ASCII,
11971 # Perl's traditional space doesn't include Vertical Tab
11972 my $XPerlSpace = $perl->add_match_table('XPerlSpace',
11973 Description => '\s, including beyond ASCII',
11974 Initialize => $Space - 0x000B,
11976 $XPerlSpace->add_alias('SpacePerl'); # A pre-existing synonym
11977 my $PerlSpace = $perl->add_match_table('PerlSpace',
11978 Description => '\s, restricted to ASCII = [ \f\n\r\t]',
11979 Initialize => $XPerlSpace & $ASCII,
11983 my $Cntrl = $perl->add_match_table('Cntrl',
11984 Description => 'Control characters');
11985 $Cntrl->set_equivalent_to($gc->table('Cc'), Related => 1);
11986 $Cntrl->add_alias('XPosixCntrl');
11987 $perl->add_match_table("PosixCntrl",
11988 Description => "ASCII control characters: NUL, SOH, STX, ETX, EOT, ENQ, ACK, BEL, BS, HT, LF, VT, FF, CR, SO, SI, DLE, DC1, DC2, DC3, DC4, NAK, SYN, ETB, CAN, EOM, SUB, ESC, FS, GS, RS, US, and DEL",
11989 Initialize => $Cntrl & $ASCII,
11992 # $controls is a temporary used to construct Graph.
11993 my $controls = Range_List->new(Initialize => $gc->table('Unassigned')
11994 + $gc->table('Control'));
11995 # Cs not in release 1
11996 $controls += $gc->table('Surrogate') if defined $gc->table('Surrogate');
11998 # Graph is ~space & ~(Cc|Cs|Cn) = ~(space + $controls)
11999 my $Graph = $perl->add_match_table('Graph',
12000 Description => 'Characters that are graphical',
12001 Initialize => ~ ($Space + $controls),
12003 $Graph->add_alias('XPosixGraph');
12004 $perl->add_match_table("PosixGraph",
12006 '[-!"#$%&\'()*+,./:;<>?@[\\\]^_`{|}~0-9A-Za-z]',
12007 Initialize => $Graph & $ASCII,
12010 $print = $perl->add_match_table('Print',
12011 Description => 'Characters that are graphical plus space characters (but no controls)',
12012 Initialize => $Blank + $Graph - $gc->table('Control'),
12014 $print->add_alias('XPosixPrint');
12015 $perl->add_match_table("PosixPrint",
12017 '[- 0-9A-Za-z!"#$%&\'()*+,./:;<>?@[\\\]^_`{|}~]',
12018 Initialize => $print & $ASCII,
12021 my $Punct = $perl->add_match_table('Punct');
12022 $Punct->set_equivalent_to($gc->table('Punctuation'), Related => 1);
12024 # \p{punct} doesn't include the symbols, which posix does
12025 my $XPosixPunct = $perl->add_match_table('XPosixPunct',
12026 Description => '\p{Punct} + ASCII-range \p{Symbol}',
12027 Initialize => $gc->table('Punctuation')
12028 + ($ASCII & $gc->table('Symbol')),
12029 Perl_Extension => 1
12031 $perl->add_match_table('PosixPunct', Perl_Extension => 1,
12032 Description => '[-!"#$%&\'()*+,./:;<>?@[\\\]^_`{|}~]',
12033 Initialize => $ASCII & $XPosixPunct,
12036 my $Digit = $perl->add_match_table('Digit',
12037 Description => '[0-9] + all other decimal digits');
12038 $Digit->set_equivalent_to($gc->table('Decimal_Number'), Related => 1);
12039 $Digit->add_alias('XPosixDigit');
12040 my $PosixDigit = $perl->add_match_table("PosixDigit",
12041 Description => '[0-9]',
12042 Initialize => $Digit & $ASCII,
12045 # Hex_Digit was not present in first release
12046 my $Xdigit = $perl->add_match_table('XDigit');
12047 $Xdigit->add_alias('XPosixXDigit');
12048 my $Hex = property_ref('Hex_Digit');
12049 if (defined $Hex && ! $Hex->is_empty) {
12050 $Xdigit->set_equivalent_to($Hex->table('Y'), Related => 1);
12053 # (Have to use hex instead of e.g. '0', because could be running on an
12054 # non-ASCII machine, and we want the Unicode (ASCII) values)
12055 $Xdigit->initialize([ 0x30..0x39, 0x41..0x46, 0x61..0x66,
12056 0xFF10..0xFF19, 0xFF21..0xFF26, 0xFF41..0xFF46]);
12057 $Xdigit->add_description('[0-9A-Fa-f] and corresponding fullwidth versions, like U+FF10: FULLWIDTH DIGIT ZERO');
12060 # AHex was not present in early releases
12061 my $PosixXDigit = $perl->add_match_table('PosixXDigit');
12062 my $AHex = property_ref('ASCII_Hex_Digit');
12063 if (defined $AHex && ! $AHex->is_empty) {
12064 $PosixXDigit->set_equivalent_to($AHex->table('Y'), Related => 1);
12067 $PosixXDigit->initialize($Xdigit & $ASCII);
12069 $PosixXDigit->add_description('[0-9A-Fa-f]');
12071 my $dt = property_ref('Decomposition_Type');
12072 $dt->add_match_table('Non_Canon', Full_Name => 'Non_Canonical',
12073 Initialize => ~ ($dt->table('None') + $dt->table('Canonical')),
12074 Perl_Extension => 1,
12075 Note => 'Union of all non-canonical decompositions',
12078 # _CanonDCIJ is equivalent to Soft_Dotted, but if on a release earlier
12079 # than SD appeared, construct it ourselves, based on the first release SD
12080 # was in. A pod entry is grandfathered in for it
12081 my $CanonDCIJ = $perl->add_match_table('_CanonDCIJ', Re_Pod_Entry => 1,
12082 Perl_Extension => 1,
12083 Fate => $INTERNAL_ONLY,
12084 Status => $DISCOURAGED);
12085 my $soft_dotted = property_ref('Soft_Dotted');
12086 if (defined $soft_dotted && ! $soft_dotted->is_empty) {
12087 $CanonDCIJ->set_equivalent_to($soft_dotted->table('Y'), Related => 1);
12091 # This list came from 3.2 Soft_Dotted.
12092 $CanonDCIJ->initialize([ 0x0069,
12101 $CanonDCIJ = $CanonDCIJ & $Assigned;
12104 # These are used in Unicode's definition of \X
12105 my $begin = $perl->add_match_table('_X_Begin', Perl_Extension => 1,
12106 Fate => $INTERNAL_ONLY);
12107 my $extend = $perl->add_match_table('_X_Extend', Perl_Extension => 1,
12108 Fate => $INTERNAL_ONLY);
12110 # For backward compatibility, Perl has its own definition for IDStart
12111 # First, we include the underscore, and then the regular XID_Start also
12113 $perl->add_match_table('_Perl_IDStart',
12114 Perl_Extension => 1,
12115 Fate => $INTERNAL_ONLY,
12118 + (property_ref('XID_Start')->table('Y') & $Word)
12121 my $gcb = property_ref('Grapheme_Cluster_Break');
12123 # The 'extended' grapheme cluster came in 5.1. The non-extended
12124 # definition differs too much from the traditional Perl one to use.
12125 if (defined $gcb && defined $gcb->table('SpacingMark')) {
12127 # Note that assumes HST is defined; it came in an earlier release than
12128 # GCB. In the line below, two negatives means: yes hangul
12129 $begin += ~ property_ref('Hangul_Syllable_Type')
12130 ->table('Not_Applicable')
12131 + ~ ($gcb->table('Control')
12132 + $gcb->table('CR')
12133 + $gcb->table('LF'));
12134 $begin->add_comment('For use in \X; matches: Hangul_Syllable | ! Control');
12136 $extend += $gcb->table('Extend') + $gcb->table('SpacingMark');
12137 $extend->add_comment('For use in \X; matches: Extend | SpacingMark');
12139 else { # Old definition, used on early releases.
12140 $extend += $gc->table('Mark')
12143 $begin += ~ $extend;
12145 # Here we may have a release that has the regular grapheme cluster
12146 # defined, or a release that doesn't have anything defined.
12147 # We set things up so the Perl core degrades gracefully, possibly with
12148 # placeholders that match nothing.
12150 if (! defined $gcb) {
12151 $gcb = Property->new('GCB', Status => $PLACEHOLDER);
12153 my $hst = property_ref('HST');
12154 if (!defined $hst) {
12155 $hst = Property->new('HST', Status => $PLACEHOLDER);
12156 $hst->add_match_table('Not_Applicable',
12157 Initialize => $Any,
12161 # On some releases, here we may not have the needed tables for the
12162 # perl core, in some releases we may.
12163 foreach my $name (qw{ L LV LVT T V prepend }) {
12164 my $table = $gcb->table($name);
12165 if (! defined $table) {
12166 $table = $gcb->add_match_table($name);
12167 push @tables_that_may_be_empty, $table->complete_name;
12170 # The HST property predates the GCB one, and has identical tables
12171 # for some of them, so use it if we can.
12172 if ($table->is_empty
12174 && defined $hst->table($name))
12176 $table += $hst->table($name);
12181 # More GCB. If we found some hangul syllables, populate a combined
12183 my $lv_lvt_v = $perl->add_match_table('_X_LV_LVT_V',
12184 Perl_Extension => 1,
12185 Fate => $INTERNAL_ONLY);
12186 my $LV = $gcb->table('LV');
12187 if ($LV->is_empty) {
12188 push @tables_that_may_be_empty, $lv_lvt_v->complete_name;
12190 $lv_lvt_v += $LV + $gcb->table('LVT') + $gcb->table('V');
12191 $lv_lvt_v->add_comment('For use in \X; matches: HST=LV | HST=LVT | HST=V');
12194 # Was previously constructed to contain both Name and Unicode_1_Name
12195 my @composition = ('Name', 'Unicode_1_Name');
12197 if (@named_sequences) {
12198 push @composition, 'Named_Sequence';
12199 foreach my $sequence (@named_sequences) {
12200 $perl_charname->add_anomalous_entry($sequence);
12204 my $alias_sentence = "";
12205 my $alias = property_ref('Name_Alias');
12206 if (defined $alias) {
12207 push @composition, 'Name_Alias';
12208 $alias->reset_each_range;
12209 while (my ($range) = $alias->each_range) {
12210 next if $range->value eq "";
12211 if ($range->start != $range->end) {
12212 Carp::my_carp("Expecting only one code point in the range $range. Just to keep going, using just the first code point;");
12214 $perl_charname->add_duplicate($range->start, $range->value);
12216 $alias_sentence = <<END;
12217 The Name_Alias property adds duplicate code point entries with a corrected
12218 name. The original (less correct, but still valid) name will be physically
12223 if (@composition <= 2) { # Always at least 2
12224 $comment = join " and ", @composition;
12227 $comment = join ", ", @composition[0 .. scalar @composition - 2];
12228 $comment .= ", and $composition[-1]";
12231 $perl_charname->add_comment(join_lines( <<END
12232 This file is for charnames.pm. It is the union of the $comment properties.
12233 Unicode_1_Name entries are used only for otherwise nameless code
12236 This file doesn't include the algorithmically determinable names. For those,
12237 use 'unicore/Name.pm'
12240 property_ref('Name')->add_comment(join_lines( <<END
12241 This file doesn't include the algorithmically determinable names. For those,
12242 use 'unicore/Name.pm'
12246 # Construct the Present_In property from the Age property.
12247 if (-e 'DAge.txt' && defined (my $age = property_ref('Age'))) {
12248 my $default_map = $age->default_map;
12249 my $in = Property->new('In',
12250 Default_Map => $default_map,
12251 Full_Name => "Present_In",
12252 Perl_Extension => 1,
12254 Initialize => $age,
12256 $in->add_comment(join_lines(<<END
12257 THIS FILE SHOULD NOT BE USED FOR ANY PURPOSE. The values in this file are the
12258 same as for $age, and not for what $in really means. This is because anything
12259 defined in a given release should have multiple values: that release and all
12260 higher ones. But only one value per code point can be represented in a table
12265 # The Age tables are named like 1.5, 2.0, 2.1, .... Sort so that the
12266 # lowest numbered (earliest) come first, with the non-numeric one
12268 my ($first_age, @rest_ages) = sort { ($a->name !~ /^[\d.]*$/)
12270 : ($b->name !~ /^[\d.]*$/)
12272 : $a->name <=> $b->name
12275 # The Present_In property is the cumulative age properties. The first
12276 # one hence is identical to the first age one.
12277 my $previous_in = $in->add_match_table($first_age->name);
12278 $previous_in->set_equivalent_to($first_age, Related => 1);
12280 my $description_start = "Code point's usage introduced in version ";
12281 $first_age->add_description($description_start . $first_age->name);
12283 # To construct the accumulated values, for each of the age tables
12284 # starting with the 2nd earliest, merge the earliest with it, to get
12285 # all those code points existing in the 2nd earliest. Repeat merging
12286 # the new 2nd earliest with the 3rd earliest to get all those existing
12287 # in the 3rd earliest, and so on.
12288 foreach my $current_age (@rest_ages) {
12289 next if $current_age->name !~ /^[\d.]*$/; # Skip the non-numeric
12291 my $current_in = $in->add_match_table(
12292 $current_age->name,
12293 Initialize => $current_age + $previous_in,
12294 Description => $description_start
12295 . $current_age->name
12298 $previous_in = $current_in;
12300 # Add clarifying material for the corresponding age file. This is
12301 # in part because of the confusing and contradictory information
12302 # given in the Standard's documentation itself, as of 5.2.
12303 $current_age->add_description(
12304 "Code point's usage was introduced in version "
12305 . $current_age->name);
12306 $current_age->add_note("See also $in");
12310 # And finally the code points whose usages have yet to be decided are
12311 # the same in both properties. Note that permanently unassigned code
12312 # points actually have their usage assigned (as being permanently
12313 # unassigned), so that these tables are not the same as gc=cn.
12314 my $unassigned = $in->add_match_table($default_map);
12315 my $age_default = $age->table($default_map);
12316 $age_default->add_description(<<END
12317 Code point's usage has not been assigned in any Unicode release thus far.
12320 $unassigned->set_equivalent_to($age_default, Related => 1);
12324 # Finished creating all the perl properties. All non-internal non-string
12325 # ones have a synonym of 'Is_' prefixed. (Internal properties begin with
12326 # an underscore.) These do not get a separate entry in the pod file
12327 foreach my $table ($perl->tables) {
12328 foreach my $alias ($table->aliases) {
12329 next if $alias->name =~ /^_/;
12330 $table->add_alias('Is_' . $alias->name,
12333 Status => $alias->status,
12334 OK_as_Filename => 0);
12338 # Here done with all the basic stuff. Ready to populate the information
12339 # about each character if annotating them.
12342 # See comments at its declaration
12343 $annotate_ranges = Range_Map->new;
12345 # This separates out the non-characters from the other unassigneds, so
12346 # can give different annotations for each.
12347 $unassigned_sans_noncharacters = Range_List->new(
12348 Initialize => $gc->table('Unassigned')
12349 & property_ref('Noncharacter_Code_Point')->table('N'));
12351 for (my $i = 0; $i <= $MAX_UNICODE_CODEPOINT; $i++ ) {
12352 $i = populate_char_info($i); # Note sets $i so may cause skips
12359 sub add_perl_synonyms() {
12360 # A number of Unicode tables have Perl synonyms that are expressed in
12361 # the single-form, \p{name}. These are:
12362 # All the binary property Y tables, so that \p{Name=Y} gets \p{Name} and
12363 # \p{Is_Name} as synonyms
12364 # \p{Script=Value} gets \p{Value}, \p{Is_Value} as synonyms
12365 # \p{General_Category=Value} gets \p{Value}, \p{Is_Value} as synonyms
12366 # \p{Block=Value} gets \p{In_Value} as a synonym, and, if there is no
12367 # conflict, \p{Value} and \p{Is_Value} as well
12369 # This routine generates these synonyms, warning of any unexpected
12372 # Construct the list of tables to get synonyms for. Start with all the
12373 # binary and the General_Category ones.
12374 my @tables = grep { $_->type == $BINARY || $_->type == $FORCED_BINARY }
12376 push @tables, $gc->tables;
12378 # If the version of Unicode includes the Script property, add its tables
12379 push @tables, $script->tables if defined $script;
12381 # The Block tables are kept separate because they are treated differently.
12382 # And the earliest versions of Unicode didn't include them, so add only if
12385 push @blocks, $block->tables if defined $block;
12387 # Here, have the lists of tables constructed. Process blocks last so that
12388 # if there are name collisions with them, blocks have lowest priority.
12389 # Should there ever be other collisions, manual intervention would be
12390 # required. See the comments at the beginning of the program for a
12391 # possible way to handle those semi-automatically.
12392 foreach my $table (@tables, @blocks) {
12394 # For non-binary properties, the synonym is just the name of the
12395 # table, like Greek, but for binary properties the synonym is the name
12396 # of the property, and means the code points in its 'Y' table.
12397 my $nominal = $table;
12398 my $nominal_property = $nominal->property;
12400 if (! $nominal->isa('Property')) {
12405 # Here is a binary property. Use the 'Y' table. Verify that is
12407 my $yes = $nominal->table('Y');
12408 unless (defined $yes) { # Must be defined, but is permissible to
12410 Carp::my_carp_bug("Undefined $nominal, 'Y'. Skipping.");
12416 foreach my $alias ($nominal->aliases) {
12418 # Attempt to create a table in the perl directory for the
12419 # candidate table, using whatever aliases in it that don't
12420 # conflict. Also add non-conflicting aliases for all these
12421 # prefixed by 'Is_' (and/or 'In_' for Block property tables)
12423 foreach my $prefix ("", 'Is_', 'In_') {
12425 # Only Block properties can have added 'In_' aliases.
12426 next if $prefix eq 'In_' and $nominal_property != $block;
12428 my $proposed_name = $prefix . $alias->name;
12430 # No Is_Is, In_In, nor combinations thereof
12431 trace "$proposed_name is a no-no" if main::DEBUG && $to_trace && $proposed_name =~ /^ I [ns] _I [ns] _/x;
12432 next if $proposed_name =~ /^ I [ns] _I [ns] _/x;
12434 trace "Seeing if can add alias or table: 'perl=$proposed_name' based on $nominal" if main::DEBUG && $to_trace;
12436 # Get a reference to any existing table in the perl
12437 # directory with the desired name.
12438 my $pre_existing = $perl->table($proposed_name);
12440 if (! defined $pre_existing) {
12442 # No name collision, so ok to add the perl synonym.
12444 my $make_re_pod_entry;
12445 my $ok_as_filename;
12446 my $status = $alias->status;
12447 if ($nominal_property == $block) {
12449 # For block properties, the 'In' form is preferred for
12450 # external use; the pod file contains wild cards for
12451 # this and the 'Is' form so no entries for those; and
12452 # we don't want people using the name without the
12453 # 'In', so discourage that.
12454 if ($prefix eq "") {
12455 $make_re_pod_entry = 1;
12456 $status = $status || $DISCOURAGED;
12457 $ok_as_filename = 0;
12459 elsif ($prefix eq 'In_') {
12460 $make_re_pod_entry = 0;
12461 $status = $status || $NORMAL;
12462 $ok_as_filename = 1;
12465 $make_re_pod_entry = 0;
12466 $status = $status || $DISCOURAGED;
12467 $ok_as_filename = 0;
12470 elsif ($prefix ne "") {
12472 # The 'Is' prefix is handled in the pod by a wild
12473 # card, and we won't use it for an external name
12474 $make_re_pod_entry = 0;
12475 $status = $status || $NORMAL;
12476 $ok_as_filename = 0;
12480 # Here, is an empty prefix, non block. This gets its
12481 # own pod entry and can be used for an external name.
12482 $make_re_pod_entry = 1;
12483 $status = $status || $NORMAL;
12484 $ok_as_filename = 1;
12487 # Here, there isn't a perl pre-existing table with the
12488 # name. Look through the list of equivalents of this
12489 # table to see if one is a perl table.
12490 foreach my $equivalent ($actual->leader->equivalents) {
12491 next if $equivalent->property != $perl;
12493 # Here, have found a table for $perl. Add this alias
12494 # to it, and are done with this prefix.
12495 $equivalent->add_alias($proposed_name,
12496 Re_Pod_Entry => $make_re_pod_entry,
12498 # Currently don't output these in the
12499 # ucd pod, as are strongly discouraged
12504 OK_as_Filename => $ok_as_filename);
12505 trace "adding alias perl=$proposed_name to $equivalent" if main::DEBUG && $to_trace;
12509 # Here, $perl doesn't already have a table that is a
12510 # synonym for this property, add one.
12511 my $added_table = $perl->add_match_table($proposed_name,
12512 Re_Pod_Entry => $make_re_pod_entry,
12514 # See UCD comment just above
12518 OK_as_Filename => $ok_as_filename);
12519 # And it will be related to the actual table, since it is
12521 $added_table->set_equivalent_to($actual, Related => 1);
12522 trace "added ", $perl->table($proposed_name) if main::DEBUG && $to_trace;
12524 } # End of no pre-existing.
12526 # Here, there is a pre-existing table that has the proposed
12527 # name. We could be in trouble, but not if this is just a
12528 # synonym for another table that we have already made a child
12529 # of the pre-existing one.
12530 if ($pre_existing->is_set_equivalent_to($actual)) {
12531 trace "$pre_existing is already equivalent to $actual; adding alias perl=$proposed_name to it" if main::DEBUG && $to_trace;
12532 $pre_existing->add_alias($proposed_name);
12536 # Here, there is a name collision, but it still could be ok if
12537 # the tables match the identical set of code points, in which
12538 # case, we can combine the names. Compare each table's code
12539 # point list to see if they are identical.
12540 trace "Potential name conflict with $pre_existing having ", $pre_existing->count, " code points" if main::DEBUG && $to_trace;
12541 if ($pre_existing->matches_identically_to($actual)) {
12543 # Here, they do match identically. Not a real conflict.
12544 # Make the perl version a child of the Unicode one, except
12545 # in the non-obvious case of where the perl name is
12546 # already a synonym of another Unicode property. (This is
12547 # excluded by the test for it being its own parent.) The
12548 # reason for this exclusion is that then the two Unicode
12549 # properties become related; and we don't really know if
12550 # they are or not. We generate documentation based on
12551 # relatedness, and this would be misleading. Code
12552 # later executed in the process will cause the tables to
12553 # be represented by a single file anyway, without making
12554 # it look in the pod like they are necessarily related.
12555 if ($pre_existing->parent == $pre_existing
12556 && ($pre_existing->property == $perl
12557 || $actual->property == $perl))
12559 trace "Setting $pre_existing equivalent to $actual since one is \$perl, and match identical sets" if main::DEBUG && $to_trace;
12560 $pre_existing->set_equivalent_to($actual, Related => 1);
12562 elsif (main::DEBUG && $to_trace) {
12563 trace "$pre_existing is equivalent to $actual since match identical sets, but not setting them equivalent, to preserve the separateness of the perl aliases";
12564 trace $pre_existing->parent;
12569 # Here they didn't match identically, there is a real conflict
12570 # between our new name and a pre-existing property.
12571 $actual->add_conflicting($proposed_name, 'p', $pre_existing);
12572 $pre_existing->add_conflicting($nominal->full_name,
12576 # Don't output a warning for aliases for the block
12577 # properties (unless they start with 'In_') as it is
12578 # expected that there will be conflicts and the block
12580 if ($verbosity >= $NORMAL_VERBOSITY
12581 && ($actual->property != $block || $prefix eq 'In_'))
12583 print simple_fold(join_lines(<<END
12584 There is already an alias named $proposed_name (from " . $pre_existing . "),
12585 so not creating this alias for " . $actual
12590 # Keep track for documentation purposes.
12591 $has_In_conflicts++ if $prefix eq 'In_';
12592 $has_Is_conflicts++ if $prefix eq 'Is_';
12597 # There are some properties which have No and Yes (and N and Y) as
12598 # property values, but aren't binary, and could possibly be confused with
12599 # binary ones. So create caveats for them. There are tables that are
12600 # named 'No', and tables that are named 'N', but confusion is not likely
12601 # unless they are the same table. For example, N meaning Number or
12602 # Neutral is not likely to cause confusion, so don't add caveats to things
12604 foreach my $property (grep { $_->type != $BINARY
12605 && $_->type != $FORCED_BINARY }
12608 my $yes = $property->table('Yes');
12609 if (defined $yes) {
12610 my $y = $property->table('Y');
12611 if (defined $y && $yes == $y) {
12612 foreach my $alias ($property->aliases) {
12613 $yes->add_conflicting($alias->name);
12617 my $no = $property->table('No');
12619 my $n = $property->table('N');
12620 if (defined $n && $no == $n) {
12621 foreach my $alias ($property->aliases) {
12622 $no->add_conflicting($alias->name, 'P');
12631 sub register_file_for_name($$$) {
12632 # Given info about a table and a datafile that it should be associated
12633 # with, register that association
12636 my $directory_ref = shift; # Array of the directory path for the file
12637 my $file = shift; # The file name in the final directory.
12638 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
12640 trace "table=$table, file=$file, directory=@$directory_ref" if main::DEBUG && $to_trace;
12642 if ($table->isa('Property')) {
12643 $table->set_file_path(@$directory_ref, $file);
12644 push @map_properties, $table;
12646 # No swash means don't do the rest of this.
12647 return if $table->fate != $ORDINARY;
12649 # Get the path to the file
12650 my @path = $table->file_path;
12652 # Use just the file name if no subdirectory.
12653 shift @path if $path[0] eq File::Spec->curdir();
12655 my $file = join '/', @path;
12657 # Create a hash entry for utf8_heavy to get the file that stores this
12658 # property's map table
12659 foreach my $alias ($table->aliases) {
12660 my $name = $alias->name;
12661 $loose_property_to_file_of{standardize($name)} = $file;
12664 # And a way for utf8_heavy to find the proper key in the SwashInfo
12665 # hash for this property.
12666 $file_to_swash_name{$file} = "To" . $table->swash_name;
12670 # Do all of the work for all equivalent tables when called with the leader
12671 # table, so skip if isn't the leader.
12672 return if $table->leader != $table;
12674 # If this is a complement of another file, use that other file instead,
12675 # with a ! prepended to it.
12677 if (($complement = $table->complement) != 0) {
12678 my @directories = $complement->file_path;
12680 # This assumes that the 0th element is something like 'lib',
12681 # the 1th element the property name (in its own directory), like
12682 # 'AHex', and the 2th element the file like 'Y' which will have a .pl
12683 # appended to it later.
12684 $directories[1] =~ s/^/!/;
12685 $file = pop @directories;
12686 $directory_ref =\@directories;
12689 # Join all the file path components together, using slashes.
12690 my $full_filename = join('/', @$directory_ref, $file);
12692 # All go in the same subdirectory of unicore
12693 if ($directory_ref->[0] ne $matches_directory) {
12694 Carp::my_carp("Unexpected directory in "
12695 . join('/', @{$directory_ref}, $file));
12698 # For this table and all its equivalents ...
12699 foreach my $table ($table, $table->equivalents) {
12701 # Associate it with its file internally. Don't include the
12702 # $matches_directory first component
12703 $table->set_file_path(@$directory_ref, $file);
12705 # No swash means don't do the rest of this.
12706 next if $table->isa('Map_Table') && $table->fate != $ORDINARY;
12708 my $sub_filename = join('/', $directory_ref->[1, -1], $file);
12710 my $property = $table->property;
12711 my $property_name = ($property == $perl)
12712 ? "" # 'perl' is never explicitly stated
12713 : standardize($property->name) . '=';
12715 my $is_default = 0; # Is this table the default one for the property?
12717 # To calculate $is_default, we find if this table is the same as the
12718 # default one for the property. But this is complicated by the
12719 # possibility that there is a master table for this one, and the
12720 # information is stored there instead of here.
12721 my $parent = $table->parent;
12722 my $leader_prop = $parent->property;
12723 my $default_map = $leader_prop->default_map;
12724 if (defined $default_map) {
12725 my $default_table = $leader_prop->table($default_map);
12726 $is_default = 1 if defined $default_table && $parent == $default_table;
12729 # Calculate the loose name for this table. Mostly it's just its name,
12730 # standardized. But in the case of Perl tables that are single-form
12731 # equivalents to Unicode properties, it is the latter's name.
12732 my $loose_table_name =
12733 ($property != $perl || $leader_prop == $perl)
12734 ? standardize($table->name)
12735 : standardize($parent->name);
12737 my $deprecated = ($table->status eq $DEPRECATED)
12738 ? $table->status_info
12740 my $caseless_equivalent = $table->caseless_equivalent;
12742 # And for each of the table's aliases... This inner loop eventually
12743 # goes through all aliases in the UCD that we generate regex match
12745 foreach my $alias ($table->aliases) {
12746 my $standard = utf8_heavy_name($table, $alias);
12748 # Generate an entry in either the loose or strict hashes, which
12749 # will translate the property and alias names combination into the
12750 # file where the table for them is stored.
12751 if ($alias->loose_match) {
12752 if (exists $loose_to_file_of{$standard}) {
12753 Carp::my_carp("Can't change file registered to $loose_to_file_of{$standard} to '$sub_filename'.");
12756 $loose_to_file_of{$standard} = $sub_filename;
12760 if (exists $stricter_to_file_of{$standard}) {
12761 Carp::my_carp("Can't change file registered to $stricter_to_file_of{$standard} to '$sub_filename'.");
12764 $stricter_to_file_of{$standard} = $sub_filename;
12766 # Tightly coupled with how utf8_heavy.pl works, for a
12767 # floating point number that is a whole number, get rid of
12768 # the trailing decimal point and 0's, so that utf8_heavy
12769 # will work. Also note that this assumes that such a
12770 # number is matched strictly; so if that were to change,
12771 # this would be wrong.
12772 if ((my $integer_name = $alias->name)
12773 =~ s/^ ( -? \d+ ) \.0+ $ /$1/x)
12775 $stricter_to_file_of{$property_name . $integer_name}
12781 # For Unicode::UCD, create a mapping of the prop=value to the
12782 # canonical =value for that property.
12783 if ($standard =~ /=/) {
12785 # This could happen if a strict name mapped into an existing
12786 # loose name. In that event, the strict names would have to
12787 # be moved to a new hash.
12788 if (exists($loose_to_standard_value{$standard})) {
12789 Carp::my_carp_bug("'$standard' conflicts with a pre-existing use. Bad News. Continuing anyway");
12791 $loose_to_standard_value{$standard} = $loose_table_name;
12794 # Keep a list of the deprecated properties and their filenames
12795 if ($deprecated && $complement == 0) {
12796 $utf8::why_deprecated{$sub_filename} = $deprecated;
12799 # And a substitute table, if any, for case-insensitive matching
12800 if ($caseless_equivalent != 0) {
12801 $caseless_equivalent_to{$standard} = $caseless_equivalent;
12804 # Add to defaults list if the table this alias belongs to is the
12806 $loose_defaults{$standard} = 1 if $is_default;
12814 my %base_names; # Names already used for avoiding DOS 8.3 filesystem
12816 my %full_dir_name_of; # Full length names of directories used.
12818 sub construct_filename($$$) {
12819 # Return a file name for a table, based on the table name, but perhaps
12820 # changed to get rid of non-portable characters in it, and to make
12821 # sure that it is unique on a file system that allows the names before
12822 # any period to be at most 8 characters (DOS). While we're at it
12823 # check and complain if there are any directory conflicts.
12825 my $name = shift; # The name to start with
12826 my $mutable = shift; # Boolean: can it be changed? If no, but
12827 # yet it must be to work properly, a warning
12829 my $directories_ref = shift; # A reference to an array containing the
12830 # path to the file, with each element one path
12831 # component. This is used because the same
12832 # name can be used in different directories.
12833 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
12835 my $warn = ! defined wantarray; # If true, then if the name is
12836 # changed, a warning is issued as well.
12838 if (! defined $name) {
12839 Carp::my_carp("Undefined name in directory "
12840 . File::Spec->join(@$directories_ref)
12845 # Make sure that no directory names conflict with each other. Look at
12846 # each directory in the input file's path. If it is already in use,
12847 # assume it is correct, and is merely being re-used, but if we
12848 # truncate it to 8 characters, and find that there are two directories
12849 # that are the same for the first 8 characters, but differ after that,
12850 # then that is a problem.
12851 foreach my $directory (@$directories_ref) {
12852 my $short_dir = substr($directory, 0, 8);
12853 if (defined $full_dir_name_of{$short_dir}) {
12854 next if $full_dir_name_of{$short_dir} eq $directory;
12855 Carp::my_carp("$directory conflicts with $full_dir_name_of{$short_dir}. Bad News. Continuing anyway");
12858 $full_dir_name_of{$short_dir} = $directory;
12862 my $path = join '/', @$directories_ref;
12863 $path .= '/' if $path;
12865 # Remove interior underscores.
12866 (my $filename = $name) =~ s/ (?<=.) _ (?=.) //xg;
12868 # Change any non-word character into an underscore, and truncate to 8.
12869 $filename =~ s/\W+/_/g; # eg., "L&" -> "L_"
12870 substr($filename, 8) = "" if length($filename) > 8;
12872 # Make sure the basename doesn't conflict with something we
12873 # might have already written. If we have, say,
12880 while (my $num = $base_names{$path}{lc $filename}++) {
12881 $num++; # so basenames with numbers start with '2', which
12882 # just looks more natural.
12884 # Want to append $num, but if it'll make the basename longer
12885 # than 8 characters, pre-truncate $filename so that the result
12887 my $delta = length($filename) + length($num) - 8;
12889 substr($filename, -$delta) = $num;
12894 if ($warn && ! $warned) {
12896 Carp::my_carp("'$path$name' conflicts with another name on a filesystem with 8 significant characters (like DOS). Proceeding anyway.");
12900 return $filename if $mutable;
12902 # If not changeable, must return the input name, but warn if needed to
12903 # change it beyond shortening it.
12904 if ($name ne $filename
12905 && substr($name, 0, length($filename)) ne $filename) {
12906 Carp::my_carp("'$path$name' had to be changed into '$filename'. Bad News. Proceeding anyway.");
12912 # The pod file contains a very large table. Many of the lines in that table
12913 # would exceed a typical output window's size, and so need to be wrapped with
12914 # a hanging indent to make them look good. The pod language is really
12915 # insufficient here. There is no general construct to do that in pod, so it
12916 # is done here by beginning each such line with a space to cause the result to
12917 # be output without formatting, and doing all the formatting here. This leads
12918 # to the result that if the eventual display window is too narrow it won't
12919 # look good, and if the window is too wide, no advantage is taken of that
12920 # extra width. A further complication is that the output may be indented by
12921 # the formatter so that there is less space than expected. What I (khw) have
12922 # done is to assume that that indent is a particular number of spaces based on
12923 # what it is in my Linux system; people can always resize their windows if
12924 # necessary, but this is obviously less than desirable, but the best that can
12926 my $automatic_pod_indent = 8;
12928 # Try to format so that uses fewest lines, but few long left column entries
12929 # slide into the right column. An experiment on 5.1 data yielded the
12930 # following percentages that didn't cut into the other side along with the
12931 # associated first-column widths
12933 # 80% not too bad except for a few blocks
12934 # 90% = 33; # , cuts 353/3053 lines from 37 = 12%
12936 my $indent_info_column = 27; # 75% of lines didn't have overlap
12938 my $FILLER = 3; # Length of initial boiler-plate columns in a pod line
12939 # The 3 is because of:
12940 # 1 for the leading space to tell the pod formatter to
12943 # 1 for the space between the flag and the main data
12945 sub format_pod_line ($$$;$$) {
12946 # Take a pod line and return it, formatted properly
12948 my $first_column_width = shift;
12949 my $entry = shift; # Contents of left column
12950 my $info = shift; # Contents of right column
12952 my $status = shift || ""; # Any flag
12954 my $loose_match = shift; # Boolean.
12955 $loose_match = 1 unless defined $loose_match;
12957 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
12960 $flags .= $STRICTER if ! $loose_match;
12962 $flags .= $status if $status;
12964 # There is a blank in the left column to cause the pod formatter to
12965 # output the line as-is.
12966 return sprintf " %-*s%-*s %s\n",
12967 # The first * in the format is replaced by this, the -1 is
12968 # to account for the leading blank. There isn't a
12969 # hard-coded blank after this to separate the flags from
12970 # the rest of the line, so that in the unlikely event that
12971 # multiple flags are shown on the same line, they both
12972 # will get displayed at the expense of that separation,
12973 # but since they are left justified, a blank will be
12974 # inserted in the normal case.
12978 # The other * in the format is replaced by this number to
12979 # cause the first main column to right fill with blanks.
12980 # The -1 is for the guaranteed blank following it.
12981 $first_column_width - $FILLER - 1,
12986 my @zero_match_tables; # List of tables that have no matches in this release
12988 sub make_re_pod_entries($) {
12989 # This generates the entries for the pod file for a given table.
12990 # Also done at this time are any children tables. The output looks like:
12991 # \p{Common} \p{Script=Common} (Short: \p{Zyyy}) (5178)
12993 my $input_table = shift; # Table the entry is for
12994 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
12996 # Generate parent and all its children at the same time.
12997 return if $input_table->parent != $input_table;
12999 my $property = $input_table->property;
13000 my $type = $property->type;
13001 my $full_name = $property->full_name;
13003 my $count = $input_table->count;
13004 my $string_count = clarify_number($count);
13005 my $status = $input_table->status;
13006 my $status_info = $input_table->status_info;
13007 my $caseless_equivalent = $input_table->caseless_equivalent;
13009 my $entry_for_first_table; # The entry for the first table output.
13010 # Almost certainly, it is the parent.
13012 # For each related table (including itself), we will generate a pod entry
13013 # for each name each table goes by
13014 foreach my $table ($input_table, $input_table->children) {
13016 # utf8_heavy.pl cannot deal with null string property values, so skip
13017 # any tables that have no non-null names.
13018 next if ! grep { $_->name ne "" } $table->aliases;
13020 # First, gather all the info that applies to this table as a whole.
13022 push @zero_match_tables, $table if $count == 0;
13024 my $table_property = $table->property;
13026 # The short name has all the underscores removed, while the full name
13027 # retains them. Later, we decide whether to output a short synonym
13028 # for the full one, we need to compare apples to apples, so we use the
13029 # short name's length including underscores.
13030 my $table_property_short_name_length;
13031 my $table_property_short_name
13032 = $table_property->short_name(\$table_property_short_name_length);
13033 my $table_property_full_name = $table_property->full_name;
13035 # Get how much savings there is in the short name over the full one
13036 # (delta will always be <= 0)
13037 my $table_property_short_delta = $table_property_short_name_length
13038 - length($table_property_full_name);
13039 my @table_description = $table->description;
13040 my @table_note = $table->note;
13042 # Generate an entry for each alias in this table.
13043 my $entry_for_first_alias; # saves the first one encountered.
13044 foreach my $alias ($table->aliases) {
13046 # Skip if not to go in pod.
13047 next unless $alias->make_re_pod_entry;
13049 # Start gathering all the components for the entry
13050 my $name = $alias->name;
13052 # Skip if name is empty, as can't be accessed by regexes.
13053 next if $name eq "";
13055 my $entry; # Holds the left column, may include extras
13056 my $entry_ref; # To refer to the left column's contents from
13057 # another entry; has no extras
13059 # First the left column of the pod entry. Tables for the $perl
13060 # property always use the single form.
13061 if ($table_property == $perl) {
13062 $entry = "\\p{$name}";
13063 $entry_ref = "\\p{$name}";
13065 else { # Compound form.
13067 # Only generate one entry for all the aliases that mean true
13068 # or false in binary properties. Append a '*' to indicate
13069 # some are missing. (The heading comment notes this.)
13071 if ($type == $BINARY) {
13072 next if $name ne 'N' && $name ne 'Y';
13075 elsif ($type != $FORCED_BINARY) {
13080 # Forced binary properties require special handling. It
13081 # has two sets of tables, one set is true/false; and the
13082 # other set is everything else. Entries are generated for
13083 # each set. Use the Bidi_Mirrored property (which appears
13084 # in all Unicode versions) to get a list of the aliases
13085 # for the true/false tables. Of these, only output the N
13086 # and Y ones, the same as, a regular binary property. And
13087 # output all the rest, same as a non-binary property.
13088 my $bm = property_ref("Bidi_Mirrored");
13089 if ($name eq 'N' || $name eq 'Y') {
13091 } elsif (grep { $name eq $_->name } $bm->table("Y")->aliases,
13092 $bm->table("N")->aliases)
13101 # Colon-space is used to give a little more space to be easier
13104 . $table_property_full_name
13107 # But for the reference to this entry, which will go in the
13108 # right column, where space is at a premium, use equals
13110 $entry_ref = "\\p{" . $table_property_full_name . "=$name}";
13113 # Then the right (info) column. This is stored as components of
13114 # an array for the moment, then joined into a string later. For
13115 # non-internal only properties, begin the info with the entry for
13116 # the first table we encountered (if any), as things are ordered
13117 # so that that one is the most descriptive. This leads to the
13118 # info column of an entry being a more descriptive version of the
13121 if ($name =~ /^_/) {
13123 '(For internal use by Perl, not necessarily stable)';
13125 elsif ($entry_for_first_alias) {
13126 push @info, $entry_for_first_alias;
13129 # If this entry is equivalent to another, add that to the info,
13130 # using the first such table we encountered
13131 if ($entry_for_first_table) {
13133 push @info, "(= $entry_for_first_table)";
13136 push @info, $entry_for_first_table;
13140 # If the name is a large integer, add an equivalent with an
13141 # exponent for better readability
13142 if ($name =~ /^[+-]?[\d]+$/ && $name >= 10_000) {
13143 push @info, sprintf "(= %.1e)", $name
13146 my $parenthesized = "";
13147 if (! $entry_for_first_alias) {
13149 # This is the first alias for the current table. The alias
13150 # array is ordered so that this is the fullest, most
13151 # descriptive alias, so it gets the fullest info. The other
13152 # aliases are mostly merely pointers to this one, using the
13153 # information already added above.
13155 # Display any status message, but only on the parent table
13156 if ($status && ! $entry_for_first_table) {
13157 push @info, $status_info;
13160 # Put out any descriptive info
13161 if (@table_description || @table_note) {
13162 push @info, join "; ", @table_description, @table_note;
13165 # Look to see if there is a shorter name we can point people
13167 my $standard_name = standardize($name);
13169 my $proposed_short = $table->short_name;
13170 if (defined $proposed_short) {
13171 my $standard_short = standardize($proposed_short);
13173 # If the short name is shorter than the standard one, or
13174 # even it it's not, but the combination of it and its
13175 # short property name (as in \p{prop=short} ($perl doesn't
13176 # have this form)) saves at least two characters, then,
13177 # cause it to be listed as a shorter synonym.
13178 if (length $standard_short < length $standard_name
13179 || ($table_property != $perl
13180 && (length($standard_short)
13181 - length($standard_name)
13182 + $table_property_short_delta) # (<= 0)
13185 $short_name = $proposed_short;
13186 if ($table_property != $perl) {
13187 $short_name = $table_property_short_name
13190 $short_name = "\\p{$short_name}";
13194 # And if this is a compound form name, see if there is a
13195 # single form equivalent
13197 if ($table_property != $perl) {
13199 # Special case the binary N tables, so that will print
13200 # \P{single}, but use the Y table values to populate
13201 # 'single', as we haven't likewise populated the N table.
13202 # For forced binary tables, we can't just look at the N
13203 # table, but must see if this table is equivalent to the N
13204 # one, as there are two equivalent beasts in these
13208 if ( ($type == $BINARY
13209 && $input_table == $property->table('No'))
13210 || ($type == $FORCED_BINARY
13211 && $property->table('No')->
13212 is_set_equivalent_to($input_table)))
13214 $test_table = $property->table('Yes');
13218 $test_table = $input_table;
13222 # Look for a single form amongst all the children.
13223 foreach my $table ($test_table->children) {
13224 next if $table->property != $perl;
13225 my $proposed_name = $table->short_name;
13226 next if ! defined $proposed_name;
13228 # Don't mention internal-only properties as a possible
13229 # single form synonym
13230 next if substr($proposed_name, 0, 1) eq '_';
13232 $proposed_name = "\\$p\{$proposed_name}";
13233 if (! defined $single_form
13234 || length($proposed_name) < length $single_form)
13236 $single_form = $proposed_name;
13238 # The goal here is to find a single form; not the
13239 # shortest possible one. We've already found a
13240 # short name. So, stop at the first single form
13241 # found, which is likely to be closer to the
13248 # Ouput both short and single in the same parenthesized
13249 # expression, but with only one of 'Single', 'Short' if there
13251 if ($short_name || $single_form || $table->conflicting) {
13252 $parenthesized .= "Short: $short_name" if $short_name;
13253 if ($short_name && $single_form) {
13254 $parenthesized .= ', ';
13256 elsif ($single_form) {
13257 $parenthesized .= 'Single: ';
13259 $parenthesized .= $single_form if $single_form;
13263 if ($caseless_equivalent != 0) {
13264 $parenthesized .= '; ' if $parenthesized ne "";
13265 $parenthesized .= "/i= " . $caseless_equivalent->complete_name;
13269 # Warn if this property isn't the same as one that a
13270 # semi-casual user might expect. The other components of this
13271 # parenthesized structure are calculated only for the first entry
13272 # for this table, but the conflicting is deemed important enough
13273 # to go on every entry.
13274 my $conflicting = join " NOR ", $table->conflicting;
13275 if ($conflicting) {
13276 $parenthesized .= '; ' if $parenthesized ne "";
13277 $parenthesized .= "NOT $conflicting";
13280 push @info, "($parenthesized)" if $parenthesized;
13282 if ($name =~ /_$/ && $alias->loose_match) {
13283 push @info, "Note the trailing '_' matters in spite of loose matching rules.";
13286 if ($table_property != $perl && $table->perl_extension) {
13287 push @info, '(Perl extension)';
13289 push @info, "($string_count)";
13291 # Now, we have both the entry and info so add them to the
13292 # list of all the properties.
13293 push @match_properties,
13294 format_pod_line($indent_info_column,
13298 $alias->loose_match);
13300 $entry_for_first_alias = $entry_ref unless $entry_for_first_alias;
13301 } # End of looping through the aliases for this table.
13303 if (! $entry_for_first_table) {
13304 $entry_for_first_table = $entry_for_first_alias;
13306 } # End of looping through all the related tables
13310 sub make_ucd_table_pod_entries {
13313 # Generate the entries for the UCD section of the pod for $table. This
13314 # also calculates if names are ambiguous, so has to be called even if the
13315 # pod is not being output
13317 my $short_name = $table->name;
13318 my $standard_short_name = standardize($short_name);
13319 my $full_name = $table->full_name;
13320 my $standard_full_name = standardize($full_name);
13322 my $full_info = ""; # Text of info column for full-name entries
13323 my $other_info = ""; # Text of info column for short-name entries
13324 my $short_info = ""; # Text of info column for other entries
13325 my $meaning = ""; # Synonym of this table
13327 my $property = ($table->isa('Property'))
13329 : $table->parent->property;
13331 my $perl_extension = $table->perl_extension;
13333 # Get the more official name for for perl extensions that aren't
13334 # stand-alone properties
13335 if ($perl_extension && $property != $table) {
13336 if ($property == $perl ||$property->type == $BINARY) {
13337 $meaning = $table->complete_name;
13340 $meaning = $property->full_name . "=$full_name";
13344 # There are three types of info column. One for the short name, one for
13345 # the full name, and one for everything else. They mostly are the same,
13346 # so initialize in the same loop.
13347 foreach my $info_ref (\$full_info, \$short_info, \$other_info) {
13348 if ($perl_extension && $property != $table) {
13350 # Add the synonymous name for the non-full name entries; and to
13351 # the full-name entry if it adds extra information
13352 if ($info_ref == \$other_info
13353 || ($info_ref == \$short_info
13354 && $standard_short_name ne $standard_full_name)
13355 || standardize($meaning) ne $standard_full_name
13357 $$info_ref .= "$meaning.";
13360 elsif ($info_ref != \$full_info) {
13362 # Otherwise, the non-full name columns include the full name
13363 $$info_ref .= $full_name;
13366 # And the full-name entry includes the short name, if different
13367 if ($info_ref == \$full_info
13368 && $standard_short_name ne $standard_full_name)
13370 $full_info =~ s/\.\Z//;
13371 $full_info .= " " if $full_info;
13372 $full_info .= "(Short: $short_name)";
13375 if ($table->perl_extension) {
13376 $$info_ref =~ s/\.\Z//;
13377 $$info_ref .= ". " if $$info_ref;
13378 $$info_ref .= "(Perl extension)";
13382 # Add any extra annotations to the full name entry
13383 foreach my $more_info ($table->description,
13385 $table->status_info)
13387 next unless $more_info;
13388 $full_info =~ s/\.\Z//;
13389 $full_info .= ". " if $full_info;
13390 $full_info .= $more_info;
13393 # These keep track if have created full and short name pod entries for the
13396 my $done_short = 0;
13398 # Every possible name is kept track of, even those that aren't going to be
13399 # output. This way we can be sure to find the ambiguities.
13400 foreach my $alias ($table->aliases) {
13401 my $name = $alias->name;
13402 my $standard = standardize($name);
13404 my $output_this = $alias->ucd;
13406 # If the full and short names are the same, we want to output the full
13407 # one's entry, so it has priority.
13408 if ($standard eq $standard_full_name) {
13409 next if $done_full;
13411 $info = $full_info;
13413 elsif ($standard eq $standard_short_name) {
13414 next if $done_short;
13416 next if $standard_short_name eq $standard_full_name;
13417 $info = $short_info;
13420 $info = $other_info;
13423 # Here, we have set up the two columns for this entry. But if an
13424 # entry already exists for this name, we have to decide which one
13425 # we're going to later output.
13426 if (exists $ucd_pod{$standard}) {
13428 # If the two entries refer to the same property, it's not going to
13429 # be ambiguous. (Likely it's because the names when standardized
13430 # are the same.) But that means if they are different properties,
13431 # there is ambiguity.
13432 if ($ucd_pod{$standard}->{'property'} != $property) {
13434 # Here, we have an ambiguity. This code assumes that one is
13435 # scheduled to be output and one not and that one is a perl
13436 # extension (which is not to be output) and the other isn't.
13437 # If those assumptions are wrong, things have to be rethought.
13438 if ($ucd_pod{$standard}{'output_this'} == $output_this
13439 || $ucd_pod{$standard}{'perl_extension'} == $perl_extension
13440 || $output_this == $perl_extension)
13442 Carp::my_carp("Bad news. $property and $ucd_pod{$standard}->{'property'} have unexpected output statuss and perl-extension combinations. Proceeding anyway.");
13445 # We modifiy the info column of the one being output to
13446 # indicate the ambiguity. Set $which to point to that one's
13449 if ($ucd_pod{$standard}{'output_this'}) {
13450 $which = \$ucd_pod{$standard}->{'info'};
13454 $meaning = $ucd_pod{$standard}{'meaning'};
13458 $$which =~ s/\.\Z//;
13459 $$which .= "; NOT '$standard' meaning '$meaning'";
13461 $ambiguous_names{$standard} = 1;
13464 # Use the non-perl-extension variant
13465 next unless $ucd_pod{$standard}{'perl_extension'};
13468 # Store enough information about this entry that we can later look for
13469 # ambiguities, and output it properly.
13470 $ucd_pod{$standard} = { 'name' => $name,
13472 'meaning' => $meaning,
13473 'output_this' => $output_this,
13474 'perl_extension' => $perl_extension,
13475 'property' => $property,
13476 'status' => $alias->status,
13478 } # End of looping through all this table's aliases
13483 sub pod_alphanumeric_sort {
13484 # Sort pod entries alphanumerically.
13486 # The first few character columns are filler, plus the '\p{'; and get rid
13487 # of all the trailing stuff, starting with the trailing '}', so as to sort
13488 # on just 'Name=Value'
13489 (my $a = lc $a) =~ s/^ .*? { //x;
13491 (my $b = lc $b) =~ s/^ .*? { //x;
13494 # Determine if the two operands are both internal only or both not.
13495 # Character 0 should be a '\'; 1 should be a p; 2 should be '{', so 3
13496 # should be the underscore that begins internal only
13497 my $a_is_internal = (substr($a, 0, 1) eq '_');
13498 my $b_is_internal = (substr($b, 0, 1) eq '_');
13500 # Sort so the internals come last in the table instead of first (which the
13501 # leading underscore would otherwise indicate).
13502 if ($a_is_internal != $b_is_internal) {
13503 return 1 if $a_is_internal;
13507 # Determine if the two operands are numeric property values or not.
13508 # A numeric property will look like xyz: 3. But the number
13509 # can begin with an optional minus sign, and may have a
13510 # fraction or rational component, like xyz: 3/2. If either
13511 # isn't numeric, use alphabetic sort.
13512 my ($a_initial, $a_number) =
13513 ($a =~ /^ ( [^:=]+ [:=] \s* ) (-? \d+ (?: [.\/] \d+)? )/ix);
13514 return $a cmp $b unless defined $a_number;
13515 my ($b_initial, $b_number) =
13516 ($b =~ /^ ( [^:=]+ [:=] \s* ) (-? \d+ (?: [.\/] \d+)? )/ix);
13517 return $a cmp $b unless defined $b_number;
13519 # Here they are both numeric, but use alphabetic sort if the
13520 # initial parts don't match
13521 return $a cmp $b if $a_initial ne $b_initial;
13523 # Convert rationals to floating for the comparison.
13524 $a_number = eval $a_number if $a_number =~ qr{/};
13525 $b_number = eval $b_number if $b_number =~ qr{/};
13527 return $a_number <=> $b_number;
13531 # Create the .pod file. This generates the various subsections and then
13532 # combines them in one big HERE document.
13534 my $Is_flags_text = "If an entry has flag(s) at its beginning, like \"$DEPRECATED\", the \"Is_\" form has the same flag(s)";
13536 return unless defined $pod_directory;
13537 print "Making pod file\n" if $verbosity >= $PROGRESS;
13539 my $exception_message =
13540 '(Any exceptions are individually noted beginning with the word NOT.)';
13542 if (-e 'Blocks.txt') {
13544 # Add the line: '\p{In_*} \p{Block: *}', with the warning message
13545 # if the global $has_In_conflicts indicates we have them.
13546 push @match_properties, format_pod_line($indent_info_column,
13549 . (($has_In_conflicts)
13550 ? " $exception_message"
13552 @block_warning = << "END";
13554 Matches in the Block property have shortcuts that begin with "In_". For
13555 example, C<\\p{Block=Latin1}> can be written as C<\\p{In_Latin1}>. For
13556 backward compatibility, if there is no conflict with another shortcut, these
13557 may also be written as C<\\p{Latin1}> or C<\\p{Is_Latin1}>. But, N.B., there
13558 are numerous such conflicting shortcuts. Use of these forms for Block is
13559 discouraged, and are flagged as such, not only because of the potential
13560 confusion as to what is meant, but also because a later release of Unicode may
13561 preempt the shortcut, and your program would no longer be correct. Use the
13562 "In_" form instead to avoid this, or even more clearly, use the compound form,
13563 e.g., C<\\p{blk:latin1}>. See L<perlunicode/"Blocks"> for more information
13567 my $text = $Is_flags_text;
13568 $text = "$exception_message $text" if $has_Is_conflicts;
13570 # And the 'Is_ line';
13571 push @match_properties, format_pod_line($indent_info_column,
13575 # Sort the properties array for output. It is sorted alphabetically
13576 # except numerically for numeric properties, and only output unique lines.
13577 @match_properties = sort pod_alphanumeric_sort uniques @match_properties;
13579 my $formatted_properties = simple_fold(\@match_properties,
13581 # indent succeeding lines by two extra
13582 # which looks better
13583 $indent_info_column + 2,
13585 # shorten the line length by how much
13586 # the formatter indents, so the folded
13587 # line will fit in the space
13588 # presumably available
13589 $automatic_pod_indent);
13590 # Add column headings, indented to be a little more centered, but not
13592 $formatted_properties = format_pod_line($indent_info_column,
13596 . $formatted_properties;
13598 # Generate pod documentation lines for the tables that match nothing
13599 my $zero_matches = "";
13600 if (@zero_match_tables) {
13601 @zero_match_tables = uniques(@zero_match_tables);
13602 $zero_matches = join "\n\n",
13603 map { $_ = '=item \p{' . $_->complete_name . "}" }
13604 sort { $a->complete_name cmp $b->complete_name }
13605 @zero_match_tables;
13607 $zero_matches = <<END;
13609 =head2 Legal C<\\p{}> and C<\\P{}> constructs that match no characters
13611 Unicode has some property-value pairs that currently don't match anything.
13612 This happens generally either because they are obsolete, or they exist for
13613 symmetry with other forms, but no language has yet been encoded that uses
13614 them. In this version of Unicode, the following match zero code points:
13625 # Generate list of properties that we don't accept, grouped by the reasons
13626 # why. This is so only put out the 'why' once, and then list all the
13627 # properties that have that reason under it.
13629 my %why_list; # The keys are the reasons; the values are lists of
13630 # properties that have the key as their reason
13632 # For each property, add it to the list that are suppressed for its reason
13633 # The sort will cause the alphabetically first properties to be added to
13634 # each list first, so each list will be sorted.
13635 foreach my $property (sort keys %why_suppressed) {
13636 push @{$why_list{$why_suppressed{$property}}}, $property;
13639 # For each reason (sorted by the first property that has that reason)...
13640 my @bad_re_properties;
13641 foreach my $why (sort { $why_list{$a}->[0] cmp $why_list{$b}->[0] }
13644 # Add to the output, all the properties that have that reason.
13645 my $has_item = 0; # Flag if actually output anything.
13646 foreach my $name (@{$why_list{$why}}) {
13648 # Split compound names into $property and $table components
13649 my $property = $name;
13651 if ($property =~ / (.*) = (.*) /x) {
13656 # This release of Unicode may not have a property that is
13657 # suppressed, so don't reference a non-existent one.
13658 $property = property_ref($property);
13659 next if ! defined $property;
13661 # And since this list is only for match tables, don't list the
13662 # ones that don't have match tables.
13663 next if ! $property->to_create_match_tables;
13665 # Find any abbreviation, and turn it into a compound name if this
13666 # is a property=value pair.
13667 my $short_name = $property->name;
13668 $short_name .= '=' . $property->table($table)->name if $table;
13670 # Start with an empty line.
13671 push @bad_re_properties, "\n\n" unless $has_item;
13673 # And add the property as an item for the reason.
13674 push @bad_re_properties, "\n=item I<$name> ($short_name)\n";
13678 # And add the reason under the list of properties, if such a list
13679 # actually got generated. Note that the header got added
13680 # unconditionally before. But pod ignores extra blank lines, so no
13682 push @bad_re_properties, "\n$why\n" if $has_item;
13684 } # End of looping through each reason.
13686 if (! @bad_re_properties) {
13687 push @bad_re_properties,
13688 "*** This installation accepts ALL non-Unihan properties ***";
13691 # Add =over only if non-empty to avoid an empty =over/=back section,
13692 # which is considered bad form.
13693 unshift @bad_re_properties, "\n=over 4\n";
13694 push @bad_re_properties, "\n=back\n";
13697 # Similiarly, generate a list of files that we don't use, grouped by the
13698 # reasons why. First, create a hash whose keys are the reasons, and whose
13699 # values are anonymous arrays of all the files that share that reason.
13700 my %grouped_by_reason;
13701 foreach my $file (keys %ignored_files) {
13702 push @{$grouped_by_reason{$ignored_files{$file}}}, $file;
13704 foreach my $file (keys %skipped_files) {
13705 push @{$grouped_by_reason{$skipped_files{$file}}}, $file;
13708 # Then, sort each group.
13709 foreach my $group (keys %grouped_by_reason) {
13710 @{$grouped_by_reason{$group}} = sort { lc $a cmp lc $b }
13711 @{$grouped_by_reason{$group}} ;
13714 # Finally, create the output text. For each reason (sorted by the
13715 # alphabetically first file that has that reason)...
13717 foreach my $reason (sort { lc $grouped_by_reason{$a}->[0]
13718 cmp lc $grouped_by_reason{$b}->[0]
13720 keys %grouped_by_reason)
13722 # Add all the files that have that reason to the output. Start
13723 # with an empty line.
13724 push @unused_files, "\n\n";
13725 push @unused_files, map { "\n=item F<$_> \n" }
13726 @{$grouped_by_reason{$reason}};
13727 # And add the reason under the list of files
13728 push @unused_files, "\n$reason\n";
13731 # Similarly, create the output text for the UCD section of the pod
13733 foreach my $key (keys %ucd_pod) {
13734 next unless $ucd_pod{$key}->{'output_this'};
13735 push @ucd_pod, format_pod_line($indent_info_column,
13736 $ucd_pod{$key}->{'name'},
13737 $ucd_pod{$key}->{'info'},
13738 $ucd_pod{$key}->{'status'},
13742 # Sort alphabetically, and fold for output
13743 @ucd_pod = sort { lc substr($a, 2) cmp lc substr($b, 2) } @ucd_pod;
13744 my $ucd_pod = simple_fold(\@ucd_pod,
13746 $indent_info_column,
13747 $automatic_pod_indent);
13748 $ucd_pod = format_pod_line($indent_info_column, 'NAME', ' INFO')
13753 # Everything is ready to assemble.
13754 my @OUT = << "END";
13759 To change this file, edit $0 instead.
13765 $pod_file - Index of Unicode Version $string_version character properties in Perl
13769 This document provides information about the portion of the Unicode database
13770 that deals with character properties, that is the portion that is defined on
13771 single code points. (L</Other information in the Unicode data base>
13772 below briefly mentions other data that Unicode provides.)
13774 Perl can provide access to all non-provisional Unicode character properties,
13775 though not all are enabled by default. The omitted ones are the Unihan
13776 properties (accessible via the CPAN module L<Unicode::Unihan>) and certain
13777 deprecated or Unicode-internal properties. (An installation may choose to
13778 recompile Perl's tables to change this. See L<Unicode character
13779 properties that are NOT accepted by Perl>.)
13781 For most purposes, access to Unicode properties from the Perl core is through
13782 regular expression matches, as described in the next section.
13783 For some special purposes, and to access the properties that are not suitable
13784 for regular expression matching, all the Unicode character properties that
13785 Perl handles are accessible via the standard L<Unicode::UCD> module, as
13786 described in the section L</Properties accessible through Unicode::UCD>.
13788 Perl also provides some additional extensions and short-cut synonyms
13789 for Unicode properties.
13791 This document merely lists all available properties and does not attempt to
13792 explain what each property really means. There is a brief description of each
13793 Perl extension; see L<perlunicode/Other Properties> for more information on
13794 these. There is some detail about Blocks, Scripts, General_Category,
13795 and Bidi_Class in L<perlunicode>, but to find out about the intricacies of the
13796 official Unicode properties, refer to the Unicode standard. A good starting
13797 place is L<$unicode_reference_url>.
13799 Note that you can define your own properties; see
13800 L<perlunicode/"User-Defined Character Properties">.
13802 =head1 Properties accessible through C<\\p{}> and C<\\P{}>
13804 The Perl regular expression C<\\p{}> and C<\\P{}> constructs give access to
13805 most of the Unicode character properties. The table below shows all these
13806 constructs, both single and compound forms.
13808 B<Compound forms> consist of two components, separated by an equals sign or a
13809 colon. The first component is the property name, and the second component is
13810 the particular value of the property to match against, for example,
13811 C<\\p{Script: Greek}> and C<\\p{Script=Greek}> both mean to match characters
13812 whose Script property is Greek.
13814 B<Single forms>, like C<\\p{Greek}>, are mostly Perl-defined shortcuts for
13815 their equivalent compound forms. The table shows these equivalences. (In our
13816 example, C<\\p{Greek}> is a just a shortcut for C<\\p{Script=Greek}>.)
13817 There are also a few Perl-defined single forms that are not shortcuts for a
13818 compound form. One such is C<\\p{Word}>. These are also listed in the table.
13820 In parsing these constructs, Perl always ignores Upper/lower case differences
13821 everywhere within the {braces}. Thus C<\\p{Greek}> means the same thing as
13822 C<\\p{greek}>. But note that changing the case of the C<"p"> or C<"P"> before
13823 the left brace completely changes the meaning of the construct, from "match"
13824 (for C<\\p{}>) to "doesn't match" (for C<\\P{}>). Casing in this document is
13825 for improved legibility.
13827 Also, white space, hyphens, and underscores are also normally ignored
13828 everywhere between the {braces}, and hence can be freely added or removed
13829 even if the C</x> modifier hasn't been specified on the regular expression.
13830 But $a_bold_stricter at the beginning of an entry in the table below
13831 means that tighter (stricter) rules are used for that entry:
13835 =item Single form (C<\\p{name}>) tighter rules:
13837 White space, hyphens, and underscores ARE significant
13842 =item * white space adjacent to a non-word character
13844 =item * underscores separating digits in numbers
13848 That means, for example, that you can freely add or remove white space
13849 adjacent to (but within) the braces without affecting the meaning.
13851 =item Compound form (C<\\p{name=value}> or C<\\p{name:value}>) tighter rules:
13853 The tighter rules given above for the single form apply to everything to the
13854 right of the colon or equals; the looser rules still apply to everything to
13857 That means, for example, that you can freely add or remove white space
13858 adjacent to (but within) the braces and the colon or equal sign.
13862 Some properties are considered obsolete by Unicode, but still available.
13863 There are several varieties of obsolescence:
13869 A property may be stabilized. Such a determination does not indicate
13870 that the property should or should not be used; instead it is a declaration
13871 that the property will not be maintained nor extended for newly encoded
13872 characters. Such properties are marked with $a_bold_stabilized in the
13877 A property may be deprecated, perhaps because its original intent
13878 has been replaced by another property, or because its specification was
13879 somehow defective. This means that its use is strongly
13880 discouraged, so much so that a warning will be issued if used, unless the
13881 regular expression is in the scope of a C<S<no warnings 'deprecated'>>
13882 statement. $A_bold_deprecated flags each such entry in the table, and
13883 the entry there for the longest, most descriptive version of the property will
13884 give the reason it is deprecated, and perhaps advice. Perl may issue such a
13885 warning, even for properties that aren't officially deprecated by Unicode,
13886 when there used to be characters or code points that were matched by them, but
13887 no longer. This is to warn you that your program may not work like it did on
13888 earlier Unicode releases.
13890 A deprecated property may be made unavailable in a future Perl version, so it
13891 is best to move away from them.
13893 A deprecated property may also be stabilized, but this fact is not shown.
13897 Properties marked with $a_bold_obsolete in the table are considered (plain)
13898 obsolete. Generally this designation is given to properties that Unicode once
13899 used for internal purposes (but not any longer).
13903 Some Perl extensions are present for backwards compatibility and are
13904 discouraged from being used, but are not obsolete. $A_bold_discouraged
13905 flags each such entry in the table. Future Unicode versions may force
13906 some of these extensions to be removed without warning, replaced by another
13907 property with the same name that means something different. Use the
13908 equivalent shown instead.
13912 The table below has two columns. The left column contains the C<\\p{}>
13913 constructs to look up, possibly preceded by the flags mentioned above; and
13914 the right column contains information about them, like a description, or
13915 synonyms. It shows both the single and compound forms for each property that
13916 has them. If the left column is a short name for a property, the right column
13917 will give its longer, more descriptive name; and if the left column is the
13918 longest name, the right column will show any equivalent shortest name, in both
13919 single and compound forms if applicable.
13921 The right column will also caution you if a property means something different
13922 than what might normally be expected.
13924 All single forms are Perl extensions; a few compound forms are as well, and
13927 Numbers in (parentheses) indicate the total number of code points matched by
13928 the property. For emphasis, those properties that match no code points at all
13929 are listed as well in a separate section following the table.
13931 Most properties match the same code points regardless of whether C<"/i">
13932 case-insensitive matching is specified or not. But a few properties are
13933 affected. These are shown with the notation
13935 (/i= other_property)
13937 in the second column. Under case-insensitive matching they match the
13938 same code pode points as the property "other_property".
13940 There is no description given for most non-Perl defined properties (See
13941 L<$unicode_reference_url> for that).
13943 For compactness, 'B<*>' is used as a wildcard instead of showing all possible
13944 combinations. For example, entries like:
13946 \\p{Gc: *} \\p{General_Category: *}
13948 mean that 'Gc' is a synonym for 'General_Category', and anything that is valid
13949 for the latter is also valid for the former. Similarly,
13953 means that if and only if, for example, C<\\p{Foo}> exists, then
13954 C<\\p{Is_Foo}> and C<\\p{IsFoo}> are also valid and all mean the same thing.
13955 And similarly, C<\\p{Foo=Bar}> means the same as C<\\p{Is_Foo=Bar}> and
13956 C<\\p{IsFoo=Bar}>. "*" here is restricted to something not beginning with an
13959 Also, in binary properties, 'Yes', 'T', and 'True' are all synonyms for 'Y'.
13960 And 'No', 'F', and 'False' are all synonyms for 'N'. The table shows 'Y*' and
13961 'N*' to indicate this, and doesn't have separate entries for the other
13962 possibilities. Note that not all properties which have values 'Yes' and 'No'
13963 are binary, and they have all their values spelled out without using this wild
13964 card, and a C<NOT> clause in their description that highlights their not being
13965 binary. These also require the compound form to match them, whereas true
13966 binary properties have both single and compound forms available.
13968 Note that all non-essential underscores are removed in the display of the
13975 =item Z<>B<*> is a wild-card
13977 =item B<(\\d+)> in the info column gives the number of code points matched by
13980 =item B<$DEPRECATED> means this is deprecated.
13982 =item B<$OBSOLETE> means this is obsolete.
13984 =item B<$STABILIZED> means this is stabilized.
13986 =item B<$STRICTER> means tighter (stricter) name matching applies.
13988 =item B<$DISCOURAGED> means use of this form is discouraged, and may not be
13993 $formatted_properties
13997 =head1 Properties accessible through Unicode::UCD
13999 All the Unicode character properties mentioned above (except for those marked
14000 as for internal use by Perl) are also accessible by
14001 L<Unicode::UCD/prop_invlist()>.
14003 Due to their nature, not all Unicode character properties are suitable for
14004 regular expression matches, nor C<prop_invlist()>. The remaining
14005 non-provisional, non-internal ones are accessible via
14006 L<Unicode::UCD/prop_invmap()> (except for those that this Perl installation
14007 hasn't included; see L<below for which those are|/Unicode character properties
14008 that are NOT accepted by Perl>).
14010 For compatibility with other parts of Perl, all the single forms given in the
14011 table in the L<section above|/Properties accessible through \\p{} and \\P{}>
14012 are recognized. BUT, there are some ambiguities between some Perl extensions
14013 and the Unicode properties, all of which are silently resolved in favor of the
14014 official Unicode property. To avoid surprises, you should only use
14015 C<prop_invmap()> for forms listed in the table below, which omits the
14016 non-recommended ones. The affected forms are the Perl single form equivalents
14017 of Unicode properties, such as C<\\p{sc}> being a single-form equivalent of
14018 C<\\p{gc=sc}>, which is treated by C<prop_invmap()> as the C<Script> property,
14019 whose short name is C<sc>. The table indicates the current ambiguities in the
14020 INFO column, beginning with the word C<"NOT">.
14022 The standard Unicode properties listed below are documented in
14023 L<$unicode_reference_url>; Perl_Decimal_Digit is documented in
14024 L<Unicode::UCD/prop_invmap()>. The other Perl extensions are in
14025 L<perlunicode/Other Properties>;
14027 The first column in the table is a name for the property; the second column is
14028 an alternative name, if any, plus possibly some annotations. The alternative
14029 name is the property's full name, unless that would simply repeat the first
14030 column, in which case the second column indicates the property's short name
14031 (if different). The annotations are given only in the entry for the full
14032 name. If a property is obsolete, etc, the entry will be flagged with the same
14033 characters used in the table in the L<section above|/Properties accessible
14034 through \\p{} and \\P{}>, like B<$DEPRECATED> or B<$STABILIZED>.
14038 =head1 Properties accessible through other means
14040 Certain properties are accessible also via core function calls. These are:
14042 Lowercase_Mapping lc() and lcfirst()
14043 Titlecase_Mapping ucfirst()
14044 Uppercase_Mapping uc()
14046 Also, Case_Folding is accessible through the C</i> modifier in regular
14049 And, the Name and Name_Aliases properties are accessible through the C<\\N{}>
14050 interpolation in double-quoted strings and regular expressions; and functions
14051 C<charnames::viacode()>, C<charnames::vianame()>, and
14052 C<charnames::string_vianame()> (which require a C<use charnames ();> to be
14055 Finally, most properties related to decomposition are accessible via
14056 L<Unicode::Normalize>.
14058 =head1 Unicode character properties that are NOT accepted by Perl
14060 Perl will generate an error for a few character properties in Unicode when
14061 used in a regular expression. The non-Unihan ones are listed below, with the
14062 reasons they are not accepted, perhaps with work-arounds. The short names for
14063 the properties are listed enclosed in (parentheses).
14064 As described after the list, an installation can change the defaults and choose
14065 to accept any of these. The list is machine generated based on the
14066 choices made for the installation that generated this document.
14070 An installation can choose to allow any of these to be matched by downloading
14071 the Unicode database from L<http://www.unicode.org/Public/> to
14072 C<\$Config{privlib}>/F<unicore/> in the Perl source tree, changing the
14073 controlling lists contained in the program
14074 C<\$Config{privlib}>/F<unicore/mktables> and then re-compiling and installing.
14075 (C<\%Config> is available from the Config module).
14077 =head1 Other information in the Unicode data base
14079 The Unicode data base is delivered in two different formats. The XML version
14080 is valid for more modern Unicode releases. The other version is a collection
14081 of files. The two are intended to give equivalent information. Perl uses the
14082 older form; this allows you to recompile Perl to use early Unicode releases.
14084 The only non-character property that Perl currently supports is Named
14085 Sequences, in which a sequence of code points
14086 is given a name and generally treated as a single entity. (Perl supports
14087 these via the C<\\N{...}> double-quotish construct,
14088 L<charnames/charnames::string_vianame(name)>, and L<Unicode::UCD/namedseq()>.
14090 Below is a list of the files in the Unicode data base that Perl doesn't
14091 currently use, along with very brief descriptions of their purposes.
14092 Some of the names of the files have been shortened from those that Unicode
14093 uses, in order to allow them to be distinguishable from similarly named files
14094 on file systems for which only the first 8 characters of a name are
14105 L<$unicode_reference_url>
14113 # And write it. The 0 means no utf8.
14114 main::write([ $pod_directory, "$pod_file.pod" ], 0, \@OUT);
14118 sub make_Heavy () {
14119 # Create and write Heavy.pl, which passes info about the tables to
14122 # Stringify structures for output
14123 my $loose_property_name_of
14124 = simple_dumper(\%loose_property_name_of, ' ' x 4);
14125 chomp $loose_property_name_of;
14127 my $stricter_to_file_of = simple_dumper(\%stricter_to_file_of, ' ' x 4);
14128 chomp $stricter_to_file_of;
14130 my $loose_to_file_of = simple_dumper(\%loose_to_file_of, ' ' x 4);
14131 chomp $loose_to_file_of;
14133 my $nv_floating_to_rational
14134 = simple_dumper(\%nv_floating_to_rational, ' ' x 4);
14135 chomp $nv_floating_to_rational;
14137 my $why_deprecated = simple_dumper(\%utf8::why_deprecated, ' ' x 4);
14138 chomp $why_deprecated;
14140 # We set the key to the file when we associated files with tables, but we
14141 # couldn't do the same for the value then, as we might not have the file
14142 # for the alternate table figured out at that time.
14143 foreach my $cased (keys %caseless_equivalent_to) {
14144 my @path = $caseless_equivalent_to{$cased}->file_path;
14145 my $path = join '/', @path[1, -1];
14146 $caseless_equivalent_to{$cased} = $path;
14148 my $caseless_equivalent_to
14149 = simple_dumper(\%caseless_equivalent_to, ' ' x 4);
14150 chomp $caseless_equivalent_to;
14152 my $loose_property_to_file_of
14153 = simple_dumper(\%loose_property_to_file_of, ' ' x 4);
14154 chomp $loose_property_to_file_of;
14156 my $file_to_swash_name = simple_dumper(\%file_to_swash_name, ' ' x 4);
14157 chomp $file_to_swash_name;
14161 $INTERNAL_ONLY_HEADER
14163 # This file is for the use of utf8_heavy.pl and Unicode::UCD
14165 # Maps Unicode (not Perl single-form extensions) property names in loose
14166 # standard form to their corresponding standard names
14167 \%utf8::loose_property_name_of = (
14168 $loose_property_name_of
14171 # Maps property, table to file for those using stricter matching
14172 \%utf8::stricter_to_file_of = (
14173 $stricter_to_file_of
14176 # Maps property, table to file for those using loose matching
14177 \%utf8::loose_to_file_of = (
14181 # Maps floating point to fractional form
14182 \%utf8::nv_floating_to_rational = (
14183 $nv_floating_to_rational
14186 # If a floating point number doesn't have enough digits in it to get this
14187 # close to a fraction, it isn't considered to be that fraction even if all the
14188 # digits it does have match.
14189 \$utf8::max_floating_slop = $MAX_FLOATING_SLOP;
14191 # Deprecated tables to generate a warning for. The key is the file containing
14192 # the table, so as to avoid duplication, as many property names can map to the
14193 # file, but we only need one entry for all of them.
14194 \%utf8::why_deprecated = (
14198 # A few properties have different behavior under /i matching. This maps
14199 # those to substitute files to use under /i.
14200 \%utf8::caseless_equivalent = (
14201 $caseless_equivalent_to
14204 # Property names to mapping files
14205 \%utf8::loose_property_to_file_of = (
14206 $loose_property_to_file_of
14209 # Files to the swash names within them.
14210 \%utf8::file_to_swash_name = (
14211 $file_to_swash_name
14217 main::write("Heavy.pl", 0, \@heavy); # The 0 means no utf8.
14221 sub make_Name_pm () {
14222 # Create and write Name.pm, which contains subroutines and data to use in
14223 # conjunction with Name.pl
14225 # Maybe there's nothing to do.
14226 return unless $has_hangul_syllables || @code_points_ending_in_code_point;
14230 $INTERNAL_ONLY_HEADER
14233 # Convert these structures to output format.
14234 my $code_points_ending_in_code_point =
14235 main::simple_dumper(\@code_points_ending_in_code_point,
14237 my $names = main::simple_dumper(\%names_ending_in_code_point,
14239 my $loose_names = main::simple_dumper(\%loose_names_ending_in_code_point,
14242 # Do the same with the Hangul names,
14248 if ($has_hangul_syllables) {
14250 # Construct a regular expression of all the possible
14251 # combinations of the Hangul syllables.
14252 my @L_re; # Leading consonants
14253 for my $i ($LBase .. $LBase + $LCount - 1) {
14254 push @L_re, $Jamo{$i}
14256 my @V_re; # Middle vowels
14257 for my $i ($VBase .. $VBase + $VCount - 1) {
14258 push @V_re, $Jamo{$i}
14260 my @T_re; # Trailing consonants
14261 for my $i ($TBase + 1 .. $TBase + $TCount - 1) {
14262 push @T_re, $Jamo{$i}
14265 # The whole re is made up of the L V T combination.
14267 . join ('|', sort @L_re)
14269 . join ('|', sort @V_re)
14271 . join ('|', sort @T_re)
14274 # These hashes needed by the algorithm were generated
14275 # during reading of the Jamo.txt file
14276 $jamo = main::simple_dumper(\%Jamo, ' ' x 8);
14277 $jamo_l = main::simple_dumper(\%Jamo_L, ' ' x 8);
14278 $jamo_v = main::simple_dumper(\%Jamo_V, ' ' x 8);
14279 $jamo_t = main::simple_dumper(\%Jamo_T, ' ' x 8);
14286 # This module contains machine-generated tables and code for the
14287 # algorithmically-determinable Unicode character names. The following
14288 # routines can be used to translate between name and code point and vice versa
14292 # Matches legal code point. 4-6 hex numbers, If there are 6, the first
14293 # two must be 10; if there are 5, the first must not be a 0. Written this
14294 # way to decrease backtracking. The first regex allows the code point to
14295 # be at the end of a word, but to work properly, the word shouldn't end
14296 # with a valid hex character. The second one won't match a code point at
14297 # the end of a word, and doesn't have the run-on issue
14298 my \$run_on_code_point_re = qr/$run_on_code_point_re/;
14299 my \$code_point_re = qr/$code_point_re/;
14301 # In the following hash, the keys are the bases of names which includes
14302 # the code point in the name, like CJK UNIFIED IDEOGRAPH-4E01. The values
14303 # of each key is another hash which is used to get the low and high ends
14304 # for each range of code points that apply to the name.
14305 my %names_ending_in_code_point = (
14309 # The following hash is a copy of the previous one, except is for loose
14310 # matching, so each name has blanks and dashes squeezed out
14311 my %loose_names_ending_in_code_point = (
14315 # And the following array gives the inverse mapping from code points to
14316 # names. Lowest code points are first
14317 my \@code_points_ending_in_code_point = (
14318 $code_points_ending_in_code_point
14321 # Earlier releases didn't have Jamos. No sense outputting
14322 # them unless will be used.
14323 if ($has_hangul_syllables) {
14326 # Convert from code point to Jamo short name for use in composing Hangul
14332 # Leading consonant (can be null)
14342 # Optional trailing consonant
14347 # Computed re that splits up a Hangul name into LVT or LV syllables
14348 my \$syllable_re = qr/$jamo_re/;
14350 my \$HANGUL_SYLLABLE = "HANGUL SYLLABLE ";
14351 my \$loose_HANGUL_SYLLABLE = "HANGULSYLLABLE";
14353 # These constants names and values were taken from the Unicode standard,
14354 # version 5.1, section 3.12. They are used in conjunction with Hangul
14356 my \$SBase = $SBase_string;
14357 my \$LBase = $LBase_string;
14358 my \$VBase = $VBase_string;
14359 my \$TBase = $TBase_string;
14360 my \$SCount = $SCount;
14361 my \$LCount = $LCount;
14362 my \$VCount = $VCount;
14363 my \$TCount = $TCount;
14364 my \$NCount = \$VCount * \$TCount;
14366 } # End of has Jamos
14368 push @name, << 'END';
14370 sub name_to_code_point_special {
14371 my ($name, $loose) = @_;
14373 # Returns undef if not one of the specially handled names; otherwise
14374 # returns the code point equivalent to the input name
14375 # $loose is non-zero if to use loose matching, 'name' in that case
14376 # must be input as upper case with all blanks and dashes squeezed out.
14378 if ($has_hangul_syllables) {
14379 push @name, << 'END';
14381 if ((! $loose && $name =~ s/$HANGUL_SYLLABLE//)
14382 || ($loose && $name =~ s/$loose_HANGUL_SYLLABLE//))
14384 return if $name !~ qr/^$syllable_re$/;
14385 my $L = $Jamo_L{$1};
14386 my $V = $Jamo_V{$2};
14387 my $T = (defined $3) ? $Jamo_T{$3} : 0;
14388 return ($L * $VCount + $V) * $TCount + $T + $SBase;
14392 push @name, << 'END';
14394 # Name must end in 'code_point' for this to handle.
14395 return if (($loose && $name !~ /^ (.*?) ($run_on_code_point_re) $/x)
14396 || (! $loose && $name !~ /^ (.*) ($code_point_re) $/x));
14399 my $code_point = CORE::hex $2;
14403 $names_ref = \%loose_names_ending_in_code_point;
14406 return if $base !~ s/-$//;
14407 $names_ref = \%names_ending_in_code_point;
14410 # Name must be one of the ones which has the code point in it.
14411 return if ! $names_ref->{$base};
14413 # Look through the list of ranges that apply to this name to see if
14414 # the code point is in one of them.
14415 for (my $i = 0; $i < scalar @{$names_ref->{$base}{'low'}}; $i++) {
14416 return if $names_ref->{$base}{'low'}->[$i] > $code_point;
14417 next if $names_ref->{$base}{'high'}->[$i] < $code_point;
14419 # Here, the code point is in the range.
14420 return $code_point;
14423 # Here, looked like the name had a code point number in it, but
14424 # did not match one of the valid ones.
14428 sub code_point_to_name_special {
14429 my $code_point = shift;
14431 # Returns the name of a code point if algorithmically determinable;
14434 if ($has_hangul_syllables) {
14435 push @name, << 'END';
14437 # If in the Hangul range, calculate the name based on Unicode's
14439 if ($code_point >= $SBase && $code_point <= $SBase + $SCount -1) {
14441 my $SIndex = $code_point - $SBase;
14442 my $L = $LBase + $SIndex / $NCount;
14443 my $V = $VBase + ($SIndex % $NCount) / $TCount;
14444 my $T = $TBase + $SIndex % $TCount;
14445 $name = "$HANGUL_SYLLABLE$Jamo{$L}$Jamo{$V}";
14446 $name .= $Jamo{$T} if $T != $TBase;
14451 push @name, << 'END';
14453 # Look through list of these code points for one in range.
14454 foreach my $hash (@code_points_ending_in_code_point) {
14455 return if $code_point < $hash->{'low'};
14456 if ($code_point <= $hash->{'high'}) {
14457 return sprintf("%s-%04X", $hash->{'name'}, $code_point);
14460 return; # None found
14467 main::write("Name.pm", 0, \@name); # The 0 means no utf8.
14472 # Create and write UCD.pl, which passes info about the tables to
14475 # Create a mapping from each alias of Perl single-form extensions to all
14476 # its equivalent aliases, for quick look-up.
14477 my %perlprop_to_aliases;
14478 foreach my $table ($perl->tables) {
14480 # First create the list of the aliases of each extension
14481 my @aliases_list; # List of legal aliases for this extension
14483 my $table_name = $table->name;
14484 my $standard_table_name = standardize($table_name);
14485 my $table_full_name = $table->full_name;
14486 my $standard_table_full_name = standardize($table_full_name);
14488 # Make sure that the list has both the short and full names
14489 push @aliases_list, $table_name, $table_full_name;
14491 my $found_ucd = 0; # ? Did we actually get an alias that should be
14492 # output for this table
14494 # Go through all the aliases (including the two just added), and add
14495 # any new unique ones to the list
14496 foreach my $alias ($table->aliases) {
14498 # Skip non-legal names
14499 next unless $alias->ok_as_filename;
14500 next unless $alias->ucd;
14502 $found_ucd = 1; # have at least one legal name
14504 my $name = $alias->name;
14505 my $standard = standardize($name);
14507 # Don't repeat a name that is equivalent to one already on the
14509 next if $standard eq $standard_table_name;
14510 next if $standard eq $standard_table_full_name;
14512 push @aliases_list, $name;
14515 # If there were no legal names, don't output anything.
14516 next unless $found_ucd;
14518 # To conserve memory in the program reading these in, omit full names
14519 # that are identical to the short name, when those are the only two
14520 # aliases for the property.
14521 if (@aliases_list == 2 && $aliases_list[0] eq $aliases_list[1]) {
14525 # Here, @aliases_list is the list of all the aliases that this
14526 # extension legally has. Now can create a map to it from each legal
14527 # standardized alias
14528 foreach my $alias ($table->aliases) {
14529 next unless $alias->ucd;
14530 next unless $alias->ok_as_filename;
14531 push @{$perlprop_to_aliases{standardize($alias->name)}},
14536 # Make a list of all combinations of properties/values that are suppressed.
14538 foreach my $property_name (keys %why_suppressed) {
14541 my $value_name = $1 if $property_name =~ s/ = ( .* ) //x;
14543 # The hash may contain properties not in this release of Unicode
14544 next unless defined (my $property = property_ref($property_name));
14546 # Find all combinations
14547 foreach my $prop_alias ($property->aliases) {
14548 my $prop_alias_name = standardize($prop_alias->name);
14550 # If no =value, there's just one combination possibe for this
14551 if (! $value_name) {
14553 # The property may be suppressed, but there may be a proxy for
14554 # it, so it shouldn't be listed as suppressed
14555 next if $prop_alias->ucd;
14556 push @suppressed, $prop_alias_name;
14559 foreach my $value_alias ($property->table($value_name)->aliases)
14561 next if $value_alias->ucd;
14563 push @suppressed, "$prop_alias_name="
14564 . standardize($value_alias->name);
14570 # Convert the structure below (designed for Name.pm) to a form that UCD
14571 # wants, so it doesn't have to modify it at all; i.e. so that it includes
14572 # an element for the Hangul syllables in the appropriate place, and
14573 # otherwise changes the name to include the "-<code point>" suffix.
14574 my @algorithm_names;
14575 my $done_hangul = 0;
14577 # Copy it linearly.
14578 for my $i (0 .. @code_points_ending_in_code_point - 1) {
14580 # Insert the hanguls in the correct place.
14582 && $code_points_ending_in_code_point[$i]->{'low'} > $SBase)
14585 push @algorithm_names, { low => $SBase,
14586 high => $SBase + $SCount - 1,
14587 name => '<hangul syllable>',
14591 # Copy the current entry, modified.
14592 push @algorithm_names, {
14593 low => $code_points_ending_in_code_point[$i]->{'low'},
14594 high => $code_points_ending_in_code_point[$i]->{'high'},
14596 "$code_points_ending_in_code_point[$i]->{'name'}-<code point>",
14600 # Serialize these structures for output.
14601 my $loose_to_standard_value
14602 = simple_dumper(\%loose_to_standard_value, ' ' x 4);
14603 chomp $loose_to_standard_value;
14605 my $string_property_loose_to_name
14606 = simple_dumper(\%string_property_loose_to_name, ' ' x 4);
14607 chomp $string_property_loose_to_name;
14609 my $perlprop_to_aliases = simple_dumper(\%perlprop_to_aliases, ' ' x 4);
14610 chomp $perlprop_to_aliases;
14612 my $prop_aliases = simple_dumper(\%prop_aliases, ' ' x 4);
14613 chomp $prop_aliases;
14615 my $prop_value_aliases = simple_dumper(\%prop_value_aliases, ' ' x 4);
14616 chomp $prop_value_aliases;
14618 my $suppressed = (@suppressed) ? simple_dumper(\@suppressed, ' ' x 4) : "";
14621 my $algorithm_names = simple_dumper(\@algorithm_names, ' ' x 4);
14622 chomp $algorithm_names;
14624 my $ambiguous_names = simple_dumper(\%ambiguous_names, ' ' x 4);
14625 chomp $ambiguous_names;
14627 my $loose_defaults = simple_dumper(\%loose_defaults, ' ' x 4);
14628 chomp $loose_defaults;
14632 $INTERNAL_ONLY_HEADER
14634 # This file is for the use of Unicode::UCD
14636 # Highest legal Unicode code point
14637 \$Unicode::UCD::MAX_UNICODE_CODEPOINT = 0x$MAX_UNICODE_CODEPOINT_STRING;
14640 \$Unicode::UCD::HANGUL_BEGIN = $SBase_string;
14641 \$Unicode::UCD::HANGUL_COUNT = $SCount;
14643 # Keys are all the possible "prop=value" combinations, in loose form; values
14644 # are the standard loose name for the 'value' part of the key
14645 \%Unicode::UCD::loose_to_standard_value = (
14646 $loose_to_standard_value
14649 # String property loose names to standard loose name
14650 \%Unicode::UCD::string_property_loose_to_name = (
14651 $string_property_loose_to_name
14654 # Keys are Perl extensions in loose form; values are each one's list of
14656 \%Unicode::UCD::loose_perlprop_to_name = (
14657 $perlprop_to_aliases
14660 # Keys are standard property name; values are each one's aliases
14661 \%Unicode::UCD::prop_aliases = (
14665 # Keys of top level are standard property name; values are keys to another
14666 # hash, Each one is one of the property's values, in standard form. The
14667 # values are that prop-val's aliases. If only one specified, the short and
14668 # long alias are identical.
14669 \%Unicode::UCD::prop_value_aliases = (
14670 $prop_value_aliases
14673 # Ordered (by code point ordinal) list of the ranges of code points whose
14674 # names are algorithmically determined. Each range entry is an anonymous hash
14675 # of the start and end points and a template for the names within it.
14676 \@Unicode::UCD::algorithmic_named_code_points = (
14680 # The properties that as-is have two meanings, and which must be disambiguated
14681 \%Unicode::UCD::ambiguous_names = (
14685 # Keys are the prop-val combinations which are the default values for the
14686 # given property, expressed in standard loose form
14687 \%Unicode::UCD::loose_defaults = (
14691 # All combinations of names that are suppressed.
14692 # This is actually for UCD.t, so it knows which properties shouldn't have
14693 # entries. If it got any bigger, would probably want to put it in its own
14694 # file to use memory only when it was needed, in testing.
14695 \@Unicode::UCD::suppressed_properties = (
14702 main::write("UCD.pl", 0, \@ucd); # The 0 means no utf8.
14706 sub write_all_tables() {
14707 # Write out all the tables generated by this program to files, as well as
14708 # the supporting data structures, pod file, and .t file.
14710 my @writables; # List of tables that actually get written
14711 my %match_tables_to_write; # Used to collapse identical match tables
14712 # into one file. Each key is a hash function
14713 # result to partition tables into buckets.
14714 # Each value is an array of the tables that
14715 # fit in the bucket.
14717 # For each property ...
14718 # (sort so that if there is an immutable file name, it has precedence, so
14719 # some other property can't come in and take over its file name. If b's
14720 # file name is defined, will return 1, meaning to take it first; don't
14721 # care if both defined, as they had better be different anyway. And the
14722 # property named 'Perl' needs to be first (it doesn't have any immutable
14723 # file name) because empty properties are defined in terms of it's table
14726 foreach my $property (sort { return -1 if $a == $perl;
14727 return 1 if $b == $perl;
14728 return defined $b->file
14729 } property_ref('*'))
14731 my $type = $property->type;
14733 # And for each table for that property, starting with the mapping
14736 foreach my $table($property,
14738 # and all the match tables for it (if any), sorted so
14739 # the ones with the shortest associated file name come
14740 # first. The length sorting prevents problems of a
14741 # longer file taking a name that might have to be used
14742 # by a shorter one. The alphabetic sorting prevents
14743 # differences between releases
14744 sort { my $ext_a = $a->external_name;
14745 return 1 if ! defined $ext_a;
14746 my $ext_b = $b->external_name;
14747 return -1 if ! defined $ext_b;
14749 # But return the non-complement table before
14750 # the complement one, as the latter is defined
14751 # in terms of the former, and needs to have
14752 # the information for the former available.
14753 return 1 if $a->complement != 0;
14754 return -1 if $b->complement != 0;
14756 # Similarly, return a subservient table after
14758 return 1 if $a->leader != $a;
14759 return -1 if $b->leader != $b;
14761 my $cmp = length $ext_a <=> length $ext_b;
14763 # Return result if lengths not equal
14764 return $cmp if $cmp;
14766 # Alphabetic if lengths equal
14767 return $ext_a cmp $ext_b
14768 } $property->tables
14772 # Here we have a table associated with a property. It could be
14773 # the map table (done first for each property), or one of the
14774 # other tables. Determine which type.
14775 my $is_property = $table->isa('Property');
14777 my $name = $table->name;
14778 my $complete_name = $table->complete_name;
14780 # See if should suppress the table if is empty, but warn if it
14781 # contains something.
14782 my $suppress_if_empty_warn_if_not
14783 = $why_suppress_if_empty_warn_if_not{$complete_name} || 0;
14785 # Calculate if this table should have any code points associated
14787 my $expected_empty =
14789 # $perl should be empty, as well as properties that we just
14790 # don't do anything with
14792 && ($table == $perl
14793 || grep { $complete_name eq $_ }
14794 @unimplemented_properties
14798 # Match tables in properties we skipped populating should be
14800 || (! $is_property && ! $property->to_create_match_tables)
14802 # Tables and properties that are expected to have no code
14803 # points should be empty
14804 || $suppress_if_empty_warn_if_not
14807 # Set a boolean if this table is the complement of an empty binary
14809 my $is_complement_of_empty_binary =
14810 $type == $BINARY &&
14811 (($table == $property->table('Y')
14812 && $property->table('N')->is_empty)
14813 || ($table == $property->table('N')
14814 && $property->table('Y')->is_empty));
14816 if ($table->is_empty) {
14818 if ($suppress_if_empty_warn_if_not) {
14819 $table->set_fate($SUPPRESSED,
14820 $suppress_if_empty_warn_if_not);
14823 # Suppress (by skipping them) expected empty tables.
14824 next TABLE if $expected_empty;
14826 # And setup to later output a warning for those that aren't
14827 # known to be allowed to be empty. Don't do the warning if
14828 # this table is a child of another one to avoid duplicating
14829 # the warning that should come from the parent one.
14830 if (($table == $property || $table->parent == $table)
14831 && $table->fate != $SUPPRESSED
14832 && $table->fate != $MAP_PROXIED
14833 && ! grep { $complete_name =~ /^$_$/ }
14834 @tables_that_may_be_empty)
14836 push @unhandled_properties, "$table";
14839 # An empty table is just the complement of everything.
14840 $table->set_complement($Any) if $table != $property;
14842 elsif ($expected_empty) {
14844 if ($suppress_if_empty_warn_if_not) {
14845 $because = " because $suppress_if_empty_warn_if_not";
14848 Carp::my_carp("Not expecting property $table$because. Generating file for it anyway.");
14851 # Some tables should match everything
14852 my $expected_full =
14853 ($table->fate == $SUPPRESSED)
14856 ? # All these types of map tables will be full because
14857 # they will have been populated with defaults
14858 ($type == $ENUM || $type == $FORCED_BINARY)
14860 : # A match table should match everything if its method
14862 ($table->matches_all
14864 # The complement of an empty binary table will match
14866 || $is_complement_of_empty_binary
14870 my $count = $table->count;
14871 if ($expected_full) {
14872 if ($count != $MAX_UNICODE_CODEPOINTS) {
14873 Carp::my_carp("$table matches only "
14874 . clarify_number($count)
14875 . " Unicode code points but should match "
14876 . clarify_number($MAX_UNICODE_CODEPOINTS)
14878 . clarify_number(abs($MAX_UNICODE_CODEPOINTS - $count))
14879 . "). Proceeding anyway.");
14882 # Here is expected to be full. If it is because it is the
14883 # complement of an (empty) binary table that is to be
14884 # suppressed, then suppress this one as well.
14885 if ($is_complement_of_empty_binary) {
14886 my $opposing_name = ($name eq 'Y') ? 'N' : 'Y';
14887 my $opposing = $property->table($opposing_name);
14888 my $opposing_status = $opposing->status;
14889 if ($opposing_status) {
14890 $table->set_status($opposing_status,
14891 $opposing->status_info);
14895 elsif ($count == $MAX_UNICODE_CODEPOINTS) {
14896 if ($table == $property || $table->leader == $table) {
14897 Carp::my_carp("$table unexpectedly matches all Unicode code points. Proceeding anyway.");
14901 if ($table->fate == $SUPPRESSED) {
14902 if (! $is_property) {
14903 my @children = $table->children;
14904 foreach my $child (@children) {
14905 if ($child->fate != $SUPPRESSED) {
14906 Carp::my_carp_bug("'$table' is suppressed and has a child '$child' which isn't");
14914 if (! $is_property) {
14916 make_ucd_table_pod_entries($table) if $table->property == $perl;
14918 # Several things need to be done just once for each related
14919 # group of match tables. Do them on the parent.
14920 if ($table->parent == $table) {
14922 # Add an entry in the pod file for the table; it also does
14924 make_re_pod_entries($table) if defined $pod_directory;
14926 # See if the the table matches identical code points with
14927 # something that has already been output. In that case,
14928 # no need to have two files with the same code points in
14929 # them. We use the table's hash() method to store these
14930 # in buckets, so that it is quite likely that if two
14931 # tables are in the same bucket they will be identical, so
14932 # don't have to compare tables frequently. The tables
14933 # have to have the same status to share a file, so add
14934 # this to the bucket hash. (The reason for this latter is
14935 # that Heavy.pl associates a status with a file.)
14936 # We don't check tables that are inverses of others, as it
14937 # would lead to some coding complications, and checking
14938 # all the regular ones should find everything.
14939 if ($table->complement == 0) {
14940 my $hash = $table->hash . ';' . $table->status;
14942 # Look at each table that is in the same bucket as
14943 # this one would be.
14944 foreach my $comparison
14945 (@{$match_tables_to_write{$hash}})
14947 if ($table->matches_identically_to($comparison)) {
14948 $table->set_equivalent_to($comparison,
14954 # Here, not equivalent, add this table to the bucket.
14955 push @{$match_tables_to_write{$hash}}, $table;
14961 # Here is the property itself.
14962 # Don't write out or make references to the $perl property
14963 next if $table == $perl;
14965 make_ucd_table_pod_entries($table);
14967 # There is a mapping stored of the various synonyms to the
14968 # standardized name of the property for utf8_heavy.pl.
14969 # Also, the pod file contains entries of the form:
14970 # \p{alias: *} \p{full: *}
14971 # rather than show every possible combination of things.
14973 my @property_aliases = $property->aliases;
14975 my $full_property_name = $property->full_name;
14976 my $property_name = $property->name;
14977 my $standard_property_name = standardize($property_name);
14978 my $standard_property_full_name
14979 = standardize($full_property_name);
14981 # We also create for Unicode::UCD a list of aliases for
14982 # the property. The list starts with the property name;
14983 # then its full name.
14986 if ( $property->fate <= $MAP_PROXIED) {
14987 @property_list = ($property_name, $full_property_name);
14988 @standard_list = ($standard_property_name,
14989 $standard_property_full_name);
14992 # For each synonym ...
14993 for my $i (0 .. @property_aliases - 1) {
14994 my $alias = $property_aliases[$i];
14995 my $alias_name = $alias->name;
14996 my $alias_standard = standardize($alias_name);
14999 # Add other aliases to the list of property aliases
15000 if ($property->fate <= $MAP_PROXIED
15001 && ! grep { $alias_standard eq $_ } @standard_list)
15003 push @property_list, $alias_name;
15004 push @standard_list, $alias_standard;
15007 # For utf8_heavy, set the mapping of the alias to the
15009 if ($type == $STRING) {
15010 if ($property->fate <= $MAP_PROXIED) {
15011 $string_property_loose_to_name{$alias_standard}
15012 = $standard_property_name;
15016 if (exists ($loose_property_name_of{$alias_standard}))
15018 Carp::my_carp("There already is a property with the same standard name as $alias_name: $loose_property_name_of{$alias_standard}. Old name is retained");
15021 $loose_property_name_of{$alias_standard}
15022 = $standard_property_name;
15025 # Now for the re pod entry for this alias. Skip if not
15026 # outputting a pod; skip the first one, which is the
15027 # full name so won't have an entry like: '\p{full: *}
15028 # \p{full: *}', and skip if don't want an entry for
15031 || ! defined $pod_directory
15032 || ! $alias->make_re_pod_entry;
15034 my $rhs = "\\p{$full_property_name: *}";
15035 if ($property != $perl && $table->perl_extension) {
15036 $rhs .= ' (Perl extension)';
15038 push @match_properties,
15039 format_pod_line($indent_info_column,
15040 '\p{' . $alias->name . ': *}',
15046 # The list of all possible names is attached to each alias, so
15048 if (@property_list) {
15049 push @{$prop_aliases{$standard_list[0]}}, @property_list;
15052 if ($property->fate <= $MAP_PROXIED) {
15054 # Similarly, we create for Unicode::UCD a list of
15055 # property-value aliases.
15057 my $property_full_name = $property->full_name;
15059 # Look at each table in the property...
15060 foreach my $table ($property->tables) {
15062 my $table_full_name = $table->full_name;
15063 my $standard_table_full_name
15064 = standardize($table_full_name);
15065 my $table_name = $table->name;
15066 my $standard_table_name = standardize($table_name);
15068 # The list starts with the table name and its full
15070 push @values_list, $table_name, $table_full_name;
15072 # We add to the table each unique alias that isn't
15073 # discouraged from use.
15074 foreach my $alias ($table->aliases) {
15075 next if $alias->status
15076 && $alias->status eq $DISCOURAGED;
15077 my $name = $alias->name;
15078 my $standard = standardize($name);
15079 next if $standard eq $standard_table_name;
15080 next if $standard eq $standard_table_full_name;
15081 push @values_list, $name;
15084 # Here @values_list is a list of all the aliases for
15085 # the table. That is, all the property-values given
15086 # by this table. By agreement with Unicode::UCD,
15087 # if the name and full name are identical, and there
15088 # are no other names, drop the duplcate entry to save
15090 if (@values_list == 2
15091 && $values_list[0] eq $values_list[1])
15096 # To save memory, unlike the similar list for property
15097 # aliases above, only the standard forms hve the list.
15098 # This forces an extra step of converting from input
15099 # name to standard name, but the savings are
15100 # considerable. (There is only marginal savings if we
15101 # did this with the property aliases.)
15102 push @{$prop_value_aliases{$standard_property_name}{$standard_table_name}}, @values_list;
15106 # Don't write out a mapping file if not desired.
15107 next if ! $property->to_output_map;
15110 # Here, we know we want to write out the table, but don't do it
15111 # yet because there may be other tables that come along and will
15112 # want to share the file, and the file's comments will change to
15113 # mention them. So save for later.
15114 push @writables, $table;
15116 } # End of looping through the property and all its tables.
15117 } # End of looping through all properties.
15119 # Now have all the tables that will have files written for them. Do it.
15120 foreach my $table (@writables) {
15123 my $property = $table->property;
15124 my $is_property = ($table == $property);
15125 if (! $is_property) {
15127 # Match tables for the property go in lib/$subdirectory, which is
15128 # the property's name. Don't use the standard file name for this,
15129 # as may get an unfamiliar alias
15130 @directory = ($matches_directory, $property->external_name);
15134 @directory = $table->directory;
15135 $filename = $table->file;
15138 # Use specified filename if available, or default to property's
15139 # shortest name. We need an 8.3 safe filename (which means "an 8
15140 # safe" filename, since after the dot is only 'pl', which is < 3)
15141 # The 2nd parameter is if the filename shouldn't be changed, and
15142 # it shouldn't iff there is a hard-coded name for this table.
15143 $filename = construct_filename(
15144 $filename || $table->external_name,
15145 ! $filename, # mutable if no filename
15148 register_file_for_name($table, \@directory, $filename);
15150 # Only need to write one file when shared by more than one
15152 next if ! $is_property
15153 && ($table->leader != $table || $table->complement != 0);
15155 # Construct a nice comment to add to the file
15156 $table->set_final_comment;
15162 # Write out the pod file
15165 # And Heavy.pl, Name.pm, UCD.pl
15170 make_property_test_script() if $make_test_script;
15174 my @white_space_separators = ( # This used only for making the test script.
15181 sub generate_separator($) {
15182 # This used only for making the test script. It generates the colon or
15183 # equal separator between the property and property value, with random
15184 # white space surrounding the separator
15188 return "" if $lhs eq ""; # No separator if there's only one (the r) side
15190 # Choose space before and after randomly
15191 my $spaces_before =$white_space_separators[rand(@white_space_separators)];
15192 my $spaces_after = $white_space_separators[rand(@white_space_separators)];
15194 # And return the whole complex, half the time using a colon, half the
15196 return $spaces_before
15197 . (rand() < 0.5) ? '=' : ':'
15201 sub generate_tests($$$$$) {
15202 # This used only for making the test script. It generates test cases that
15203 # are expected to compile successfully in perl. Note that the lhs and
15204 # rhs are assumed to already be as randomized as the caller wants.
15206 my $lhs = shift; # The property: what's to the left of the colon
15207 # or equals separator
15208 my $rhs = shift; # The property value; what's to the right
15209 my $valid_code = shift; # A code point that's known to be in the
15210 # table given by lhs=rhs; undef if table is
15212 my $invalid_code = shift; # A code point known to not be in the table;
15213 # undef if the table is all code points
15214 my $warning = shift;
15216 # Get the colon or equal
15217 my $separator = generate_separator($lhs);
15219 # The whole 'property=value'
15220 my $name = "$lhs$separator$rhs";
15223 # Create a complete set of tests, with complements.
15224 if (defined $valid_code) {
15225 push @output, <<"EOC"
15226 Expect(1, $valid_code, '\\p{$name}', $warning);
15227 Expect(0, $valid_code, '\\p{^$name}', $warning);
15228 Expect(0, $valid_code, '\\P{$name}', $warning);
15229 Expect(1, $valid_code, '\\P{^$name}', $warning);
15232 if (defined $invalid_code) {
15233 push @output, <<"EOC"
15234 Expect(0, $invalid_code, '\\p{$name}', $warning);
15235 Expect(1, $invalid_code, '\\p{^$name}', $warning);
15236 Expect(1, $invalid_code, '\\P{$name}', $warning);
15237 Expect(0, $invalid_code, '\\P{^$name}', $warning);
15243 sub generate_error($$$) {
15244 # This used only for making the test script. It generates test cases that
15245 # are expected to not only not match, but to be syntax or similar errors
15247 my $lhs = shift; # The property: what's to the left of the
15248 # colon or equals separator
15249 my $rhs = shift; # The property value; what's to the right
15250 my $already_in_error = shift; # Boolean; if true it's known that the
15251 # unmodified lhs and rhs will cause an error.
15252 # This routine should not force another one
15253 # Get the colon or equal
15254 my $separator = generate_separator($lhs);
15256 # Since this is an error only, don't bother to randomly decide whether to
15257 # put the error on the left or right side; and assume that the rhs is
15258 # loosely matched, again for convenience rather than rigor.
15259 $rhs = randomize_loose_name($rhs, 'ERROR') unless $already_in_error;
15261 my $property = $lhs . $separator . $rhs;
15264 Error('\\p{$property}');
15265 Error('\\P{$property}');
15269 # These are used only for making the test script
15270 # XXX Maybe should also have a bad strict seps, which includes underscore.
15272 my @good_loose_seps = (
15279 my @bad_loose_seps = (
15284 sub randomize_stricter_name {
15285 # This used only for making the test script. Take the input name and
15286 # return a randomized, but valid version of it under the stricter matching
15290 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
15292 # If the name looks like a number (integer, floating, or rational), do
15294 if ($name =~ qr{ ^ ( -? ) (\d+ ( ( [./] ) \d+ )? ) $ }x) {
15297 my $separator = $3;
15299 # If there isn't a sign, part of the time add a plus
15300 # Note: Not testing having any denominator having a minus sign
15302 $sign = '+' if rand() <= .3;
15305 # And add 0 or more leading zeros.
15306 $name = $sign . ('0' x int rand(10)) . $number;
15308 if (defined $separator) {
15309 my $extra_zeros = '0' x int rand(10);
15311 if ($separator eq '.') {
15313 # Similarly, add 0 or more trailing zeros after a decimal
15315 $name .= $extra_zeros;
15319 # Or, leading zeros before the denominator
15320 $name =~ s,/,/$extra_zeros,;
15325 # For legibility of the test, only change the case of whole sections at a
15326 # time. To do this, first split into sections. The split returns the
15329 for my $section (split / ( [ - + \s _ . ]+ ) /x, $name) {
15330 trace $section if main::DEBUG && $to_trace;
15332 if (length $section > 1 && $section !~ /\D/) {
15334 # If the section is a sequence of digits, about half the time
15335 # randomly add underscores between some of them.
15338 # Figure out how many underscores to add. max is 1 less than
15339 # the number of digits. (But add 1 at the end to make sure
15340 # result isn't 0, and compensate earlier by subtracting 2
15342 my $num_underscores = int rand(length($section) - 2) + 1;
15344 # And add them evenly throughout, for convenience, not rigor
15346 my $spacing = (length($section) - 1)/ $num_underscores;
15347 my $temp = $section;
15349 for my $i (1 .. $num_underscores) {
15350 $section .= substr($temp, 0, $spacing, "") . '_';
15354 push @sections, $section;
15358 # Here not a sequence of digits. Change the case of the section
15360 my $switch = int rand(4);
15361 if ($switch == 0) {
15362 push @sections, uc $section;
15364 elsif ($switch == 1) {
15365 push @sections, lc $section;
15367 elsif ($switch == 2) {
15368 push @sections, ucfirst $section;
15371 push @sections, $section;
15375 trace "returning", join "", @sections if main::DEBUG && $to_trace;
15376 return join "", @sections;
15379 sub randomize_loose_name($;$) {
15380 # This used only for making the test script
15383 my $want_error = shift; # if true, make an error
15384 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
15386 $name = randomize_stricter_name($name);
15389 push @parts, $good_loose_seps[rand(@good_loose_seps)];
15391 # Preserve trailing ones for the sake of not stripping the underscore from
15393 for my $part (split /[-\s_]+ (?= . )/, $name) {
15395 if ($want_error and rand() < 0.3) {
15396 push @parts, $bad_loose_seps[rand(@bad_loose_seps)];
15400 push @parts, $good_loose_seps[rand(@good_loose_seps)];
15403 push @parts, $part;
15405 my $new = join("", @parts);
15406 trace "$name => $new" if main::DEBUG && $to_trace;
15409 if (rand() >= 0.5) {
15410 $new .= $bad_loose_seps[rand(@bad_loose_seps)];
15413 $new = $bad_loose_seps[rand(@bad_loose_seps)] . $new;
15419 # Used to make sure don't generate duplicate test cases.
15420 my %test_generated;
15422 sub make_property_test_script() {
15423 # This used only for making the test script
15424 # this written directly -- it's huge.
15426 print "Making test script\n" if $verbosity >= $PROGRESS;
15428 # This uses randomness to test different possibilities without testing all
15429 # possibilities. To ensure repeatability, set the seed to 0. But if
15430 # tests are added, it will perturb all later ones in the .t file
15433 $t_path = 'TestProp.pl' unless defined $t_path; # the traditional name
15435 # Keep going down an order of magnitude
15436 # until find that adding this quantity to
15437 # 1 remains 1; but put an upper limit on
15438 # this so in case this algorithm doesn't
15439 # work properly on some platform, that we
15440 # won't loop forever.
15442 my $min_floating_slop = 1;
15443 while (1+ $min_floating_slop != 1
15446 my $next = $min_floating_slop / 10;
15447 last if $next == 0; # If underflows,
15449 $min_floating_slop = $next;
15452 # It doesn't matter whether the elements of this array contain single lines
15453 # or multiple lines. main::write doesn't count the lines.
15456 foreach my $property (property_ref('*')) {
15457 foreach my $table ($property->tables) {
15459 # Find code points that match, and don't match this table.
15460 my $valid = $table->get_valid_code_point;
15461 my $invalid = $table->get_invalid_code_point;
15462 my $warning = ($table->status eq $DEPRECATED)
15466 # Test each possible combination of the property's aliases with
15467 # the table's. If this gets to be too many, could do what is done
15468 # in the set_final_comment() for Tables
15469 my @table_aliases = $table->aliases;
15470 my @property_aliases = $table->property->aliases;
15472 # Every property can be optionally be prefixed by 'Is_', so test
15473 # that those work, by creating such a new alias for each
15474 # pre-existing one.
15475 push @property_aliases, map { Alias->new("Is_" . $_->name,
15477 $_->make_re_pod_entry,
15478 $_->ok_as_filename,
15482 } @property_aliases;
15483 my $max = max(scalar @table_aliases, scalar @property_aliases);
15484 for my $j (0 .. $max - 1) {
15486 # The current alias for property is the next one on the list,
15487 # or if beyond the end, start over. Similarly for table
15489 = $property_aliases[$j % @property_aliases]->name;
15491 $property_name = "" if $table->property == $perl;
15492 my $table_alias = $table_aliases[$j % @table_aliases];
15493 my $table_name = $table_alias->name;
15494 my $loose_match = $table_alias->loose_match;
15496 # If the table doesn't have a file, any test for it is
15497 # already guaranteed to be in error
15498 my $already_error = ! $table->file_path;
15500 # Generate error cases for this alias.
15501 push @output, generate_error($property_name,
15505 # If the table is guaranteed to always generate an error,
15506 # quit now without generating success cases.
15507 next if $already_error;
15509 # Now for the success cases.
15511 if ($loose_match) {
15513 # For loose matching, create an extra test case for the
15515 my $standard = standardize($table_name);
15517 # $test_name should be a unique combination for each test
15518 # case; used just to avoid duplicate tests
15519 my $test_name = "$property_name=$standard";
15521 # Don't output duplicate test cases.
15522 if (! exists $test_generated{$test_name}) {
15523 $test_generated{$test_name} = 1;
15524 push @output, generate_tests($property_name,
15531 $random = randomize_loose_name($table_name)
15533 else { # Stricter match
15534 $random = randomize_stricter_name($table_name);
15537 # Now for the main test case for this alias.
15538 my $test_name = "$property_name=$random";
15539 if (! exists $test_generated{$test_name}) {
15540 $test_generated{$test_name} = 1;
15541 push @output, generate_tests($property_name,
15548 # If the name is a rational number, add tests for the
15549 # floating point equivalent.
15550 if ($table_name =~ qr{/}) {
15552 # Calculate the float, and find just the fraction.
15553 my $float = eval $table_name;
15554 my ($whole, $fraction)
15555 = $float =~ / (.*) \. (.*) /x;
15557 # Starting with one digit after the decimal point,
15558 # create a test for each possible precision (number of
15559 # digits past the decimal point) until well beyond the
15560 # native number found on this machine. (If we started
15561 # with 0 digits, it would be an integer, which could
15562 # well match an unrelated table)
15564 for my $i (1 .. $min_floating_slop + 3) {
15565 my $table_name = sprintf("%.*f", $i, $float);
15566 if ($i < $MIN_FRACTION_LENGTH) {
15568 # If the test case has fewer digits than the
15569 # minimum acceptable precision, it shouldn't
15570 # succeed, so we expect an error for it.
15571 # E.g., 2/3 = .7 at one decimal point, and we
15572 # shouldn't say it matches .7. We should make
15573 # it be .667 at least before agreeing that the
15574 # intent was to match 2/3. But at the
15575 # less-than- acceptable level of precision, it
15576 # might actually match an unrelated number.
15577 # So don't generate a test case if this
15578 # conflating is possible. In our example, we
15579 # don't want 2/3 matching 7/10, if there is
15580 # a 7/10 code point.
15582 (keys %nv_floating_to_rational)
15585 if abs($table_name - $existing)
15586 < $MAX_FLOATING_SLOP;
15588 push @output, generate_error($property_name,
15590 1 # 1 => already an error
15595 # Here the number of digits exceeds the
15596 # minimum we think is needed. So generate a
15597 # success test case for it.
15598 push @output, generate_tests($property_name,
15616 (map {"Test_X('$_');\n"} @backslash_X_tests),
15621 # This is a list of the input files and how to handle them. The files are
15622 # processed in their order in this list. Some reordering is possible if
15623 # desired, but the v0 files should be first, and the extracted before the
15624 # others except DAge.txt (as data in an extracted file can be over-ridden by
15625 # the non-extracted. Some other files depend on data derived from an earlier
15626 # file, like UnicodeData requires data from Jamo, and the case changing and
15627 # folding requires data from Unicode. Mostly, it safest to order by first
15628 # version releases in (except the Jamo). DAge.txt is read before the
15629 # extracted ones because of the rarely used feature $compare_versions. In the
15630 # unlikely event that there were ever an extracted file that contained the Age
15631 # property information, it would have to go in front of DAge.
15633 # The version strings allow the program to know whether to expect a file or
15634 # not, but if a file exists in the directory, it will be processed, even if it
15635 # is in a version earlier than expected, so you can copy files from a later
15636 # release into an earlier release's directory.
15637 my @input_file_objects = (
15638 Input_file->new('PropertyAliases.txt', v0,
15639 Handler => \&process_PropertyAliases,
15641 Input_file->new(undef, v0, # No file associated with this
15642 Progress_Message => 'Finishing property setup',
15643 Handler => \&finish_property_setup,
15645 Input_file->new('PropValueAliases.txt', v0,
15646 Handler => \&process_PropValueAliases,
15647 Has_Missings_Defaults => $NOT_IGNORED,
15649 Input_file->new('DAge.txt', v3.2.0,
15650 Has_Missings_Defaults => $NOT_IGNORED,
15653 Input_file->new("${EXTRACTED}DGeneralCategory.txt", v3.1.0,
15654 Property => 'General_Category',
15656 Input_file->new("${EXTRACTED}DCombiningClass.txt", v3.1.0,
15657 Property => 'Canonical_Combining_Class',
15658 Has_Missings_Defaults => $NOT_IGNORED,
15660 Input_file->new("${EXTRACTED}DNumType.txt", v3.1.0,
15661 Property => 'Numeric_Type',
15662 Has_Missings_Defaults => $NOT_IGNORED,
15664 Input_file->new("${EXTRACTED}DEastAsianWidth.txt", v3.1.0,
15665 Property => 'East_Asian_Width',
15666 Has_Missings_Defaults => $NOT_IGNORED,
15668 Input_file->new("${EXTRACTED}DLineBreak.txt", v3.1.0,
15669 Property => 'Line_Break',
15670 Has_Missings_Defaults => $NOT_IGNORED,
15672 Input_file->new("${EXTRACTED}DBidiClass.txt", v3.1.1,
15673 Property => 'Bidi_Class',
15674 Has_Missings_Defaults => $NOT_IGNORED,
15676 Input_file->new("${EXTRACTED}DDecompositionType.txt", v3.1.0,
15677 Property => 'Decomposition_Type',
15678 Has_Missings_Defaults => $NOT_IGNORED,
15680 Input_file->new("${EXTRACTED}DBinaryProperties.txt", v3.1.0),
15681 Input_file->new("${EXTRACTED}DNumValues.txt", v3.1.0,
15682 Property => 'Numeric_Value',
15683 Each_Line_Handler => \&filter_numeric_value_line,
15684 Has_Missings_Defaults => $NOT_IGNORED,
15686 Input_file->new("${EXTRACTED}DJoinGroup.txt", v3.1.0,
15687 Property => 'Joining_Group',
15688 Has_Missings_Defaults => $NOT_IGNORED,
15691 Input_file->new("${EXTRACTED}DJoinType.txt", v3.1.0,
15692 Property => 'Joining_Type',
15693 Has_Missings_Defaults => $NOT_IGNORED,
15695 Input_file->new('Jamo.txt', v2.0.0,
15696 Property => 'Jamo_Short_Name',
15697 Each_Line_Handler => \&filter_jamo_line,
15699 Input_file->new('UnicodeData.txt', v1.1.5,
15700 Pre_Handler => \&setup_UnicodeData,
15702 # We clean up this file for some early versions.
15703 Each_Line_Handler => [ (($v_version lt v2.0.0 )
15705 : ($v_version eq v2.1.5)
15706 ? \&filter_v2_1_5_ucd
15708 # And for 5.14 Perls with 6.0,
15709 # have to also make changes
15710 : ($v_version ge v6.0.0)
15714 # And the main filter
15715 \&filter_UnicodeData_line,
15717 EOF_Handler => \&EOF_UnicodeData,
15719 Input_file->new('ArabicShaping.txt', v2.0.0,
15720 Each_Line_Handler =>
15721 [ ($v_version lt 4.1.0)
15722 ? \&filter_old_style_arabic_shaping
15724 \&filter_arabic_shaping_line,
15726 Has_Missings_Defaults => $NOT_IGNORED,
15728 Input_file->new('Blocks.txt', v2.0.0,
15729 Property => 'Block',
15730 Has_Missings_Defaults => $NOT_IGNORED,
15731 Each_Line_Handler => \&filter_blocks_lines
15733 Input_file->new('PropList.txt', v2.0.0,
15734 Each_Line_Handler => (($v_version lt v3.1.0)
15735 ? \&filter_old_style_proplist
15738 Input_file->new('Unihan.txt', v2.0.0,
15739 Pre_Handler => \&setup_unihan,
15741 Each_Line_Handler => \&filter_unihan_line,
15743 Input_file->new('SpecialCasing.txt', v2.1.8,
15744 Each_Line_Handler => \&filter_special_casing_line,
15745 Pre_Handler => \&setup_special_casing,
15746 Has_Missings_Defaults => $IGNORED,
15749 'LineBreak.txt', v3.0.0,
15750 Has_Missings_Defaults => $NOT_IGNORED,
15751 Property => 'Line_Break',
15752 # Early versions had problematic syntax
15753 Each_Line_Handler => (($v_version lt v3.1.0)
15754 ? \&filter_early_ea_lb
15757 Input_file->new('EastAsianWidth.txt', v3.0.0,
15758 Property => 'East_Asian_Width',
15759 Has_Missings_Defaults => $NOT_IGNORED,
15760 # Early versions had problematic syntax
15761 Each_Line_Handler => (($v_version lt v3.1.0)
15762 ? \&filter_early_ea_lb
15765 Input_file->new('CompositionExclusions.txt', v3.0.0,
15766 Property => 'Composition_Exclusion',
15768 Input_file->new('BidiMirroring.txt', v3.0.1,
15769 Property => 'Bidi_Mirroring_Glyph',
15771 Input_file->new("NormalizationTest.txt", v3.0.1,
15772 Skip => 'Validation Tests',
15774 Input_file->new('CaseFolding.txt', v3.0.1,
15775 Pre_Handler => \&setup_case_folding,
15776 Each_Line_Handler =>
15777 [ ($v_version lt v3.1.0)
15778 ? \&filter_old_style_case_folding
15780 \&filter_case_folding_line
15782 Has_Missings_Defaults => $IGNORED,
15784 Input_file->new('DCoreProperties.txt', v3.1.0,
15785 # 5.2 changed this file
15786 Has_Missings_Defaults => (($v_version ge v5.2.0)
15790 Input_file->new('Scripts.txt', v3.1.0,
15791 Property => 'Script',
15792 Has_Missings_Defaults => $NOT_IGNORED,
15794 Input_file->new('DNormalizationProps.txt', v3.1.0,
15795 Has_Missings_Defaults => $NOT_IGNORED,
15796 Each_Line_Handler => (($v_version lt v4.0.1)
15797 ? \&filter_old_style_normalization_lines
15800 Input_file->new('HangulSyllableType.txt', v4.0.0,
15801 Has_Missings_Defaults => $NOT_IGNORED,
15802 Property => 'Hangul_Syllable_Type'),
15803 Input_file->new("$AUXILIARY/WordBreakProperty.txt", v4.1.0,
15804 Property => 'Word_Break',
15805 Has_Missings_Defaults => $NOT_IGNORED,
15807 Input_file->new("$AUXILIARY/GraphemeBreakProperty.txt", v4.1.0,
15808 Property => 'Grapheme_Cluster_Break',
15809 Has_Missings_Defaults => $NOT_IGNORED,
15811 Input_file->new("$AUXILIARY/GCBTest.txt", v4.1.0,
15812 Handler => \&process_GCB_test,
15814 Input_file->new("$AUXILIARY/LBTest.txt", v4.1.0,
15815 Skip => 'Validation Tests',
15817 Input_file->new("$AUXILIARY/SBTest.txt", v4.1.0,
15818 Skip => 'Validation Tests',
15820 Input_file->new("$AUXILIARY/WBTest.txt", v4.1.0,
15821 Skip => 'Validation Tests',
15823 Input_file->new("$AUXILIARY/SentenceBreakProperty.txt", v4.1.0,
15824 Property => 'Sentence_Break',
15825 Has_Missings_Defaults => $NOT_IGNORED,
15827 Input_file->new('NamedSequences.txt', v4.1.0,
15828 Handler => \&process_NamedSequences
15830 Input_file->new('NameAliases.txt', v5.0.0,
15831 Property => 'Name_Alias',
15832 Pre_Handler => ($v_version ge v6.0.0)
15833 ? \&setup_v6_name_alias
15836 Input_file->new("BidiTest.txt", v5.2.0,
15837 Skip => 'Validation Tests',
15839 Input_file->new('UnihanIndicesDictionary.txt', v5.2.0,
15841 Each_Line_Handler => \&filter_unihan_line,
15843 Input_file->new('UnihanDataDictionaryLike.txt', v5.2.0,
15845 Each_Line_Handler => \&filter_unihan_line,
15847 Input_file->new('UnihanIRGSources.txt', v5.2.0,
15849 Pre_Handler => \&setup_unihan,
15850 Each_Line_Handler => \&filter_unihan_line,
15852 Input_file->new('UnihanNumericValues.txt', v5.2.0,
15854 Each_Line_Handler => \&filter_unihan_line,
15856 Input_file->new('UnihanOtherMappings.txt', v5.2.0,
15858 Each_Line_Handler => \&filter_unihan_line,
15860 Input_file->new('UnihanRadicalStrokeCounts.txt', v5.2.0,
15862 Each_Line_Handler => \&filter_unihan_line,
15864 Input_file->new('UnihanReadings.txt', v5.2.0,
15866 Each_Line_Handler => \&filter_unihan_line,
15868 Input_file->new('UnihanVariants.txt', v5.2.0,
15870 Each_Line_Handler => \&filter_unihan_line,
15872 Input_file->new('ScriptExtensions.txt', v6.0.0,
15873 Property => 'Script_Extensions',
15874 Pre_Handler => \&setup_script_extensions,
15875 Each_Line_Handler => \&filter_script_extensions_line,
15876 Has_Missings_Defaults => (($v_version le v6.0.0)
15880 # The two Indic files are actually available starting in v6.0.0, but their
15881 # property values are missing from PropValueAliases.txt in that release,
15882 # so that further work would have to be done to get them to work properly
15883 # for that release.
15884 Input_file->new('IndicMatraCategory.txt', v6.1.0,
15885 Property => 'Indic_Matra_Category',
15886 Has_Missings_Defaults => $NOT_IGNORED,
15887 Skip => "Provisional; for the analysis and processing of Indic scripts",
15889 Input_file->new('IndicSyllabicCategory.txt', v6.1.0,
15890 Property => 'Indic_Syllabic_Category',
15891 Has_Missings_Defaults => $NOT_IGNORED,
15892 Skip => "Provisional; for the analysis and processing of Indic scripts",
15896 # End of all the preliminaries.
15899 if ($compare_versions) {
15900 Carp::my_carp(<<END
15901 Warning. \$compare_versions is set. Output is not suitable for production
15906 # Put into %potential_files a list of all the files in the directory structure
15907 # that could be inputs to this program, excluding those that we should ignore.
15908 # Use absolute file names because it makes it easier across machine types.
15909 my @ignored_files_full_names = map { File::Spec->rel2abs(
15910 internal_file_to_platform($_))
15911 } keys %ignored_files;
15914 return unless /\.txt$/i; # Some platforms change the name's case
15915 my $full = lc(File::Spec->rel2abs($_));
15916 $potential_files{$full} = 1
15917 if ! grep { $full eq lc($_) } @ignored_files_full_names;
15920 }, File::Spec->curdir());
15922 my @mktables_list_output_files;
15923 my $old_start_time = 0;
15925 if (! -e $file_list) {
15926 print "'$file_list' doesn't exist, so forcing rebuild.\n" if $verbosity >= $VERBOSE;
15927 $write_unchanged_files = 1;
15928 } elsif ($write_unchanged_files) {
15929 print "Not checking file list '$file_list'.\n" if $verbosity >= $VERBOSE;
15932 print "Reading file list '$file_list'\n" if $verbosity >= $VERBOSE;
15934 if (! open $file_handle, "<", $file_list) {
15935 Carp::my_carp("Failed to open '$file_list'; turning on -globlist option instead: $!");
15941 # Read and parse mktables.lst, placing the results from the first part
15942 # into @input, and the second part into @mktables_list_output_files
15943 for my $list ( \@input, \@mktables_list_output_files ) {
15944 while (<$file_handle>) {
15945 s/^ \s+ | \s+ $//xg;
15946 if (/^ \s* \# .* Autogenerated\ starting\ on\ (\d+)/x) {
15947 $old_start_time = $1;
15949 next if /^ \s* (?: \# .* )? $/x;
15951 my ( $file ) = split /\t/;
15952 push @$list, $file;
15954 @$list = uniques(@$list);
15958 # Look through all the input files
15959 foreach my $input (@input) {
15960 next if $input eq 'version'; # Already have checked this.
15962 # Ignore if doesn't exist. The checking about whether we care or
15963 # not is done via the Input_file object.
15964 next if ! file_exists($input);
15966 # The paths are stored with relative names, and with '/' as the
15967 # delimiter; convert to absolute on this machine
15968 my $full = lc(File::Spec->rel2abs(internal_file_to_platform($input)));
15969 $potential_files{lc $full} = 1
15970 if ! grep { lc($full) eq lc($_) } @ignored_files_full_names;
15974 close $file_handle;
15979 # Here wants to process all .txt files in the directory structure.
15980 # Convert them to full path names. They are stored in the platform's
15983 foreach my $object (@input_file_objects) {
15984 my $file = $object->file;
15985 next unless defined $file;
15986 push @known_files, File::Spec->rel2abs($file);
15989 my @unknown_input_files;
15990 foreach my $file (keys %potential_files) { # The keys are stored in lc
15991 next if grep { $file eq lc($_) } @known_files;
15993 # Here, the file is unknown to us. Get relative path name
15994 $file = File::Spec->abs2rel($file);
15995 push @unknown_input_files, $file;
15997 # What will happen is we create a data structure for it, and add it to
15998 # the list of input files to process. First get the subdirectories
16000 my (undef, $directories, undef) = File::Spec->splitpath($file);
16001 $directories =~ s;/$;;; # Can have extraneous trailing '/'
16002 my @directories = File::Spec->splitdir($directories);
16004 # If the file isn't extracted (meaning none of the directories is the
16005 # extracted one), just add it to the end of the list of inputs.
16006 if (! grep { $EXTRACTED_DIR eq $_ } @directories) {
16007 push @input_file_objects, Input_file->new($file, v0);
16011 # Here, the file is extracted. It needs to go ahead of most other
16012 # processing. Search for the first input file that isn't a
16013 # special required property (that is, find one whose first_release
16014 # is non-0), and isn't extracted. Also, the Age property file is
16015 # processed before the extracted ones, just in case
16016 # $compare_versions is set.
16017 for (my $i = 0; $i < @input_file_objects; $i++) {
16018 if ($input_file_objects[$i]->first_released ne v0
16019 && lc($input_file_objects[$i]->file) ne 'dage.txt'
16020 && $input_file_objects[$i]->file !~ /$EXTRACTED_DIR/i)
16022 splice @input_file_objects, $i, 0,
16023 Input_file->new($file, v0);
16030 if (@unknown_input_files) {
16031 print STDERR simple_fold(join_lines(<<END
16033 The following files are unknown as to how to handle. Assuming they are
16034 typical property files. You'll know by later error messages if it worked or
16037 ) . " " . join(", ", @unknown_input_files) . "\n\n");
16039 } # End of looking through directory structure for more .txt files.
16041 # Create the list of input files from the objects we have defined, plus
16043 my @input_files = 'version';
16044 foreach my $object (@input_file_objects) {
16045 my $file = $object->file;
16046 next if ! defined $file; # Not all objects have files
16047 next if $object->optional && ! -e $file;
16048 push @input_files, $file;
16051 if ( $verbosity >= $VERBOSE ) {
16052 print "Expecting ".scalar( @input_files )." input files. ",
16053 "Checking ".scalar( @mktables_list_output_files )." output files.\n";
16056 # We set $most_recent to be the most recently changed input file, including
16057 # this program itself (done much earlier in this file)
16058 foreach my $in (@input_files) {
16059 next unless -e $in; # Keep going even if missing a file
16060 my $mod_time = (stat $in)[9];
16061 $most_recent = $mod_time if $mod_time > $most_recent;
16063 # See that the input files have distinct names, to warn someone if they
16064 # are adding a new one
16066 my ($volume, $directories, $file ) = File::Spec->splitpath($in);
16067 $directories =~ s;/$;;; # Can have extraneous trailing '/'
16068 my @directories = File::Spec->splitdir($directories);
16069 my $base = $file =~ s/\.txt$//;
16070 construct_filename($file, 'mutable', \@directories);
16074 my $rebuild = $write_unchanged_files # Rebuild: if unconditional rebuild
16075 || ! scalar @mktables_list_output_files # or if no outputs known
16076 || $old_start_time < $most_recent; # or out-of-date
16078 # Now we check to see if any output files are older than youngest, if
16079 # they are, we need to continue on, otherwise we can presumably bail.
16081 foreach my $out (@mktables_list_output_files) {
16082 if ( ! file_exists($out)) {
16083 print "'$out' is missing.\n" if $verbosity >= $VERBOSE;
16087 #local $to_trace = 1 if main::DEBUG;
16088 trace $most_recent, (stat $out)[9] if main::DEBUG && $to_trace;
16089 if ( (stat $out)[9] <= $most_recent ) {
16090 #trace "$out: most recent mod time: ", (stat $out)[9], ", youngest: $most_recent\n" if main::DEBUG && $to_trace;
16091 print "'$out' is too old.\n" if $verbosity >= $VERBOSE;
16098 print "Files seem to be ok, not bothering to rebuild. Add '-w' option to force build\n";
16101 print "Must rebuild tables.\n" if $verbosity >= $VERBOSE;
16103 # Ready to do the major processing. First create the perl pseudo-property.
16104 $perl = Property->new('perl', Type => $NON_STRING, Perl_Extension => 1);
16106 # Process each input file
16107 foreach my $file (@input_file_objects) {
16111 # Finish the table generation.
16113 print "Finishing processing Unicode properties\n" if $verbosity >= $PROGRESS;
16116 print "Compiling Perl properties\n" if $verbosity >= $PROGRESS;
16119 print "Creating Perl synonyms\n" if $verbosity >= $PROGRESS;
16120 add_perl_synonyms();
16122 print "Writing tables\n" if $verbosity >= $PROGRESS;
16123 write_all_tables();
16125 # Write mktables.lst
16126 if ( $file_list and $make_list ) {
16128 print "Updating '$file_list'\n" if $verbosity >= $PROGRESS;
16129 foreach my $file (@input_files, @files_actually_output) {
16130 my (undef, $directories, $file) = File::Spec->splitpath($file);
16131 my @directories = File::Spec->splitdir($directories);
16132 $file = join '/', @directories, $file;
16136 if (! open $ofh,">",$file_list) {
16137 Carp::my_carp("Can't write to '$file_list'. Skipping: $!");
16141 my $localtime = localtime $start_time;
16142 print $ofh <<"END";
16144 # $file_list -- File list for $0.
16146 # Autogenerated starting on $start_time ($localtime)
16148 # - First section is input files
16149 # ($0 itself is not listed but is automatically considered an input)
16150 # - Section separator is /^=+\$/
16151 # - Second section is a list of output files.
16152 # - Lines matching /^\\s*#/ are treated as comments
16153 # which along with blank lines are ignored.
16159 print $ofh "$_\n" for sort(@input_files);
16160 print $ofh "\n=================================\n# Output files:\n\n";
16161 print $ofh "$_\n" for sort @files_actually_output;
16162 print $ofh "\n# ",scalar(@input_files)," input files\n",
16163 "# ",scalar(@files_actually_output)+1," output files\n\n",
16166 or Carp::my_carp("Failed to close $ofh: $!");
16168 print "Filelist has ",scalar(@input_files)," input files and ",
16169 scalar(@files_actually_output)+1," output files\n"
16170 if $verbosity >= $VERBOSE;
16174 # Output these warnings unless -q explicitly specified.
16175 if ($verbosity >= $NORMAL_VERBOSITY && ! $debug_skip) {
16176 if (@unhandled_properties) {
16177 print "\nProperties and tables that unexpectedly have no code points\n";
16178 foreach my $property (sort @unhandled_properties) {
16179 print $property, "\n";
16183 if (%potential_files) {
16184 print "\nInput files that are not considered:\n";
16185 foreach my $file (sort keys %potential_files) {
16186 print File::Spec->abs2rel($file), "\n";
16189 print "\nAll done\n" if $verbosity >= $VERBOSE;
16193 # TRAILING CODE IS USED BY make_property_test_script()
16199 # If run outside the normal test suite on an ASCII platform, you can
16200 # just create a latin1_to_native() function that just returns its
16201 # inputs, because that's the only function used from test.pl
16204 # Test qr/\X/ and the \p{} regular expression constructs. This file is
16205 # constructed by mktables from the tables it generates, so if mktables is
16206 # buggy, this won't necessarily catch those bugs. Tests are generated for all
16207 # feasible properties; a few aren't currently feasible; see
16208 # is_code_point_usable() in mktables for details.
16210 # Standard test packages are not used because this manipulates SIG_WARN. It
16211 # exits 0 if every non-skipped test succeeded; -1 if any failed.
16217 my $expected = shift;
16220 my $warning_type = shift; # Type of warning message, like 'deprecated'
16222 my $line = (caller)[2];
16223 $ord = ord(latin1_to_native(chr($ord)));
16225 # Convert the code point to hex form
16226 my $string = sprintf "\"\\x{%04X}\"", $ord;
16230 # The first time through, use all warnings. If the input should generate
16231 # a warning, add another time through with them turned off
16232 push @tests, "no warnings '$warning_type';" if $warning_type;
16234 foreach my $no_warnings (@tests) {
16236 # Store any warning messages instead of outputting them
16237 local $SIG{__WARN__} = $SIG{__WARN__};
16238 my $warning_message;
16239 $SIG{__WARN__} = sub { $warning_message = $_[0] };
16243 # A string eval is needed because of the 'no warnings'.
16244 # Assumes no parens in the regular expression
16245 my $result = eval "$no_warnings
16246 my \$RegObj = qr($regex);
16247 $string =~ \$RegObj ? 1 : 0";
16248 if (not defined $result) {
16249 print "not ok $Tests - couldn't compile /$regex/; line $line: $@\n";
16252 elsif ($result ^ $expected) {
16253 print "not ok $Tests - expected $expected but got $result for $string =~ qr/$regex/; line $line\n";
16256 elsif ($warning_message) {
16257 if (! $warning_type || ($warning_type && $no_warnings)) {
16258 print "not ok $Tests - for qr/$regex/ did not expect warning message '$warning_message'; line $line\n";
16262 print "ok $Tests - expected and got a warning message for qr/$regex/; line $line\n";
16265 elsif ($warning_type && ! $no_warnings) {
16266 print "not ok $Tests - for qr/$regex/ expected a $warning_type warning message, but got none; line $line\n";
16270 print "ok $Tests - got $result for $string =~ qr/$regex/; line $line\n";
16279 if (eval { 'x' =~ qr/$regex/; 1 }) {
16281 my $line = (caller)[2];
16282 print "not ok $Tests - re compiled ok, but expected error for qr/$regex/; line $line: $@\n";
16285 my $line = (caller)[2];
16286 print "ok $Tests - got and expected error for qr/$regex/; line $line\n";
16291 # GCBTest.txt character that separates grapheme clusters
16292 my $breakable_utf8 = my $breakable = chr(0xF7);
16293 utf8::upgrade($breakable_utf8);
16295 # GCBTest.txt character that indicates that the adjoining code points are part
16296 # of the same grapheme cluster
16297 my $nobreak_utf8 = my $nobreak = chr(0xD7);
16298 utf8::upgrade($nobreak_utf8);
16301 # Test qr/\X/ matches. The input is a line from auxiliary/GCBTest.txt
16302 # Each such line is a sequence of code points given by their hex numbers,
16303 # separated by the two characters defined just before this subroutine that
16304 # indicate that either there can or cannot be a break between the adjacent
16305 # code points. If there isn't a break, that means the sequence forms an
16306 # extended grapheme cluster, which means that \X should match the whole
16307 # thing. If there is a break, \X should stop there. This is all
16308 # converted by this routine into a match:
16309 # $string =~ /(\X)/,
16310 # Each \X should match the next cluster; and that is what is checked.
16312 my $template = shift;
16314 my $line = (caller)[2];
16316 # The line contains characters above the ASCII range, but in Latin1. It
16317 # may or may not be in utf8, and if it is, it may or may not know it. So,
16318 # convert these characters to 8 bits. If knows is in utf8, simply
16320 if (utf8::is_utf8($template)) {
16321 utf8::downgrade($template);
16324 # Otherwise, if it is in utf8, but doesn't know it, the next lines
16325 # convert the two problematic characters to their 8-bit equivalents.
16326 # If it isn't in utf8, they don't harm anything.
16328 $template =~ s/$nobreak_utf8/$nobreak/g;
16329 $template =~ s/$breakable_utf8/$breakable/g;
16332 # Get rid of the leading and trailing breakables
16333 $template =~ s/^ \s* $breakable \s* //x;
16334 $template =~ s/ \s* $breakable \s* $ //x;
16336 # And no-breaks become just a space.
16337 $template =~ s/ \s* $nobreak \s* / /xg;
16339 # Split the input into segments that are breakable between them.
16340 my @segments = split /\s*$breakable\s*/, $template;
16343 my $display_string = "";
16345 my @should_display;
16347 # Convert the code point sequence in each segment into a Perl string of
16349 foreach my $segment (@segments) {
16350 my @code_points = split /\s+/, $segment;
16351 my $this_string = "";
16352 my $this_display = "";
16353 foreach my $code_point (@code_points) {
16354 $this_string .= latin1_to_native(chr(hex $code_point));
16355 $this_display .= "\\x{$code_point}";
16358 # The next cluster should match the string in this segment.
16359 push @should_match, $this_string;
16360 push @should_display, $this_display;
16361 $string .= $this_string;
16362 $display_string .= $this_display;
16365 # If a string can be represented in both non-ut8 and utf8, test both cases
16367 for my $to_upgrade (0 .. 1) {
16371 # If already in utf8, would just be a repeat
16372 next UPGRADE if utf8::is_utf8($string);
16374 utf8::upgrade($string);
16377 # Finally, do the \X match.
16378 my @matches = $string =~ /(\X)/g;
16380 # Look through each matched cluster to verify that it matches what we
16382 my $min = (@matches < @should_match) ? @matches : @should_match;
16383 for my $i (0 .. $min - 1) {
16385 if ($matches[$i] eq $should_match[$i]) {
16386 print "ok $Tests - ";
16388 print "In \"$display_string\" =~ /(\\X)/g, \\X #1";
16390 print "And \\X #", $i + 1,
16392 print " correctly matched $should_display[$i]; line $line\n";
16394 $matches[$i] = join("", map { sprintf "\\x{%04X}", $_ }
16395 unpack("U*", $matches[$i]));
16396 print "not ok $Tests - In \"$display_string\" =~ /(\\X)/g, \\X #",
16398 " should have matched $should_display[$i]",
16399 " but instead matched $matches[$i]",
16400 ". Abandoning rest of line $line\n";
16405 # And the number of matches should equal the number of expected matches.
16407 if (@matches == @should_match) {
16408 print "ok $Tests - Nothing was left over; line $line\n";
16410 print "not ok $Tests - There were ", scalar @should_match, " \\X matches expected, but got ", scalar @matches, " instead; line $line\n";
16418 print "1..$Tests\n";
16419 exit($Fails ? -1 : 0);
16422 Error('\p{Script=InGreek}'); # Bug #69018
16423 Test_X("1100 $nobreak 1161"); # Bug #70940
16424 Expect(0, 0x2028, '\p{Print}', ""); # Bug # 71722
16425 Expect(0, 0x2029, '\p{Print}', ""); # Bug # 71722
16426 Expect(1, 0xFF10, '\p{XDigit}', ""); # Bug # 71726