3 # !!!!!!!!!!!!!! IF YOU MODIFY THIS FILE !!!!!!!!!!!!!!!!!!!!!!!!!
4 # Any files created or read by this program should be listed in 'mktables.lst'
5 # Use -makelist to regenerate it.
7 # Needs 'no overloading' to run faster on miniperl. Code commented out at the
8 # subroutine objaddr can be used instead to work as far back (untested) as
9 # 5.8: needs pack "U". But almost all occurrences of objaddr have been
10 # removed in favor of using 'no overloading'. You also would have to go
11 # through and replace occurrences like:
12 # my $addr; { no overloading; $addr = 0+$self; }
14 # my $addr = main::objaddr $self;
15 # (or reverse commit 9b01bafde4b022706c3d6f947a0963f821b2e50b
16 # that instituted this change.)
27 sub DEBUG () { 0 } # Set to 0 for production; 1 for development
29 ##########################################################################
31 # mktables -- create the runtime Perl Unicode files (lib/unicore/.../*.pl),
32 # from the Unicode database files (lib/unicore/.../*.txt), It also generates
33 # a pod file and a .t file
35 # The structure of this file is:
36 # First these introductory comments; then
37 # code needed for everywhere, such as debugging stuff; then
38 # code to handle input parameters; then
39 # data structures likely to be of external interest (some of which depend on
40 # the input parameters, so follows them; then
41 # more data structures and subroutine and package (class) definitions; then
42 # the small actual loop to process the input files and finish up; then
43 # a __DATA__ section, for the .t tests
45 # This program works on all releases of Unicode through at least 5.2. The
46 # outputs have been scrutinized most intently for release 5.1. The others
47 # have been checked for somewhat more than just sanity. It can handle all
48 # existing Unicode character properties in those releases.
50 # This program is mostly about Unicode character (or code point) properties.
51 # A property describes some attribute or quality of a code point, like if it
52 # is lowercase or not, its name, what version of Unicode it was first defined
53 # in, or what its uppercase equivalent is. Unicode deals with these disparate
54 # possibilities by making all properties into mappings from each code point
55 # into some corresponding value. In the case of it being lowercase or not,
56 # the mapping is either to 'Y' or 'N' (or various synonyms thereof). Each
57 # property maps each Unicode code point to a single value, called a "property
58 # value". (Hence each Unicode property is a true mathematical function with
59 # exactly one value per code point.)
61 # When using a property in a regular expression, what is desired isn't the
62 # mapping of the code point to its property's value, but the reverse (or the
63 # mathematical "inverse relation"): starting with the property value, "Does a
64 # code point map to it?" These are written in a "compound" form:
65 # \p{property=value}, e.g., \p{category=punctuation}. This program generates
66 # files containing the lists of code points that map to each such regular
67 # expression property value, one file per list
69 # There is also a single form shortcut that Perl adds for many of the commonly
70 # used properties. This happens for all binary properties, plus script,
71 # general_category, and block properties.
73 # Thus the outputs of this program are files. There are map files, mostly in
74 # the 'To' directory; and there are list files for use in regular expression
75 # matching, all in subdirectories of the 'lib' directory, with each
76 # subdirectory being named for the property that the lists in it are for.
77 # Bookkeeping, test, and documentation files are also generated.
79 my $matches_directory = 'lib'; # Where match (\p{}) files go.
80 my $map_directory = 'To'; # Where map files go.
84 # The major data structures of this program are Property, of course, but also
85 # Table. There are two kinds of tables, very similar to each other.
86 # "Match_Table" is the data structure giving the list of code points that have
87 # a particular property value, mentioned above. There is also a "Map_Table"
88 # data structure which gives the property's mapping from code point to value.
89 # There are two structures because the match tables need to be combined in
90 # various ways, such as constructing unions, intersections, complements, etc.,
91 # and the map ones don't. And there would be problems, perhaps subtle, if
92 # a map table were inadvertently operated on in some of those ways.
93 # The use of separate classes with operations defined on one but not the other
94 # prevents accidentally confusing the two.
96 # At the heart of each table's data structure is a "Range_List", which is just
97 # an ordered list of "Ranges", plus ancillary information, and methods to
98 # operate on them. A Range is a compact way to store property information.
99 # Each range has a starting code point, an ending code point, and a value that
100 # is meant to apply to all the code points between the two end points,
101 # inclusive. For a map table, this value is the property value for those
102 # code points. Two such ranges could be written like this:
103 # 0x41 .. 0x5A, 'Upper',
104 # 0x61 .. 0x7A, 'Lower'
106 # Each range also has a type used as a convenience to classify the values.
107 # Most ranges in this program will be Type 0, or normal, but there are some
108 # ranges that have a non-zero type. These are used only in map tables, and
109 # are for mappings that don't fit into the normal scheme of things. Mappings
110 # that require a hash entry to communicate with utf8.c are one example;
111 # another example is mappings for charnames.pm to use which indicate a name
112 # that is algorithmically determinable from its code point (and vice-versa).
113 # These are used to significantly compact these tables, instead of listing
114 # each one of the tens of thousands individually.
116 # In a match table, the value of a range is irrelevant (and hence the type as
117 # well, which will always be 0), and arbitrarily set to the null string.
118 # Using the example above, there would be two match tables for those two
119 # entries, one named Upper would contain the 0x41..0x5A range, and the other
120 # named Lower would contain 0x61..0x7A.
122 # Actually, there are two types of range lists, "Range_Map" is the one
123 # associated with map tables, and "Range_List" with match tables.
124 # Again, this is so that methods can be defined on one and not the other so as
125 # to prevent operating on them in incorrect ways.
127 # Eventually, most tables are written out to files to be read by utf8_heavy.pl
128 # in the perl core. All tables could in theory be written, but some are
129 # suppressed because there is no current practical use for them. It is easy
130 # to change which get written by changing various lists that are near the top
131 # of the actual code in this file. The table data structures contain enough
132 # ancillary information to allow them to be treated as separate entities for
133 # writing, such as the path to each one's file. There is a heading in each
134 # map table that gives the format of its entries, and what the map is for all
135 # the code points missing from it. (This allows tables to be more compact.)
137 # The Property data structure contains one or more tables. All properties
138 # contain a map table (except the $perl property which is a
139 # pseudo-property containing only match tables), and any properties that
140 # are usable in regular expression matches also contain various matching
141 # tables, one for each value the property can have. A binary property can
142 # have two values, True and False (or Y and N, which are preferred by Unicode
143 # terminology). Thus each of these properties will have a map table that
144 # takes every code point and maps it to Y or N (but having ranges cuts the
145 # number of entries in that table way down), and two match tables, one
146 # which has a list of all the code points that map to Y, and one for all the
147 # code points that map to N. (For each of these, a third table is also
148 # generated for the pseudo Perl property. It contains the identical code
149 # points as the Y table, but can be written, not in the compound form, but in
150 # a "single" form like \p{IsUppercase}.) Many properties are binary, but some
151 # properties have several possible values, some have many, and properties like
152 # Name have a different value for every named code point. Those will not,
153 # unless the controlling lists are changed, have their match tables written
154 # out. But all the ones which can be used in regular expression \p{} and \P{}
155 # constructs will. Generally a property will have either its map table or its
156 # match tables written but not both. Again, what gets written is controlled
157 # by lists which can easily be changed.
159 # For information about the Unicode properties, see Unicode's UAX44 document:
161 my $unicode_reference_url = 'http://www.unicode.org/reports/tr44/';
163 # As stated earlier, this program will work on any release of Unicode so far.
164 # Most obvious problems in earlier data have NOT been corrected except when
165 # necessary to make Perl or this program work reasonably. For example, no
166 # folding information was given in early releases, so this program uses the
167 # substitute of lower case, just so that a regular expression with the /i
168 # option will do something that actually gives the right results in many
169 # cases. There are also a couple other corrections for version 1.1.5,
170 # commented at the point they are made. As an example of corrections that
171 # weren't made (but could be) is this statement from DerivedAge.txt: "The
172 # supplementary private use code points and the non-character code points were
173 # assigned in version 2.0, but not specifically listed in the UCD until
174 # versions 3.0 and 3.1 respectively." (To be precise it was 3.0.1 not 3.0.0)
175 # More information on Unicode version glitches is further down in these
176 # introductory comments.
178 # This program works on all properties as of 5.2, though the files for some
179 # are suppressed from apparent lack of demand for them. You can change which
180 # are output by changing lists in this program.
182 # The old version of mktables emphasized the term "Fuzzy" to mean Unocde's
183 # loose matchings rules (from Unicode TR18):
185 # The recommended names for UCD properties and property values are in
186 # PropertyAliases.txt [Prop] and PropertyValueAliases.txt
187 # [PropValue]. There are both abbreviated names and longer, more
188 # descriptive names. It is strongly recommended that both names be
189 # recognized, and that loose matching of property names be used,
190 # whereby the case distinctions, whitespace, hyphens, and underbar
192 # The program still allows Fuzzy to override its determination of if loose
193 # matching should be used, but it isn't currently used, as it is no longer
194 # needed; the calculations it makes are good enough.
196 # SUMMARY OF HOW IT WORKS:
200 # A list is constructed containing each input file that is to be processed
202 # Each file on the list is processed in a loop, using the associated handler
204 # The PropertyAliases.txt and PropValueAliases.txt files are processed
205 # first. These files name the properties and property values.
206 # Objects are created of all the property and property value names
207 # that the rest of the input should expect, including all synonyms.
208 # The other input files give mappings from properties to property
209 # values. That is, they list code points and say what the mapping
210 # is under the given property. Some files give the mappings for
211 # just one property; and some for many. This program goes through
212 # each file and populates the properties from them. Some properties
213 # are listed in more than one file, and Unicode has set up a
214 # precedence as to which has priority if there is a conflict. Thus
215 # the order of processing matters, and this program handles the
216 # conflict possibility by processing the overriding input files
217 # last, so that if necessary they replace earlier values.
218 # After this is all done, the program creates the property mappings not
219 # furnished by Unicode, but derivable from what it does give.
220 # The tables of code points that match each property value in each
221 # property that is accessible by regular expressions are created.
222 # The Perl-defined properties are created and populated. Many of these
223 # require data determined from the earlier steps
224 # Any Perl-defined synonyms are created, and name clashes between Perl
225 # and Unicode are reconciled and warned about.
226 # All the properties are written to files
227 # Any other files are written, and final warnings issued.
229 # For clarity, a number of operators have been overloaded to work on tables:
230 # ~ means invert (take all characters not in the set). The more
231 # conventional '!' is not used because of the possibility of confusing
232 # it with the actual boolean operation.
234 # - means subtraction
235 # & means intersection
236 # The precedence of these is the order listed. Parentheses should be
237 # copiously used. These are not a general scheme. The operations aren't
238 # defined for a number of things, deliberately, to avoid getting into trouble.
239 # Operations are done on references and affect the underlying structures, so
240 # that the copy constructors for them have been overloaded to not return a new
241 # clone, but the input object itself.
243 # The bool operator is deliberately not overloaded to avoid confusion with
244 # "should it mean if the object merely exists, or also is non-empty?".
246 # WHY CERTAIN DESIGN DECISIONS WERE MADE
248 # This program needs to be able to run under miniperl. Therefore, it uses a
249 # minimum of other modules, and hence implements some things itself that could
250 # be gotten from CPAN
252 # This program uses inputs published by the Unicode Consortium. These can
253 # change incompatibly between releases without the Perl maintainers realizing
254 # it. Therefore this program is now designed to try to flag these. It looks
255 # at the directories where the inputs are, and flags any unrecognized files.
256 # It keeps track of all the properties in the files it handles, and flags any
257 # that it doesn't know how to handle. It also flags any input lines that
258 # don't match the expected syntax, among other checks.
260 # It is also designed so if a new input file matches one of the known
261 # templates, one hopefully just needs to add it to a list to have it
264 # As mentioned earlier, some properties are given in more than one file. In
265 # particular, the files in the extracted directory are supposedly just
266 # reformattings of the others. But they contain information not easily
267 # derivable from the other files, including results for Unihan, which this
268 # program doesn't ordinarily look at, and for unassigned code points. They
269 # also have historically had errors or been incomplete. In an attempt to
270 # create the best possible data, this program thus processes them first to
271 # glean information missing from the other files; then processes those other
272 # files to override any errors in the extracted ones. Much of the design was
273 # driven by this need to store things and then possibly override them.
275 # It tries to keep fatal errors to a minimum, to generate something usable for
276 # testing purposes. It always looks for files that could be inputs, and will
277 # warn about any that it doesn't know how to handle (the -q option suppresses
280 # Why have files written out for binary 'N' matches?
281 # For binary properties, if you know the mapping for either Y or N; the
282 # other is trivial to construct, so could be done at Perl run-time by just
283 # complementing the result, instead of having a file for it. That is, if
284 # someone types in \p{foo: N}, Perl could translate that to \P{foo: Y} and
285 # not need a file. The problem is communicating to Perl that a given
286 # property is binary. Perl can't figure it out from looking at the N (or
287 # No), as some non-binary properties have these as property values. So
288 # rather than inventing a way to communicate this info back to the core,
289 # which would have required changes there as well, it was simpler just to
290 # add the extra tables.
292 # Why is there more than one type of range?
293 # This simplified things. There are some very specialized code points that
294 # have to be handled specially for output, such as Hangul syllable names.
295 # By creating a range type (done late in the development process), it
296 # allowed this to be stored with the range, and overridden by other input.
297 # Originally these were stored in another data structure, and it became a
298 # mess trying to decide if a second file that was for the same property was
299 # overriding the earlier one or not.
301 # Why are there two kinds of tables, match and map?
302 # (And there is a base class shared by the two as well.) As stated above,
303 # they actually are for different things. Development proceeded much more
304 # smoothly when I (khw) realized the distinction. Map tables are used to
305 # give the property value for every code point (actually every code point
306 # that doesn't map to a default value). Match tables are used for regular
307 # expression matches, and are essentially the inverse mapping. Separating
308 # the two allows more specialized methods, and error checks so that one
309 # can't just take the intersection of two map tables, for example, as that
312 # There are no match tables generated for matches of the null string. These
313 # would like like qr/\p{JSN=}/ currently without modifying the regex code.
314 # Perhaps something like them could be added if necessary. The JSN does have
315 # a real code point U+110B that maps to the null string, but it is a
316 # contributory property, and therefore not output by default. And it's easily
317 # handled so far by making the null string the default where it is a
322 # This program is written so it will run under miniperl. Occasionally changes
323 # will cause an error where the backtrace doesn't work well under miniperl.
324 # To diagnose the problem, you can instead run it under regular perl, if you
327 # There is a good trace facility. To enable it, first sub DEBUG must be set
328 # to return true. Then a line like
330 # local $to_trace = 1 if main::DEBUG;
332 # can be added to enable tracing in its lexical scope or until you insert
335 # local $to_trace = 0 if main::DEBUG;
337 # then use a line like "trace $a, @b, %c, ...;
339 # Some of the more complex subroutines already have trace statements in them.
340 # Permanent trace statements should be like:
342 # trace ... if main::DEBUG && $to_trace;
344 # If there is just one or a few files that you're debugging, you can easily
345 # cause most everything else to be skipped. Change the line
347 # my $debug_skip = 0;
349 # to 1, and every file whose object is in @input_file_objects and doesn't have
350 # a, 'non_skip => 1,' in its constructor will be skipped.
354 # The program would break if Unicode were to change its names so that
355 # interior white space, underscores, or dashes differences were significant
356 # within property and property value names.
358 # It might be easier to use the xml versions of the UCD if this program ever
359 # would need heavy revision, and the ability to handle old versions was not
362 # There is the potential for name collisions, in that Perl has chosen names
363 # that Unicode could decide it also likes. There have been such collisions in
364 # the past, with mostly Perl deciding to adopt the Unicode definition of the
365 # name. However in the 5.2 Unicode beta testing, there were a number of such
366 # collisions, which were withdrawn before the final release, because of Perl's
367 # and other's protests. These all involved new properties which began with
368 # 'Is'. Based on the protests, Unicode is unlikely to try that again. Also,
369 # many of the Perl-defined synonyms, like Any, Word, etc, are listed in a
370 # Unicode document, so they are unlikely to be used by Unicode for another
371 # purpose. However, they might try something beginning with 'In', or use any
372 # of the other Perl-defined properties. This program will warn you of name
373 # collisions, and refuse to generate tables with them, but manual intervention
374 # will be required in this event. One scheme that could be implemented, if
375 # necessary, would be to have this program generate another file, or add a
376 # field to mktables.lst that gives the date of first definition of a property.
377 # Each new release of Unicode would use that file as a basis for the next
378 # iteration. And the Perl synonym addition code could sort based on the age
379 # of the property, so older properties get priority, and newer ones that clash
380 # would be refused; hence existing code would not be impacted, and some other
381 # synonym would have to be used for the new property. This is ugly, and
382 # manual intervention would certainly be easier to do in the short run; lets
383 # hope it never comes to this.
387 # This program can generate tables from the Unihan database. But it doesn't
388 # by default, letting the CPAN module Unicode::Unihan handle them. Prior to
389 # version 5.2, this database was in a single file, Unihan.txt. In 5.2 the
390 # database was split into 8 different files, all beginning with the letters
391 # 'Unihan'. This program will read those file(s) if present, but it needs to
392 # know which of the many properties in the file(s) should have tables created
393 # for them. It will create tables for any properties listed in
394 # PropertyAliases.txt and PropValueAliases.txt, plus any listed in the
395 # @cjk_properties array and the @cjk_property_values array. Thus, if a
396 # property you want is not in those files of the release you are building
397 # against, you must add it to those two arrays. Starting in 4.0, the
398 # Unicode_Radical_Stroke was listed in those files, so if the Unihan database
399 # is present in the directory, a table will be generated for that property.
400 # In 5.2, several more properties were added. For your convenience, the two
401 # arrays are initialized with all the 5.2 listed properties that are also in
402 # earlier releases. But these are commented out. You can just uncomment the
403 # ones you want, or use them as a template for adding entries for other
406 # You may need to adjust the entries to suit your purposes. setup_unihan(),
407 # and filter_unihan_line() are the functions where this is done. This program
408 # already does some adjusting to make the lines look more like the rest of the
409 # Unicode DB; You can see what that is in filter_unihan_line()
411 # There is a bug in the 3.2 data file in which some values for the
412 # kPrimaryNumeric property have commas and an unexpected comment. A filter
413 # could be added for these; or for a particular installation, the Unihan.txt
414 # file could be edited to fix them.
417 # HOW TO ADD A FILE TO BE PROCESSED
419 # A new file from Unicode needs to have an object constructed for it in
420 # @input_file_objects, probably at the end or at the end of the extracted
421 # ones. The program should warn you if its name will clash with others on
422 # restrictive file systems, like DOS. If so, figure out a better name, and
423 # add lines to the README.perl file giving that. If the file is a character
424 # property, it should be in the format that Unicode has by default
425 # standardized for such files for the more recently introduced ones.
426 # If so, the Input_file constructor for @input_file_objects can just be the
427 # file name and release it first appeared in. If not, then it should be
428 # possible to construct an each_line_handler() to massage the line into the
431 # For non-character properties, more code will be needed. You can look at
432 # the existing entries for clues.
434 # UNICODE VERSIONS NOTES
436 # The Unicode UCD has had a number of errors in it over the versions. And
437 # these remain, by policy, in the standard for that version. Therefore it is
438 # risky to correct them, because code may be expecting the error. So this
439 # program doesn't generally make changes, unless the error breaks the Perl
440 # core. As an example, some versions of 2.1.x Jamo.txt have the wrong value
441 # for U+1105, which causes real problems for the algorithms for Jamo
442 # calculations, so it is changed here.
444 # But it isn't so clear cut as to what to do about concepts that are
445 # introduced in a later release; should they extend back to earlier releases
446 # where the concept just didn't exist? It was easier to do this than to not,
447 # so that's what was done. For example, the default value for code points not
448 # in the files for various properties was probably undefined until changed by
449 # some version. No_Block for blocks is such an example. This program will
450 # assign No_Block even in Unicode versions that didn't have it. This has the
451 # benefit that code being written doesn't have to special case earlier
452 # versions; and the detriment that it doesn't match the Standard precisely for
453 # the affected versions.
455 # Here are some observations about some of the issues in early versions:
457 # The number of code points in \p{alpha} halve in 2.1.9. It turns out that
458 # the reason is that the CJK block starting at 4E00 was removed from PropList,
459 # and was not put back in until 3.1.0
461 # Unicode introduced the synonym Space for White_Space in 4.1. Perl has
462 # always had a \p{Space}. In release 3.2 only, they are not synonymous. The
463 # reason is that 3.2 introduced U+205F=medium math space, which was not
464 # classed as white space, but Perl figured out that it should have been. 4.0
465 # reclassified it correctly.
467 # Another change between 3.2 and 4.0 is the CCC property value ATBL. In 3.2
468 # this was erroneously a synonym for 202. In 4.0, ATB became 202, and ATBL
469 # was left with no code points, as all the ones that mapped to 202 stayed
470 # mapped to 202. Thus if your program used the numeric name for the class,
471 # it would not have been affected, but if it used the mnemonic, it would have
474 # \p{Script=Hrkt} (Katakana_Or_Hiragana) came in 4.0.1. Before that code
475 # points which eventually came to have this script property value, instead
476 # mapped to "Unknown". But in the next release all these code points were
477 # moved to \p{sc=common} instead.
479 # The default for missing code points for BidiClass is complicated. Starting
480 # in 3.1.1, the derived file DBidiClass.txt handles this, but this program
481 # tries to do the best it can for earlier releases. It is done in
482 # process_PropertyAliases()
484 ##############################################################################
486 my $UNDEF = ':UNDEF:'; # String to print out for undefined values in tracing
488 my $MAX_LINE_WIDTH = 78;
490 # Debugging aid to skip most files so as to not be distracted by them when
491 # concentrating on the ones being debugged. Add
493 # to the constructor for those files you want processed when you set this.
494 # Files with a first version number of 0 are special: they are always
495 # processed regardless of the state of this flag.
498 # Set to 1 to enable tracing.
501 { # Closure for trace: debugging aid
502 my $print_caller = 1; # ? Include calling subroutine name
503 my $main_with_colon = 'main::';
504 my $main_colon_length = length($main_with_colon);
507 return unless $to_trace; # Do nothing if global flag not set
511 local $DB::trace = 0;
512 $DB::trace = 0; # Quiet 'used only once' message
516 # Loop looking up the stack to get the first non-trace caller
521 $line_number = $caller_line;
522 (my $pkg, my $file, $caller_line, my $caller) = caller $i++;
523 $caller = $main_with_colon unless defined $caller;
525 $caller_name = $caller;
528 $caller_name =~ s/.*:://;
529 if (substr($caller_name, 0, $main_colon_length)
532 $caller_name = substr($caller_name, $main_colon_length);
535 } until ($caller_name ne 'trace');
537 # If the stack was empty, we were called from the top level
538 $caller_name = 'main' if ($caller_name eq ""
539 || $caller_name eq 'trace');
542 foreach my $string (@input) {
543 #print STDERR __LINE__, ": ", join ", ", @input, "\n";
544 if (ref $string eq 'ARRAY' || ref $string eq 'HASH') {
545 $output .= simple_dumper($string);
548 $string = "$string" if ref $string;
549 $string = $UNDEF unless defined $string;
551 $string = '""' if $string eq "";
552 $output .= " " if $output ne ""
554 && substr($output, -1, 1) ne " "
555 && substr($string, 0, 1) ne " ";
560 print STDERR sprintf "%4d: ", $line_number if defined $line_number;
561 print STDERR "$caller_name: " if $print_caller;
562 print STDERR $output, "\n";
567 # This is for a rarely used development feature that allows you to compare two
568 # versions of the Unicode standard without having to deal with changes caused
569 # by the code points introduced in the later verson. Change the 0 to a SINGLE
570 # dotted Unicode release number (e.g. 2.1). Only code points introduced in
571 # that release and earlier will be used; later ones are thrown away. You use
572 # the version number of the earliest one you want to compare; then run this
573 # program on directory structures containing each release, and compare the
574 # outputs. These outputs will therefore include only the code points common
575 # to both releases, and you can see the changes caused just by the underlying
576 # release semantic changes. For versions earlier than 3.2, you must copy a
577 # version of DAge.txt into the directory.
578 my $string_compare_versions = DEBUG && 0; # e.g., v2.1;
579 my $compare_versions = DEBUG
580 && $string_compare_versions
581 && pack "C*", split /\./, $string_compare_versions;
584 # Returns non-duplicated input values. From "Perl Best Practices:
585 # Encapsulated Cleverness". p. 455 in first edition.
588 return grep { ! $seen{$_}++ } @_;
591 $0 = File::Spec->canonpath($0);
593 my $make_test_script = 0; # ? Should we output a test script
594 my $write_unchanged_files = 0; # ? Should we update the output files even if
595 # we don't think they have changed
596 my $use_directory = ""; # ? Should we chdir somewhere.
597 my $pod_directory; # input directory to store the pod file.
598 my $pod_file = 'perluniprops';
599 my $t_path; # Path to the .t test file
600 my $file_list = 'mktables.lst'; # File to store input and output file names.
601 # This is used to speed up the build, by not
602 # executing the main body of the program if
603 # nothing on the list has changed since the
605 my $make_list = 1; # ? Should we write $file_list. Set to always
606 # make a list so that when the pumpking is
607 # preparing a release, s/he won't have to do
609 my $glob_list = 0; # ? Should we try to include unknown .txt files
611 my $output_range_counts = 1; # ? Should we include the number of code points
612 # in ranges in the output
613 # Verbosity levels; 0 is quiet
614 my $NORMAL_VERBOSITY = 1;
618 my $verbosity = $NORMAL_VERBOSITY;
622 my $arg = shift @ARGV;
624 $verbosity = $VERBOSE;
626 elsif ($arg eq '-p') {
627 $verbosity = $PROGRESS;
628 $| = 1; # Flush buffers as we go.
630 elsif ($arg eq '-q') {
633 elsif ($arg eq '-w') {
634 $write_unchanged_files = 1; # update the files even if havent changed
636 elsif ($arg eq '-check') {
637 my $this = shift @ARGV;
638 my $ok = shift @ARGV;
640 print "Skipping as check params are not the same.\n";
644 elsif ($arg eq '-P' && defined ($pod_directory = shift)) {
645 -d $pod_directory or croak "Directory '$pod_directory' doesn't exist";
647 elsif ($arg eq '-maketest' || ($arg eq '-T' && defined ($t_path = shift)))
649 $make_test_script = 1;
651 elsif ($arg eq '-makelist') {
654 elsif ($arg eq '-C' && defined ($use_directory = shift)) {
655 -d $use_directory or croak "Unknown directory '$use_directory'";
657 elsif ($arg eq '-L') {
659 # Existence not tested until have chdir'd
662 elsif ($arg eq '-globlist') {
665 elsif ($arg eq '-c') {
666 $output_range_counts = ! $output_range_counts
670 $with_c .= 'out' if $output_range_counts; # Complements the state
672 usage: $0 [-c|-p|-q|-v|-w] [-C dir] [-L filelist] [ -P pod_dir ]
673 [ -T test_file_path ] [-globlist] [-makelist] [-maketest]
675 -c : Output comments $with_c number of code points in ranges
676 -q : Quiet Mode: Only output serious warnings.
677 -p : Set verbosity level to normal plus show progress.
678 -v : Set Verbosity level high: Show progress and non-serious
680 -w : Write files regardless
681 -C dir : Change to this directory before proceeding. All relative paths
682 except those specified by the -P and -T options will be done
683 with respect to this directory.
684 -P dir : Output $pod_file file to directory 'dir'.
685 -T path : Create a test script as 'path'; overrides -maketest
686 -L filelist : Use alternate 'filelist' instead of standard one
687 -globlist : Take as input all non-Test *.txt files in current and sub
689 -maketest : Make test script 'TestProp.pl' in current (or -C directory),
691 -makelist : Rewrite the file list $file_list based on current setup
692 -check A B : Executes $0 only if A and B are the same
697 # Stores the most-recently changed file. If none have changed, can skip the
699 my $youngest = -M $0; # Do this before the chdir!
701 # Change directories now, because need to read 'version' early.
702 if ($use_directory) {
703 if ($pod_directory && ! File::Spec->file_name_is_absolute($pod_directory)) {
704 $pod_directory = File::Spec->rel2abs($pod_directory);
706 if ($t_path && ! File::Spec->file_name_is_absolute($t_path)) {
707 $t_path = File::Spec->rel2abs($t_path);
709 chdir $use_directory or croak "Failed to chdir to '$use_directory':$!";
710 if ($pod_directory && File::Spec->file_name_is_absolute($pod_directory)) {
711 $pod_directory = File::Spec->abs2rel($pod_directory);
713 if ($t_path && File::Spec->file_name_is_absolute($t_path)) {
714 $t_path = File::Spec->abs2rel($t_path);
718 # Get Unicode version into regular and v-string. This is done now because
719 # various tables below get populated based on it. These tables are populated
720 # here to be near the top of the file, and so easily seeable by those needing
722 open my $VERSION, "<", "version"
723 or croak "$0: can't open required file 'version': $!\n";
724 my $string_version = <$VERSION>;
726 chomp $string_version;
727 my $v_version = pack "C*", split /\./, $string_version; # v string
729 # The following are the complete names of properties with property values that
730 # are known to not match any code points in some versions of Unicode, but that
731 # may change in the future so they should be matchable, hence an empty file is
732 # generated for them.
733 my @tables_that_may_be_empty = (
734 'Joining_Type=Left_Joining',
736 push @tables_that_may_be_empty, 'Script=Common' if $v_version le v4.0.1;
737 push @tables_that_may_be_empty, 'Title' if $v_version lt v2.0.0;
738 push @tables_that_may_be_empty, 'Script=Katakana_Or_Hiragana'
739 if $v_version ge v4.1.0;
741 # The lists below are hashes, so the key is the item in the list, and the
742 # value is the reason why it is in the list. This makes generation of
743 # documentation easier.
745 my %why_suppressed; # No file generated for these.
747 # Files aren't generated for empty extraneous properties. This is arguable.
748 # Extraneous properties generally come about because a property is no longer
749 # used in a newer version of Unicode. If we generated a file without code
750 # points, programs that used to work on that property will still execute
751 # without errors. It just won't ever match (or will always match, with \P{}).
752 # This means that the logic is now likely wrong. I (khw) think its better to
753 # find this out by getting an error message. Just move them to the table
754 # above to change this behavior
755 my %why_suppress_if_empty_warn_if_not = (
757 # It is the only property that has ever officially been removed from the
758 # Standard. The database never contained any code points for it.
759 'Special_Case_Condition' => 'Obsolete',
761 # Apparently never official, but there were code points in some versions of
762 # old-style PropList.txt
763 'Non_Break' => 'Obsolete',
766 # These would normally go in the warn table just above, but they were changed
767 # a long time before this program was written, so warnings about them are
769 if ($v_version gt v3.2.0) {
770 push @tables_that_may_be_empty,
771 'Canonical_Combining_Class=Attached_Below_Left'
774 # These are listed in the Property aliases file in 5.2, but Unihan is ignored
775 # unless explicitly added.
776 if ($v_version ge v5.2.0) {
777 my $unihan = 'Unihan; remove from list if using Unihan';
778 foreach my $table qw (
782 kCompatibilityVariant
796 $why_suppress_if_empty_warn_if_not{$table} = $unihan;
800 # Properties that this program ignores.
801 my @unimplemented_properties = (
802 'Unicode_Radical_Stroke' # Remove if changing to handle this one.
805 # There are several types of obsolete properties defined by Unicode. These
806 # must be hand-edited for every new Unicode release.
807 my %why_deprecated; # Generates a deprecated warning message if used.
808 my %why_stabilized; # Documentation only
809 my %why_obsolete; # Documentation only
812 my $simple = 'Perl uses the more complete version of this property';
813 my $unihan = 'Unihan properties are by default not enabled in the Perl core. Instead use CPAN: Unicode::Unihan';
815 my $other_properties = 'other properties';
816 my $contributory = "Used by Unicode internally for generating $other_properties and not intended to be used stand-alone";
817 my $why_no_expand = "Easily computed, and yet doesn't cover the common encoding forms (UTF-16/8)",
820 'Grapheme_Link' => 'Deprecated by Unicode. Use ccc=vr (Canonical_Combining_Class=Virama) instead',
821 'Jamo_Short_Name' => $contributory,
822 'Line_Break=Surrogate' => 'Deprecated by Unicode because surrogates should never appear in well-formed text, and therefore shouldn\'t be the basis for line breaking',
823 'Other_Alphabetic' => $contributory,
824 'Other_Default_Ignorable_Code_Point' => $contributory,
825 'Other_Grapheme_Extend' => $contributory,
826 'Other_ID_Continue' => $contributory,
827 'Other_ID_Start' => $contributory,
828 'Other_Lowercase' => $contributory,
829 'Other_Math' => $contributory,
830 'Other_Uppercase' => $contributory,
834 # There is a lib/unicore/Decomposition.pl (used by normalize.pm) which
835 # contains the same information, but without the algorithmically
836 # determinable Hangul syllables'. This file is not published, so it's
837 # existence is not noted in the comment.
838 'Decomposition_Mapping' => 'Accessible via Unicode::Normalize',
840 'ISO_Comment' => 'Apparently no demand for it, but can access it through Unicode::UCD::charinfo. Obsoleted, and code points for it removed in Unicode 5.2',
841 'Unicode_1_Name' => "$simple, and no apparent demand for it, but can access it through Unicode::UCD::charinfo. If there is no later name for a code point, then this one is used instead in charnames",
843 'Simple_Case_Folding' => "$simple. Can access this through Unicode::UCD::casefold",
844 'Simple_Lowercase_Mapping' => "$simple. Can access this through Unicode::UCD::charinfo",
845 'Simple_Titlecase_Mapping' => "$simple. Can access this through Unicode::UCD::charinfo",
846 'Simple_Uppercase_Mapping' => "$simple. Can access this through Unicode::UCD::charinfo",
848 'Name' => "Accessible via 'use charnames;'",
849 'Name_Alias' => "Accessible via 'use charnames;'",
851 # These are sort of jumping the gun; deprecation is proposed for
852 # Unicode version 6.0, but they have never been exposed by Perl, and
853 # likely are soon to be deprecated, so best not to expose them.
854 FC_NFKC_Closure => 'Use NFKC_Casefold instead',
855 Expands_On_NFC => $why_no_expand,
856 Expands_On_NFD => $why_no_expand,
857 Expands_On_NFKC => $why_no_expand,
858 Expands_On_NFKD => $why_no_expand,
861 # The following are suppressed because they were made contributory or
862 # deprecated by Unicode before Perl ever thought about supporting them.
863 foreach my $property ('Jamo_Short_Name', 'Grapheme_Link') {
864 $why_suppressed{$property} = $why_deprecated{$property};
867 # Customize the message for all the 'Other_' properties
868 foreach my $property (keys %why_deprecated) {
869 next if (my $main_property = $property) !~ s/^Other_//;
870 $why_deprecated{$property} =~ s/$other_properties/the $main_property property (which should be used instead)/;
874 if ($v_version ge 4.0.0) {
875 $why_stabilized{'Hyphen'} = 'Use the Line_Break property instead; see www.unicode.org/reports/tr14';
877 if ($v_version ge 5.2.0) {
878 $why_obsolete{'ISO_Comment'} = 'Code points for it have been removed';
881 # Probably obsolete forever
882 if ($v_version ge v4.1.0) {
883 $why_suppressed{'Script=Katakana_Or_Hiragana'} = 'Obsolete. All code points previously matched by this have been moved to "Script=Common"';
886 # This program can create files for enumerated-like properties, such as
887 # 'Numeric_Type'. This file would be the same format as for a string
888 # property, with a mapping from code point to its value, so you could look up,
889 # for example, the script a code point is in. But no one so far wants this
890 # mapping, or they have found another way to get it since this is a new
891 # feature. So no file is generated except if it is in this list.
892 my @output_mapped_properties = split "\n", <<END;
895 # If you are using the Unihan database, you need to add the properties that
896 # you want to extract from it to this table. For your convenience, the
897 # properties in the 5.2 PropertyAliases.txt file are listed, commented out
898 my @cjk_properties = split "\n", <<'END';
899 #cjkAccountingNumeric; kAccountingNumeric
900 #cjkOtherNumeric; kOtherNumeric
901 #cjkPrimaryNumeric; kPrimaryNumeric
902 #cjkCompatibilityVariant; kCompatibilityVariant
904 #cjkIRG_GSource; kIRG_GSource
905 #cjkIRG_HSource; kIRG_HSource
906 #cjkIRG_JSource; kIRG_JSource
907 #cjkIRG_KPSource; kIRG_KPSource
908 #cjkIRG_KSource; kIRG_KSource
909 #cjkIRG_TSource; kIRG_TSource
910 #cjkIRG_USource; kIRG_USource
911 #cjkIRG_VSource; kIRG_VSource
912 #cjkRSUnicode; kRSUnicode ; Unicode_Radical_Stroke; URS
915 # Similarly for the property values. For your convenience, the lines in the
916 # 5.2 PropertyAliases.txt file are listed. Just remove the first BUT NOT both
918 my @cjk_property_values = split "\n", <<'END';
919 ## @missing: 0000..10FFFF; cjkAccountingNumeric; NaN
920 ## @missing: 0000..10FFFF; cjkCompatibilityVariant; <code point>
921 ## @missing: 0000..10FFFF; cjkIICore; <none>
922 ## @missing: 0000..10FFFF; cjkIRG_GSource; <none>
923 ## @missing: 0000..10FFFF; cjkIRG_HSource; <none>
924 ## @missing: 0000..10FFFF; cjkIRG_JSource; <none>
925 ## @missing: 0000..10FFFF; cjkIRG_KPSource; <none>
926 ## @missing: 0000..10FFFF; cjkIRG_KSource; <none>
927 ## @missing: 0000..10FFFF; cjkIRG_TSource; <none>
928 ## @missing: 0000..10FFFF; cjkIRG_USource; <none>
929 ## @missing: 0000..10FFFF; cjkIRG_VSource; <none>
930 ## @missing: 0000..10FFFF; cjkOtherNumeric; NaN
931 ## @missing: 0000..10FFFF; cjkPrimaryNumeric; NaN
932 ## @missing: 0000..10FFFF; cjkRSUnicode; <none>
935 # The input files don't list every code point. Those not listed are to be
936 # defaulted to some value. Below are hard-coded what those values are for
937 # non-binary properties as of 5.1. Starting in 5.0, there are
938 # machine-parsable comment lines in the files the give the defaults; so this
939 # list shouldn't have to be extended. The claim is that all missing entries
940 # for binary properties will default to 'N'. Unicode tried to change that in
941 # 5.2, but the beta period produced enough protest that they backed off.
943 # The defaults for the fields that appear in UnicodeData.txt in this hash must
944 # be in the form that it expects. The others may be synonyms.
945 my $CODE_POINT = '<code point>';
946 my %default_mapping = (
948 # Bidi_Class => Complicated; set in code
949 Bidi_Mirroring_Glyph => "",
951 Canonical_Combining_Class => 0,
952 Case_Folding => $CODE_POINT,
953 Decomposition_Mapping => $CODE_POINT,
954 Decomposition_Type => 'None',
955 East_Asian_Width => "Neutral",
956 FC_NFKC_Closure => $CODE_POINT,
957 General_Category => 'Cn',
958 Grapheme_Cluster_Break => 'Other',
959 Hangul_Syllable_Type => 'NA',
961 Jamo_Short_Name => "",
962 Joining_Group => "No_Joining_Group",
963 # Joining_Type => Complicated; set in code
964 kIICore => 'N', # Is converted to binary
965 #Line_Break => Complicated; set in code
966 Lowercase_Mapping => $CODE_POINT,
973 Numeric_Type => 'None',
974 Numeric_Value => 'NaN',
975 Script => ($v_version le 4.1.0) ? 'Common' : 'Unknown',
976 Sentence_Break => 'Other',
977 Simple_Case_Folding => $CODE_POINT,
978 Simple_Lowercase_Mapping => $CODE_POINT,
979 Simple_Titlecase_Mapping => $CODE_POINT,
980 Simple_Uppercase_Mapping => $CODE_POINT,
981 Titlecase_Mapping => $CODE_POINT,
982 Unicode_1_Name => "",
983 Unicode_Radical_Stroke => "",
984 Uppercase_Mapping => $CODE_POINT,
985 Word_Break => 'Other',
988 # Below are files that Unicode furnishes, but this program ignores, and why
989 my %ignored_files = (
990 'CJKRadicals.txt' => 'Unihan data',
991 'Index.txt' => 'An index, not actual data',
992 'NamedSqProv.txt' => 'Not officially part of the Unicode standard; Append it to NamedSequences.txt if you want to process the contents.',
993 'NamesList.txt' => 'Just adds commentary',
994 'NormalizationCorrections.txt' => 'Data is already in other files.',
995 'Props.txt' => 'Adds nothing to PropList.txt; only in very early releases',
996 'ReadMe.txt' => 'Just comments',
997 'README.TXT' => 'Just comments',
998 'StandardizedVariants.txt' => 'Only for glyph changes, not a Unicode character property. Does not fit into current scheme where one code point is mapped',
1001 ### End of externally interesting definitions, except for @input_file_objects
1004 # !!!!!!! DO NOT EDIT THIS FILE !!!!!!!
1005 # This file is machine-generated by $0 from the Unicode
1006 # database, Version $string_version. Any changes made here will be lost!
1009 my $INTERNAL_ONLY=<<"EOF";
1011 # !!!!!!! INTERNAL PERL USE ONLY !!!!!!!
1012 # This file is for internal use by the Perl program only. The format and even
1013 # the name or existence of this file are subject to change without notice.
1014 # Don't use it directly.
1017 my $DEVELOPMENT_ONLY=<<"EOF";
1018 # !!!!!!! DEVELOPMENT USE ONLY !!!!!!!
1019 # This file contains information artificially constrained to code points
1020 # present in Unicode release $string_compare_versions.
1021 # IT CANNOT BE RELIED ON. It is for use during development only and should
1022 # not be used for production.
1026 my $LAST_UNICODE_CODEPOINT_STRING = "10FFFF";
1027 my $LAST_UNICODE_CODEPOINT = hex $LAST_UNICODE_CODEPOINT_STRING;
1028 my $MAX_UNICODE_CODEPOINTS = $LAST_UNICODE_CODEPOINT + 1;
1030 # Matches legal code point. 4-6 hex numbers, If there are 6, the first
1031 # two must be 10; if there are 5, the first must not be a 0. Written this way
1032 # to decrease backtracking
1034 qr/ \b (?: 10[0-9A-F]{4} | [1-9A-F][0-9A-F]{4} | [0-9A-F]{4} ) \b/x;
1036 # This matches the beginning of the line in the Unicode db files that give the
1037 # defaults for code points not listed (i.e., missing) in the file. The code
1038 # depends on this ending with a semi-colon, so it can assume it is a valid
1039 # field when the line is split() by semi-colons
1040 my $missing_defaults_prefix =
1041 qr/^#\s+\@missing:\s+0000\.\.$LAST_UNICODE_CODEPOINT_STRING\s*;/;
1043 # Property types. Unicode has more types, but these are sufficient for our
1045 my $UNKNOWN = -1; # initialized to illegal value
1046 my $NON_STRING = 1; # Either binary or enum
1048 my $ENUM = 3; # Include catalog
1049 my $STRING = 4; # Anything else: string or misc
1051 # Some input files have lines that give default values for code points not
1052 # contained in the file. Sometimes these should be ignored.
1053 my $NO_DEFAULTS = 0; # Must evaluate to false
1054 my $NOT_IGNORED = 1;
1057 # Range types. Each range has a type. Most ranges are type 0, for normal,
1058 # and will appear in the main body of the tables in the output files, but
1059 # there are other types of ranges as well, listed below, that are specially
1060 # handled. There are pseudo-types as well that will never be stored as a
1061 # type, but will affect the calculation of the type.
1063 # 0 is for normal, non-specials
1064 my $MULTI_CP = 1; # Sequence of more than code point
1065 my $HANGUL_SYLLABLE = 2;
1066 my $CP_IN_NAME = 3; # The NAME contains the code point appended to it.
1067 my $NULL = 4; # The map is to the null string; utf8.c can't
1068 # handle these, nor is there an accepted syntax
1069 # for them in \p{} constructs
1070 my $COMPUTE_NO_MULTI_CP = 5; # Pseudo-type; means that ranges that would
1071 # otherwise be $MULTI_CP type are instead type 0
1073 # process_generic_property_file() can accept certain overrides in its input.
1074 # Each of these must begin AND end with $CMD_DELIM.
1075 my $CMD_DELIM = "\a";
1076 my $REPLACE_CMD = 'replace'; # Override the Replace
1077 my $MAP_TYPE_CMD = 'map_type'; # Override the Type
1082 # Values for the Replace argument to add_range.
1083 # $NO # Don't replace; add only the code points not
1085 my $IF_NOT_EQUIVALENT = 1; # Replace only under certain conditions; details in
1086 # the comments at the subroutine definition.
1087 my $UNCONDITIONALLY = 2; # Replace without conditions.
1088 my $MULTIPLE = 4; # Don't replace, but add a duplicate record if
1091 # Flags to give property statuses. The phrases are to remind maintainers that
1092 # if the flag is changed, the indefinite article referring to it in the
1093 # documentation may need to be as well.
1095 my $SUPPRESSED = 'z'; # The character should never actually be seen, since
1097 my $PLACEHOLDER = 'P'; # Implies no pod entry generated
1098 my $DEPRECATED = 'D';
1099 my $a_bold_deprecated = "a 'B<$DEPRECATED>'";
1100 my $A_bold_deprecated = "A 'B<$DEPRECATED>'";
1101 my $DISCOURAGED = 'X';
1102 my $a_bold_discouraged = "an 'B<$DISCOURAGED>'";
1103 my $A_bold_discouraged = "An 'B<$DISCOURAGED>'";
1105 my $a_bold_stricter = "a 'B<$STRICTER>'";
1106 my $A_bold_stricter = "A 'B<$STRICTER>'";
1107 my $STABILIZED = 'S';
1108 my $a_bold_stabilized = "an 'B<$STABILIZED>'";
1109 my $A_bold_stabilized = "An 'B<$STABILIZED>'";
1111 my $a_bold_obsolete = "an 'B<$OBSOLETE>'";
1112 my $A_bold_obsolete = "An 'B<$OBSOLETE>'";
1114 my %status_past_participles = (
1115 $DISCOURAGED => 'discouraged',
1116 $SUPPRESSED => 'should never be generated',
1117 $STABILIZED => 'stabilized',
1118 $OBSOLETE => 'obsolete',
1119 $DEPRECATED => 'deprecated',
1122 # The format of the values of the map tables:
1123 my $BINARY_FORMAT = 'b';
1124 my $DECIMAL_FORMAT = 'd';
1125 my $FLOAT_FORMAT = 'f';
1126 my $INTEGER_FORMAT = 'i';
1127 my $HEX_FORMAT = 'x';
1128 my $RATIONAL_FORMAT = 'r';
1129 my $STRING_FORMAT = 's';
1131 my %map_table_formats = (
1132 $BINARY_FORMAT => 'binary',
1133 $DECIMAL_FORMAT => 'single decimal digit',
1134 $FLOAT_FORMAT => 'floating point number',
1135 $INTEGER_FORMAT => 'integer',
1136 $HEX_FORMAT => 'positive hex whole number; a code point',
1137 $RATIONAL_FORMAT => 'rational: an integer or a fraction',
1138 $STRING_FORMAT => 'arbitrary string',
1141 # Unicode didn't put such derived files in a separate directory at first.
1142 my $EXTRACTED_DIR = (-d 'extracted') ? 'extracted' : "";
1143 my $EXTRACTED = ($EXTRACTED_DIR) ? "$EXTRACTED_DIR/" : "";
1144 my $AUXILIARY = 'auxiliary';
1146 # Hashes that will eventually go into Heavy.pl for the use of utf8_heavy.pl
1147 my %loose_to_file_of; # loosely maps table names to their respective
1149 my %stricter_to_file_of; # same; but for stricter mapping.
1150 my %nv_floating_to_rational; # maps numeric values floating point numbers to
1151 # their rational equivalent
1152 my %loose_property_name_of; # Loosely maps property names to standard form
1154 # These constants names and values were taken from the Unicode standard,
1155 # version 5.1, section 3.12. They are used in conjunction with Hangul
1165 my $NCount = $VCount * $TCount;
1167 # For Hangul syllables; These store the numbers from Jamo.txt in conjunction
1168 # with the above published constants.
1170 my %Jamo_L; # Leading consonants
1171 my %Jamo_V; # Vowels
1172 my %Jamo_T; # Trailing consonants
1174 my @backslash_X_tests; # List of tests read in for testing \X
1175 my @unhandled_properties; # Will contain a list of properties found in
1176 # the input that we didn't process.
1177 my @match_properties; # Properties that have match tables, to be
1179 my @map_properties; # Properties that get map files written
1180 my @named_sequences; # NamedSequences.txt contents.
1181 my %potential_files; # Generated list of all .txt files in the directory
1182 # structure so we can warn if something is being
1184 my @files_actually_output; # List of files we generated.
1185 my @more_Names; # Some code point names are compound; this is used
1186 # to store the extra components of them.
1187 my $MIN_FRACTION_LENGTH = 3; # How many digits of a floating point number at
1188 # the minimum before we consider it equivalent to a
1189 # candidate rational
1190 my $MAX_FLOATING_SLOP = 10 ** - $MIN_FRACTION_LENGTH; # And in floating terms
1192 # These store references to certain commonly used property objects
1197 # Are there conflicting names because of beginning with 'In_', or 'Is_'
1198 my $has_In_conflicts = 0;
1199 my $has_Is_conflicts = 0;
1201 sub internal_file_to_platform ($) {
1202 # Convert our file paths which have '/' separators to those of the
1206 return undef unless defined $file;
1208 return File::Spec->join(split '/', $file);
1211 sub file_exists ($) { # platform independent '-e'. This program internally
1212 # uses slash as a path separator.
1214 return 0 if ! defined $file;
1215 return -e internal_file_to_platform($file);
1219 # Returns the address of the blessed input object.
1220 # It doesn't check for blessedness because that would do a string eval
1221 # every call, and the program is structured so that this is never called
1222 # for a non-blessed object.
1224 no overloading; # If overloaded, numifying below won't work.
1226 # Numifying a ref gives its address.
1230 # Commented code below should work on Perl 5.8.
1231 ## This 'require' doesn't necessarily work in miniperl, and even if it does,
1232 ## the native perl version of it (which is what would operate under miniperl)
1233 ## is extremely slow, as it does a string eval every call.
1234 #my $has_fast_scalar_util = $
\18 !~ /miniperl/
1235 # && defined eval "require Scalar::Util";
1238 # # Returns the address of the blessed input object. Uses the XS version if
1239 # # available. It doesn't check for blessedness because that would do a
1240 # # string eval every call, and the program is structured so that this is
1241 # # never called for a non-blessed object.
1243 # return Scalar::Util::refaddr($_[0]) if $has_fast_scalar_util;
1245 # # Check at least that is a ref.
1246 # my $pkg = ref($_[0]) or return undef;
1248 # # Change to a fake package to defeat any overloaded stringify
1249 # bless $_[0], 'main::Fake';
1251 # # Numifying a ref gives its address.
1252 # my $addr = 0 + $_[0];
1254 # # Return to original class
1255 # bless $_[0], $pkg;
1262 return $a if $a >= $b;
1269 return $a if $a <= $b;
1273 sub clarify_number ($) {
1274 # This returns the input number with underscores inserted every 3 digits
1275 # in large (5 digits or more) numbers. Input must be entirely digits, not
1279 my $pos = length($number) - 3;
1280 return $number if $pos <= 1;
1282 substr($number, $pos, 0) = '_';
1291 # These routines give a uniform treatment of messages in this program. They
1292 # are placed in the Carp package to cause the stack trace to not include them,
1293 # although an alternative would be to use another package and set @CARP_NOT
1296 our $Verbose = 1 if main::DEBUG; # Useful info when debugging
1298 # This is a work-around suggested by Nicholas Clark to fix a problem with Carp
1299 # and overload trying to load Scalar:Util under miniperl. See
1300 # http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/2009-11/msg01057.html
1301 undef $overload::VERSION;
1304 my $message = shift || "";
1305 my $nofold = shift || 0;
1308 $message = main::join_lines($message);
1309 $message =~ s/^$0: *//; # Remove initial program name
1310 $message =~ s/[.;,]+$//; # Remove certain ending punctuation
1311 $message = "\n$0: $message;";
1313 # Fold the message with program name, semi-colon end punctuation
1314 # (which looks good with the message that carp appends to it), and a
1315 # hanging indent for continuation lines.
1316 $message = main::simple_fold($message, "", 4) unless $nofold;
1317 $message =~ s/\n$//; # Remove the trailing nl so what carp
1318 # appends is to the same line
1321 return $message if defined wantarray; # If a caller just wants the msg
1328 # This is called when it is clear that the problem is caused by a bug in
1331 my $message = shift;
1332 $message =~ s/^$0: *//;
1333 $message = my_carp("Bug in $0. Please report it by running perlbug or if that is unavailable, by sending email to perbug\@perl.org:\n$message");
1338 sub carp_too_few_args {
1340 my_carp_bug("Wrong number of arguments: to 'carp_too_few_arguments'. No action taken.");
1344 my $args_ref = shift;
1347 my_carp_bug("Need at least $count arguments to "
1349 . ". Instead got: '"
1350 . join ', ', @$args_ref
1351 . "'. No action taken.");
1355 sub carp_extra_args {
1356 my $args_ref = shift;
1357 my_carp_bug("Too many arguments to 'carp_extra_args': (" . join(', ', @_) . "); Extras ignored.") if @_;
1359 unless (ref $args_ref) {
1360 my_carp_bug("Argument to 'carp_extra_args' ($args_ref) must be a ref. Not checking arguments.");
1363 my ($package, $file, $line) = caller;
1364 my $subroutine = (caller 1)[3];
1367 if (ref $args_ref eq 'HASH') {
1368 foreach my $key (keys %$args_ref) {
1369 $args_ref->{$key} = $UNDEF unless defined $args_ref->{$key};
1371 $list = join ', ', each %{$args_ref};
1373 elsif (ref $args_ref eq 'ARRAY') {
1374 foreach my $arg (@$args_ref) {
1375 $arg = $UNDEF unless defined $arg;
1377 $list = join ', ', @$args_ref;
1380 my_carp_bug("Can't cope with ref "
1382 . " . argument to 'carp_extra_args'. Not checking arguments.");
1386 my_carp_bug("Unrecognized parameters in options: '$list' to $subroutine. Skipped.");
1394 # This program uses the inside-out method for objects, as recommended in
1395 # "Perl Best Practices". This closure aids in generating those. There
1396 # are two routines. setup_package() is called once per package to set
1397 # things up, and then set_access() is called for each hash representing a
1398 # field in the object. These routines arrange for the object to be
1399 # properly destroyed when no longer used, and for standard accessor
1400 # functions to be generated. If you need more complex accessors, just
1401 # write your own and leave those accesses out of the call to set_access().
1402 # More details below.
1404 my %constructor_fields; # fields that are to be used in constructors; see
1407 # The values of this hash will be the package names as keys to other
1408 # hashes containing the name of each field in the package as keys, and
1409 # references to their respective hashes as values.
1413 # Sets up the package, creating standard DESTROY and dump methods
1414 # (unless already defined). The dump method is used in debugging by
1416 # The optional parameters are:
1417 # a) a reference to a hash, that gets populated by later
1418 # set_access() calls with one of the accesses being
1419 # 'constructor'. The caller can then refer to this, but it is
1420 # not otherwise used by these two routines.
1421 # b) a reference to a callback routine to call during destruction
1422 # of the object, before any fields are actually destroyed
1425 my $constructor_ref = delete $args{'Constructor_Fields'};
1426 my $destroy_callback = delete $args{'Destroy_Callback'};
1427 Carp::carp_extra_args(\@_) if main::DEBUG && %args;
1430 my $package = (caller)[0];
1432 $package_fields{$package} = \%fields;
1433 $constructor_fields{$package} = $constructor_ref;
1435 unless ($package->can('DESTROY')) {
1436 my $destroy_name = "${package}::DESTROY";
1439 # Use typeglob to give the anonymous subroutine the name we want
1440 *$destroy_name = sub {
1442 my $addr; { no overloading; $addr = 0+$self; }
1444 $self->$destroy_callback if $destroy_callback;
1445 foreach my $field (keys %{$package_fields{$package}}) {
1446 #print STDERR __LINE__, ": Destroying ", ref $self, " ", sprintf("%04X", $addr), ": ", $field, "\n";
1447 delete $package_fields{$package}{$field}{$addr};
1453 unless ($package->can('dump')) {
1454 my $dump_name = "${package}::dump";
1458 return dump_inside_out($self, $package_fields{$package}, @_);
1465 # Arrange for the input field to be garbage collected when no longer
1466 # needed. Also, creates standard accessor functions for the field
1467 # based on the optional parameters-- none if none of these parameters:
1468 # 'addable' creates an 'add_NAME()' accessor function.
1469 # 'readable' or 'readable_array' creates a 'NAME()' accessor
1471 # 'settable' creates a 'set_NAME()' accessor function.
1472 # 'constructor' doesn't create an accessor function, but adds the
1473 # field to the hash that was previously passed to
1475 # Any of the accesses can be abbreviated down, so that 'a', 'ad',
1476 # 'add' etc. all mean 'addable'.
1477 # The read accessor function will work on both array and scalar
1478 # values. If another accessor in the parameter list is 'a', the read
1479 # access assumes an array. You can also force it to be array access
1480 # by specifying 'readable_array' instead of 'readable'
1482 # A sort-of 'protected' access can be set-up by preceding the addable,
1483 # readable or settable with some initial portion of 'protected_' (but,
1484 # the underscore is required), like 'p_a', 'pro_set', etc. The
1485 # "protection" is only by convention. All that happens is that the
1486 # accessor functions' names begin with an underscore. So instead of
1487 # calling set_foo, the call is _set_foo. (Real protection could be
1488 # accomplished by having a new subroutine, end_package called at the
1489 # end of each package, and then storing the __LINE__ ranges and
1490 # checking them on every accessor. But that is way overkill.)
1492 # We create anonymous subroutines as the accessors and then use
1493 # typeglobs to assign them to the proper package and name
1495 my $name = shift; # Name of the field
1496 my $field = shift; # Reference to the inside-out hash containing the
1499 my $package = (caller)[0];
1501 if (! exists $package_fields{$package}) {
1502 croak "$0: Must call 'setup_package' before 'set_access'";
1505 # Stash the field so DESTROY can get it.
1506 $package_fields{$package}{$name} = $field;
1508 # Remaining arguments are the accessors. For each...
1509 foreach my $access (@_) {
1510 my $access = lc $access;
1514 # Match the input as far as it goes.
1515 if ($access =~ /^(p[^_]*)_/) {
1517 if (substr('protected_', 0, length $protected)
1521 # Add 1 for the underscore not included in $protected
1522 $access = substr($access, length($protected) + 1);
1530 if (substr('addable', 0, length $access) eq $access) {
1531 my $subname = "${package}::${protected}add_$name";
1534 # add_ accessor. Don't add if already there, which we
1535 # determine using 'eq' for scalars and '==' otherwise.
1538 return Carp::carp_too_few_args(\@_, 2) if main::DEBUG && @_ < 2;
1541 my $addr; { no overloading; $addr = 0+$self; }
1542 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
1544 return if grep { $value == $_ } @{$field->{$addr}};
1547 return if grep { $value eq $_ } @{$field->{$addr}};
1549 push @{$field->{$addr}}, $value;
1553 elsif (substr('constructor', 0, length $access) eq $access) {
1555 Carp::my_carp_bug("Can't set-up 'protected' constructors")
1558 $constructor_fields{$package}{$name} = $field;
1561 elsif (substr('readable_array', 0, length $access) eq $access) {
1563 # Here has read access. If one of the other parameters for
1564 # access is array, or this one specifies array (by being more
1565 # than just 'readable_'), then create a subroutine that
1566 # assumes the data is an array. Otherwise just a scalar
1567 my $subname = "${package}::${protected}$name";
1568 if (grep { /^a/i } @_
1569 or length($access) > length('readable_'))
1574 Carp::carp_extra_args(\@_) if main::DEBUG && @_ > 1;
1575 my $addr; { no overloading; $addr = 0+$_[0]; }
1576 if (ref $field->{$addr} ne 'ARRAY') {
1577 my $type = ref $field->{$addr};
1578 $type = 'scalar' unless $type;
1579 Carp::my_carp_bug("Trying to read $name as an array when it is a $type. Big problems.");
1582 return scalar @{$field->{$addr}} unless wantarray;
1584 # Make a copy; had problems with caller modifying the
1585 # original otherwise
1586 my @return = @{$field->{$addr}};
1592 # Here not an array value, a simpler function.
1596 Carp::carp_extra_args(\@_) if main::DEBUG && @_ > 1;
1598 return $field->{0+$_[0]};
1602 elsif (substr('settable', 0, length $access) eq $access) {
1603 my $subname = "${package}::${protected}set_$name";
1608 return Carp::carp_too_few_args(\@_, 2) if @_ < 2;
1609 Carp::carp_extra_args(\@_) if @_ > 2;
1611 # $self is $_[0]; $value is $_[1]
1613 $field->{0+$_[0]} = $_[1];
1618 Carp::my_carp_bug("Unknown accessor type $access. No accessor set.");
1627 # All input files use this object, which stores various attributes about them,
1628 # and provides for convenient, uniform handling. The run method wraps the
1629 # processing. It handles all the bookkeeping of opening, reading, and closing
1630 # the file, returning only significant input lines.
1632 # Each object gets a handler which processes the body of the file, and is
1633 # called by run(). Most should use the generic, default handler, which has
1634 # code scrubbed to handle things you might not expect. A handler should
1635 # basically be a while(next_line()) {...} loop.
1637 # You can also set up handlers to
1638 # 1) call before the first line is read for pre processing
1639 # 2) call to adjust each line of the input before the main handler gets them
1640 # 3) call upon EOF before the main handler exits its loop
1641 # 4) call at the end for post processing
1643 # $_ is used to store the input line, and is to be filtered by the
1644 # each_line_handler()s. So, if the format of the line is not in the desired
1645 # format for the main handler, these are used to do that adjusting. They can
1646 # be stacked (by enclosing them in an [ anonymous array ] in the constructor,
1647 # so the $_ output of one is used as the input to the next. None of the other
1648 # handlers are stackable, but could easily be changed to be so.
1650 # Most of the handlers can call insert_lines() or insert_adjusted_lines()
1651 # which insert the parameters as lines to be processed before the next input
1652 # file line is read. This allows the EOF handler to flush buffers, for
1653 # example. The difference between the two routines is that the lines inserted
1654 # by insert_lines() are subjected to the each_line_handler()s. (So if you
1655 # called it from such a handler, you would get infinite recursion.) Lines
1656 # inserted by insert_adjusted_lines() go directly to the main handler without
1657 # any adjustments. If the post-processing handler calls any of these, there
1658 # will be no effect. Some error checking for these conditions could be added,
1659 # but it hasn't been done.
1661 # carp_bad_line() should be called to warn of bad input lines, which clears $_
1662 # to prevent further processing of the line. This routine will output the
1663 # message as a warning once, and then keep a count of the lines that have the
1664 # same message, and output that count at the end of the file's processing.
1665 # This keeps the number of messages down to a manageable amount.
1667 # get_missings() should be called to retrieve any @missing input lines.
1668 # Messages will be raised if this isn't done if the options aren't to ignore
1671 sub trace { return main::trace(@_); }
1674 # Keep track of fields that are to be put into the constructor.
1675 my %constructor_fields;
1677 main::setup_package(Constructor_Fields => \%constructor_fields);
1679 my %file; # Input file name, required
1680 main::set_access('file', \%file, qw{ c r });
1682 my %first_released; # Unicode version file was first released in, required
1683 main::set_access('first_released', \%first_released, qw{ c r });
1685 my %handler; # Subroutine to process the input file, defaults to
1686 # 'process_generic_property_file'
1687 main::set_access('handler', \%handler, qw{ c });
1690 # name of property this file is for. defaults to none, meaning not
1691 # applicable, or is otherwise determinable, for example, from each line.
1692 main::set_access('property', \%property, qw{ c });
1695 # If this is true, the file is optional. If not present, no warning is
1696 # output. If it is present, the string given by this parameter is
1697 # evaluated, and if false the file is not processed.
1698 main::set_access('optional', \%optional, 'c', 'r');
1701 # This is used for debugging, to skip processing of all but a few input
1702 # files. Add 'non_skip => 1' to the constructor for those files you want
1703 # processed when you set the $debug_skip global.
1704 main::set_access('non_skip', \%non_skip, 'c');
1707 # This is used to skip processing of this input file semi-permanently.
1708 # It is used for files that we aren't planning to process anytime soon,
1709 # but want to allow to be in the directory and not raise a message that we
1710 # are not handling. Mostly for test files. This is in contrast to the
1711 # non_skip element, which is supposed to be used very temporarily for
1712 # debugging. Sets 'optional' to 1
1713 main::set_access('skip', \%skip, 'c');
1715 my %each_line_handler;
1716 # list of subroutines to look at and filter each non-comment line in the
1717 # file. defaults to none. The subroutines are called in order, each is
1718 # to adjust $_ for the next one, and the final one adjusts it for
1720 main::set_access('each_line_handler', \%each_line_handler, 'c');
1722 my %has_missings_defaults;
1723 # ? Are there lines in the file giving default values for code points
1724 # missing from it?. Defaults to NO_DEFAULTS. Otherwise NOT_IGNORED is
1725 # the norm, but IGNORED means it has such lines, but the handler doesn't
1726 # use them. Having these three states allows us to catch changes to the
1727 # UCD that this program should track
1728 main::set_access('has_missings_defaults',
1729 \%has_missings_defaults, qw{ c r });
1732 # Subroutine to call before doing anything else in the file. If undef, no
1733 # such handler is called.
1734 main::set_access('pre_handler', \%pre_handler, qw{ c });
1737 # Subroutine to call upon getting an EOF on the input file, but before
1738 # that is returned to the main handler. This is to allow buffers to be
1739 # flushed. The handler is expected to call insert_lines() or
1740 # insert_adjusted() with the buffered material
1741 main::set_access('eof_handler', \%eof_handler, qw{ c r });
1744 # Subroutine to call after all the lines of the file are read in and
1745 # processed. If undef, no such handler is called.
1746 main::set_access('post_handler', \%post_handler, qw{ c });
1748 my %progress_message;
1749 # Message to print to display progress in lieu of the standard one
1750 main::set_access('progress_message', \%progress_message, qw{ c });
1753 # cache open file handle, internal. Is undef if file hasn't been
1754 # processed at all, empty if has;
1755 main::set_access('handle', \%handle);
1758 # cache of lines added virtually to the file, internal
1759 main::set_access('added_lines', \%added_lines);
1762 # cache of errors found, internal
1763 main::set_access('errors', \%errors);
1766 # storage of '@missing' defaults lines
1767 main::set_access('missings', \%missings);
1772 my $self = bless \do{ my $anonymous_scalar }, $class;
1773 my $addr; { no overloading; $addr = 0+$self; }
1776 $handler{$addr} = \&main::process_generic_property_file;
1777 $non_skip{$addr} = 0;
1779 $has_missings_defaults{$addr} = $NO_DEFAULTS;
1780 $handle{$addr} = undef;
1781 $added_lines{$addr} = [ ];
1782 $each_line_handler{$addr} = [ ];
1783 $errors{$addr} = { };
1784 $missings{$addr} = [ ];
1786 # Two positional parameters.
1787 return Carp::carp_too_few_args(\@_, 2) if main::DEBUG && @_ < 2;
1788 $file{$addr} = main::internal_file_to_platform(shift);
1789 $first_released{$addr} = shift;
1791 # The rest of the arguments are key => value pairs
1792 # %constructor_fields has been set up earlier to list all possible
1793 # ones. Either set or push, depending on how the default has been set
1796 foreach my $key (keys %args) {
1797 my $argument = $args{$key};
1799 # Note that the fields are the lower case of the constructor keys
1800 my $hash = $constructor_fields{lc $key};
1801 if (! defined $hash) {
1802 Carp::my_carp_bug("Unrecognized parameters '$key => $argument' to new() for $self. Skipped");
1805 if (ref $hash->{$addr} eq 'ARRAY') {
1806 if (ref $argument eq 'ARRAY') {
1807 foreach my $argument (@{$argument}) {
1808 next if ! defined $argument;
1809 push @{$hash->{$addr}}, $argument;
1813 push @{$hash->{$addr}}, $argument if defined $argument;
1817 $hash->{$addr} = $argument;
1822 # If the file has a property for it, it means that the property is not
1823 # listed in the file's entries. So add a handler to the list of line
1824 # handlers to insert the property name into the lines, to provide a
1825 # uniform interface to the final processing subroutine.
1826 # the final code doesn't have to worry about that.
1827 if ($property{$addr}) {
1828 push @{$each_line_handler{$addr}}, \&_insert_property_into_line;
1831 if ($non_skip{$addr} && ! $debug_skip && $verbosity) {
1832 print "Warning: " . __PACKAGE__ . " constructor for $file{$addr} has useless 'non_skip' in it\n";
1835 $optional{$addr} = 1 if $skip{$addr};
1843 qw("") => "_operator_stringify",
1844 "." => \&main::_operator_dot,
1847 sub _operator_stringify {
1850 return __PACKAGE__ . " object for " . $self->file;
1853 # flag to make sure extracted files are processed early
1854 my $seen_non_extracted_non_age = 0;
1857 # Process the input object $self. This opens and closes the file and
1858 # calls all the handlers for it. Currently, this can only be called
1859 # once per file, as it destroy's the EOF handler
1862 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
1864 my $addr; { no overloading; $addr = 0+$self; }
1866 my $file = $file{$addr};
1868 # Don't process if not expecting this file (because released later
1869 # than this Unicode version), and isn't there. This means if someone
1870 # copies it into an earlier version's directory, we will go ahead and
1872 return if $first_released{$addr} gt $v_version && ! -e $file;
1874 # If in debugging mode and this file doesn't have the non-skip
1875 # flag set, and isn't one of the critical files, skip it.
1877 && $first_released{$addr} ne v0
1878 && ! $non_skip{$addr})
1880 print "Skipping $file in debugging\n" if $verbosity;
1884 # File could be optional
1885 if ($optional{$addr}) {
1886 return unless -e $file;
1887 my $result = eval $optional{$addr};
1888 if (! defined $result) {
1889 Carp::my_carp_bug("Got '$@' when tried to eval $optional{$addr}. $file Skipped.");
1894 print STDERR "Skipping processing input file '$file' because '$optional{$addr}' is not true\n";
1900 if (! defined $file || ! -e $file) {
1902 # If the file doesn't exist, see if have internal data for it
1903 # (based on first_released being 0).
1904 if ($first_released{$addr} eq v0) {
1905 $handle{$addr} = 'pretend_is_open';
1908 if (! $optional{$addr} # File could be optional
1909 && $v_version ge $first_released{$addr})
1911 print STDERR "Skipping processing input file '$file' because not found\n" if $v_version ge $first_released{$addr};
1918 # Here, the file exists. Some platforms may change the case of
1920 if ($seen_non_extracted_non_age) {
1921 if ($file =~ /$EXTRACTED/i) {
1922 Carp::my_carp_bug(join_lines(<<END
1923 $file should be processed just after the 'Prop...Alias' files, and before
1924 anything not in the $EXTRACTED_DIR directory. Proceeding, but the results may
1925 have subtle problems
1930 elsif ($EXTRACTED_DIR
1931 && $first_released{$addr} ne v0
1932 && $file !~ /$EXTRACTED/i
1933 && lc($file) ne 'dage.txt')
1935 # We don't set this (by the 'if' above) if we have no
1936 # extracted directory, so if running on an early version,
1937 # this test won't work. Not worth worrying about.
1938 $seen_non_extracted_non_age = 1;
1941 # And mark the file as having being processed, and warn if it
1942 # isn't a file we are expecting. As we process the files,
1943 # they are deleted from the hash, so any that remain at the
1944 # end of the program are files that we didn't process.
1945 my $fkey = File::Spec->rel2abs($file);
1946 my $expecting = delete $potential_files{$fkey};
1947 $expecting = delete $potential_files{lc($fkey)} unless defined $expecting;
1948 Carp::my_carp("Was not expecting '$file'.") if
1950 && ! defined $handle{$addr};
1952 # Having deleted from expected files, we can quit if not to do
1953 # anything. Don't print progress unless really want verbosity
1955 print "Skipping $file.\n" if $verbosity >= $VERBOSE;
1959 # Open the file, converting the slashes used in this program
1960 # into the proper form for the OS
1962 if (not open $file_handle, "<", $file) {
1963 Carp::my_carp("Can't open $file. Skipping: $!");
1966 $handle{$addr} = $file_handle; # Cache the open file handle
1969 if ($verbosity >= $PROGRESS) {
1970 if ($progress_message{$addr}) {
1971 print "$progress_message{$addr}\n";
1974 # If using a virtual file, say so.
1975 print "Processing ", (-e $file)
1977 : "substitute $file",
1983 # Call any special handler for before the file.
1984 &{$pre_handler{$addr}}($self) if $pre_handler{$addr};
1986 # Then the main handler
1987 &{$handler{$addr}}($self);
1989 # Then any special post-file handler.
1990 &{$post_handler{$addr}}($self) if $post_handler{$addr};
1992 # If any errors have been accumulated, output the counts (as the first
1993 # error message in each class was output when it was encountered).
1994 if ($errors{$addr}) {
1997 foreach my $error (keys %{$errors{$addr}}) {
1998 $total += $errors{$addr}->{$error};
1999 delete $errors{$addr}->{$error};
2004 = "A total of $total lines had errors in $file. ";
2006 $message .= ($types == 1)
2007 ? '(Only the first one was displayed.)'
2008 : '(Only the first of each type was displayed.)';
2009 Carp::my_carp($message);
2013 if (@{$missings{$addr}}) {
2014 Carp::my_carp_bug("Handler for $file didn't look at all the \@missing lines. Generated tables likely are wrong");
2017 # If a real file handle, close it.
2018 close $handle{$addr} or Carp::my_carp("Can't close $file: $!") if
2020 $handle{$addr} = ""; # Uses empty to indicate that has already seen
2021 # the file, as opposed to undef
2026 # Sets $_ to be the next logical input line, if any. Returns non-zero
2027 # if such a line exists. 'logical' means that any lines that have
2028 # been added via insert_lines() will be returned in $_ before the file
2032 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2034 my $addr; { no overloading; $addr = 0+$self; }
2036 # Here the file is open (or if the handle is not a ref, is an open
2037 # 'virtual' file). Get the next line; any inserted lines get priority
2038 # over the file itself.
2042 while (1) { # Loop until find non-comment, non-empty line
2043 #local $to_trace = 1 if main::DEBUG;
2044 my $inserted_ref = shift @{$added_lines{$addr}};
2045 if (defined $inserted_ref) {
2046 ($adjusted, $_) = @{$inserted_ref};
2047 trace $adjusted, $_ if main::DEBUG && $to_trace;
2048 return 1 if $adjusted;
2051 last if ! ref $handle{$addr}; # Don't read unless is real file
2052 last if ! defined ($_ = readline $handle{$addr});
2055 trace $_ if main::DEBUG && $to_trace;
2057 # See if this line is the comment line that defines what property
2058 # value that code points that are not listed in the file should
2059 # have. The format or existence of these lines is not guaranteed
2060 # by Unicode since they are comments, but the documentation says
2061 # that this was added for machine-readability, so probably won't
2062 # change. This works starting in Unicode Version 5.0. They look
2065 # @missing: 0000..10FFFF; Not_Reordered
2066 # @missing: 0000..10FFFF; Decomposition_Mapping; <code point>
2067 # @missing: 0000..10FFFF; ; NaN
2069 # Save the line for a later get_missings() call.
2070 if (/$missing_defaults_prefix/) {
2071 if ($has_missings_defaults{$addr} == $NO_DEFAULTS) {
2072 $self->carp_bad_line("Unexpected \@missing line. Assuming no missing entries");
2074 elsif ($has_missings_defaults{$addr} == $NOT_IGNORED) {
2075 my @defaults = split /\s* ; \s*/x, $_;
2077 # The first field is the @missing, which ends in a
2078 # semi-colon, so can safely shift.
2081 # Some of these lines may have empty field placeholders
2082 # which get in the way. An example is:
2083 # @missing: 0000..10FFFF; ; NaN
2084 # Remove them. Process starting from the top so the
2085 # splice doesn't affect things still to be looked at.
2086 for (my $i = @defaults - 1; $i >= 0; $i--) {
2087 next if $defaults[$i] ne "";
2088 splice @defaults, $i, 1;
2091 # What's left should be just the property (maybe) and the
2092 # default. Having only one element means it doesn't have
2096 if (@defaults >= 1) {
2097 if (@defaults == 1) {
2098 $default = $defaults[0];
2101 $property = $defaults[0];
2102 $default = $defaults[1];
2108 || ($default =~ /^</
2109 && $default !~ /^<code *point>$/i
2110 && $default !~ /^<none>$/i))
2112 $self->carp_bad_line("Unrecognized \@missing line: $_. Assuming no missing entries");
2116 # If the property is missing from the line, it should
2117 # be the one for the whole file
2118 $property = $property{$addr} if ! defined $property;
2120 # Change <none> to the null string, which is what it
2121 # really means. If the default is the code point
2122 # itself, set it to <code point>, which is what
2123 # Unicode uses (but sometimes they've forgotten the
2125 if ($default =~ /^<none>$/i) {
2128 elsif ($default =~ /^<code *point>$/i) {
2129 $default = $CODE_POINT;
2132 # Store them as a sub-arrays with both components.
2133 push @{$missings{$addr}}, [ $default, $property ];
2137 # There is nothing for the caller to process on this comment
2142 # Remove comments and trailing space, and skip this line if the
2148 # Call any handlers for this line, and skip further processing of
2149 # the line if the handler sets the line to null.
2150 foreach my $sub_ref (@{$each_line_handler{$addr}}) {
2155 # Here the line is ok. return success.
2157 } # End of looping through lines.
2159 # If there is an EOF handler, call it (only once) and if it generates
2160 # more lines to process go back in the loop to handle them.
2161 if ($eof_handler{$addr}) {
2162 &{$eof_handler{$addr}}($self);
2163 $eof_handler{$addr} = ""; # Currently only get one shot at it.
2164 goto LINE if $added_lines{$addr};
2167 # Return failure -- no more lines.
2172 # Not currently used, not fully tested.
2174 # # Non-destructive look-ahead one non-adjusted, non-comment, non-blank
2175 # # record. Not callable from an each_line_handler(), nor does it call
2176 # # an each_line_handler() on the line.
2179 # my $addr; { no overloading; $addr = 0+$self; }
2181 # foreach my $inserted_ref (@{$added_lines{$addr}}) {
2182 # my ($adjusted, $line) = @{$inserted_ref};
2183 # next if $adjusted;
2185 # # Remove comments and trailing space, and return a non-empty
2188 # $line =~ s/\s+$//;
2189 # return $line if $line ne "";
2192 # return if ! ref $handle{$addr}; # Don't read unless is real file
2193 # while (1) { # Loop until find non-comment, non-empty line
2194 # local $to_trace = 1 if main::DEBUG;
2195 # trace $_ if main::DEBUG && $to_trace;
2196 # return if ! defined (my $line = readline $handle{$addr});
2198 # push @{$added_lines{$addr}}, [ 0, $line ];
2201 # $line =~ s/\s+$//;
2202 # return $line if $line ne "";
2210 # Lines can be inserted so that it looks like they were in the input
2211 # file at the place it was when this routine is called. See also
2212 # insert_adjusted_lines(). Lines inserted via this routine go through
2213 # any each_line_handler()
2217 # Each inserted line is an array, with the first element being 0 to
2218 # indicate that this line hasn't been adjusted, and needs to be
2221 push @{$added_lines{0+$self}}, map { [ 0, $_ ] } @_;
2225 sub insert_adjusted_lines {
2226 # Lines can be inserted so that it looks like they were in the input
2227 # file at the place it was when this routine is called. See also
2228 # insert_lines(). Lines inserted via this routine are already fully
2229 # adjusted, ready to be processed; each_line_handler()s handlers will
2230 # not be called. This means this is not a completely general
2231 # facility, as only the last each_line_handler on the stack should
2232 # call this. It could be made more general, by passing to each of the
2233 # line_handlers their position on the stack, which they would pass on
2234 # to this routine, and that would replace the boolean first element in
2235 # the anonymous array pushed here, so that the next_line routine could
2236 # use that to call only those handlers whose index is after it on the
2237 # stack. But this is overkill for what is needed now.
2240 trace $_[0] if main::DEBUG && $to_trace;
2242 # Each inserted line is an array, with the first element being 1 to
2243 # indicate that this line has been adjusted
2245 push @{$added_lines{0+$self}}, map { [ 1, $_ ] } @_;
2250 # Returns the stored up @missings lines' values, and clears the list.
2251 # The values are in an array, consisting of the default in the first
2252 # element, and the property in the 2nd. However, since these lines
2253 # can be stacked up, the return is an array of all these arrays.
2256 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2258 my $addr; { no overloading; $addr = 0+$self; }
2260 # If not accepting a list return, just return the first one.
2261 return shift @{$missings{$addr}} unless wantarray;
2263 my @return = @{$missings{$addr}};
2264 undef @{$missings{$addr}};
2268 sub _insert_property_into_line {
2269 # Add a property field to $_, if this file requires it.
2272 my $addr; { no overloading; $addr = 0+$self; }
2273 my $property = $property{$addr};
2274 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2276 $_ =~ s/(;|$)/; $property$1/;
2281 # Output consistent error messages, using either a generic one, or the
2282 # one given by the optional parameter. To avoid gazillions of the
2283 # same message in case the syntax of a file is way off, this routine
2284 # only outputs the first instance of each message, incrementing a
2285 # count so the totals can be output at the end of the file.
2288 my $message = shift;
2289 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2291 my $addr; { no overloading; $addr = 0+$self; }
2293 $message = 'Unexpected line' unless $message;
2295 # No trailing punctuation so as to fit with our addenda.
2296 $message =~ s/[.:;,]$//;
2298 # If haven't seen this exact message before, output it now. Otherwise
2299 # increment the count of how many times it has occurred
2300 unless ($errors{$addr}->{$message}) {
2301 Carp::my_carp("$message in '$_' in "
2303 . " at line $.. Skipping this line;");
2304 $errors{$addr}->{$message} = 1;
2307 $errors{$addr}->{$message}++;
2310 # Clear the line to prevent any further (meaningful) processing of it.
2317 package Multi_Default;
2319 # Certain properties in early versions of Unicode had more than one possible
2320 # default for code points missing from the files. In these cases, one
2321 # default applies to everything left over after all the others are applied,
2322 # and for each of the others, there is a description of which class of code
2323 # points applies to it. This object helps implement this by storing the
2324 # defaults, and for all but that final default, an eval string that generates
2325 # the class that it applies to.
2330 main::setup_package();
2333 # The defaults structure for the classes
2334 main::set_access('class_defaults', \%class_defaults);
2337 # The default that applies to everything left over.
2338 main::set_access('other_default', \%other_default, 'r');
2342 # The constructor is called with default => eval pairs, terminated by
2343 # the left-over default. e.g.
2344 # Multi_Default->new(
2345 # 'T' => '$gc->table("Mn") + $gc->table("Cf") - 0x200C
2347 # 'R' => 'some other expression that evaluates to code points',
2355 my $self = bless \do{my $anonymous_scalar}, $class;
2356 my $addr; { no overloading; $addr = 0+$self; }
2359 my $default = shift;
2361 $class_defaults{$addr}->{$default} = $eval;
2364 $other_default{$addr} = shift;
2369 sub get_next_defaults {
2370 # Iterates and returns the next class of defaults.
2372 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2374 my $addr; { no overloading; $addr = 0+$self; }
2376 return each %{$class_defaults{$addr}};
2382 # An alias is one of the names that a table goes by. This class defines them
2383 # including some attributes. Everything is currently setup in the
2389 main::setup_package();
2392 main::set_access('name', \%name, 'r');
2395 # Determined by the constructor code if this name should match loosely or
2396 # not. The constructor parameters can override this, but it isn't fully
2397 # implemented, as should have ability to override Unicode one's via
2398 # something like a set_loose_match()
2399 main::set_access('loose_match', \%loose_match, 'r');
2402 # Some aliases should not get their own entries because they are covered
2403 # by a wild-card, and some we want to discourage use of. Binary
2404 main::set_access('make_pod_entry', \%make_pod_entry, 'r');
2407 # Aliases have a status, like deprecated, or even suppressed (which means
2408 # they don't appear in documentation). Enum
2409 main::set_access('status', \%status, 'r');
2412 # Similarly, some aliases should not be considered as usable ones for
2413 # external use, such as file names, or we don't want documentation to
2414 # recommend them. Boolean
2415 main::set_access('externally_ok', \%externally_ok, 'r');
2420 my $self = bless \do { my $anonymous_scalar }, $class;
2421 my $addr; { no overloading; $addr = 0+$self; }
2423 $name{$addr} = shift;
2424 $loose_match{$addr} = shift;
2425 $make_pod_entry{$addr} = shift;
2426 $externally_ok{$addr} = shift;
2427 $status{$addr} = shift;
2429 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2431 # Null names are never ok externally
2432 $externally_ok{$addr} = 0 if $name{$addr} eq "";
2440 # A range is the basic unit for storing code points, and is described in the
2441 # comments at the beginning of the program. Each range has a starting code
2442 # point; an ending code point (not less than the starting one); a value
2443 # that applies to every code point in between the two end-points, inclusive;
2444 # and an enum type that applies to the value. The type is for the user's
2445 # convenience, and has no meaning here, except that a non-zero type is
2446 # considered to not obey the normal Unicode rules for having standard forms.
2448 # The same structure is used for both map and match tables, even though in the
2449 # latter, the value (and hence type) is irrelevant and could be used as a
2450 # comment. In map tables, the value is what all the code points in the range
2451 # map to. Type 0 values have the standardized version of the value stored as
2452 # well, so as to not have to recalculate it a lot.
2454 sub trace { return main::trace(@_); }
2458 main::setup_package();
2461 main::set_access('start', \%start, 'r', 's');
2464 main::set_access('end', \%end, 'r', 's');
2467 main::set_access('value', \%value, 'r');
2470 main::set_access('type', \%type, 'r');
2473 # The value in internal standard form. Defined only if the type is 0.
2474 main::set_access('standard_form', \%standard_form);
2476 # Note that if these fields change, the dump() method should as well
2479 return Carp::carp_too_few_args(\@_, 3) if main::DEBUG && @_ < 3;
2482 my $self = bless \do { my $anonymous_scalar }, $class;
2483 my $addr; { no overloading; $addr = 0+$self; }
2485 $start{$addr} = shift;
2486 $end{$addr} = shift;
2490 my $value = delete $args{'Value'}; # Can be 0
2491 $value = "" unless defined $value;
2492 $value{$addr} = $value;
2494 $type{$addr} = delete $args{'Type'} || 0;
2496 Carp::carp_extra_args(\%args) if main::DEBUG && %args;
2498 if (! $type{$addr}) {
2499 $standard_form{$addr} = main::standardize($value);
2507 qw("") => "_operator_stringify",
2508 "." => \&main::_operator_dot,
2511 sub _operator_stringify {
2513 my $addr; { no overloading; $addr = 0+$self; }
2515 # Output it like '0041..0065 (value)'
2516 my $return = sprintf("%04X", $start{$addr})
2518 . sprintf("%04X", $end{$addr});
2519 my $value = $value{$addr};
2520 my $type = $type{$addr};
2522 $return .= "$value";
2523 $return .= ", Type=$type" if $type != 0;
2530 # The standard form is the value itself if the standard form is
2531 # undefined (that is if the value is special)
2534 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2536 my $addr; { no overloading; $addr = 0+$self; }
2538 return $standard_form{$addr} if defined $standard_form{$addr};
2539 return $value{$addr};
2543 # Human, not machine readable. For machine readable, comment out this
2544 # entire routine and let the standard one take effect.
2547 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2549 my $addr; { no overloading; $addr = 0+$self; }
2551 my $return = $indent
2552 . sprintf("%04X", $start{$addr})
2554 . sprintf("%04X", $end{$addr})
2555 . " '$value{$addr}';";
2556 if (! defined $standard_form{$addr}) {
2557 $return .= "(type=$type{$addr})";
2559 elsif ($standard_form{$addr} ne $value{$addr}) {
2560 $return .= "(standard '$standard_form{$addr}')";
2566 package _Range_List_Base;
2568 # Base class for range lists. A range list is simply an ordered list of
2569 # ranges, so that the ranges with the lowest starting numbers are first in it.
2571 # When a new range is added that is adjacent to an existing range that has the
2572 # same value and type, it merges with it to form a larger range.
2574 # Ranges generally do not overlap, except that there can be multiple entries
2575 # of single code point ranges. This is because of NameAliases.txt.
2577 # In this program, there is a standard value such that if two different
2578 # values, have the same standard value, they are considered equivalent. This
2579 # value was chosen so that it gives correct results on Unicode data
2581 # There are a number of methods to manipulate range lists, and some operators
2582 # are overloaded to handle them.
2584 sub trace { return main::trace(@_); }
2590 main::setup_package();
2593 # The list of ranges
2594 main::set_access('ranges', \%ranges, 'readable_array');
2597 # The highest code point in the list. This was originally a method, but
2598 # actual measurements said it was used a lot.
2599 main::set_access('max', \%max, 'r');
2601 my %each_range_iterator;
2602 # Iterator position for each_range()
2603 main::set_access('each_range_iterator', \%each_range_iterator);
2606 # Name of parent this is attached to, if any. Solely for better error
2608 main::set_access('owner_name_of', \%owner_name_of, 'p_r');
2610 my %_search_ranges_cache;
2611 # A cache of the previous result from _search_ranges(), for better
2613 main::set_access('_search_ranges_cache', \%_search_ranges_cache);
2619 # Optional initialization data for the range list.
2620 my $initialize = delete $args{'Initialize'};
2624 # Use _union() to initialize. _union() returns an object of this
2625 # class, which means that it will call this constructor recursively.
2626 # But it won't have this $initialize parameter so that it won't
2627 # infinitely loop on this.
2628 return _union($class, $initialize, %args) if defined $initialize;
2630 $self = bless \do { my $anonymous_scalar }, $class;
2631 my $addr; { no overloading; $addr = 0+$self; }
2633 # Optional parent object, only for debug info.
2634 $owner_name_of{$addr} = delete $args{'Owner'};
2635 $owner_name_of{$addr} = "" if ! defined $owner_name_of{$addr};
2637 # Stringify, in case it is an object.
2638 $owner_name_of{$addr} = "$owner_name_of{$addr}";
2640 # This is used only for error messages, and so a colon is added
2641 $owner_name_of{$addr} .= ": " if $owner_name_of{$addr} ne "";
2643 Carp::carp_extra_args(\%args) if main::DEBUG && %args;
2645 # Max is initialized to a negative value that isn't adjacent to 0,
2649 $_search_ranges_cache{$addr} = 0;
2650 $ranges{$addr} = [];
2657 qw("") => "_operator_stringify",
2658 "." => \&main::_operator_dot,
2661 sub _operator_stringify {
2663 my $addr; { no overloading; $addr = 0+$self; }
2665 return "Range_List attached to '$owner_name_of{$addr}'"
2666 if $owner_name_of{$addr};
2667 return "anonymous Range_List " . \$self;
2671 # Returns the union of the input code points. It can be called as
2672 # either a constructor or a method. If called as a method, the result
2673 # will be a new() instance of the calling object, containing the union
2674 # of that object with the other parameter's code points; if called as
2675 # a constructor, the first parameter gives the class the new object
2676 # should be, and the second parameter gives the code points to go into
2678 # In either case, there are two parameters looked at by this routine;
2679 # any additional parameters are passed to the new() constructor.
2681 # The code points can come in the form of some object that contains
2682 # ranges, and has a conventionally named method to access them; or
2683 # they can be an array of individual code points (as integers); or
2684 # just a single code point.
2686 # If they are ranges, this routine doesn't make any effort to preserve
2687 # the range values of one input over the other. Therefore this base
2688 # class should not allow _union to be called from other than
2689 # initialization code, so as to prevent two tables from being added
2690 # together where the range values matter. The general form of this
2691 # routine therefore belongs in a derived class, but it was moved here
2692 # to avoid duplication of code. The failure to overload this in this
2693 # class keeps it safe.
2697 my @args; # Arguments to pass to the constructor
2701 # If a method call, will start the union with the object itself, and
2702 # the class of the new object will be the same as self.
2709 # Add the other required parameter.
2711 # Rest of parameters are passed on to the constructor
2713 # Accumulate all records from both lists.
2715 for my $arg (@args) {
2716 #local $to_trace = 0 if main::DEBUG;
2717 trace "argument = $arg" if main::DEBUG && $to_trace;
2718 if (! defined $arg) {
2720 if (defined $self) {
2722 $message .= $owner_name_of{0+$self};
2724 Carp::my_carp_bug($message .= "Undefined argument to _union. No union done.");
2727 $arg = [ $arg ] if ! ref $arg;
2728 my $type = ref $arg;
2729 if ($type eq 'ARRAY') {
2730 foreach my $element (@$arg) {
2731 push @records, Range->new($element, $element);
2734 elsif ($arg->isa('Range')) {
2735 push @records, $arg;
2737 elsif ($arg->can('ranges')) {
2738 push @records, $arg->ranges;
2742 if (defined $self) {
2744 $message .= $owner_name_of{0+$self};
2746 Carp::my_carp_bug($message . "Cannot take the union of a $type. No union done.");
2751 # Sort with the range containing the lowest ordinal first, but if
2752 # two ranges start at the same code point, sort with the bigger range
2753 # of the two first, because it takes fewer cycles.
2754 @records = sort { ($a->start <=> $b->start)
2756 # if b is shorter than a, b->end will be
2757 # less than a->end, and we want to select
2758 # a, so want to return -1
2759 ($b->end <=> $a->end)
2762 my $new = $class->new(@_);
2764 # Fold in records so long as they add new information.
2765 for my $set (@records) {
2766 my $start = $set->start;
2767 my $end = $set->end;
2768 my $value = $set->value;
2769 if ($start > $new->max) {
2770 $new->_add_delete('+', $start, $end, $value);
2772 elsif ($end > $new->max) {
2773 $new->_add_delete('+', $new->max +1, $end, $value);
2780 sub range_count { # Return the number of ranges in the range list
2782 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2785 return scalar @{$ranges{0+$self}};
2789 # Returns the minimum code point currently in the range list, or if
2790 # the range list is empty, 2 beyond the max possible. This is a
2791 # method because used so rarely, that not worth saving between calls,
2792 # and having to worry about changing it as ranges are added and
2796 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2798 my $addr; { no overloading; $addr = 0+$self; }
2800 # If the range list is empty, return a large value that isn't adjacent
2801 # to any that could be in the range list, for simpler tests
2802 return $LAST_UNICODE_CODEPOINT + 2 unless scalar @{$ranges{$addr}};
2803 return $ranges{$addr}->[0]->start;
2807 # Boolean: Is argument in the range list? If so returns $i such that:
2808 # range[$i]->end < $codepoint <= range[$i+1]->end
2809 # which is one beyond what you want; this is so that the 0th range
2810 # doesn't return false
2812 my $codepoint = shift;
2813 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2815 my $i = $self->_search_ranges($codepoint);
2816 return 0 unless defined $i;
2818 # The search returns $i, such that
2819 # range[$i-1]->end < $codepoint <= range[$i]->end
2820 # So is in the table if and only iff it is at least the start position
2823 return 0 if $ranges{0+$self}->[$i]->start > $codepoint;
2828 # Returns the value associated with the code point, undef if none
2831 my $codepoint = shift;
2832 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2834 my $i = $self->contains($codepoint);
2837 # contains() returns 1 beyond where we should look
2839 return $ranges{0+$self}->[$i-1]->value;
2842 sub _search_ranges {
2843 # Find the range in the list which contains a code point, or where it
2844 # should go if were to add it. That is, it returns $i, such that:
2845 # range[$i-1]->end < $codepoint <= range[$i]->end
2846 # Returns undef if no such $i is possible (e.g. at end of table), or
2847 # if there is an error.
2850 my $code_point = shift;
2851 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2853 my $addr; { no overloading; $addr = 0+$self; }
2855 return if $code_point > $max{$addr};
2856 my $r = $ranges{$addr}; # The current list of ranges
2857 my $range_list_size = scalar @$r;
2860 use integer; # want integer division
2862 # Use the cached result as the starting guess for this one, because,
2863 # an experiment on 5.1 showed that 90% of the time the cache was the
2864 # same as the result on the next call (and 7% it was one less).
2865 $i = $_search_ranges_cache{$addr};
2866 $i = 0 if $i >= $range_list_size; # Reset if no longer valid (prob.
2867 # from an intervening deletion
2868 #local $to_trace = 1 if main::DEBUG;
2869 trace "previous \$i is still valid: $i" if main::DEBUG && $to_trace && $code_point <= $r->[$i]->end && ($i == 0 || $r->[$i-1]->end < $code_point);
2870 return $i if $code_point <= $r->[$i]->end
2871 && ($i == 0 || $r->[$i-1]->end < $code_point);
2873 # Here the cache doesn't yield the correct $i. Try adding 1.
2874 if ($i < $range_list_size - 1
2875 && $r->[$i]->end < $code_point &&
2876 $code_point <= $r->[$i+1]->end)
2879 trace "next \$i is correct: $i" if main::DEBUG && $to_trace;
2880 $_search_ranges_cache{$addr} = $i;
2884 # Here, adding 1 also didn't work. We do a binary search to
2885 # find the correct position, starting with current $i
2887 my $upper = $range_list_size - 1;
2889 trace "top of loop i=$i:", sprintf("%04X", $r->[$lower]->start), "[$lower] .. ", sprintf("%04X", $r->[$i]->start), "[$i] .. ", sprintf("%04X", $r->[$upper]->start), "[$upper]" if main::DEBUG && $to_trace;
2891 if ($code_point <= $r->[$i]->end) {
2893 # Here we have met the upper constraint. We can quit if we
2894 # also meet the lower one.
2895 last if $i == 0 || $r->[$i-1]->end < $code_point;
2897 $upper = $i; # Still too high.
2902 # Here, $r[$i]->end < $code_point, so look higher up.
2906 # Split search domain in half to try again.
2907 my $temp = ($upper + $lower) / 2;
2909 # No point in continuing unless $i changes for next time
2913 # We can't reach the highest element because of the averaging.
2914 # So if one below the upper edge, force it there and try one
2916 if ($i == $range_list_size - 2) {
2918 trace "Forcing to upper edge" if main::DEBUG && $to_trace;
2919 $i = $range_list_size - 1;
2921 # Change $lower as well so if fails next time through,
2922 # taking the average will yield the same $i, and we will
2923 # quit with the error message just below.
2927 Carp::my_carp_bug("$owner_name_of{$addr}Can't find where the range ought to go. No action taken.");
2931 } # End of while loop
2933 if (main::DEBUG && $to_trace) {
2934 trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i;
2935 trace "i= [ $i ]", $r->[$i];
2936 trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < $range_list_size - 1;
2939 # Here we have found the offset. Cache it as a starting point for the
2941 $_search_ranges_cache{$addr} = $i;
2946 # Add, replace or delete ranges to or from a list. The $type
2947 # parameter gives which:
2948 # '+' => insert or replace a range, returning a list of any changed
2950 # '-' => delete a range, returning a list of any deleted ranges.
2952 # The next three parameters give respectively the start, end, and
2953 # value associated with the range. 'value' should be null unless the
2956 # The range list is kept sorted so that the range with the lowest
2957 # starting position is first in the list, and generally, adjacent
2958 # ranges with the same values are merged into single larger one (see
2959 # exceptions below).
2961 # There are more parameters, all are key => value pairs:
2962 # Type gives the type of the value. It is only valid for '+'.
2963 # All ranges have types; if this parameter is omitted, 0 is
2964 # assumed. Ranges with type 0 are assumed to obey the
2965 # Unicode rules for casing, etc; ranges with other types are
2966 # not. Otherwise, the type is arbitrary, for the caller's
2967 # convenience, and looked at only by this routine to keep
2968 # adjacent ranges of different types from being merged into
2969 # a single larger range, and when Replace =>
2970 # $IF_NOT_EQUIVALENT is specified (see just below).
2971 # Replace determines what to do if the range list already contains
2972 # ranges which coincide with all or portions of the input
2973 # range. It is only valid for '+':
2974 # => $NO means that the new value is not to replace
2975 # any existing ones, but any empty gaps of the
2976 # range list coinciding with the input range
2977 # will be filled in with the new value.
2978 # => $UNCONDITIONALLY means to replace the existing values with
2979 # this one unconditionally. However, if the
2980 # new and old values are identical, the
2981 # replacement is skipped to save cycles
2982 # => $IF_NOT_EQUIVALENT means to replace the existing values
2983 # with this one if they are not equivalent.
2984 # Ranges are equivalent if their types are the
2985 # same, and they are the same string, or if
2986 # both are type 0 ranges, if their Unicode
2987 # standard forms are identical. In this last
2988 # case, the routine chooses the more "modern"
2989 # one to use. This is because some of the
2990 # older files are formatted with values that
2991 # are, for example, ALL CAPs, whereas the
2992 # derived files have a more modern style,
2993 # which looks better. By looking for this
2994 # style when the pre-existing and replacement
2995 # standard forms are the same, we can move to
2997 # => $MULTIPLE means that if this range duplicates an
2998 # existing one, but has a different value,
2999 # don't replace the existing one, but insert
3000 # this, one so that the same range can occur
3002 # => anything else is the same as => $IF_NOT_EQUIVALENT
3004 # "same value" means identical for type-0 ranges, and it means having
3005 # the same standard forms for non-type-0 ranges.
3007 return Carp::carp_too_few_args(\@_, 5) if main::DEBUG && @_ < 5;
3010 my $operation = shift; # '+' for add/replace; '-' for delete;
3017 $value = "" if not defined $value; # warning: $value can be "0"
3019 my $replace = delete $args{'Replace'};
3020 $replace = $IF_NOT_EQUIVALENT unless defined $replace;
3022 my $type = delete $args{'Type'};
3023 $type = 0 unless defined $type;
3025 Carp::carp_extra_args(\%args) if main::DEBUG && %args;
3027 my $addr; { no overloading; $addr = 0+$self; }
3029 if ($operation ne '+' && $operation ne '-') {
3030 Carp::my_carp_bug("$owner_name_of{$addr}First parameter to _add_delete must be '+' or '-'. No action taken.");
3033 unless (defined $start && defined $end) {
3034 Carp::my_carp_bug("$owner_name_of{$addr}Undefined start and/or end to _add_delete. No action taken.");
3037 unless ($end >= $start) {
3038 Carp::my_carp_bug("$owner_name_of{$addr}End of range (" . sprintf("%04X", $end) . ") must not be before start (" . sprintf("%04X", $start) . "). No action taken.");
3041 #local $to_trace = 1 if main::DEBUG;
3043 if ($operation eq '-') {
3044 if ($replace != $IF_NOT_EQUIVALENT) {
3045 Carp::my_carp_bug("$owner_name_of{$addr}Replace => \$IF_NOT_EQUIVALENT is required when deleting a range from a range list. Assuming Replace => \$IF_NOT_EQUIVALENT.");
3046 $replace = $IF_NOT_EQUIVALENT;
3049 Carp::my_carp_bug("$owner_name_of{$addr}Type => 0 is required when deleting a range from a range list. Assuming Type => 0.");
3053 Carp::my_carp_bug("$owner_name_of{$addr}Value => \"\" is required when deleting a range from a range list. Assuming Value => \"\".");
3058 my $r = $ranges{$addr}; # The current list of ranges
3059 my $range_list_size = scalar @$r; # And its size
3060 my $max = $max{$addr}; # The current high code point in
3061 # the list of ranges
3063 # Do a special case requiring fewer machine cycles when the new range
3064 # starts after the current highest point. The Unicode input data is
3065 # structured so this is common.
3066 if ($start > $max) {
3068 trace "$owner_name_of{$addr} $operation", sprintf("%04X", $start) . '..' . sprintf("%04X", $end) . " ($value) type=$type" if main::DEBUG && $to_trace;
3069 return if $operation eq '-'; # Deleting a non-existing range is a
3072 # If the new range doesn't logically extend the current final one
3073 # in the range list, create a new range at the end of the range
3074 # list. (max cleverly is initialized to a negative number not
3075 # adjacent to 0 if the range list is empty, so even adding a range
3076 # to an empty range list starting at 0 will have this 'if'
3078 if ($start > $max + 1 # non-adjacent means can't extend.
3079 || @{$r}[-1]->value ne $value # values differ, can't extend.
3080 || @{$r}[-1]->type != $type # types differ, can't extend.
3082 push @$r, Range->new($start, $end,
3088 # Here, the new range starts just after the current highest in
3089 # the range list, and they have the same type and value.
3090 # Extend the current range to incorporate the new one.
3091 @{$r}[-1]->set_end($end);
3094 # This becomes the new maximum.
3099 #local $to_trace = 0 if main::DEBUG;
3101 trace "$owner_name_of{$addr} $operation", sprintf("%04X", $start) . '..' . sprintf("%04X", $end) . " ($value) replace=$replace" if main::DEBUG && $to_trace;
3103 # Here, the input range isn't after the whole rest of the range list.
3104 # Most likely 'splice' will be needed. The rest of the routine finds
3105 # the needed splice parameters, and if necessary, does the splice.
3106 # First, find the offset parameter needed by the splice function for
3107 # the input range. Note that the input range may span multiple
3108 # existing ones, but we'll worry about that later. For now, just find
3109 # the beginning. If the input range is to be inserted starting in a
3110 # position not currently in the range list, it must (obviously) come
3111 # just after the range below it, and just before the range above it.
3112 # Slightly less obviously, it will occupy the position currently
3113 # occupied by the range that is to come after it. More formally, we
3114 # are looking for the position, $i, in the array of ranges, such that:
3116 # r[$i-1]->start <= r[$i-1]->end < $start < r[$i]->start <= r[$i]->end
3118 # (The ordered relationships within existing ranges are also shown in
3119 # the equation above). However, if the start of the input range is
3120 # within an existing range, the splice offset should point to that
3121 # existing range's position in the list; that is $i satisfies a
3122 # somewhat different equation, namely:
3124 #r[$i-1]->start <= r[$i-1]->end < r[$i]->start <= $start <= r[$i]->end
3126 # More briefly, $start can come before or after r[$i]->start, and at
3127 # this point, we don't know which it will be. However, these
3128 # two equations share these constraints:
3130 # r[$i-1]->end < $start <= r[$i]->end
3132 # And that is good enough to find $i.
3134 my $i = $self->_search_ranges($start);
3136 Carp::my_carp_bug("Searching $self for range beginning with $start unexpectedly returned undefined. Operation '$operation' not performed");
3140 # The search function returns $i such that:
3142 # r[$i-1]->end < $start <= r[$i]->end
3144 # That means that $i points to the first range in the range list
3145 # that could possibly be affected by this operation. We still don't
3146 # know if the start of the input range is within r[$i], or if it
3147 # points to empty space between r[$i-1] and r[$i].
3148 trace "[$i] is the beginning splice point. Existing range there is ", $r->[$i] if main::DEBUG && $to_trace;
3150 # Special case the insertion of data that is not to replace any
3152 if ($replace == $NO) { # If $NO, has to be operation '+'
3153 #local $to_trace = 1 if main::DEBUG;
3154 trace "Doesn't replace" if main::DEBUG && $to_trace;
3156 # Here, the new range is to take effect only on those code points
3157 # that aren't already in an existing range. This can be done by
3158 # looking through the existing range list and finding the gaps in
3159 # the ranges that this new range affects, and then calling this
3160 # function recursively on each of those gaps, leaving untouched
3161 # anything already in the list. Gather up a list of the changed
3162 # gaps first so that changes to the internal state as new ranges
3163 # are added won't be a problem.
3166 # First, if the starting point of the input range is outside an
3167 # existing one, there is a gap from there to the beginning of the
3168 # existing range -- add a span to fill the part that this new
3170 if ($start < $r->[$i]->start) {
3171 push @gap_list, Range->new($start,
3173 $r->[$i]->start - 1),
3175 trace "gap before $r->[$i] [$i], will add", $gap_list[-1] if main::DEBUG && $to_trace;
3178 # Then look through the range list for other gaps until we reach
3179 # the highest range affected by the input one.
3181 for ($j = $i+1; $j < $range_list_size; $j++) {
3182 trace "j=[$j]", $r->[$j] if main::DEBUG && $to_trace;
3183 last if $end < $r->[$j]->start;
3185 # If there is a gap between when this range starts and the
3186 # previous one ends, add a span to fill it. Note that just
3187 # because there are two ranges doesn't mean there is a
3188 # non-zero gap between them. It could be that they have
3189 # different values or types
3190 if ($r->[$j-1]->end + 1 != $r->[$j]->start) {
3192 Range->new($r->[$j-1]->end + 1,
3193 $r->[$j]->start - 1,
3195 trace "gap between $r->[$j-1] and $r->[$j] [$j], will add: $gap_list[-1]" if main::DEBUG && $to_trace;
3199 # Here, we have either found an existing range in the range list,
3200 # beyond the area affected by the input one, or we fell off the
3201 # end of the loop because the input range affects the whole rest
3202 # of the range list. In either case, $j is 1 higher than the
3203 # highest affected range. If $j == $i, it means that there are no
3204 # affected ranges, that the entire insertion is in the gap between
3205 # r[$i-1], and r[$i], which we already have taken care of before
3207 # On the other hand, if there are affected ranges, it might be
3208 # that there is a gap that needs filling after the final such
3209 # range to the end of the input range
3210 if ($r->[$j-1]->end < $end) {
3211 push @gap_list, Range->new(main::max($start,
3212 $r->[$j-1]->end + 1),
3215 trace "gap after $r->[$j-1], will add $gap_list[-1]" if main::DEBUG && $to_trace;
3218 # Call recursively to fill in all the gaps.
3219 foreach my $gap (@gap_list) {
3220 $self->_add_delete($operation,
3230 # Here, we have taken care of the case where $replace is $NO, which
3231 # means that whatever action we now take is done unconditionally. It
3232 # still could be that this call will result in a no-op, if duplicates
3233 # aren't allowed, and we are inserting a range that merely duplicates
3234 # data already in the range list; or also if deleting a non-existent
3236 # $i still points to the first potential affected range. Now find the
3237 # highest range affected, which will determine the length parameter to
3238 # splice. (The input range can span multiple existing ones.) While
3239 # we are looking through the range list, see also if this is an
3240 # insertion that will change the values of at least one of the
3241 # affected ranges. We don't need to do this check unless this is an
3242 # insertion of non-multiples, and also since this is a boolean, we
3243 # don't need to do it if have already determined that it will make a
3244 # change; just unconditionally change them. $cdm is created to be 1
3245 # if either of these is true. (The 'c' in the name comes from below)
3246 my $cdm = ($operation eq '-' || $replace == $MULTIPLE);
3247 my $j; # This will point to the highest affected range
3249 # For non-zero types, the standard form is the value itself;
3250 my $standard_form = ($type) ? $value : main::standardize($value);
3252 for ($j = $i; $j < $range_list_size; $j++) {
3253 trace "Looking for highest affected range; the one at $j is ", $r->[$j] if main::DEBUG && $to_trace;
3255 # If find a range that it doesn't overlap into, we can stop
3257 last if $end < $r->[$j]->start;
3259 # Here, overlaps the range at $j. If the value's don't match,
3260 # and this is supposedly an insertion, it becomes a change
3261 # instead. This is what the 'c' stands for in $cdm.
3263 if ($r->[$j]->standard_form ne $standard_form) {
3268 # Here, the two values are essentially the same. If the
3269 # two are actually identical, replacing wouldn't change
3270 # anything so skip it.
3271 my $pre_existing = $r->[$j]->value;
3272 if ($pre_existing ne $value) {
3274 # Here the new and old standardized values are the
3275 # same, but the non-standardized values aren't. If
3276 # replacing unconditionally, then replace
3277 if( $replace == $UNCONDITIONALLY) {
3282 # Here, are replacing conditionally. Decide to
3283 # replace or not based on which appears to look
3284 # the "nicest". If one is mixed case and the
3285 # other isn't, choose the mixed case one.
3286 my $new_mixed = $value =~ /[A-Z]/
3287 && $value =~ /[a-z]/;
3288 my $old_mixed = $pre_existing =~ /[A-Z]/
3289 && $pre_existing =~ /[a-z]/;
3291 if ($old_mixed != $new_mixed) {
3292 $cdm = 1 if $new_mixed;
3293 if (main::DEBUG && $to_trace) {
3295 trace "Replacing $pre_existing with $value";
3298 trace "Retaining $pre_existing over $value";
3304 # Here casing wasn't different between the two.
3305 # If one has hyphens or underscores and the
3306 # other doesn't, choose the one with the
3308 my $new_punct = $value =~ /[-_]/;
3309 my $old_punct = $pre_existing =~ /[-_]/;
3311 if ($old_punct != $new_punct) {
3312 $cdm = 1 if $new_punct;
3313 if (main::DEBUG && $to_trace) {
3315 trace "Replacing $pre_existing with $value";
3318 trace "Retaining $pre_existing over $value";
3321 } # else existing one is just as "good";
3322 # retain it to save cycles.
3328 } # End of loop looking for highest affected range.
3330 # Here, $j points to one beyond the highest range that this insertion
3331 # affects (hence to beyond the range list if that range is the final
3332 # one in the range list).
3334 # The splice length is all the affected ranges. Get it before
3335 # subtracting, for efficiency, so we don't have to later add 1.
3336 my $length = $j - $i;
3338 $j--; # $j now points to the highest affected range.
3339 trace "Final affected range is $j: $r->[$j]" if main::DEBUG && $to_trace;
3341 # If inserting a multiple record, this is where it goes, after all the
3342 # existing ones for this range. This implies an insertion, and no
3343 # change to any existing ranges. Note that $j can be -1 if this new
3344 # range doesn't actually duplicate any existing, and comes at the
3345 # beginning of the list, in which case we can handle it like any other
3346 # insertion, and is easier to do so.
3347 if ($replace == $MULTIPLE && $j >= 0) {
3349 # This restriction could be remedied with a little extra work, but
3350 # it won't hopefully ever be necessary
3351 if ($r->[$j]->start != $r->[$j]->end) {
3352 Carp::my_carp_bug("$owner_name_of{$addr}Can't cope with adding a multiple when the other range ($r->[$j]) contains more than one code point. No action taken.");
3356 # Don't add an exact duplicate, as it isn't really a multiple
3357 return if $value eq $r->[$j]->value && $type eq $r->[$j]->type;
3359 trace "Adding multiple record at $j+1 with $start..$end, $value" if main::DEBUG && $to_trace;
3360 my @return = splice @$r,
3367 if (main::DEBUG && $to_trace) {
3368 trace "After splice:";
3369 trace 'j-2=[', $j-2, ']', $r->[$j-2] if $j >= 2;
3370 trace 'j-1=[', $j-1, ']', $r->[$j-1] if $j >= 1;
3371 trace "j =[", $j, "]", $r->[$j] if $j >= 0;
3372 trace 'j+1=[', $j+1, ']', $r->[$j+1] if $j < @$r - 1;
3373 trace 'j+2=[', $j+2, ']', $r->[$j+2] if $j < @$r - 2;
3374 trace 'j+3=[', $j+3, ']', $r->[$j+3] if $j < @$r - 3;
3379 # Here, have taken care of $NO and $MULTIPLE replaces.
3380 # $j points to the highest affected range. But it can be < $i or even
3381 # -1. These happen only if the insertion is entirely in the gap
3382 # between r[$i-1] and r[$i]. Here's why: j < i means that the j loop
3383 # above exited first time through with $end < $r->[$i]->start. (And
3384 # then we subtracted one from j) This implies also that $start <
3385 # $r->[$i]->start, but we know from above that $r->[$i-1]->end <
3386 # $start, so the entire input range is in the gap.
3389 # Here the entire input range is in the gap before $i.
3391 if (main::DEBUG && $to_trace) {
3393 trace "Entire range is between $r->[$i-1] and $r->[$i]";
3396 trace "Entire range is before $r->[$i]";
3399 return if $operation ne '+'; # Deletion of a non-existent range is
3404 # Here the entire input range is not in the gap before $i. There
3405 # is an affected one, and $j points to the highest such one.
3407 # At this point, here is the situation:
3408 # This is not an insertion of a multiple, nor of tentative ($NO)
3410 # $i points to the first element in the current range list that
3411 # may be affected by this operation. In fact, we know
3412 # that the range at $i is affected because we are in
3413 # the else branch of this 'if'
3414 # $j points to the highest affected range.
3416 # r[$i-1]->end < $start <= r[$i]->end
3418 # r[$i-1]->end < $start <= $end <= r[$j]->end
3421 # $cdm is a boolean which is set true if and only if this is a
3422 # change or deletion (multiple was handled above). In
3423 # other words, it could be renamed to be just $cd.
3425 # We now have enough information to decide if this call is a no-op
3426 # or not. It is a no-op if it is a deletion of a non-existent
3427 # range, or an insertion of already existing data.
3429 if (main::DEBUG && $to_trace && ! $cdm
3431 && $start >= $r->[$i]->start)
3435 return if ! $cdm # change or delete => not no-op
3436 && $i == $j # more than one affected range => not no-op
3438 # Here, r[$i-1]->end < $start <= $end <= r[$i]->end
3439 # Further, $start and/or $end is >= r[$i]->start
3440 # The test below hence guarantees that
3441 # r[$i]->start < $start <= $end <= r[$i]->end
3442 # This means the input range is contained entirely in
3443 # the one at $i, so is a no-op
3444 && $start >= $r->[$i]->start;
3447 # Here, we know that some action will have to be taken. We have
3448 # calculated the offset and length (though adjustments may be needed)
3449 # for the splice. Now start constructing the replacement list.
3451 my $splice_start = $i;
3456 # See if should extend any adjacent ranges.
3457 if ($operation eq '-') { # Don't extend deletions
3458 $extends_below = $extends_above = 0;
3460 else { # Here, should extend any adjacent ranges. See if there are
3462 $extends_below = ($i > 0
3463 # can't extend unless adjacent
3464 && $r->[$i-1]->end == $start -1
3465 # can't extend unless are same standard value
3466 && $r->[$i-1]->standard_form eq $standard_form
3467 # can't extend unless share type
3468 && $r->[$i-1]->type == $type);
3469 $extends_above = ($j+1 < $range_list_size
3470 && $r->[$j+1]->start == $end +1
3471 && $r->[$j+1]->standard_form eq $standard_form
3472 && $r->[$j-1]->type == $type);
3474 if ($extends_below && $extends_above) { # Adds to both
3475 $splice_start--; # start replace at element below
3476 $length += 2; # will replace on both sides
3477 trace "Extends both below and above ranges" if main::DEBUG && $to_trace;
3479 # The result will fill in any gap, replacing both sides, and
3480 # create one large range.
3481 @replacement = Range->new($r->[$i-1]->start,
3488 # Here we know that the result won't just be the conglomeration of
3489 # a new range with both its adjacent neighbors. But it could
3490 # extend one of them.
3492 if ($extends_below) {
3494 # Here the new element adds to the one below, but not to the
3495 # one above. If inserting, and only to that one range, can
3496 # just change its ending to include the new one.
3497 if ($length == 0 && ! $cdm) {
3498 $r->[$i-1]->set_end($end);
3499 trace "inserted range extends range to below so it is now $r->[$i-1]" if main::DEBUG && $to_trace;
3503 trace "Changing inserted range to start at ", sprintf("%04X", $r->[$i-1]->start), " instead of ", sprintf("%04X", $start) if main::DEBUG && $to_trace;
3504 $splice_start--; # start replace at element below
3505 $length++; # will replace the element below
3506 $start = $r->[$i-1]->start;
3509 elsif ($extends_above) {
3511 # Here the new element adds to the one above, but not below.
3512 # Mirror the code above
3513 if ($length == 0 && ! $cdm) {
3514 $r->[$j+1]->set_start($start);
3515 trace "inserted range extends range to above so it is now $r->[$j+1]" if main::DEBUG && $to_trace;
3519 trace "Changing inserted range to end at ", sprintf("%04X", $r->[$j+1]->end), " instead of ", sprintf("%04X", $end) if main::DEBUG && $to_trace;
3520 $length++; # will replace the element above
3521 $end = $r->[$j+1]->end;
3525 trace "Range at $i is $r->[$i]" if main::DEBUG && $to_trace;
3527 # Finally, here we know there will have to be a splice.
3528 # If the change or delete affects only the highest portion of the
3529 # first affected range, the range will have to be split. The
3530 # splice will remove the whole range, but will replace it by a new
3531 # range containing just the unaffected part. So, in this case,
3532 # add to the replacement list just this unaffected portion.
3533 if (! $extends_below
3534 && $start > $r->[$i]->start && $start <= $r->[$i]->end)
3537 Range->new($r->[$i]->start,
3539 Value => $r->[$i]->value,
3540 Type => $r->[$i]->type);
3543 # In the case of an insert or change, but not a delete, we have to
3544 # put in the new stuff; this comes next.
3545 if ($operation eq '+') {
3546 push @replacement, Range->new($start,
3552 trace "Range at $j is $r->[$j]" if main::DEBUG && $to_trace && $j != $i;
3553 #trace "$end >=", $r->[$j]->start, " && $end <", $r->[$j]->end if main::DEBUG && $to_trace;
3555 # And finally, if we're changing or deleting only a portion of the
3556 # highest affected range, it must be split, as the lowest one was.
3557 if (! $extends_above
3558 && $j >= 0 # Remember that j can be -1 if before first
3560 && $end >= $r->[$j]->start
3561 && $end < $r->[$j]->end)
3564 Range->new($end + 1,
3566 Value => $r->[$j]->value,
3567 Type => $r->[$j]->type);
3571 # And do the splice, as calculated above
3572 if (main::DEBUG && $to_trace) {
3573 trace "replacing $length element(s) at $i with ";
3574 foreach my $replacement (@replacement) {
3575 trace " $replacement";
3577 trace "Before splice:";
3578 trace 'i-2=[', $i-2, ']', $r->[$i-2] if $i >= 2;
3579 trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i >= 1;
3580 trace "i =[", $i, "]", $r->[$i];
3581 trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < @$r - 1;
3582 trace 'i+2=[', $i+2, ']', $r->[$i+2] if $i < @$r - 2;
3585 my @return = splice @$r, $splice_start, $length, @replacement;
3587 if (main::DEBUG && $to_trace) {
3588 trace "After splice:";
3589 trace 'i-2=[', $i-2, ']', $r->[$i-2] if $i >= 2;
3590 trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i >= 1;
3591 trace "i =[", $i, "]", $r->[$i];
3592 trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < @$r - 1;
3593 trace 'i+2=[', $i+2, ']', $r->[$i+2] if $i < @$r - 2;
3594 trace "removed @return";
3597 # An actual deletion could have changed the maximum in the list.
3598 # There was no deletion if the splice didn't return something, but
3599 # otherwise recalculate it. This is done too rarely to worry about
3601 if ($operation eq '-' && @return) {
3602 $max{$addr} = $r->[-1]->end;
3607 sub reset_each_range { # reset the iterator for each_range();
3609 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3612 undef $each_range_iterator{0+$self};
3617 # Iterate over each range in a range list. Results are undefined if
3618 # the range list is changed during the iteration.
3621 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3623 my $addr; { no overloading; $addr = 0+$self; }
3625 return if $self->is_empty;
3627 $each_range_iterator{$addr} = -1
3628 if ! defined $each_range_iterator{$addr};
3629 $each_range_iterator{$addr}++;
3630 return $ranges{$addr}->[$each_range_iterator{$addr}]
3631 if $each_range_iterator{$addr} < @{$ranges{$addr}};
3632 undef $each_range_iterator{$addr};
3636 sub count { # Returns count of code points in range list
3638 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3640 my $addr; { no overloading; $addr = 0+$self; }
3643 foreach my $range (@{$ranges{$addr}}) {
3644 $count += $range->end - $range->start + 1;
3649 sub delete_range { # Delete a range
3654 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3656 return $self->_add_delete('-', $start, $end, "");
3659 sub is_empty { # Returns boolean as to if a range list is empty
3661 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3664 return scalar @{$ranges{0+$self}} == 0;
3668 # Quickly returns a scalar suitable for separating tables into
3669 # buckets, i.e. it is a hash function of the contents of a table, so
3670 # there are relatively few conflicts.
3673 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3675 my $addr; { no overloading; $addr = 0+$self; }
3677 # These are quickly computable. Return looks like 'min..max;count'
3678 return $self->min . "..$max{$addr};" . scalar @{$ranges{$addr}};
3680 } # End closure for _Range_List_Base
3683 use base '_Range_List_Base';
3685 # A Range_List is a range list for match tables; i.e. the range values are
3686 # not significant. Thus a number of operations can be safely added to it,
3687 # such as inversion, intersection. Note that union is also an unsafe
3688 # operation when range values are cared about, and that method is in the base
3689 # class, not here. But things are set up so that that method is callable only
3690 # during initialization. Only in this derived class, is there an operation
3691 # that combines two tables. A Range_Map can thus be used to initialize a
3692 # Range_List, and its mappings will be in the list, but are not significant to
3695 sub trace { return main::trace(@_); }
3701 '+' => sub { my $self = shift;
3704 return $self->_union($other)
3706 '&' => sub { my $self = shift;
3709 return $self->_intersect($other, 0);
3716 # Returns a new Range_List that gives all code points not in $self.
3720 my $new = Range_List->new;
3722 # Go through each range in the table, finding the gaps between them
3723 my $max = -1; # Set so no gap before range beginning at 0
3724 for my $range ($self->ranges) {
3725 my $start = $range->start;
3726 my $end = $range->end;
3728 # If there is a gap before this range, the inverse will contain
3730 if ($start > $max + 1) {
3731 $new->add_range($max + 1, $start - 1);
3736 # And finally, add the gap from the end of the table to the max
3737 # possible code point
3738 if ($max < $LAST_UNICODE_CODEPOINT) {
3739 $new->add_range($max + 1, $LAST_UNICODE_CODEPOINT);
3745 # Returns a new Range_List with the argument deleted from it. The
3746 # argument can be a single code point, a range, or something that has
3747 # a range, with the _range_list() method on it returning them
3751 my $reversed = shift;
3752 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3755 Carp::my_carp_bug("Can't cope with a "
3757 . " being the second parameter in a '-'. Subtraction ignored.");
3761 my $new = Range_List->new(Initialize => $self);
3763 if (! ref $other) { # Single code point
3764 $new->delete_range($other, $other);
3766 elsif ($other->isa('Range')) {
3767 $new->delete_range($other->start, $other->end);
3769 elsif ($other->can('_range_list')) {
3770 foreach my $range ($other->_range_list->ranges) {
3771 $new->delete_range($range->start, $range->end);
3775 Carp::my_carp_bug("Can't cope with a "
3777 . " argument to '-'. Subtraction ignored."
3786 # Returns either a boolean giving whether the two inputs' range lists
3787 # intersect (overlap), or a new Range_List containing the intersection
3788 # of the two lists. The optional final parameter being true indicates
3789 # to do the check instead of the intersection.
3791 my $a_object = shift;
3792 my $b_object = shift;
3793 my $check_if_overlapping = shift;
3794 $check_if_overlapping = 0 unless defined $check_if_overlapping;
3795 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3797 if (! defined $b_object) {
3799 $message .= $a_object->_owner_name_of if defined $a_object;
3800 Carp::my_carp_bug($message .= "Called with undefined value. Intersection not done.");
3804 # a & b = !(!a | !b), or in our terminology = ~ ( ~a + -b )
3805 # Thus the intersection could be much more simply be written:
3806 # return ~(~$a_object + ~$b_object);
3807 # But, this is slower, and when taking the inverse of a large
3808 # range_size_1 table, back when such tables were always stored that
3809 # way, it became prohibitively slow, hence the code was changed to the
3812 if ($b_object->isa('Range')) {
3813 $b_object = Range_List->new(Initialize => $b_object,
3814 Owner => $a_object->_owner_name_of);
3816 $b_object = $b_object->_range_list if $b_object->can('_range_list');
3818 my @a_ranges = $a_object->ranges;
3819 my @b_ranges = $b_object->ranges;
3821 #local $to_trace = 1 if main::DEBUG;
3822 trace "intersecting $a_object with ", scalar @a_ranges, "ranges and $b_object with", scalar @b_ranges, " ranges" if main::DEBUG && $to_trace;
3824 # Start with the first range in each list
3826 my $range_a = $a_ranges[$a_i];
3828 my $range_b = $b_ranges[$b_i];
3830 my $new = __PACKAGE__->new(Owner => $a_object->_owner_name_of)
3831 if ! $check_if_overlapping;
3833 # If either list is empty, there is no intersection and no overlap
3834 if (! defined $range_a || ! defined $range_b) {
3835 return $check_if_overlapping ? 0 : $new;
3837 trace "range_a[$a_i]=$range_a; range_b[$b_i]=$range_b" if main::DEBUG && $to_trace;
3839 # Otherwise, must calculate the intersection/overlap. Start with the
3840 # very first code point in each list
3841 my $a = $range_a->start;
3842 my $b = $range_b->start;
3844 # Loop through all the ranges of each list; in each iteration, $a and
3845 # $b are the current code points in their respective lists
3848 # If $a and $b are the same code point, ...
3851 # it means the lists overlap. If just checking for overlap
3852 # know the answer now,
3853 return 1 if $check_if_overlapping;
3855 # The intersection includes this code point plus anything else
3856 # common to both current ranges.
3858 my $end = main::min($range_a->end, $range_b->end);
3859 if (! $check_if_overlapping) {
3860 trace "adding intersection range ", sprintf("%04X", $start) . ".." . sprintf("%04X", $end) if main::DEBUG && $to_trace;
3861 $new->add_range($start, $end);
3864 # Skip ahead to the end of the current intersect
3867 # If the current intersect ends at the end of either range (as
3868 # it must for at least one of them), the next possible one
3869 # will be the beginning code point in it's list's next range.
3870 if ($a == $range_a->end) {
3871 $range_a = $a_ranges[++$a_i];
3872 last unless defined $range_a;
3873 $a = $range_a->start;
3875 if ($b == $range_b->end) {
3876 $range_b = $b_ranges[++$b_i];
3877 last unless defined $range_b;
3878 $b = $range_b->start;
3881 trace "range_a[$a_i]=$range_a; range_b[$b_i]=$range_b" if main::DEBUG && $to_trace;
3885 # Not equal, but if the range containing $a encompasses $b,
3886 # change $a to be the middle of the range where it does equal
3887 # $b, so the next iteration will get the intersection
3888 if ($range_a->end >= $b) {
3893 # Here, the current range containing $a is entirely below
3894 # $b. Go try to find a range that could contain $b.
3895 $a_i = $a_object->_search_ranges($b);
3897 # If no range found, quit.
3898 last unless defined $a_i;
3900 # The search returns $a_i, such that
3901 # range_a[$a_i-1]->end < $b <= range_a[$a_i]->end
3902 # Set $a to the beginning of this new range, and repeat.
3903 $range_a = $a_ranges[$a_i];
3904 $a = $range_a->start;
3907 else { # Here, $b < $a.
3909 # Mirror image code to the leg just above
3910 if ($range_b->end >= $a) {
3914 $b_i = $b_object->_search_ranges($a);
3915 last unless defined $b_i;
3916 $range_b = $b_ranges[$b_i];
3917 $b = $range_b->start;
3920 } # End of looping through ranges.
3922 # Intersection fully computed, or now know that there is no overlap
3923 return $check_if_overlapping ? 0 : $new;
3927 # Returns boolean giving whether the two arguments overlap somewhere
3931 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3933 return $self->_intersect($other, 1);
3937 # Add a range to the list.
3942 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3944 return $self->_add_delete('+', $start, $end, "");
3947 my $non_ASCII = (ord('A') != 65); # Assumes test on same platform
3949 sub is_code_point_usable {
3950 # This used only for making the test script. See if the input
3951 # proposed trial code point is one that Perl will handle. If second
3952 # parameter is 0, it won't select some code points for various
3953 # reasons, noted below.
3956 my $try_hard = shift;
3957 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3959 return 0 if $code < 0; # Never use a negative
3961 # For non-ASCII, we shun the characters that don't have Perl encoding-
3962 # independent symbols for them. 'A' is such a symbol, so is "\n".
3963 return $try_hard if $non_ASCII
3966 || ($code >= 0x0E && $code <= 0x1F)
3967 || ($code >= 0x01 && $code <= 0x06)
3970 # shun null. I'm (khw) not sure why this was done, but NULL would be
3971 # the character very frequently used.
3972 return $try_hard if $code == 0x0000;
3974 return 0 if $try_hard; # XXX Temporary until fix utf8.c
3976 # shun non-character code points.
3977 return $try_hard if $code >= 0xFDD0 && $code <= 0xFDEF;
3978 return $try_hard if ($code & 0xFFFE) == 0xFFFE; # includes FFFF
3980 return $try_hard if $code > $LAST_UNICODE_CODEPOINT; # keep in range
3981 return $try_hard if $code >= 0xD800 && $code <= 0xDFFF; # no surrogate
3986 sub get_valid_code_point {
3987 # Return a code point that's part of the range list. Returns nothing
3988 # if the table is empty or we can't find a suitable code point. This
3989 # used only for making the test script.
3992 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3994 my $addr; { no overloading; $addr = 0+$self; }
3996 # On first pass, don't choose less desirable code points; if no good
3997 # one is found, repeat, allowing a less desirable one to be selected.
3998 for my $try_hard (0, 1) {
4000 # Look through all the ranges for a usable code point.
4001 for my $set ($self->ranges) {
4003 # Try the edge cases first, starting with the end point of the
4005 my $end = $set->end;
4006 return $end if is_code_point_usable($end, $try_hard);
4008 # End point didn't, work. Start at the beginning and try
4009 # every one until find one that does work.
4010 for my $trial ($set->start .. $end - 1) {
4011 return $trial if is_code_point_usable($trial, $try_hard);
4015 return (); # If none found, give up.
4018 sub get_invalid_code_point {
4019 # Return a code point that's not part of the table. Returns nothing
4020 # if the table covers all code points or a suitable code point can't
4021 # be found. This used only for making the test script.
4024 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4026 # Just find a valid code point of the inverse, if any.
4027 return Range_List->new(Initialize => ~ $self)->get_valid_code_point;
4029 } # end closure for Range_List
4032 use base '_Range_List_Base';
4034 # A Range_Map is a range list in which the range values (called maps) are
4035 # significant, and hence shouldn't be manipulated by our other code, which
4036 # could be ambiguous or lose things. For example, in taking the union of two
4037 # lists, which share code points, but which have differing values, which one
4038 # has precedence in the union?
4039 # It turns out that these operations aren't really necessary for map tables,
4040 # and so this class was created to make sure they aren't accidentally
4046 # Add a range containing a mapping value to the list
4049 # Rest of parameters passed on
4051 return $self->_add_delete('+', @_);
4055 # Adds entry to a range list which can duplicate an existing entry
4058 my $code_point = shift;
4060 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4062 return $self->add_map($code_point, $code_point,
4063 $value, Replace => $MULTIPLE);
4065 } # End of closure for package Range_Map
4067 package _Base_Table;
4069 # A table is the basic data structure that gets written out into a file for
4070 # use by the Perl core. This is the abstract base class implementing the
4071 # common elements from the derived ones. A list of the methods to be
4072 # furnished by an implementing class is just after the constructor.
4074 sub standardize { return main::standardize($_[0]); }
4075 sub trace { return main::trace(@_); }
4079 main::setup_package();
4082 # Object containing the ranges of the table.
4083 main::set_access('range_list', \%range_list, 'p_r', 'p_s');
4086 # The full table name.
4087 main::set_access('full_name', \%full_name, 'r');
4090 # The table name, almost always shorter
4091 main::set_access('name', \%name, 'r');
4094 # The shortest of all the aliases for this table, with underscores removed
4095 main::set_access('short_name', \%short_name);
4097 my %nominal_short_name_length;
4098 # The length of short_name before removing underscores
4099 main::set_access('nominal_short_name_length',
4100 \%nominal_short_name_length);
4103 # The complete name, including property.
4104 main::set_access('complete_name', \%complete_name, 'r');
4107 # Parent property this table is attached to.
4108 main::set_access('property', \%property, 'r');
4111 # Ordered list of aliases of the table's name. The first ones in the list
4112 # are output first in comments
4113 main::set_access('aliases', \%aliases, 'readable_array');
4116 # A comment associated with the table for human readers of the files
4117 main::set_access('comment', \%comment, 's');
4120 # A comment giving a short description of the table's meaning for human
4121 # readers of the files.
4122 main::set_access('description', \%description, 'readable_array');
4125 # A comment giving a short note about the table for human readers of the
4127 main::set_access('note', \%note, 'readable_array');
4130 # Boolean; if set means any file that contains this table is marked as for
4131 # internal-only use.
4132 main::set_access('internal_only', \%internal_only);
4134 my %find_table_from_alias;
4135 # The parent property passes this pointer to a hash which this class adds
4136 # all its aliases to, so that the parent can quickly take an alias and
4138 main::set_access('find_table_from_alias', \%find_table_from_alias, 'p_r');
4141 # After this table is made equivalent to another one; we shouldn't go
4142 # changing the contents because that could mean it's no longer equivalent
4143 main::set_access('locked', \%locked, 'r');
4146 # This gives the final path to the file containing the table. Each
4147 # directory in the path is an element in the array
4148 main::set_access('file_path', \%file_path, 'readable_array');
4151 # What is the table's status, normal, $OBSOLETE, etc. Enum
4152 main::set_access('status', \%status, 'r');
4155 # A comment about its being obsolete, or whatever non normal status it has
4156 main::set_access('status_info', \%status_info, 'r');
4159 # Is the table to be output with each range only a single code point?
4160 # This is done to avoid breaking existing code that may have come to rely
4161 # on this behavior in previous versions of this program.)
4162 main::set_access('range_size_1', \%range_size_1, 'r', 's');
4165 # A boolean set iff this table is a Perl extension to the Unicode
4167 main::set_access('perl_extension', \%perl_extension, 'r');
4169 my %output_range_counts;
4170 # A boolean set iff this table is to have comments written in the
4171 # output file that contain the number of code points in the range.
4172 # The constructor can override the global flag of the same name.
4173 main::set_access('output_range_counts', \%output_range_counts, 'r');
4176 # All arguments are key => value pairs, which you can see below, most
4177 # of which match fields documented above. Otherwise: Pod_Entry,
4178 # Externally_Ok, and Fuzzy apply to the names of the table, and are
4179 # documented in the Alias package
4181 return Carp::carp_too_few_args(\@_, 2) if main::DEBUG && @_ < 2;
4185 my $self = bless \do { my $anonymous_scalar }, $class;
4186 my $addr; { no overloading; $addr = 0+$self; }
4190 $name{$addr} = delete $args{'Name'};
4191 $find_table_from_alias{$addr} = delete $args{'_Alias_Hash'};
4192 $full_name{$addr} = delete $args{'Full_Name'};
4193 my $complete_name = $complete_name{$addr}
4194 = delete $args{'Complete_Name'};
4195 $internal_only{$addr} = delete $args{'Internal_Only_Warning'} || 0;
4196 $output_range_counts{$addr} = delete $args{'Output_Range_Counts'};
4197 $property{$addr} = delete $args{'_Property'};
4198 $range_list{$addr} = delete $args{'_Range_List'};
4199 $status{$addr} = delete $args{'Status'} || $NORMAL;
4200 $status_info{$addr} = delete $args{'_Status_Info'} || "";
4201 $range_size_1{$addr} = delete $args{'Range_Size_1'} || 0;
4203 my $description = delete $args{'Description'};
4204 my $externally_ok = delete $args{'Externally_Ok'};
4205 my $loose_match = delete $args{'Fuzzy'};
4206 my $note = delete $args{'Note'};
4207 my $make_pod_entry = delete $args{'Pod_Entry'};
4208 my $perl_extension = delete $args{'Perl_Extension'};
4210 # Shouldn't have any left over
4211 Carp::carp_extra_args(\%args) if main::DEBUG && %args;
4213 # Can't use || above because conceivably the name could be 0, and
4214 # can't use // operator in case this program gets used in Perl 5.8
4215 $full_name{$addr} = $name{$addr} if ! defined $full_name{$addr};
4216 $output_range_counts{$addr} = $output_range_counts if
4217 ! defined $output_range_counts{$addr};
4219 $aliases{$addr} = [ ];
4220 $comment{$addr} = [ ];
4221 $description{$addr} = [ ];
4223 $file_path{$addr} = [ ];
4224 $locked{$addr} = "";
4226 push @{$description{$addr}}, $description if $description;
4227 push @{$note{$addr}}, $note if $note;
4229 if ($status{$addr} eq $PLACEHOLDER) {
4231 # A placeholder table doesn't get documented, is a perl extension,
4232 # and quite likely will be empty
4233 $make_pod_entry = 0 if ! defined $make_pod_entry;
4234 $perl_extension = 1 if ! defined $perl_extension;
4235 push @tables_that_may_be_empty, $complete_name{$addr};
4237 elsif (! $status{$addr}) {
4239 # If hasn't set its status already, see if it is on one of the
4240 # lists of properties or tables that have particular statuses; if
4241 # not, is normal. The lists are prioritized so the most serious
4242 # ones are checked first
4243 if (exists $why_suppressed{$complete_name}) {
4244 $status{$addr} = $SUPPRESSED;
4246 elsif (exists $why_deprecated{$complete_name}) {
4247 $status{$addr} = $DEPRECATED;
4249 elsif (exists $why_stabilized{$complete_name}) {
4250 $status{$addr} = $STABILIZED;
4252 elsif (exists $why_obsolete{$complete_name}) {
4253 $status{$addr} = $OBSOLETE;
4256 # Existence above doesn't necessarily mean there is a message
4257 # associated with it. Use the most serious message.
4258 if ($status{$addr}) {
4259 if ($why_suppressed{$complete_name}) {
4261 = $why_suppressed{$complete_name};
4263 elsif ($why_deprecated{$complete_name}) {
4265 = $why_deprecated{$complete_name};
4267 elsif ($why_stabilized{$complete_name}) {
4269 = $why_stabilized{$complete_name};
4271 elsif ($why_obsolete{$complete_name}) {
4273 = $why_obsolete{$complete_name};
4278 $perl_extension{$addr} = $perl_extension || 0;
4280 # By convention what typically gets printed only or first is what's
4281 # first in the list, so put the full name there for good output
4282 # clarity. Other routines rely on the full name being first on the
4284 $self->add_alias($full_name{$addr},
4285 Externally_Ok => $externally_ok,
4286 Fuzzy => $loose_match,
4287 Pod_Entry => $make_pod_entry,
4288 Status => $status{$addr},
4291 # Then comes the other name, if meaningfully different.
4292 if (standardize($full_name{$addr}) ne standardize($name{$addr})) {
4293 $self->add_alias($name{$addr},
4294 Externally_Ok => $externally_ok,
4295 Fuzzy => $loose_match,
4296 Pod_Entry => $make_pod_entry,
4297 Status => $status{$addr},
4304 # Here are the methods that are required to be defined by any derived
4310 # append_to_body and pre_body are called in the write() method
4311 # to add stuff after the main body of the table, but before
4312 # its close; and to prepend stuff before the beginning of the
4317 Carp::my_carp_bug( __LINE__
4318 . ": Must create method '$sub()' for "
4326 "." => \&main::_operator_dot,
4327 '!=' => \&main::_operator_not_equal,
4328 '==' => \&main::_operator_equal,
4332 # Returns the array of ranges associated with this table.
4335 return $range_list{0+shift}->ranges;
4339 # Add a synonym for this table.
4341 return Carp::carp_too_few_args(\@_, 3) if main::DEBUG && @_ < 3;
4344 my $name = shift; # The name to add.
4345 my $pointer = shift; # What the alias hash should point to. For
4346 # map tables, this is the parent property;
4347 # for match tables, it is the table itself.
4350 my $loose_match = delete $args{'Fuzzy'};
4352 my $make_pod_entry = delete $args{'Pod_Entry'};
4353 $make_pod_entry = $YES unless defined $make_pod_entry;
4355 my $externally_ok = delete $args{'Externally_Ok'};
4356 $externally_ok = 1 unless defined $externally_ok;
4358 my $status = delete $args{'Status'};
4359 $status = $NORMAL unless defined $status;
4361 Carp::carp_extra_args(\%args) if main::DEBUG && %args;
4363 # Capitalize the first letter of the alias unless it is one of the CJK
4364 # ones which specifically begins with a lower 'k'. Do this because
4365 # Unicode has varied whether they capitalize first letters or not, and
4366 # have later changed their minds and capitalized them, but not the
4367 # other way around. So do it always and avoid changes from release to
4369 $name = ucfirst($name) unless $name =~ /^k[A-Z]/;
4371 my $addr; { no overloading; $addr = 0+$self; }
4373 # Figure out if should be loosely matched if not already specified.
4374 if (! defined $loose_match) {
4376 # Is a loose_match if isn't null, and doesn't begin with an
4377 # underscore and isn't just a number
4379 && substr($name, 0, 1) ne '_'
4380 && $name !~ qr{^[0-9_.+-/]+$})
4389 # If this alias has already been defined, do nothing.
4390 return if defined $find_table_from_alias{$addr}->{$name};
4392 # That includes if it is standardly equivalent to an existing alias,
4393 # in which case, add this name to the list, so won't have to search
4395 my $standard_name = main::standardize($name);
4396 if (defined $find_table_from_alias{$addr}->{$standard_name}) {
4397 $find_table_from_alias{$addr}->{$name}
4398 = $find_table_from_alias{$addr}->{$standard_name};
4402 # Set the index hash for this alias for future quick reference.
4403 $find_table_from_alias{$addr}->{$name} = $pointer;
4404 $find_table_from_alias{$addr}->{$standard_name} = $pointer;
4405 local $to_trace = 0 if main::DEBUG;
4406 trace "adding alias $name to $pointer" if main::DEBUG && $to_trace;
4407 trace "adding alias $standard_name to $pointer" if main::DEBUG && $to_trace;
4410 # Put the new alias at the end of the list of aliases unless the final
4411 # element begins with an underscore (meaning it is for internal perl
4412 # use) or is all numeric, in which case, put the new one before that
4413 # one. This floats any all-numeric or underscore-beginning aliases to
4414 # the end. This is done so that they are listed last in output lists,
4415 # to encourage the user to use a better name (either more descriptive
4416 # or not an internal-only one) instead. This ordering is relied on
4417 # implicitly elsewhere in this program, like in short_name()
4418 my $list = $aliases{$addr};
4419 my $insert_position = (@$list == 0
4420 || (substr($list->[-1]->name, 0, 1) ne '_'
4421 && $list->[-1]->name =~ /\D/))
4427 Alias->new($name, $loose_match, $make_pod_entry,
4428 $externally_ok, $status);
4430 # This name may be shorter than any existing ones, so clear the cache
4431 # of the shortest, so will have to be recalculated.
4433 undef $short_name{0+$self};
4438 # Returns a name suitable for use as the base part of a file name.
4439 # That is, shorter wins. It can return undef if there is no suitable
4440 # name. The name has all non-essential underscores removed.
4442 # The optional second parameter is a reference to a scalar in which
4443 # this routine will store the length the returned name had before the
4444 # underscores were removed, or undef if the return is undef.
4446 # The shortest name can change if new aliases are added. So using
4447 # this should be deferred until after all these are added. The code
4448 # that does that should clear this one's cache.
4449 # Any name with alphabetics is preferred over an all numeric one, even
4453 my $nominal_length_ptr = shift;
4454 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4456 my $addr; { no overloading; $addr = 0+$self; }
4458 # For efficiency, don't recalculate, but this means that adding new
4459 # aliases could change what the shortest is, so the code that does
4460 # that needs to undef this.
4461 if (defined $short_name{$addr}) {
4462 if ($nominal_length_ptr) {
4463 $$nominal_length_ptr = $nominal_short_name_length{$addr};
4465 return $short_name{$addr};
4468 # Look at each alias
4469 foreach my $alias ($self->aliases()) {
4471 # Don't use an alias that isn't ok to use for an external name.
4472 next if ! $alias->externally_ok;
4474 my $name = main::Standardize($alias->name);
4475 trace $self, $name if main::DEBUG && $to_trace;
4477 # Take the first one, or a shorter one that isn't numeric. This
4478 # relies on numeric aliases always being last in the array
4479 # returned by aliases(). Any alpha one will have precedence.
4480 if (! defined $short_name{$addr}
4482 && length($name) < length($short_name{$addr})))
4484 # Remove interior underscores.
4485 ($short_name{$addr} = $name) =~ s/ (?<= . ) _ (?= . ) //xg;
4487 $nominal_short_name_length{$addr} = length $name;
4491 # If no suitable external name return undef
4492 if (! defined $short_name{$addr}) {
4493 $$nominal_length_ptr = undef if $nominal_length_ptr;
4497 # Don't allow a null external name.
4498 if ($short_name{$addr} eq "") {
4499 $short_name{$addr} = '_';
4500 $nominal_short_name_length{$addr} = 1;
4503 trace $self, $short_name{$addr} if main::DEBUG && $to_trace;
4505 if ($nominal_length_ptr) {
4506 $$nominal_length_ptr = $nominal_short_name_length{$addr};
4508 return $short_name{$addr};
4512 # Returns the external name that this table should be known by. This
4513 # is usually the short_name, but not if the short_name is undefined.
4516 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4518 my $short = $self->short_name;
4519 return $short if defined $short;
4524 sub add_description { # Adds the parameter as a short description.
4527 my $description = shift;
4529 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4532 push @{$description{0+$self}}, $description;
4537 sub add_note { # Adds the parameter as a short note.
4542 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4545 push @{$note{0+$self}}, $note;
4550 sub add_comment { # Adds the parameter as a comment.
4553 my $comment = shift;
4554 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4559 push @{$comment{0+$self}}, $comment;
4565 # Return the current comment for this table. If called in list
4566 # context, returns the array of comments. In scalar, returns a string
4567 # of each element joined together with a period ending each.
4570 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4572 my $addr; { no overloading; $addr = 0+$self; }
4573 my @list = @{$comment{$addr}};
4574 return @list if wantarray;
4576 foreach my $sentence (@list) {
4577 $return .= '. ' if $return;
4578 $return .= $sentence;
4581 $return .= '.' if $return;
4586 # Initialize the table with the argument which is any valid
4587 # initialization for range lists.
4590 my $addr; { no overloading; $addr = 0+$self; }
4591 my $initialization = shift;
4592 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4594 # Replace the current range list with a new one of the same exact
4596 my $class = ref $range_list{$addr};
4597 $range_list{$addr} = $class->new(Owner => $self,
4598 Initialize => $initialization);
4604 # The header that is output for the table in the file it is written
4608 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4611 $return .= $DEVELOPMENT_ONLY if $compare_versions;
4614 $return .= $INTERNAL_ONLY if $internal_only{0+$self};
4619 # Write a representation of the table to its file.
4622 my $tab_stops = shift; # The number of tab stops over to put any
4624 my $suppress_value = shift; # Optional, if the value associated with
4625 # a range equals this one, don't write
4627 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4629 my $addr; { no overloading; $addr = 0+$self; }
4631 # Start with the header
4632 my @OUT = $self->header;
4635 push @OUT, "\n", main::simple_fold($comment{$addr}, '# '), "\n"
4638 # Then any pre-body stuff.
4639 my $pre_body = $self->pre_body;
4640 push @OUT, $pre_body, "\n" if $pre_body;
4642 # The main body looks like a 'here' document
4643 push @OUT, "return <<'END';\n";
4645 if ($range_list{$addr}->is_empty) {
4647 # This is a kludge for empty tables to silence a warning in
4648 # utf8.c, which can't really deal with empty tables, but it can
4649 # deal with a table that matches nothing, as the inverse of 'Any'
4651 push @OUT, "!utf8::IsAny\n";
4654 my $range_size_1 = $range_size_1{$addr};
4656 # Output each range as part of the here document.
4657 for my $set ($range_list{$addr}->ranges) {
4658 my $start = $set->start;
4659 my $end = $set->end;
4660 my $value = $set->value;
4662 # Don't output ranges whose value is the one to suppress
4663 next if defined $suppress_value && $value eq $suppress_value;
4665 # If has or wants a single point range output
4666 if ($start == $end || $range_size_1) {
4667 for my $i ($start .. $end) {
4668 push @OUT, sprintf "%04X\t\t%s\n", $i, $value;
4672 push @OUT, sprintf "%04X\t%04X\t%s", $start, $end, $value;
4674 # Add a comment with the size of the range, if requested.
4675 # Expand Tabs to make sure they all start in the same
4676 # column, and then unexpand to use mostly tabs.
4677 if (! $output_range_counts{$addr}) {
4681 $OUT[-1] = Text::Tabs::expand($OUT[-1]);
4682 my $count = main::clarify_number($end - $start + 1);
4685 my $width = $tab_stops * 8 - 1;
4686 $OUT[-1] = sprintf("%-*s # [%s]\n",
4690 $OUT[-1] = Text::Tabs::unexpand($OUT[-1]);
4693 } # End of loop through all the table's ranges
4696 # Add anything that goes after the main body, but within the here
4698 my $append_to_body = $self->append_to_body;
4699 push @OUT, $append_to_body if $append_to_body;
4701 # And finish the here document.
4704 # All these files have a .pl suffix
4705 $file_path{$addr}->[-1] .= '.pl';
4707 main::write($file_path{$addr}, \@OUT);
4711 sub set_status { # Set the table's status
4713 my $status = shift; # The status enum value
4714 my $info = shift; # Any message associated with it.
4715 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4717 my $addr; { no overloading; $addr = 0+$self; }
4719 $status{$addr} = $status;
4720 $status_info{$addr} = $info;
4725 # Don't allow changes to the table from now on. This stores a stack
4726 # trace of where it was called, so that later attempts to modify it
4727 # can immediately show where it got locked.
4730 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4732 my $addr; { no overloading; $addr = 0+$self; }
4734 $locked{$addr} = "";
4736 my $line = (caller(0))[2];
4739 # Accumulate the stack trace
4741 my ($pkg, $file, $caller_line, $caller) = caller $i++;
4743 last unless defined $caller;
4745 $locked{$addr} .= " called from $caller() at line $line\n";
4746 $line = $caller_line;
4748 $locked{$addr} .= " called from main at line $line\n";
4753 sub carp_if_locked {
4754 # Return whether a table is locked or not, and, by the way, complain
4758 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4760 my $addr; { no overloading; $addr = 0+$self; }
4762 return 0 if ! $locked{$addr};
4763 Carp::my_carp_bug("Can't modify a locked table. Stack trace of locking:\n$locked{$addr}\n\n");
4767 sub set_file_path { # Set the final directory path for this table
4769 # Rest of parameters passed on
4772 @{$file_path{0+$self}} = @_;
4776 # Accessors for the range list stored in this table. First for
4796 return $range_list{0+$self}->$sub(@_);
4800 # Then for ones that should fail if locked
4810 return if $self->carp_if_locked;
4812 return $range_list{0+$self}->$sub(@_);
4819 use base '_Base_Table';
4821 # A Map Table is a table that contains the mappings from code points to
4822 # values. There are two weird cases:
4823 # 1) Anomalous entries are ones that aren't maps of ranges of code points, but
4824 # are written in the table's file at the end of the table nonetheless. It
4825 # requires specially constructed code to handle these; utf8.c can not read
4826 # these in, so they should not go in $map_directory. As of this writing,
4827 # the only case that these happen is for named sequences used in
4828 # charnames.pm. But this code doesn't enforce any syntax on these, so
4829 # something else could come along that uses it.
4830 # 2) Specials are anything that doesn't fit syntactically into the body of the
4831 # table. The ranges for these have a map type of non-zero. The code below
4832 # knows about and handles each possible type. In most cases, these are
4833 # written as part of the header.
4835 # A map table deliberately can't be manipulated at will unlike match tables.
4836 # This is because of the ambiguities having to do with what to do with
4837 # overlapping code points. And there just isn't a need for those things;
4838 # what one wants to do is just query, add, replace, or delete mappings, plus
4839 # write the final result.
4840 # However, there is a method to get the list of possible ranges that aren't in
4841 # this table to use for defaulting missing code point mappings. And,
4842 # map_add_or_replace_non_nulls() does allow one to add another table to this
4843 # one, but it is clearly very specialized, and defined that the other's
4844 # non-null values replace this one's if there is any overlap.
4846 sub trace { return main::trace(@_); }
4850 main::setup_package();
4853 # Many input files omit some entries; this gives what the mapping for the
4854 # missing entries should be
4855 main::set_access('default_map', \%default_map, 'r');
4857 my %anomalous_entries;
4858 # Things that go in the body of the table which don't fit the normal
4859 # scheme of things, like having a range. Not much can be done with these
4860 # once there except to output them. This was created to handle named
4862 main::set_access('anomalous_entry', \%anomalous_entries, 'a');
4863 main::set_access('anomalous_entries', # Append singular, read plural
4864 \%anomalous_entries,
4868 # The format of the entries of the table. This is calculated from the
4869 # data in the table (or passed in the constructor). This is an enum e.g.,
4871 main::set_access('format', \%format);
4874 # This is a string, solely for documentation, indicating how one can get
4875 # access to this property via the Perl core.
4876 main::set_access('core_access', \%core_access, 'r', 's');
4879 # Boolean set when non-zero map-type ranges are added to this table,
4880 # which happens in only a few tables. This is purely for performance, to
4881 # avoid having to search through every table upon output, so if all the
4882 # non-zero maps got deleted before output, this would remain set, and the
4883 # only penalty would be performance. Currently, most map tables that get
4884 # output have specials in them, so this doesn't help that much anyway.
4885 main::set_access('has_specials', \%has_specials);
4888 # Boolean as to whether or not to write out this map table
4889 main::set_access('to_output_map', \%to_output_map, 's');
4898 # Optional initialization data for the table.
4899 my $initialize = delete $args{'Initialize'};
4901 my $core_access = delete $args{'Core_Access'};
4902 my $default_map = delete $args{'Default_Map'};
4903 my $format = delete $args{'Format'};
4904 my $property = delete $args{'_Property'};
4905 my $full_name = delete $args{'Full_Name'};
4906 # Rest of parameters passed on
4908 my $range_list = Range_Map->new(Owner => $property);
4910 my $self = $class->SUPER::new(
4912 Complete_Name => $full_name,
4913 Full_Name => $full_name,
4914 _Property => $property,
4915 _Range_List => $range_list,
4918 my $addr; { no overloading; $addr = 0+$self; }
4920 $anomalous_entries{$addr} = [];
4921 $core_access{$addr} = $core_access;
4922 $default_map{$addr} = $default_map;
4923 $format{$addr} = $format;
4925 $self->initialize($initialize) if defined $initialize;
4932 qw("") => "_operator_stringify",
4935 sub _operator_stringify {
4938 my $name = $self->property->full_name;
4939 $name = '""' if $name eq "";
4940 return "Map table for Property '$name'";
4944 # Add a synonym for this table (which means the property itself)
4947 # Rest of parameters passed on.
4949 $self->SUPER::add_alias($name, $self->property, @_);
4954 # Add a range of code points to the list of specially-handled code
4955 # points. $MULTI_CP is assumed if the type of special is not passed
4964 my $type = delete $args{'Type'} || 0;
4965 # Rest of parameters passed on
4967 # Can't change the table if locked.
4968 return if $self->carp_if_locked;
4970 my $addr; { no overloading; $addr = 0+$self; }
4972 $has_specials{$addr} = 1 if $type;
4974 $self->_range_list->add_map($lower, $upper,
4981 sub append_to_body {
4982 # Adds to the written HERE document of the table's body any anomalous
4983 # entries in the table..
4986 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4988 my $addr; { no overloading; $addr = 0+$self; }
4990 return "" unless @{$anomalous_entries{$addr}};
4991 return join("\n", @{$anomalous_entries{$addr}}) . "\n";
4994 sub map_add_or_replace_non_nulls {
4995 # This adds the mappings in the table $other to $self. Non-null
4996 # mappings from $other override those in $self. It essentially merges
4997 # the two tables, with the second having priority except for null
5002 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5004 return if $self->carp_if_locked;
5006 if (! $other->isa(__PACKAGE__)) {
5007 Carp::my_carp_bug("$other should be a "
5015 my $addr; { no overloading; $addr = 0+$self; }
5016 my $other_addr; { no overloading; $other_addr = 0+$other; }
5018 local $to_trace = 0 if main::DEBUG;
5020 my $self_range_list = $self->_range_list;
5021 my $other_range_list = $other->_range_list;
5022 foreach my $range ($other_range_list->ranges) {
5023 my $value = $range->value;
5024 next if $value eq "";
5025 $self_range_list->_add_delete('+',
5029 Type => $range->type,
5030 Replace => $UNCONDITIONALLY);
5033 # Copy the specials information from the other table to $self
5034 if ($has_specials{$other_addr}) {
5035 $has_specials{$addr} = 1;
5041 sub set_default_map {
5042 # Define what code points that are missing from the input files should
5047 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5049 my $addr; { no overloading; $addr = 0+$self; }
5051 # Convert the input to the standard equivalent, if any (won't have any
5052 # for $STRING properties)
5053 my $standard = $self->_find_table_from_alias->{$map};
5054 $map = $standard->name if defined $standard;
5056 # Warn if there already is a non-equivalent default map for this
5057 # property. Note that a default map can be a ref, which means that
5058 # what it actually means is delayed until later in the program, and it
5059 # IS permissible to override it here without a message.
5060 my $default_map = $default_map{$addr};
5061 if (defined $default_map
5062 && ! ref($default_map)
5063 && $default_map ne $map
5064 && main::Standardize($map) ne $default_map)
5066 my $property = $self->property;
5067 my $map_table = $property->table($map);
5068 my $default_table = $property->table($default_map);
5069 if (defined $map_table
5070 && defined $default_table
5071 && $map_table != $default_table)
5073 Carp::my_carp("Changing the default mapping for "
5075 . " from $default_map to $map'");
5079 $default_map{$addr} = $map;
5081 # Don't also create any missing table for this map at this point,
5082 # because if we did, it could get done before the main table add is
5083 # done for PropValueAliases.txt; instead the caller will have to make
5084 # sure it exists, if desired.
5089 # Returns boolean: should we write this map table?
5092 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5094 my $addr; { no overloading; $addr = 0+$self; }
5096 # If overridden, use that
5097 return $to_output_map{$addr} if defined $to_output_map{$addr};
5099 my $full_name = $self->full_name;
5101 # If table says to output, do so; if says to suppress it, do do.
5102 return 1 if grep { $_ eq $full_name } @output_mapped_properties;
5103 return 0 if $self->status eq $SUPPRESSED;
5105 my $type = $self->property->type;
5107 # Don't want to output binary map tables even for debugging.
5108 return 0 if $type == $BINARY;
5110 # But do want to output string ones.
5111 return 1 if $type == $STRING;
5113 # Otherwise is an $ENUM, don't output it
5118 # Returns a Range_List that is gaps of the current table. That is,
5122 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5124 my $current = Range_List->new(Initialize => $self->_range_list,
5125 Owner => $self->property);
5129 sub set_final_comment {
5130 # Just before output, create the comment that heads the file
5131 # containing this table.
5134 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5136 # No sense generating a comment if aren't going to write it out.
5137 return if ! $self->to_output_map;
5139 my $addr; { no overloading; $addr = 0+$self; }
5141 my $property = $self->property;
5143 # Get all the possible names for this property. Don't use any that
5144 # aren't ok for use in a file name, etc. This is perhaps causing that
5145 # flag to do double duty, and may have to be changed in the future to
5146 # have our own flag for just this purpose; but it works now to exclude
5147 # Perl generated synonyms from the lists for properties, where the
5148 # name is always the proper Unicode one.
5149 my @property_aliases = grep { $_->externally_ok } $self->aliases;
5151 my $count = $self->count;
5152 my $default_map = $default_map{$addr};
5154 # The ranges that map to the default aren't output, so subtract that
5155 # to get those actually output. A property with matching tables
5156 # already has the information calculated.
5157 if ($property->type != $STRING) {
5158 $count -= $property->table($default_map)->count;
5160 elsif (defined $default_map) {
5162 # But for $STRING properties, must calculate now. Subtract the
5163 # count from each range that maps to the default.
5164 foreach my $range ($self->_range_list->ranges) {
5165 if ($range->value eq $default_map) {
5166 $count -= $range->end +1 - $range->start;
5172 # Get a string version of $count with underscores in large numbers,
5174 my $string_count = main::clarify_number($count);
5176 my $code_points = ($count == 1)
5177 ? 'single code point'
5178 : "$string_count code points";
5183 if (@property_aliases <= 1) {
5184 $mapping = 'mapping';
5185 $these_mappings = 'this mapping';
5189 $mapping = 'synonymous mappings';
5190 $these_mappings = 'these mappings';
5194 if ($count >= $MAX_UNICODE_CODEPOINTS) {
5195 $cp = "any code point in Unicode Version $string_version";
5199 if ($default_map eq "") {
5200 $map_to = 'the null string';
5202 elsif ($default_map eq $CODE_POINT) {
5206 $map_to = "'$default_map'";
5209 $cp = "the single code point";
5212 $cp = "one of the $code_points";
5214 $cp .= " in Unicode Version $string_version for which the mapping is not to $map_to";
5219 my $status = $self->status;
5221 my $warn = uc $status_past_participles{$status};
5224 !!!!!!! $warn !!!!!!!!!!!!!!!!!!!
5225 All property or property=value combinations contained in this file are $warn.
5226 See $unicode_reference_url for what this means.
5230 $comment .= "This file returns the $mapping:\n";
5232 for my $i (0 .. @property_aliases - 1) {
5233 $comment .= sprintf("%-8s%s\n",
5235 $property_aliases[$i]->name . '(cp)'
5239 "\nwhere 'cp' is $cp. Note that $these_mappings $are ";
5241 my $access = $core_access{$addr};
5243 $comment .= "accessible through the Perl core via $access.";
5246 $comment .= "not accessible through the Perl core directly.";
5249 # And append any commentary already set from the actual property.
5250 $comment .= "\n\n" . $self->comment if $self->comment;
5251 if ($self->description) {
5252 $comment .= "\n\n" . join " ", $self->description;
5255 $comment .= "\n\n" . join " ", $self->note;
5259 if (! $self->perl_extension) {
5262 For information about what this property really means, see:
5263 $unicode_reference_url
5267 if ($count) { # Format differs for empty table
5268 $comment.= "\nThe format of the ";
5269 if ($self->range_size_1) {
5271 main body of lines of this file is: CODE_POINT\\t\\tMAPPING where CODE_POINT
5272 is in hex; MAPPING is what CODE_POINT maps to.
5277 # There are tables which end up only having one element per
5278 # range, but it is not worth keeping track of for making just
5279 # this comment a little better.
5281 non-comment portions of the main body of lines of this file is:
5282 START\\tSTOP\\tMAPPING where START is the starting code point of the
5283 range, in hex; STOP is the ending point, or if omitted, the range has just one
5284 code point; MAPPING is what each code point between START and STOP maps to.
5286 if ($self->output_range_counts) {
5288 Numbers in comments in [brackets] indicate how many code points are in the
5289 range (omitted when the range is a single code point or if the mapping is to
5295 $self->set_comment(main::join_lines($comment));
5299 my %swash_keys; # Makes sure don't duplicate swash names.
5302 # Returns the string that should be output in the file before the main
5303 # body of this table. This includes some hash entries identifying the
5304 # format of the body, and what the single value should be for all
5305 # ranges missing from it. It also includes any code points which have
5306 # map_types that don't go in the main table.
5309 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5311 my $addr; { no overloading; $addr = 0+$self; }
5313 my $name = $self->property->swash_name;
5315 if (defined $swash_keys{$name}) {
5316 Carp::my_carp(join_lines(<<END
5317 Already created a swash name '$name' for $swash_keys{$name}. This means that
5318 the same name desired for $self shouldn't be used. Bad News. This must be
5319 fixed before production use, but proceeding anyway
5323 $swash_keys{$name} = "$self";
5325 my $default_map = $default_map{$addr};
5328 if ($has_specials{$addr}) {
5330 # Here, some maps with non-zero type have been added to the table.
5331 # Go through the table and handle each of them. None will appear
5332 # in the body of the table, so delete each one as we go. The
5333 # code point count has already been calculated, so ok to delete
5336 my @multi_code_point_maps;
5337 my $has_hangul_syllables = 0;
5339 # The key is the base name of the code point, and the value is an
5340 # array giving all the ranges that use this base name. Each range
5341 # is actually a hash giving the 'low' and 'high' values of it.
5342 my %names_ending_in_code_point;
5344 # Inverse mapping. The list of ranges that have these kinds of
5345 # names. Each element contains the low, high, and base names in a
5347 my @code_points_ending_in_code_point;
5349 my $range_map = $self->_range_list;
5350 foreach my $range ($range_map->ranges) {
5351 next unless $range->type != 0;
5352 my $low = $range->start;
5353 my $high = $range->end;
5354 my $map = $range->value;
5355 my $type = $range->type;
5357 # No need to output the range if it maps to the default. And
5358 # the write method won't output it either, so no need to
5359 # delete it to keep it from being output, and is faster to
5360 # skip than to delete anyway.
5361 next if $map eq $default_map;
5363 # Delete the range to keep write() from trying to output it
5364 $range_map->delete_range($low, $high);
5366 # Switch based on the map type...
5367 if ($type == $HANGUL_SYLLABLE) {
5369 # These are entirely algorithmically determinable based on
5370 # some constants furnished by Unicode; for now, just set a
5371 # flag to indicate that have them. Below we will output
5372 # the code that does the algorithm.
5373 $has_hangul_syllables = 1;
5375 elsif ($type == $CP_IN_NAME) {
5377 # If the name ends in the code point it represents, are
5378 # also algorithmically determinable, but need information
5379 # about the map to do so. Both the map and its inverse
5380 # are stored in data structures output in the file.
5381 push @{$names_ending_in_code_point{$map}->{'low'}}, $low;
5382 push @{$names_ending_in_code_point{$map}->{'high'}}, $high;
5384 push @code_points_ending_in_code_point, { low => $low,
5389 elsif ($range->type == $MULTI_CP || $range->type == $NULL) {
5391 # Multi-code point maps and null string maps have an entry
5392 # for each code point in the range. They use the same
5394 for my $code_point ($low .. $high) {
5396 # The pack() below can't cope with surrogates.
5397 if ($code_point >= 0xD800 && $code_point <= 0xDFFF) {
5398 Carp::my_carp("Surrogage code point '$code_point' in mapping to '$map' in $self. No map created");
5402 # Generate the hash entries for these in the form that
5403 # utf8.c understands.
5405 foreach my $to (split " ", $map) {
5406 if ($to !~ /^$code_point_re$/) {
5407 Carp::my_carp("Illegal code point '$to' in mapping '$map' from $code_point in $self. No map created");
5410 $tostr .= sprintf "\\x{%s}", $to;
5413 # I (khw) have never waded through this line to
5414 # understand it well enough to comment it.
5415 my $utf8 = sprintf(qq["%s" => "$tostr",],
5416 join("", map { sprintf "\\x%02X", $_ }
5417 unpack("U0C*", pack("U", $code_point))));
5419 # Add a comment so that a human reader can more easily
5420 # see what's going on.
5421 push @multi_code_point_maps,
5422 sprintf("%-45s # U+%04X => %s", $utf8,
5428 Carp::my_carp("Unrecognized map type '$range->type' in '$range' in $self. Using type 0 instead");
5429 $range_map->add_map($low, $high, $map, Replace => $UNCONDITIONALLY, Type => 0);
5431 } # End of loop through all ranges
5433 # Here have gone through the whole file. If actually generated
5434 # anything for each map type, add its respective header and
5436 if (@multi_code_point_maps) {
5439 # Some code points require special handling because their mappings are each to
5440 # multiple code points. These do not appear in the main body, but are defined
5441 # in the hash below.
5443 # The key: UTF-8 _bytes_, the value: UTF-8 (speed hack)
5444 %utf8::ToSpec$name = (
5446 $pre_body .= join("\n", @multi_code_point_maps) . "\n);\n";
5449 if ($has_hangul_syllables || @code_points_ending_in_code_point) {
5451 # Convert these structures to output format.
5452 my $code_points_ending_in_code_point =
5453 main::simple_dumper(\@code_points_ending_in_code_point,
5455 my $names = main::simple_dumper(\%names_ending_in_code_point,
5458 # Do the same with the Hangul names,
5464 if ($has_hangul_syllables) {
5466 # Construct a regular expression of all the possible
5467 # combinations of the Hangul syllables.
5468 my @L_re; # Leading consonants
5469 for my $i ($LBase .. $LBase + $LCount - 1) {
5470 push @L_re, $Jamo{$i}
5472 my @V_re; # Middle vowels
5473 for my $i ($VBase .. $VBase + $VCount - 1) {
5474 push @V_re, $Jamo{$i}
5476 my @T_re; # Trailing consonants
5477 for my $i ($TBase + 1 .. $TBase + $TCount - 1) {
5478 push @T_re, $Jamo{$i}
5481 # The whole re is made up of the L V T combination.
5483 . join ('|', sort @L_re)
5485 . join ('|', sort @V_re)
5487 . join ('|', sort @T_re)
5490 # These hashes needed by the algorithm were generated
5491 # during reading of the Jamo.txt file
5492 $jamo = main::simple_dumper(\%Jamo, ' ' x 8);
5493 $jamo_l = main::simple_dumper(\%Jamo_L, ' ' x 8);
5494 $jamo_v = main::simple_dumper(\%Jamo_V, ' ' x 8);
5495 $jamo_t = main::simple_dumper(\%Jamo_T, ' ' x 8);
5500 # To achieve significant memory savings when this file is read in,
5501 # algorithmically derivable code points are omitted from the main body below.
5502 # Instead, the following routines can be used to translate between name and
5503 # code point and vice versa
5507 # Matches legal code point. 4-6 hex numbers, If there are 6, the
5508 # first two must be '10'; if there are 5, the first must not be a '0'.
5509 my \$code_point_re = qr/$code_point_re/;
5511 # In the following hash, the keys are the bases of names which includes
5512 # the code point in the name, like CJK UNIFIED IDEOGRAPH-4E01. The values
5513 # of each key is another hash which is used to get the low and high ends
5514 # for each range of code points that apply to the name
5515 my %names_ending_in_code_point = (
5519 # And the following array gives the inverse mapping from code points to
5520 # names. Lowest code points are first
5521 my \@code_points_ending_in_code_point = (
5522 $code_points_ending_in_code_point
5525 # Earlier releases didn't have Jamos. No sense outputting
5526 # them unless will be used.
5527 if ($has_hangul_syllables) {
5530 # Convert from code point to Jamo short name for use in composing Hangul
5536 # Leading consonant (can be null)
5546 # Optional trailing consonant
5551 # Computed re that splits up a Hangul name into LVT or LV syllables
5552 my \$syllable_re = qr/$jamo_re/;
5554 my \$HANGUL_SYLLABLE = "HANGUL SYLLABLE ";
5555 my \$HANGUL_SYLLABLE_LENGTH = length \$HANGUL_SYLLABLE;
5557 # These constants names and values were taken from the Unicode standard,
5558 # version 5.1, section 3.12. They are used in conjunction with Hangul
5560 my \$SBase = 0xAC00;
5561 my \$LBase = 0x1100;
5562 my \$VBase = 0x1161;
5563 my \$TBase = 0x11A7;
5564 my \$SCount = 11172;
5568 my \$NCount = \$VCount * \$TCount;
5570 } # End of has Jamos
5572 $pre_body .= << 'END';
5574 sub name_to_code_point_special {
5577 # Returns undef if not one of the specially handled names; otherwise
5578 # returns the code point equivalent to the input name
5580 if ($has_hangul_syllables) {
5581 $pre_body .= << 'END';
5583 if (substr($name, 0, $HANGUL_SYLLABLE_LENGTH) eq $HANGUL_SYLLABLE) {
5584 $name = substr($name, $HANGUL_SYLLABLE_LENGTH);
5585 return if $name !~ qr/^$syllable_re$/;
5586 my $L = $Jamo_L{$1};
5587 my $V = $Jamo_V{$2};
5588 my $T = (defined $3) ? $Jamo_T{$3} : 0;
5589 return ($L * $VCount + $V) * $TCount + $T + $SBase;
5593 $pre_body .= << 'END';
5595 # Name must end in '-code_point' for this to handle.
5596 if ($name !~ /^ (.*) - ($code_point_re) $/x) {
5601 my $code_point = CORE::hex $2;
5603 # Name must be one of the ones which has the code point in it.
5604 return if ! $names_ending_in_code_point{$base};
5606 # Look through the list of ranges that apply to this name to see if
5607 # the code point is in one of them.
5608 for (my $i = 0; $i < scalar @{$names_ending_in_code_point{$base}{'low'}}; $i++) {
5609 return if $names_ending_in_code_point{$base}{'low'}->[$i] > $code_point;
5610 next if $names_ending_in_code_point{$base}{'high'}->[$i] < $code_point;
5612 # Here, the code point is in the range.
5616 # Here, looked like the name had a code point number in it, but
5617 # did not match one of the valid ones.
5621 sub code_point_to_name_special {
5622 my $code_point = shift;
5624 # Returns the name of a code point if algorithmically determinable;
5627 if ($has_hangul_syllables) {
5628 $pre_body .= << 'END';
5630 # If in the Hangul range, calculate the name based on Unicode's
5632 if ($code_point >= $SBase && $code_point <= $SBase + $SCount -1) {
5634 my $SIndex = $code_point - $SBase;
5635 my $L = $LBase + $SIndex / $NCount;
5636 my $V = $VBase + ($SIndex % $NCount) / $TCount;
5637 my $T = $TBase + $SIndex % $TCount;
5638 $name = "$HANGUL_SYLLABLE $Jamo{$L}$Jamo{$V}";
5639 $name .= $Jamo{$T} if $T != $TBase;
5644 $pre_body .= << 'END';
5646 # Look through list of these code points for one in range.
5647 foreach my $hash (@code_points_ending_in_code_point) {
5648 return if $code_point < $hash->{'low'};
5649 if ($code_point <= $hash->{'high'}) {
5650 return sprintf("%s-%04X", $hash->{'name'}, $code_point);
5653 return; # None found
5658 } # End of has hangul or code point in name maps.
5659 } # End of has specials
5661 # Calculate the format of the table if not already done.
5662 my $format = $format{$addr};
5663 my $property = $self->property;
5664 my $type = $property->type;
5665 if (! defined $format) {
5666 if ($type == $BINARY) {
5668 # Don't bother checking the values, because we elsewhere
5669 # verify that a binary table has only 2 values.
5670 $format = $BINARY_FORMAT;
5673 my @ranges = $self->_range_list->ranges;
5675 # default an empty table based on its type and default map
5678 # But it turns out that the only one we can say is a
5679 # non-string (besides binary, handled above) is when the
5680 # table is a string and the default map is to a code point
5681 if ($type == $STRING && $default_map eq $CODE_POINT) {
5682 $format = $HEX_FORMAT;
5685 $format = $STRING_FORMAT;
5690 # Start with the most restrictive format, and as we find
5691 # something that doesn't fit with that, change to the next
5692 # most restrictive, and so on.
5693 $format = $DECIMAL_FORMAT;
5694 foreach my $range (@ranges) {
5695 my $map = $range->value;
5696 if ($map ne $default_map) {
5697 last if $format eq $STRING_FORMAT; # already at
5700 $format = $INTEGER_FORMAT
5701 if $format eq $DECIMAL_FORMAT
5702 && $map !~ / ^ [0-9] $ /x;
5703 $format = $FLOAT_FORMAT
5704 if $format eq $INTEGER_FORMAT
5705 && $map !~ / ^ -? [0-9]+ $ /x;
5706 $format = $RATIONAL_FORMAT
5707 if $format eq $FLOAT_FORMAT
5708 && $map !~ / ^ -? [0-9]+ \. [0-9]* $ /x;
5709 $format = $HEX_FORMAT
5710 if $format eq $RATIONAL_FORMAT
5711 && $map !~ / ^ -? [0-9]+ ( \/ [0-9]+ )? $ /x;
5712 $format = $STRING_FORMAT if $format eq $HEX_FORMAT
5713 && $map =~ /[^0-9A-F]/;
5718 } # end of calculating format
5721 # The name this swash is to be known by, with the format of the mappings in
5722 # the main body of the table, and what all code points missing from this file
5724 \$utf8::SwashInfo{'To$name'}{'format'} = '$format'; # $map_table_formats{$format}
5726 my $missing = $default_map;
5727 if ($missing eq $CODE_POINT
5728 && $format ne $HEX_FORMAT
5729 && ! defined $format{$addr}) # Is expected if was manually set
5731 Carp::my_carp_bug("Expecting hex format for mapping table for $self, instead got '$format'")
5733 $format{$addr} = $format;
5734 $return .= "\$utf8::SwashInfo{'To$name'}{'missing'} = '$missing';";
5735 if ($missing eq $CODE_POINT) {
5736 $return .= ' # code point maps to itself';
5738 elsif ($missing eq "") {
5739 $return .= ' # code point maps to the null string';
5743 $return .= $pre_body;
5749 # Write the table to the file.
5752 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5754 my $addr; { no overloading; $addr = 0+$self; }
5756 return $self->SUPER::write(
5757 ($self->property == $block)
5758 ? 7 # block file needs more tab stops
5760 $default_map{$addr}); # don't write defaulteds
5763 # Accessors for the underlying list that should fail if locked.
5773 return if $self->carp_if_locked;
5774 return $self->_range_list->$sub(@_);
5777 } # End closure for Map_Table
5779 package Match_Table;
5780 use base '_Base_Table';
5782 # A Match table is one which is a list of all the code points that have
5783 # the same property and property value, for use in \p{property=value}
5784 # constructs in regular expressions. It adds very little data to the base
5785 # structure, but many methods, as these lists can be combined in many ways to
5787 # There are only a few concepts added:
5788 # 1) Equivalents and Relatedness.
5789 # Two tables can match the identical code points, but have different names.
5790 # This always happens when there is a perl single form extension
5791 # \p{IsProperty} for the Unicode compound form \P{Property=True}. The two
5792 # tables are set to be related, with the Perl extension being a child, and
5793 # the Unicode property being the parent.
5795 # It may be that two tables match the identical code points and we don't
5796 # know if they are related or not. This happens most frequently when the
5797 # Block and Script properties have the exact range. But note that a
5798 # revision to Unicode could add new code points to the script, which would
5799 # now have to be in a different block (as the block was filled, or there
5800 # would have been 'Unknown' script code points in it and they wouldn't have
5801 # been identical). So we can't rely on any two properties from Unicode
5802 # always matching the same code points from release to release, and thus
5803 # these tables are considered coincidentally equivalent--not related. When
5804 # two tables are unrelated but equivalent, one is arbitrarily chosen as the
5805 # 'leader', and the others are 'equivalents'. This concept is useful
5806 # to minimize the number of tables written out. Only one file is used for
5807 # any identical set of code points, with entries in Heavy.pl mapping all
5808 # the involved tables to it.
5810 # Related tables will always be identical; we set them up to be so. Thus
5811 # if the Unicode one is deprecated, the Perl one will be too. Not so for
5812 # unrelated tables. Relatedness makes generating the documentation easier.
5814 # 2) Conflicting. It may be that there will eventually be name clashes, with
5815 # the same name meaning different things. For a while, there actually were
5816 # conflicts, but they have so far been resolved by changing Perl's or
5817 # Unicode's definitions to match the other, but when this code was written,
5818 # it wasn't clear that that was what was going to happen. (Unicode changed
5819 # because of protests during their beta period.) Name clashes are warned
5820 # about during compilation, and the documentation. The generated tables
5821 # are sane, free of name clashes, because the code suppresses the Perl
5822 # version. But manual intervention to decide what the actual behavior
5823 # should be may be required should this happen. The introductory comments
5824 # have more to say about this.
5826 sub standardize { return main::standardize($_[0]); }
5827 sub trace { return main::trace(@_); }
5832 main::setup_package();
5835 # The leader table of this one; initially $self.
5836 main::set_access('leader', \%leader, 'r');
5839 # An array of any tables that have this one as their leader
5840 main::set_access('equivalents', \%equivalents, 'readable_array');
5843 # The parent table to this one, initially $self. This allows us to
5844 # distinguish between equivalent tables that are related, and those which
5845 # may not be, but share the same output file because they match the exact
5846 # same set of code points in the current Unicode release.
5847 main::set_access('parent', \%parent, 'r');
5850 # An array of any tables that have this one as their parent
5851 main::set_access('children', \%children, 'readable_array');
5854 # Array of any tables that would have the same name as this one with
5855 # a different meaning. This is used for the generated documentation.
5856 main::set_access('conflicting', \%conflicting, 'readable_array');
5859 # Set in the constructor for tables that are expected to match all code
5861 main::set_access('matches_all', \%matches_all, 'r');
5868 # The property for which this table is a listing of property values.
5869 my $property = delete $args{'_Property'};
5871 my $name = delete $args{'Name'};
5872 my $full_name = delete $args{'Full_Name'};
5873 $full_name = $name if ! defined $full_name;
5876 my $initialize = delete $args{'Initialize'};
5877 my $matches_all = delete $args{'Matches_All'} || 0;
5878 # Rest of parameters passed on.
5880 my $range_list = Range_List->new(Initialize => $initialize,
5881 Owner => $property);
5883 my $complete = $full_name;
5884 $complete = '""' if $complete eq ""; # A null name shouldn't happen,
5885 # but this helps debug if it
5887 # The complete name for a match table includes it's property in a
5888 # compound form 'property=table', except if the property is the
5889 # pseudo-property, perl, in which case it is just the single form,
5890 # 'table' (If you change the '=' must also change the ':' in lots of
5891 # places in this program that assume an equal sign)
5892 $complete = $property->full_name . "=$complete" if $property != $perl;
5894 my $self = $class->SUPER::new(%args,
5896 Complete_Name => $complete,
5897 Full_Name => $full_name,
5898 _Property => $property,
5899 _Range_List => $range_list,
5901 my $addr; { no overloading; $addr = 0+$self; }
5903 $conflicting{$addr} = [ ];
5904 $equivalents{$addr} = [ ];
5905 $children{$addr} = [ ];
5906 $matches_all{$addr} = $matches_all;
5907 $leader{$addr} = $self;
5908 $parent{$addr} = $self;
5913 # See this program's beginning comment block about overloading these.
5916 qw("") => "_operator_stringify",
5920 return if $self->carp_if_locked;
5928 return $self->_range_list + $other;
5934 return $self->_range_list & $other;
5940 return if $self->carp_if_locked;
5942 my $addr; { no overloading; $addr = 0+$self; }
5946 # Change the range list of this table to be the
5948 $self->_set_range_list($self->_range_list
5951 else { # $other is just a simple value
5952 $self->add_range($other, $other);
5956 '-' => sub { my $self = shift;
5958 my $reversed = shift;
5961 Carp::my_carp_bug("Can't cope with a "
5963 . " being the first parameter in a '-'. Subtraction ignored.");
5967 return $self->_range_list - $other;
5969 '~' => sub { my $self = shift;
5970 return ~ $self->_range_list;
5974 sub _operator_stringify {
5977 my $name = $self->complete_name;
5978 return "Table '$name'";
5982 # Add a synonym for this table. See the comments in the base class
5986 # Rest of parameters passed on.
5988 $self->SUPER::add_alias($name, $self, @_);
5992 sub add_conflicting {
5993 # Add the name of some other object to the list of ones that name
5994 # clash with this match table.
5997 my $conflicting_name = shift; # The name of the conflicting object
5998 my $p = shift || 'p'; # Optional, is this a \p{} or \P{} ?
5999 my $conflicting_object = shift; # Optional, the conflicting object
6000 # itself. This is used to
6001 # disambiguate the text if the input
6002 # name is identical to any of the
6003 # aliases $self is known by.
6004 # Sometimes the conflicting object is
6005 # merely hypothetical, so this has to
6006 # be an optional parameter.
6007 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6009 my $addr; { no overloading; $addr = 0+$self; }
6011 # Check if the conflicting name is exactly the same as any existing
6012 # alias in this table (as long as there is a real object there to
6013 # disambiguate with).
6014 if (defined $conflicting_object) {
6015 foreach my $alias ($self->aliases) {
6016 if ($alias->name eq $conflicting_name) {
6018 # Here, there is an exact match. This results in
6019 # ambiguous comments, so disambiguate by changing the
6020 # conflicting name to its object's complete equivalent.
6021 $conflicting_name = $conflicting_object->complete_name;
6027 # Convert to the \p{...} final name
6028 $conflicting_name = "\\$p" . "{$conflicting_name}";
6031 return if grep { $conflicting_name eq $_ } @{$conflicting{$addr}};
6033 push @{$conflicting{$addr}}, $conflicting_name;
6038 sub is_equivalent_to {
6039 # Return boolean of whether or not the other object is a table of this
6040 # type and has been marked equivalent to this one.
6044 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6046 return 0 if ! defined $other; # Can happen for incomplete early
6048 unless ($other->isa(__PACKAGE__)) {
6049 my $ref_other = ref $other;
6050 my $ref_self = ref $self;
6051 Carp::my_carp_bug("Argument to 'is_equivalent_to' must be another $ref_self, not a '$ref_other'. $other not set equivalent to $self.");
6055 # Two tables are equivalent if they have the same leader.
6057 return $leader{0+$self} == $leader{0+$other};
6061 sub matches_identically_to {
6062 # Return a boolean as to whether or not two tables match identical
6063 # sets of code points.
6067 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6069 unless ($other->isa(__PACKAGE__)) {
6070 my $ref_other = ref $other;
6071 my $ref_self = ref $self;
6072 Carp::my_carp_bug("Argument to 'matches_identically_to' must be another $ref_self, not a '$ref_other'. $other not set equivalent to $self.");
6076 # These are ordered in increasing real time to figure out (at least
6077 # until a patch changes that and doesn't change this)
6078 return 0 if $self->max != $other->max;
6079 return 0 if $self->min != $other->min;
6080 return 0 if $self->range_count != $other->range_count;
6081 return 0 if $self->count != $other->count;
6083 # Here they could be identical because all the tests above passed.
6084 # The loop below is somewhat simpler since we know they have the same
6085 # number of elements. Compare range by range, until reach the end or
6086 # find something that differs.
6087 my @a_ranges = $self->_range_list->ranges;
6088 my @b_ranges = $other->_range_list->ranges;
6089 for my $i (0 .. @a_ranges - 1) {
6090 my $a = $a_ranges[$i];
6091 my $b = $b_ranges[$i];
6092 trace "self $a; other $b" if main::DEBUG && $to_trace;
6093 return 0 if $a->start != $b->start || $a->end != $b->end;
6098 sub set_equivalent_to {
6099 # Set $self equivalent to the parameter table.
6100 # The required Related => 'x' parameter is a boolean indicating
6101 # whether these tables are related or not. If related, $other becomes
6102 # the 'parent' of $self; if unrelated it becomes the 'leader'
6104 # Related tables share all characteristics except names; equivalents
6105 # not quite so many.
6106 # If they are related, one must be a perl extension. This is because
6107 # we can't guarantee that Unicode won't change one or the other in a
6108 # later release even if they are idential now.
6114 my $related = delete $args{'Related'};
6116 Carp::carp_extra_args(\%args) if main::DEBUG && %args;
6118 return if ! defined $other; # Keep on going; happens in some early
6121 if (! defined $related) {
6122 Carp::my_carp_bug("set_equivalent_to must have 'Related => [01] parameter. Assuming $self is not related to $other");
6126 # If already are equivalent, no need to re-do it; if subroutine
6127 # returns null, it found an error, also do nothing
6128 my $are_equivalent = $self->is_equivalent_to($other);
6129 return if ! defined $are_equivalent || $are_equivalent;
6131 my $addr; { no overloading; $addr = 0+$self; }
6132 my $current_leader = ($related) ? $parent{$addr} : $leader{$addr};
6135 ! $other->perl_extension
6136 && ! $current_leader->perl_extension)
6138 Carp::my_carp_bug("set_equivalent_to should have 'Related => 0 for equivalencing two Unicode properties. Assuming $self is not related to $other");
6142 my $leader; { no overloading; $leader = 0+$current_leader; }
6143 my $other_addr; { no overloading; $other_addr = 0+$other; }
6145 # Any tables that are equivalent to or children of this table must now
6146 # instead be equivalent to or (children) to the new leader (parent),
6147 # still equivalent. The equivalency includes their matches_all info,
6148 # and for related tables, their status
6149 # All related tables are of necessity equivalent, but the converse
6150 # isn't necessarily true
6151 my $status = $other->status;
6152 my $status_info = $other->status_info;
6153 my $matches_all = $matches_all{other_addr};
6154 foreach my $table ($current_leader, @{$equivalents{$leader}}) {
6155 next if $table == $other;
6156 trace "setting $other to be the leader of $table, status=$status" if main::DEBUG && $to_trace;
6158 my $table_addr; { no overloading; $table_addr = 0+$table; }
6159 $leader{$table_addr} = $other;
6160 $matches_all{$table_addr} = $matches_all;
6161 $self->_set_range_list($other->_range_list);
6162 push @{$equivalents{$other_addr}}, $table;
6164 $parent{$table_addr} = $other;
6165 push @{$children{$other_addr}}, $table;
6166 $table->set_status($status, $status_info);
6170 # Now that we've declared these to be equivalent, any changes to one
6171 # of the tables would invalidate that equivalency.
6177 sub add_range { # Add a range to the list for this table.
6179 # Rest of parameters passed on
6181 return if $self->carp_if_locked;
6182 return $self->_range_list->add_range(@_);
6185 sub pre_body { # Does nothing for match tables.
6189 sub append_to_body { # Does nothing for match tables.
6195 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6197 return $self->SUPER::write(2); # 2 tab stops
6200 sub set_final_comment {
6201 # This creates a comment for the file that is to hold the match table
6202 # $self. It is somewhat convoluted to make the English read nicely,
6203 # but, heh, it's just a comment.
6204 # This should be called only with the leader match table of all the
6205 # ones that share the same file. It lists all such tables, ordered so
6206 # that related ones are together.
6208 my $leader = shift; # Should only be called on the leader table of
6209 # an equivalent group
6210 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6212 my $addr; { no overloading; $addr = 0+$leader; }
6214 if ($leader{$addr} != $leader) {
6215 Carp::my_carp_bug(<<END
6216 set_final_comment() must be called on a leader table, which $leader is not.
6217 It is equivalent to $leader{$addr}. No comment created
6223 # Get the number of code points matched by each of the tables in this
6224 # file, and add underscores for clarity.
6225 my $count = $leader->count;
6226 my $string_count = main::clarify_number($count);
6228 my $loose_count = 0; # how many aliases loosely matched
6229 my $compound_name = ""; # ? Are any names compound?, and if so, an
6231 my $properties_with_compound_names = 0; # count of these
6234 my %flags; # The status flags used in the file
6235 my $total_entries = 0; # number of entries written in the comment
6236 my $matches_comment = ""; # The portion of the comment about the
6238 my @global_comments; # List of all the tables' comments that are
6239 # there before this routine was called.
6241 # Get list of all the parent tables that are equivalent to this one
6242 # (including itself).
6243 my @parents = grep { $parent{main::objaddr $_} == $_ }
6244 main::uniques($leader, @{$equivalents{$addr}});
6245 my $has_unrelated = (@parents >= 2); # boolean, ? are there unrelated
6248 for my $parent (@parents) {
6250 my $property = $parent->property;
6252 # Special case 'N' tables in properties with two match tables when
6253 # the other is a 'Y' one. These are likely to be binary tables,
6254 # but not necessarily. In either case, \P{} will match the
6255 # complement of \p{}, and so if something is a synonym of \p, the
6256 # complement of that something will be the synonym of \P. This
6257 # would be true of any property with just two match tables, not
6258 # just those whose values are Y and N; but that would require a
6259 # little extra work, and there are none such so far in Unicode.
6260 my $perl_p = 'p'; # which is it? \p{} or \P{}
6261 my @yes_perl_synonyms; # list of any synonyms for the 'Y' table
6263 if (scalar $property->tables == 2
6264 && $parent == $property->table('N')
6265 && defined (my $yes = $property->table('Y')))
6267 my $yes_addr; { no overloading; $yes_addr = 0+$yes; }
6269 = grep { $_->property == $perl }
6272 $parent{$yes_addr}->children);
6274 # But these synonyms are \P{} ,not \p{}
6278 my @description; # Will hold the table description
6279 my @note; # Will hold the table notes.
6280 my @conflicting; # Will hold the table conflicts.
6282 # Look at the parent, any yes synonyms, and all the children
6283 my $parent_addr; { no overloading; $parent_addr = 0+$parent; }
6284 for my $table ($parent,
6286 @{$children{$parent_addr}})
6288 my $table_addr; { no overloading; $table_addr = 0+$table; }
6289 my $table_property = $table->property;
6291 # Tables are separated by a blank line to create a grouping.
6292 $matches_comment .= "\n" if $matches_comment;
6294 # The table is named based on the property and value
6295 # combination it is for, like script=greek. But there may be
6296 # a number of synonyms for each side, like 'sc' for 'script',
6297 # and 'grek' for 'greek'. Any combination of these is a valid
6298 # name for this table. In this case, there are three more,
6299 # 'sc=grek', 'sc=greek', and 'script='grek'. Rather than
6300 # listing all possible combinations in the comment, we make
6301 # sure that each synonym occurs at least once, and add
6302 # commentary that the other combinations are possible.
6303 my @property_aliases = $table_property->aliases;
6304 my @table_aliases = $table->aliases;
6306 Carp::my_carp_bug("$table doesn't have any names. Proceeding anyway.") unless @table_aliases;
6308 # The alias lists above are already ordered in the order we
6309 # want to output them. To ensure that each synonym is listed,
6310 # we must use the max of the two numbers.
6311 my $listed_combos = main::max(scalar @table_aliases,
6312 scalar @property_aliases);
6313 trace "$listed_combos, tables=", scalar @table_aliases, "; names=", scalar @property_aliases if main::DEBUG;
6315 my $property_had_compound_name = 0;
6317 for my $i (0 .. $listed_combos - 1) {
6320 # The current alias for the property is the next one on
6321 # the list, or if beyond the end, start over. Similarly
6322 # for the table (\p{prop=table})
6323 my $property_alias = $property_aliases
6324 [$i % @property_aliases]->name;
6325 my $table_alias_object = $table_aliases
6326 [$i % @table_aliases];
6327 my $table_alias = $table_alias_object->name;
6328 my $loose_match = $table_alias_object->loose_match;
6330 if ($table_alias !~ /\D/) { # Clarify large numbers.
6331 $table_alias = main::clarify_number($table_alias)
6334 # Add a comment for this alias combination
6335 my $current_match_comment;
6336 if ($table_property == $perl) {
6337 $current_match_comment = "\\$perl_p"
6341 $current_match_comment
6342 = "\\p{$property_alias=$table_alias}";
6343 $property_had_compound_name = 1;
6346 # Flag any abnormal status for this table.
6347 my $flag = $property->status
6349 || $table_alias_object->status;
6351 if ($flag ne $PLACEHOLDER) {
6352 $flags{$flag} = $status_past_participles{$flag};
6354 $flags{$flag} = <<END;
6355 a placeholder because it is not in Version $string_version of Unicode, but is
6356 needed by the Perl core to work gracefully. Because it is not in this version
6357 of Unicode, it will not be listed in $pod_file.pod
6364 # Pretty up the comment. Note the \b; it says don't make
6365 # this line a continuation.
6366 $matches_comment .= sprintf("\b%-1s%-s%s\n",
6369 $current_match_comment);
6370 } # End of generating the entries for this table.
6372 # Save these for output after this group of related tables.
6373 push @description, $table->description;
6374 push @note, $table->note;
6375 push @conflicting, $table->conflicting;
6377 # And this for output after all the tables.
6378 push @global_comments, $table->comment;
6380 # Compute an alternate compound name using the final property
6381 # synonym and the first table synonym with a colon instead of
6382 # the equal sign used elsewhere.
6383 if ($property_had_compound_name) {
6384 $properties_with_compound_names ++;
6385 if (! $compound_name || @property_aliases > 1) {
6386 $compound_name = $property_aliases[-1]->name
6388 . $table_aliases[0]->name;
6391 } # End of looping through all children of this table
6393 # Here have assembled in $matches_comment all the related tables
6394 # to the current parent (preceded by the same info for all the
6395 # previous parents). Put out information that applies to all of
6396 # the current family.
6399 # But output the conflicting information now, as it applies to
6401 my $conflicting = join ", ", @conflicting;
6403 $matches_comment .= <<END;
6405 Note that contrary to what you might expect, the above is NOT the same as
6407 $matches_comment .= "any of: " if @conflicting > 1;
6408 $matches_comment .= "$conflicting\n";
6412 $matches_comment .= "\n Meaning: "
6413 . join('; ', @description)
6417 $matches_comment .= "\n Note: "
6418 . join("\n ", @note)
6421 } # End of looping through all tables
6429 $code_points = 'single code point';
6433 $code_points = "$string_count code points";
6438 if ($total_entries <= 1) {
6441 $any_of_these = 'this'
6444 $synonyms = " any of the following regular expression constructs";
6445 $entries = 'entries';
6446 $any_of_these = 'any of these'
6450 if ($has_unrelated) {
6452 This file is for tables that are not necessarily related: To conserve
6453 resources, every table that matches the identical set of code points in this
6454 version of Unicode uses this file. Each one is listed in a separate group
6455 below. It could be that the tables will match the same set of code points in
6456 other Unicode releases, or it could be purely coincidence that they happen to
6457 be the same in Unicode $string_version, and hence may not in other versions.
6463 foreach my $flag (sort keys %flags) {
6465 '$flag' below means that this form is $flags{$flag}.
6467 next if $flag eq $PLACEHOLDER;
6468 $comment .= "Consult $pod_file.pod\n";
6474 This file returns the $code_points in Unicode Version $string_version that
6478 $pod_file.pod should be consulted for the syntax rules for $any_of_these,
6479 including if adding or subtracting white space, underscore, and hyphen
6480 characters matters or doesn't matter, and other permissible syntactic
6481 variants. Upper/lower case distinctions never matter.
6484 if ($compound_name) {
6487 A colon can be substituted for the equals sign, and
6489 if ($properties_with_compound_names > 1) {
6491 within each group above,
6494 $compound_name = sprintf("%-8s\\p{%s}", " ", $compound_name);
6496 # Note the \b below, it says don't make that line a continuation.
6498 anything to the left of the equals (or colon) can be combined with anything to
6499 the right. Thus, for example,
6505 # And append any comment(s) from the actual tables. They are all
6506 # gathered here, so may not read all that well.
6507 if (@global_comments) {
6508 $comment .= "\n" . join("\n\n", @global_comments) . "\n";
6511 if ($count) { # The format differs if no code points, and needs no
6512 # explanation in that case
6515 The format of the lines of this file is:
6518 START\\tSTOP\\twhere START is the starting code point of the range, in hex;
6519 STOP is the ending point, or if omitted, the range has just one code point.
6521 if ($leader->output_range_counts) {
6523 Numbers in comments in [brackets] indicate how many code points are in the
6529 $leader->set_comment(main::join_lines($comment));
6533 # Accessors for the underlying list
6535 get_valid_code_point
6536 get_invalid_code_point
6544 return $self->_range_list->$sub(@_);
6547 } # End closure for Match_Table
6551 # The Property class represents a Unicode property, or the $perl
6552 # pseudo-property. It contains a map table initialized empty at construction
6553 # time, and for properties accessible through regular expressions, various
6554 # match tables, created through the add_match_table() method, and referenced
6555 # by the table('NAME') or tables() methods, the latter returning a list of all
6556 # of the match tables. Otherwise table operations implicitly are for the map
6559 # Most of the data in the property is actually about its map table, so it
6560 # mostly just uses that table's accessors for most methods. The two could
6561 # have been combined into one object, but for clarity because of their
6562 # differing semantics, they have been kept separate. It could be argued that
6563 # the 'file' and 'directory' fields should be kept with the map table.
6565 # Each property has a type. This can be set in the constructor, or in the
6566 # set_type accessor, but mostly it is figured out by the data. Every property
6567 # starts with unknown type, overridden by a parameter to the constructor, or
6568 # as match tables are added, or ranges added to the map table, the data is
6569 # inspected, and the type changed. After the table is mostly or entirely
6570 # filled, compute_type() should be called to finalize they analysis.
6572 # There are very few operations defined. One can safely remove a range from
6573 # the map table, and property_add_or_replace_non_nulls() adds the maps from another
6574 # table to this one, replacing any in the intersection of the two.
6576 sub standardize { return main::standardize($_[0]); }
6577 sub trace { return main::trace(@_) if main::DEBUG && $to_trace }
6581 # This hash will contain as keys, all the aliases of all properties, and
6582 # as values, pointers to their respective property objects. This allows
6583 # quick look-up of a property from any of its names.
6584 my %alias_to_property_of;
6586 sub dump_alias_to_property_of {
6589 print "\n", main::simple_dumper (\%alias_to_property_of), "\n";
6594 # This is a package subroutine, not called as a method.
6595 # If the single parameter is a literal '*' it returns a list of all
6596 # defined properties.
6597 # Otherwise, the single parameter is a name, and it returns a pointer
6598 # to the corresponding property object, or undef if none.
6600 # Properties can have several different names. The 'standard' form of
6601 # each of them is stored in %alias_to_property_of as they are defined.
6602 # But it's possible that this subroutine will be called with some
6603 # variant, so if the initial lookup fails, it is repeated with the
6604 # standarized form of the input name. If found, besides returning the
6605 # result, the input name is added to the list so future calls won't
6606 # have to do the conversion again.
6610 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6612 if (! defined $name) {
6613 Carp::my_carp_bug("Undefined input property. No action taken.");
6617 return main::uniques(values %alias_to_property_of) if $name eq '*';
6619 # Return cached result if have it.
6620 my $result = $alias_to_property_of{$name};
6621 return $result if defined $result;
6623 # Convert the input to standard form.
6624 my $standard_name = standardize($name);
6626 $result = $alias_to_property_of{$standard_name};
6627 return unless defined $result; # Don't cache undefs
6629 # Cache the result before returning it.
6630 $alias_to_property_of{$name} = $result;
6635 main::setup_package();
6638 # A pointer to the map table object for this property
6639 main::set_access('map', \%map);
6642 # The property's full name. This is a duplicate of the copy kept in the
6643 # map table, but is needed because stringify needs it during
6644 # construction of the map table, and then would have a chicken before egg
6646 main::set_access('full_name', \%full_name, 'r');
6649 # This hash will contain as keys, all the aliases of any match tables
6650 # attached to this property, and as values, the pointers to their
6651 # respective tables. This allows quick look-up of a table from any of its
6653 main::set_access('table_ref', \%table_ref);
6656 # The type of the property, $ENUM, $BINARY, etc
6657 main::set_access('type', \%type, 'r');
6660 # The filename where the map table will go (if actually written).
6661 # Normally defaulted, but can be overridden.
6662 main::set_access('file', \%file, 'r', 's');
6665 # The directory where the map table will go (if actually written).
6666 # Normally defaulted, but can be overridden.
6667 main::set_access('directory', \%directory, 's');
6669 my %pseudo_map_type;
6670 # This is used to affect the calculation of the map types for all the
6671 # ranges in the table. It should be set to one of the values that signify
6672 # to alter the calculation.
6673 main::set_access('pseudo_map_type', \%pseudo_map_type, 'r');
6675 my %has_only_code_point_maps;
6676 # A boolean used to help in computing the type of data in the map table.
6677 main::set_access('has_only_code_point_maps', \%has_only_code_point_maps);
6680 # A list of the first few distinct mappings this property has. This is
6681 # used to disambiguate between binary and enum property types, so don't
6682 # have to keep more than three.
6683 main::set_access('unique_maps', \%unique_maps);
6686 # The only required parameter is the positionally first, name. All
6687 # other parameters are key => value pairs. See the documentation just
6688 # above for the meanings of the ones not passed directly on to the map
6689 # table constructor.
6692 my $name = shift || "";
6694 my $self = property_ref($name);
6695 if (defined $self) {
6696 my $options_string = join ", ", @_;
6697 $options_string = ". Ignoring options $options_string" if $options_string;
6698 Carp::my_carp("$self is already in use. Using existing one$options_string;");
6704 $self = bless \do { my $anonymous_scalar }, $class;
6705 my $addr; { no overloading; $addr = 0+$self; }
6707 $directory{$addr} = delete $args{'Directory'};
6708 $file{$addr} = delete $args{'File'};
6709 $full_name{$addr} = delete $args{'Full_Name'} || $name;
6710 $type{$addr} = delete $args{'Type'} || $UNKNOWN;
6711 $pseudo_map_type{$addr} = delete $args{'Map_Type'};
6712 # Rest of parameters passed on.
6714 $has_only_code_point_maps{$addr} = 1;
6715 $table_ref{$addr} = { };
6716 $unique_maps{$addr} = { };
6718 $map{$addr} = Map_Table->new($name,
6719 Full_Name => $full_name{$addr},
6720 _Alias_Hash => \%alias_to_property_of,
6726 # See this program's beginning comment block about overloading the copy
6727 # constructor. Few operations are defined on properties, but a couple are
6728 # useful. It is safe to take the inverse of a property, and to remove a
6729 # single code point from it.
6732 qw("") => "_operator_stringify",
6733 "." => \&main::_operator_dot,
6734 '==' => \&main::_operator_equal,
6735 '!=' => \&main::_operator_not_equal,
6736 '=' => sub { return shift },
6737 '-=' => "_minus_and_equal",
6740 sub _operator_stringify {
6741 return "Property '" . shift->full_name . "'";
6744 sub _minus_and_equal {
6745 # Remove a single code point from the map table of a property.
6749 my $reversed = shift;
6750 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6753 Carp::my_carp_bug("Can't cope with a "
6755 . " argument to '-='. Subtraction ignored.");
6758 elsif ($reversed) { # Shouldnt happen in a -=, but just in case
6759 Carp::my_carp_bug("Can't cope with a "
6761 . " being the first parameter in a '-='. Subtraction ignored.");
6766 $map{0+$self}->delete_range($other, $other);
6771 sub add_match_table {
6772 # Add a new match table for this property, with name given by the
6773 # parameter. It returns a pointer to the table.
6779 my $addr; { no overloading; $addr = 0+$self; }
6781 my $table = $table_ref{$addr}{$name};
6782 my $standard_name = main::standardize($name);
6784 || (defined ($table = $table_ref{$addr}{$standard_name})))
6786 Carp::my_carp("Table '$name' in $self is already in use. Using existing one");
6787 $table_ref{$addr}{$name} = $table;
6792 # See if this is a perl extension, if not passed in.
6793 my $perl_extension = delete $args{'Perl_Extension'};
6795 = $self->perl_extension if ! defined $perl_extension;
6797 $table = Match_Table->new(
6799 Perl_Extension => $perl_extension,
6800 _Alias_Hash => $table_ref{$addr},
6803 # gets property's status by default
6804 Status => $self->status,
6805 _Status_Info => $self->status_info,
6807 Internal_Only_Warning => 1); # Override any
6809 return unless defined $table;
6812 # Save the names for quick look up
6813 $table_ref{$addr}{$standard_name} = $table;
6814 $table_ref{$addr}{$name} = $table;
6816 # Perhaps we can figure out the type of this property based on the
6817 # fact of adding this match table. First, string properties don't
6818 # have match tables; second, a binary property can't have 3 match
6820 if ($type{$addr} == $UNKNOWN) {
6821 $type{$addr} = $NON_STRING;
6823 elsif ($type{$addr} == $STRING) {
6824 Carp::my_carp("$self Added a match table '$name' to a string property '$self'. Changed it to a non-string property. Bad News.");
6825 $type{$addr} = $NON_STRING;
6827 elsif ($type{$addr} != $ENUM) {
6828 if (scalar main::uniques(values %{$table_ref{$addr}}) > 2
6829 && $type{$addr} == $BINARY)
6831 Carp::my_carp("$self now has more than 2 tables (with the addition of '$name'), and so is no longer binary. Changing its type to 'enum'. Bad News.");
6832 $type{$addr} = $ENUM;
6840 # Return a pointer to the match table (with name given by the
6841 # parameter) associated with this property; undef if none.
6845 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6847 my $addr; { no overloading; $addr = 0+$self; }
6849 return $table_ref{$addr}{$name} if defined $table_ref{$addr}{$name};
6851 # If quick look-up failed, try again using the standard form of the
6852 # input name. If that succeeds, cache the result before returning so
6853 # won't have to standardize this input name again.
6854 my $standard_name = main::standardize($name);
6855 return unless defined $table_ref{$addr}{$standard_name};
6857 $table_ref{$addr}{$name} = $table_ref{$addr}{$standard_name};
6858 return $table_ref{$addr}{$name};
6862 # Return a list of pointers to all the match tables attached to this
6866 return main::uniques(values %{$table_ref{0+shift}});
6870 # Returns the directory the map table for this property should be
6871 # output in. If a specific directory has been specified, that has
6872 # priority; 'undef' is returned if the type isn't defined;
6873 # or $map_directory for everything else.
6875 my $addr; { no overloading; $addr = 0+shift; }
6877 return $directory{$addr} if defined $directory{$addr};
6878 return undef if $type{$addr} == $UNKNOWN;
6879 return $map_directory;
6883 # Return the name that is used to both:
6884 # 1) Name the file that the map table is written to.
6885 # 2) The name of swash related stuff inside that file.
6886 # The reason for this is that the Perl core historically has used
6887 # certain names that aren't the same as the Unicode property names.
6888 # To continue using these, $file is hard-coded in this file for those,
6889 # but otherwise the standard name is used. This is different from the
6890 # external_name, so that the rest of the files, like in lib can use
6891 # the standard name always, without regard to historical precedent.
6894 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6896 my $addr; { no overloading; $addr = 0+$self; }
6898 return $file{$addr} if defined $file{$addr};
6899 return $map{$addr}->external_name;
6902 sub to_create_match_tables {
6903 # Returns a boolean as to whether or not match tables should be
6904 # created for this property.
6907 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6909 # The whole point of this pseudo property is match tables.
6910 return 1 if $self == $perl;
6912 my $addr; { no overloading; $addr = 0+$self; }
6914 # Don't generate tables of code points that match the property values
6915 # of a string property. Such a list would most likely have many
6916 # property values, each with just one or very few code points mapping
6918 return 0 if $type{$addr} == $STRING;
6920 # Don't generate anything for unimplemented properties.
6921 return 0 if grep { $self->complete_name eq $_ }
6922 @unimplemented_properties;
6927 sub property_add_or_replace_non_nulls {
6928 # This adds the mappings in the property $other to $self. Non-null
6929 # mappings from $other override those in $self. It essentially merges
6930 # the two properties, with the second having priority except for null
6935 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6937 if (! $other->isa(__PACKAGE__)) {
6938 Carp::my_carp_bug("$other should be a "
6947 return $map{0+$self}->map_add_or_replace_non_nulls($map{0+$other});
6951 # Set the type of the property. Mostly this is figured out by the
6952 # data in the table. But this is used to set it explicitly. The
6953 # reason it is not a standard accessor is that when setting a binary
6954 # property, we need to make sure that all the true/false aliases are
6955 # present, as they were omitted in early Unicode releases.
6959 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6961 if ($type != $ENUM && $type != $BINARY && $type != $STRING) {
6962 Carp::my_carp("Unrecognized type '$type'. Type not set");
6966 { no overloading; $type{0+$self} = $type; }
6967 return if $type != $BINARY;
6969 my $yes = $self->table('Y');
6970 $yes = $self->table('Yes') if ! defined $yes;
6971 $yes = $self->add_match_table('Y') if ! defined $yes;
6972 $yes->add_alias('Yes');
6973 $yes->add_alias('T');
6974 $yes->add_alias('True');
6976 my $no = $self->table('N');
6977 $no = $self->table('No') if ! defined $no;
6978 $no = $self->add_match_table('N') if ! defined $no;
6979 $no->add_alias('No');
6980 $no->add_alias('F');
6981 $no->add_alias('False');
6986 # Add a map to the property's map table. This also keeps
6987 # track of the maps so that the property type can be determined from
6991 my $start = shift; # First code point in range
6992 my $end = shift; # Final code point in range
6993 my $map = shift; # What the range maps to.
6994 # Rest of parameters passed on.
6996 my $addr; { no overloading; $addr = 0+$self; }
6998 # If haven't the type of the property, gather information to figure it
7000 if ($type{$addr} == $UNKNOWN) {
7002 # If the map contains an interior blank or dash, or most other
7003 # nonword characters, it will be a string property. This
7004 # heuristic may actually miss some string properties. If so, they
7005 # may need to have explicit set_types called for them. This
7006 # happens in the Unihan properties.
7007 if ($map =~ / (?<= . ) [ -] (?= . ) /x
7008 || $map =~ / [^\w.\/\ -] /x)
7010 $self->set_type($STRING);
7012 # $unique_maps is used for disambiguating between ENUM and
7013 # BINARY later; since we know the property is not going to be
7014 # one of those, no point in keeping the data around
7015 undef $unique_maps{$addr};
7019 # Not necessarily a string. The final decision has to be
7020 # deferred until all the data are in. We keep track of if all
7021 # the values are code points for that eventual decision.
7022 $has_only_code_point_maps{$addr} &=
7023 $map =~ / ^ $code_point_re $/x;
7025 # For the purposes of disambiguating between binary and other
7026 # enumerations at the end, we keep track of the first three
7027 # distinct property values. Once we get to three, we know
7028 # it's not going to be binary, so no need to track more.
7029 if (scalar keys %{$unique_maps{$addr}} < 3) {
7030 $unique_maps{$addr}{main::standardize($map)} = 1;
7035 # Add the mapping by calling our map table's method
7036 return $map{$addr}->add_map($start, $end, $map, @_);
7040 # Compute the type of the property: $ENUM, $STRING, or $BINARY. This
7041 # should be called after the property is mostly filled with its maps.
7042 # We have been keeping track of what the property values have been,
7043 # and now have the necessary information to figure out the type.
7046 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7048 my $addr; { no overloading; $addr = 0+$self; }
7050 my $type = $type{$addr};
7052 # If already have figured these out, no need to do so again, but we do
7053 # a double check on ENUMS to make sure that a string property hasn't
7054 # improperly been classified as an ENUM, so continue on with those.
7055 return if $type == $STRING || $type == $BINARY;
7057 # If every map is to a code point, is a string property.
7058 if ($type == $UNKNOWN
7059 && ($has_only_code_point_maps{$addr}
7060 || (defined $map{$addr}->default_map
7061 && $map{$addr}->default_map eq "")))
7063 $self->set_type($STRING);
7067 # Otherwise, it is to some sort of enumeration. (The case where
7068 # it is a Unicode miscellaneous property, and treated like a
7069 # string in this program is handled in add_map()). Distinguish
7070 # between binary and some other enumeration type. Of course, if
7071 # there are more than two values, it's not binary. But more
7072 # subtle is the test that the default mapping is defined means it
7073 # isn't binary. This in fact may change in the future if Unicode
7074 # changes the way its data is structured. But so far, no binary
7075 # properties ever have @missing lines for them, so the default map
7076 # isn't defined for them. The few properties that are two-valued
7077 # and aren't considered binary have the default map defined
7078 # starting in Unicode 5.0, when the @missing lines appeared; and
7079 # this program has special code to put in a default map for them
7080 # for earlier than 5.0 releases.
7082 || scalar keys %{$unique_maps{$addr}} > 2
7083 || defined $self->default_map)
7085 my $tables = $self->tables;
7086 my $count = $self->count;
7087 if ($verbosity && $count > 500 && $tables/$count > .1) {
7088 Carp::my_carp_bug("It appears that $self should be a \$STRING property, not an \$ENUM because it has too many match tables: $count\n");
7090 $self->set_type($ENUM);
7093 $self->set_type($BINARY);
7096 undef $unique_maps{$addr}; # Garbage collect
7100 # Most of the accessors for a property actually apply to its map table.
7101 # Setup up accessor functions for those, referring to %map
7148 # 'property' above is for symmetry, so that one can take
7149 # the property of a property and get itself, and so don't
7150 # have to distinguish between properties and tables in
7158 return $map{0+$self}->$sub(@_);
7168 # Returns lines of the input joined together, so that they can be folded
7170 # This causes continuation lines to be joined together into one long line
7171 # for folding. A continuation line is any line that doesn't begin with a
7172 # space or "\b" (the latter is stripped from the output). This is so
7173 # lines can be be in a HERE document so as to fit nicely in the terminal
7174 # width, but be joined together in one long line, and then folded with
7175 # indents, '#' prefixes, etc, properly handled.
7176 # A blank separates the joined lines except if there is a break; an extra
7177 # blank is inserted after a period ending a line.
7179 # Intialize the return with the first line.
7180 my ($return, @lines) = split "\n", shift;
7182 # If the first line is null, it was an empty line, add the \n back in
7183 $return = "\n" if $return eq "";
7185 # Now join the remainder of the physical lines.
7186 for my $line (@lines) {
7188 # An empty line means wanted a blank line, so add two \n's to get that
7189 # effect, and go to the next line.
7190 if (length $line == 0) {
7195 # Look at the last character of what we have so far.
7196 my $previous_char = substr($return, -1, 1);
7198 # And at the next char to be output.
7199 my $next_char = substr($line, 0, 1);
7201 if ($previous_char ne "\n") {
7203 # Here didn't end wth a nl. If the next char a blank or \b, it
7204 # means that here there is a break anyway. So add a nl to the
7206 if ($next_char eq " " || $next_char eq "\b") {
7207 $previous_char = "\n";
7208 $return .= $previous_char;
7211 # Add an extra space after periods.
7212 $return .= " " if $previous_char eq '.';
7215 # Here $previous_char is still the latest character to be output. If
7216 # it isn't a nl, it means that the next line is to be a continuation
7217 # line, with a blank inserted between them.
7218 $return .= " " if $previous_char ne "\n";
7221 substr($line, 0, 1) = "" if $next_char eq "\b";
7223 # And append this next line.
7230 sub simple_fold($;$$$) {
7231 # Returns a string of the input (string or an array of strings) folded
7232 # into multiple-lines each of no more than $MAX_LINE_WIDTH characters plus
7234 # This is tailored for the kind of text written by this program,
7235 # especially the pod file, which can have very long names with
7236 # underscores in the middle, or words like AbcDefgHij.... We allow
7237 # breaking in the middle of such constructs if the line won't fit
7238 # otherwise. The break in such cases will come either just after an
7239 # underscore, or just before one of the Capital letters.
7241 local $to_trace = 0 if main::DEBUG;
7244 my $prefix = shift; # Optional string to prepend to each output
7246 $prefix = "" unless defined $prefix;
7248 my $hanging_indent = shift; # Optional number of spaces to indent
7249 # continuation lines
7250 $hanging_indent = 0 unless $hanging_indent;
7252 my $right_margin = shift; # Optional number of spaces to narrow the
7254 $right_margin = 0 unless defined $right_margin;
7256 # Call carp with the 'nofold' option to avoid it from trying to call us
7258 Carp::carp_extra_args(\@_, 'nofold') if main::DEBUG && @_;
7260 # The space available doesn't include what's automatically prepended
7261 # to each line, or what's reserved on the right.
7262 my $max = $MAX_LINE_WIDTH - length($prefix) - $right_margin;
7263 # XXX Instead of using the 'nofold' perhaps better to look up the stack
7265 if (DEBUG && $hanging_indent >= $max) {
7266 Carp::my_carp("Too large a hanging indent ($hanging_indent); must be < $max. Using 0", 'nofold');
7267 $hanging_indent = 0;
7270 # First, split into the current physical lines.
7272 if (ref $line) { # Better be an array, because not bothering to
7274 foreach my $line (@{$line}) {
7275 push @line, split /\n/, $line;
7279 @line = split /\n/, $line;
7282 #local $to_trace = 1 if main::DEBUG;
7283 trace "", join(" ", @line), "\n" if main::DEBUG && $to_trace;
7285 # Look at each current physical line.
7286 for (my $i = 0; $i < @line; $i++) {
7287 Carp::my_carp("Tabs don't work well.", 'nofold') if $line[$i] =~ /\t/;
7288 #local $to_trace = 1 if main::DEBUG;
7289 trace "i=$i: $line[$i]\n" if main::DEBUG && $to_trace;
7291 # Remove prefix, because will be added back anyway, don't want
7293 $line[$i] =~ s/^$prefix//;
7295 # Remove trailing space
7296 $line[$i] =~ s/\s+\Z//;
7298 # If the line is too long, fold it.
7299 if (length $line[$i] > $max) {
7302 # Here needs to fold. Save the leading space in the line for
7304 $line[$i] =~ /^ ( \s* )/x;
7305 my $leading_space = $1;
7306 trace "line length", length $line[$i], "; lead length", length($leading_space) if main::DEBUG && $to_trace;
7308 # If character at final permissible position is white space,
7309 # fold there, which will delete that white space
7310 if (substr($line[$i], $max - 1, 1) =~ /\s/) {
7311 $remainder = substr($line[$i], $max);
7312 $line[$i] = substr($line[$i], 0, $max - 1);
7316 # Otherwise fold at an acceptable break char closest to
7317 # the max length. Look at just the maximal initial
7318 # segment of the line
7319 my $segment = substr($line[$i], 0, $max - 1);
7321 /^ ( .{$hanging_indent} # Don't look before the
7323 \ * # Don't look in leading
7324 # blanks past the indent
7325 [^ ] .* # Find the right-most
7326 (?: # acceptable break:
7327 [ \s = ] # space or equal
7328 | - (?! [.0-9] ) # or non-unary minus.
7329 ) # $1 includes the character
7332 # Split into the initial part that fits, and remaining
7334 $remainder = substr($line[$i], length $1);
7336 trace $line[$i] if DEBUG && $to_trace;
7337 trace $remainder if DEBUG && $to_trace;
7340 # If didn't find a good breaking spot, see if there is a
7341 # not-so-good breaking spot. These are just after
7342 # underscores or where the case changes from lower to
7343 # upper. Use \a as a soft hyphen, but give up
7344 # and don't break the line if there is actually a \a
7345 # already in the input. We use an ascii character for the
7346 # soft-hyphen to avoid any attempt by miniperl to try to
7347 # access the files that this program is creating.
7348 elsif ($segment !~ /\a/
7349 && ($segment =~ s/_/_\a/g
7350 || $segment =~ s/ ( [a-z] ) (?= [A-Z] )/$1\a/xg))
7352 # Here were able to find at least one place to insert
7353 # our substitute soft hyphen. Find the right-most one
7354 # and replace it by a real hyphen.
7355 trace $segment if DEBUG && $to_trace;
7357 rindex($segment, "\a"),
7360 # Then remove the soft hyphen substitutes.
7361 $segment =~ s/\a//g;
7362 trace $segment if DEBUG && $to_trace;
7364 # And split into the initial part that fits, and
7365 # remainder of the line
7366 my $pos = rindex($segment, '-');
7367 $remainder = substr($line[$i], $pos);
7368 trace $remainder if DEBUG && $to_trace;
7369 $line[$i] = substr($segment, 0, $pos + 1);
7373 # Here we know if we can fold or not. If we can, $remainder
7374 # is what remains to be processed in the next iteration.
7375 if (defined $remainder) {
7376 trace "folded='$line[$i]'" if main::DEBUG && $to_trace;
7378 # Insert the folded remainder of the line as a new element
7379 # of the array. (It may still be too long, but we will
7380 # deal with that next time through the loop.) Omit any
7381 # leading space in the remainder.
7382 $remainder =~ s/^\s+//;
7383 trace "remainder='$remainder'" if main::DEBUG && $to_trace;
7385 # But then indent by whichever is larger of:
7386 # 1) the leading space on the input line;
7387 # 2) the hanging indent.
7388 # This preserves indentation in the original line.
7389 my $lead = ($leading_space)
7390 ? length $leading_space
7392 $lead = max($lead, $hanging_indent);
7393 splice @line, $i+1, 0, (" " x $lead) . $remainder;
7397 # Ready to output the line. Get rid of any trailing space
7398 # And prefix by the required $prefix passed in.
7399 $line[$i] =~ s/\s+$//;
7400 $line[$i] = "$prefix$line[$i]\n";
7401 } # End of looping through all the lines.
7403 return join "", @line;
7406 sub property_ref { # Returns a reference to a property object.
7407 return Property::property_ref(@_);
7410 sub force_unlink ($) {
7411 my $filename = shift;
7412 return unless file_exists($filename);
7413 return if CORE::unlink($filename);
7415 # We might need write permission
7416 chmod 0777, $filename;
7417 CORE::unlink($filename) or Carp::my_carp("Couldn't unlink $filename. Proceeding anyway: $!");
7422 # Given a filename and a reference to an array of lines, write the lines
7424 # Filename can be given as an arrayref of directory names
7427 my $lines_ref = shift;
7428 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7430 if (! defined $lines_ref) {
7431 Carp::my_carp("Missing lines to write parameter for $file. Writing skipped;");
7435 # Get into a single string if an array, and get rid of, in Unix terms, any
7437 $file= File::Spec->join(@$file) if ref $file eq 'ARRAY';
7438 $file = File::Spec->canonpath($file);
7440 # If has directories, make sure that they all exist
7441 (undef, my $directories, undef) = File::Spec->splitpath($file);
7442 File::Path::mkpath($directories) if $directories && ! -d $directories;
7444 push @files_actually_output, $file;
7448 $text = join "", @$lines_ref;
7452 Carp::my_carp("Output file '$file' is empty; writing it anyway;");
7455 force_unlink ($file);
7458 if (not open $OUT, ">", $file) {
7459 Carp::my_carp("can't open $file for output. Skipping this file: $!");
7462 print "$file written.\n" if $verbosity >= $VERBOSE;
7470 sub Standardize($) {
7471 # This converts the input name string into a standardized equivalent to
7475 unless (defined $name) {
7476 Carp::my_carp_bug("Standardize() called with undef. Returning undef.");
7480 # Remove any leading or trailing white space
7484 # Convert interior white space and hypens into underscores.
7485 $name =~ s/ (?<= .) [ -]+ (.) /_$1/xg;
7487 # Capitalize the letter following an underscore, and convert a sequence of
7488 # multiple underscores to a single one
7489 $name =~ s/ (?<= .) _+ (.) /_\u$1/xg;
7491 # And capitalize the first letter, but not for the special cjk ones.
7492 $name = ucfirst($name) unless $name =~ /^k[A-Z]/;
7496 sub standardize ($) {
7497 # Returns a lower-cased standardized name, without underscores. This form
7498 # is chosen so that it can distinguish between any real versus superficial
7499 # Unicode name differences. It relies on the fact that Unicode doesn't
7500 # have interior underscores, white space, nor dashes in any
7501 # stricter-matched name. It should not be used on Unicode code point
7502 # names (the Name property), as they mostly, but not always follow these
7505 my $name = Standardize(shift);
7506 return if !defined $name;
7508 $name =~ s/ (?<= .) _ (?= . ) //xg;
7514 my $indent_increment = " " x 2;
7517 $main::simple_dumper_nesting = 0;
7520 # Like Simple Data::Dumper. Good enough for our needs. We can't use
7521 # the real thing as we have to run under miniperl.
7523 # It is designed so that on input it is at the beginning of a line,
7524 # and the final thing output in any call is a trailing ",\n".
7528 $indent = "" if ! defined $indent;
7530 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7532 # nesting level is localized, so that as the call stack pops, it goes
7533 # back to the prior value.
7534 local $main::simple_dumper_nesting = $main::simple_dumper_nesting;
7535 undef %already_output if $main::simple_dumper_nesting == 0;
7536 $main::simple_dumper_nesting++;
7537 #print STDERR __LINE__, ": $main::simple_dumper_nesting: $indent$item\n";
7539 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7541 # Determine the indent for recursive calls.
7542 my $next_indent = $indent . $indent_increment;
7547 # Dump of scalar: just output it in quotes if not a number. To do
7548 # so we must escape certain characters, and therefore need to
7549 # operate on a copy to avoid changing the original
7551 $copy = $UNDEF unless defined $copy;
7553 # Quote non-numbers (numbers also have optional leading '-' and
7555 if ($copy eq "" || $copy !~ /^ -? \d+ ( \. \d+ )? $/x) {
7557 # Escape apostrophe and backslash
7558 $copy =~ s/ ( ['\\] ) /\\$1/xg;
7561 $output = "$indent$copy,\n";
7565 # Keep track of cycles in the input, and refuse to infinitely loop
7566 my $addr; { no overloading; $addr = 0+$item; }
7567 if (defined $already_output{$addr}) {
7568 return "${indent}ALREADY OUTPUT: $item\n";
7570 $already_output{$addr} = $item;
7572 if (ref $item eq 'ARRAY') {
7575 if ($main::simple_dumper_nesting > 1) {
7577 $using_brackets = 1;
7580 $using_brackets = 0;
7583 # If the array is empty, put the closing bracket on the same
7584 # line. Otherwise, recursively add each array element
7590 for (my $i = 0; $i < @$item; $i++) {
7592 # Indent array elements one level
7593 $output .= &simple_dumper($item->[$i], $next_indent);
7594 $output =~ s/\n$//; # Remove trailing nl so as to
7595 $output .= " # [$i]\n"; # add a comment giving the
7598 $output .= $indent; # Indent closing ']' to orig level
7600 $output .= ']' if $using_brackets;
7603 elsif (ref $item eq 'HASH') {
7608 # No surrounding braces at top level
7610 if ($main::simple_dumper_nesting > 1) {
7613 $body_indent = $next_indent;
7614 $next_indent .= $indent_increment;
7619 $body_indent = $indent;
7623 # Output hashes sorted alphabetically instead of apparently
7624 # random. Use caseless alphabetic sort
7625 foreach my $key (sort { lc $a cmp lc $b } keys %$item)
7627 if ($is_first_line) {
7631 $output .= "$body_indent";
7634 # The key must be a scalar, but this recursive call quotes
7636 $output .= &simple_dumper($key);
7638 # And change the trailing comma and nl to the hash fat
7639 # comma for clarity, and so the value can be on the same
7641 $output =~ s/,\n$/ => /;
7643 # Recursively call to get the value's dump.
7644 my $next = &simple_dumper($item->{$key}, $next_indent);
7646 # If the value is all on one line, remove its indent, so
7647 # will follow the => immediately. If it takes more than
7648 # one line, start it on a new line.
7649 if ($next !~ /\n.*\n/) {
7658 $output .= "$indent},\n" if $using_braces;
7660 elsif (ref $item eq 'CODE' || ref $item eq 'GLOB') {
7661 $output = $indent . ref($item) . "\n";
7662 # XXX see if blessed
7664 elsif ($item->can('dump')) {
7666 # By convention in this program, objects furnish a 'dump'
7667 # method. Since not doing any output at this level, just pass
7668 # on the input indent
7669 $output = $item->dump($indent);
7672 Carp::my_carp("Can't cope with dumping a " . ref($item) . ". Skipping.");
7679 sub dump_inside_out {
7680 # Dump inside-out hashes in an object's state by converting them to a
7681 # regular hash and then calling simple_dumper on that.
7684 my $fields_ref = shift;
7685 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7687 my $addr; { no overloading; $addr = 0+$object; }
7690 foreach my $key (keys %$fields_ref) {
7691 $hash{$key} = $fields_ref->{$key}{$addr};
7694 return simple_dumper(\%hash, @_);
7698 # Overloaded '.' method that is common to all packages. It uses the
7699 # package's stringify method.
7703 my $reversed = shift;
7704 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7706 $other = "" unless defined $other;
7708 foreach my $which (\$self, \$other) {
7709 next unless ref $$which;
7710 if ($$which->can('_operator_stringify')) {
7711 $$which = $$which->_operator_stringify;
7714 my $ref = ref $$which;
7715 my $addr; { no overloading; $addr = 0+$$which; }
7716 $$which = "$ref ($addr)";
7724 sub _operator_equal {
7725 # Generic overloaded '==' routine. To be equal, they must be the exact
7731 return 0 unless defined $other;
7732 return 0 unless ref $other;
7734 return 0+$self == 0+$other;
7737 sub _operator_not_equal {
7741 return ! _operator_equal($self, $other);
7744 sub process_PropertyAliases($) {
7745 # This reads in the PropertyAliases.txt file, which contains almost all
7746 # the character properties in Unicode and their equivalent aliases:
7747 # scf ; Simple_Case_Folding ; sfc
7749 # Field 0 is the preferred short name for the property.
7750 # Field 1 is the full name.
7751 # Any succeeding ones are other accepted names.
7754 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7756 # This whole file was non-existent in early releases, so use our own
7758 $file->insert_lines(get_old_property_aliases())
7759 if ! -e 'PropertyAliases.txt';
7761 # Add any cjk properties that may have been defined.
7762 $file->insert_lines(@cjk_properties);
7764 while ($file->next_line) {
7766 my @data = split /\s*;\s*/;
7768 my $full = $data[1];
7770 my $this = Property->new($data[0], Full_Name => $full);
7772 # Start looking for more aliases after these two.
7773 for my $i (2 .. @data - 1) {
7774 $this->add_alias($data[$i]);
7781 sub finish_property_setup {
7782 # Finishes setting up after PropertyAliases.
7785 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7787 # This entry was missing from this file in earlier Unicode versions
7788 if (-e 'Jamo.txt') {
7789 my $jsn = property_ref('JSN');
7790 if (! defined $jsn) {
7791 $jsn = Property->new('JSN', Full_Name => 'Jamo_Short_Name');
7795 # This entry is still missing as of 5.2, perhaps because no short name for
7797 if (-e 'NameAliases.txt') {
7798 my $aliases = property_ref('Name_Alias');
7799 if (! defined $aliases) {
7800 $aliases = Property->new('Name_Alias');
7804 # These are used so much, that we set globals for them.
7805 $gc = property_ref('General_Category');
7806 $block = property_ref('Block');
7808 # Perl adds this alias.
7809 $gc->add_alias('Category');
7811 # For backwards compatibility, these property files have particular names.
7812 my $upper = property_ref('Uppercase_Mapping');
7813 $upper->set_core_access('uc()');
7814 $upper->set_file('Upper'); # This is what utf8.c calls it
7816 my $lower = property_ref('Lowercase_Mapping');
7817 $lower->set_core_access('lc()');
7818 $lower->set_file('Lower');
7820 my $title = property_ref('Titlecase_Mapping');
7821 $title->set_core_access('ucfirst()');
7822 $title->set_file('Title');
7824 my $fold = property_ref('Case_Folding');
7825 $fold->set_file('Fold') if defined $fold;
7827 # utf8.c can't currently cope with non range-size-1 for these, and even if
7828 # it were changed to do so, someone else may be using them, expecting the
7830 foreach my $property (qw {
7837 property_ref($property)->set_range_size_1(1);
7840 # These two properties aren't actually used in the core, but unfortunately
7841 # the names just above that are in the core interfere with these, so
7842 # choose different names. These aren't a problem unless the map tables
7843 # for these files get written out.
7844 my $lowercase = property_ref('Lowercase');
7845 $lowercase->set_file('IsLower') if defined $lowercase;
7846 my $uppercase = property_ref('Uppercase');
7847 $uppercase->set_file('IsUpper') if defined $uppercase;
7849 # Set up the hard-coded default mappings, but only on properties defined
7851 foreach my $property (keys %default_mapping) {
7852 my $property_object = property_ref($property);
7853 next if ! defined $property_object;
7854 my $default_map = $default_mapping{$property};
7855 $property_object->set_default_map($default_map);
7857 # A map of <code point> implies the property is string.
7858 if ($property_object->type == $UNKNOWN
7859 && $default_map eq $CODE_POINT)
7861 $property_object->set_type($STRING);
7865 # The following use the Multi_Default class to create objects for
7868 # Bidi class has a complicated default, but the derived file takes care of
7869 # the complications, leaving just 'L'.
7870 if (file_exists("${EXTRACTED}DBidiClass.txt")) {
7871 property_ref('Bidi_Class')->set_default_map('L');
7876 # The derived file was introduced in 3.1.1. The values below are
7877 # taken from table 3-8, TUS 3.0
7879 'my $default = Range_List->new;
7880 $default->add_range(0x0590, 0x05FF);
7881 $default->add_range(0xFB1D, 0xFB4F);'
7884 # The defaults apply only to unassigned characters
7885 $default_R .= '$gc->table("Cn") & $default;';
7887 if ($v_version lt v3.0.0) {
7888 $default = Multi_Default->new(R => $default_R, 'L');
7892 # AL apparently not introduced until 3.0: TUS 2.x references are
7893 # not on-line to check it out
7895 'my $default = Range_List->new;
7896 $default->add_range(0x0600, 0x07BF);
7897 $default->add_range(0xFB50, 0xFDFF);
7898 $default->add_range(0xFE70, 0xFEFF);'
7901 # Non-character code points introduced in this release; aren't AL
7902 if ($v_version ge 3.1.0) {
7903 $default_AL .= '$default->delete_range(0xFDD0, 0xFDEF);';
7905 $default_AL .= '$gc->table("Cn") & $default';
7906 $default = Multi_Default->new(AL => $default_AL,
7910 property_ref('Bidi_Class')->set_default_map($default);
7913 # Joining type has a complicated default, but the derived file takes care
7914 # of the complications, leaving just 'U' (or Non_Joining), except the file
7916 if (file_exists("${EXTRACTED}DJoinType.txt") || -e 'ArabicShaping.txt') {
7917 if (file_exists("${EXTRACTED}DJoinType.txt") && $v_version ne 3.1.0) {
7918 property_ref('Joining_Type')->set_default_map('Non_Joining');
7922 # Otherwise, there are not one, but two possibilities for the
7923 # missing defaults: T and U.
7924 # The missing defaults that evaluate to T are given by:
7925 # T = Mn + Cf - ZWNJ - ZWJ
7926 # where Mn and Cf are the general category values. In other words,
7927 # any non-spacing mark or any format control character, except
7928 # U+200C ZERO WIDTH NON-JOINER (joining type U) and U+200D ZERO
7929 # WIDTH JOINER (joining type C).
7930 my $default = Multi_Default->new(
7931 'T' => '$gc->table("Mn") + $gc->table("Cf") - 0x200C - 0x200D',
7933 property_ref('Joining_Type')->set_default_map($default);
7937 # Line break has a complicated default in early releases. It is 'Unknown'
7938 # for non-assigned code points; 'AL' for assigned.
7939 if (file_exists("${EXTRACTED}DLineBreak.txt") || -e 'LineBreak.txt') {
7940 my $lb = property_ref('Line_Break');
7941 if ($v_version gt 3.2.0) {
7942 $lb->set_default_map('Unknown');
7945 my $default = Multi_Default->new( 'Unknown' => '$gc->table("Cn")',
7947 $lb->set_default_map($default);
7950 # If has the URS property, make sure that the standard aliases are in
7951 # it, since not in the input tables in some versions.
7952 my $urs = property_ref('Unicode_Radical_Stroke');
7954 $urs->add_alias('cjkRSUnicode');
7955 $urs->add_alias('kRSUnicode');
7961 sub get_old_property_aliases() {
7962 # Returns what would be in PropertyAliases.txt if it existed in very old
7963 # versions of Unicode. It was derived from the one in 3.2, and pared
7964 # down based on the data that was actually in the older releases.
7965 # An attempt was made to use the existence of files to mean inclusion or
7966 # not of various aliases, but if this was not sufficient, using version
7967 # numbers was resorted to.
7971 # These are to be used in all versions (though some are constructed by
7972 # this program if missing)
7973 push @return, split /\n/, <<'END';
7975 Bidi_M ; Bidi_Mirrored
7977 ccc ; Canonical_Combining_Class
7978 dm ; Decomposition_Mapping
7979 dt ; Decomposition_Type
7980 gc ; General_Category
7982 lc ; Lowercase_Mapping
7984 na1 ; Unicode_1_Name
7987 sfc ; Simple_Case_Folding
7988 slc ; Simple_Lowercase_Mapping
7989 stc ; Simple_Titlecase_Mapping
7990 suc ; Simple_Uppercase_Mapping
7991 tc ; Titlecase_Mapping
7992 uc ; Uppercase_Mapping
7995 if (-e 'Blocks.txt') {
7996 push @return, "blk ; Block\n";
7998 if (-e 'ArabicShaping.txt') {
7999 push @return, split /\n/, <<'END';
8004 if (-e 'PropList.txt') {
8006 # This first set is in the original old-style proplist.
8007 push @return, split /\n/, <<'END';
8009 Bidi_C ; Bidi_Control
8017 Join_C ; Join_Control
8019 QMark ; Quotation_Mark
8020 Term ; Terminal_Punctuation
8021 WSpace ; White_Space
8023 # The next sets were added later
8024 if ($v_version ge v3.0.0) {
8025 push @return, split /\n/, <<'END';
8030 if ($v_version ge v3.0.1) {
8031 push @return, split /\n/, <<'END';
8032 NChar ; Noncharacter_Code_Point
8035 # The next sets were added in the new-style
8036 if ($v_version ge v3.1.0) {
8037 push @return, split /\n/, <<'END';
8038 OAlpha ; Other_Alphabetic
8039 OLower ; Other_Lowercase
8041 OUpper ; Other_Uppercase
8044 if ($v_version ge v3.1.1) {
8045 push @return, "AHex ; ASCII_Hex_Digit\n";
8048 if (-e 'EastAsianWidth.txt') {
8049 push @return, "ea ; East_Asian_Width\n";
8051 if (-e 'CompositionExclusions.txt') {
8052 push @return, "CE ; Composition_Exclusion\n";
8054 if (-e 'LineBreak.txt') {
8055 push @return, "lb ; Line_Break\n";
8057 if (-e 'BidiMirroring.txt') {
8058 push @return, "bmg ; Bidi_Mirroring_Glyph\n";
8060 if (-e 'Scripts.txt') {
8061 push @return, "sc ; Script\n";
8063 if (-e 'DNormalizationProps.txt') {
8064 push @return, split /\n/, <<'END';
8065 Comp_Ex ; Full_Composition_Exclusion
8066 FC_NFKC ; FC_NFKC_Closure
8067 NFC_QC ; NFC_Quick_Check
8068 NFD_QC ; NFD_Quick_Check
8069 NFKC_QC ; NFKC_Quick_Check
8070 NFKD_QC ; NFKD_Quick_Check
8071 XO_NFC ; Expands_On_NFC
8072 XO_NFD ; Expands_On_NFD
8073 XO_NFKC ; Expands_On_NFKC
8074 XO_NFKD ; Expands_On_NFKD
8077 if (-e 'DCoreProperties.txt') {
8078 push @return, split /\n/, <<'END';
8083 # These can also appear in some versions of PropList.txt
8084 push @return, "Lower ; Lowercase\n"
8085 unless grep { $_ =~ /^Lower\b/} @return;
8086 push @return, "Upper ; Uppercase\n"
8087 unless grep { $_ =~ /^Upper\b/} @return;
8090 # This flag requires the DAge.txt file to be copied into the directory.
8091 if (DEBUG && $compare_versions) {
8092 push @return, 'age ; Age';
8098 sub process_PropValueAliases {
8099 # This file contains values that properties look like:
8100 # bc ; AL ; Arabic_Letter
8101 # blk; n/a ; Greek_And_Coptic ; Greek
8103 # Field 0 is the property.
8104 # Field 1 is the short name of a property value or 'n/a' if no
8105 # short name exists;
8106 # Field 2 is the full property value name;
8107 # Any other fields are more synonyms for the property value.
8108 # Purely numeric property values are omitted from the file; as are some
8109 # others, fewer and fewer in later releases
8111 # Entries for the ccc property have an extra field before the
8113 # ccc; 0; NR ; Not_Reordered
8114 # It is the numeric value that the names are synonyms for.
8116 # There are comment entries for values missing from this file:
8117 # # @missing: 0000..10FFFF; ISO_Comment; <none>
8118 # # @missing: 0000..10FFFF; Lowercase_Mapping; <code point>
8121 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8123 # This whole file was non-existent in early releases, so use our own
8124 # internal one if necessary.
8125 if (! -e 'PropValueAliases.txt') {
8126 $file->insert_lines(get_old_property_value_aliases());
8129 # Add any explicit cjk values
8130 $file->insert_lines(@cjk_property_values);
8132 # This line is used only for testing the code that checks for name
8133 # conflicts. There is a script Inherited, and when this line is executed
8134 # it causes there to be a name conflict with the 'Inherited' that this
8135 # program generates for this block property value
8136 #$file->insert_lines('blk; n/a; Herited');
8139 # Process each line of the file ...
8140 while ($file->next_line) {
8142 my ($property, @data) = split /\s*;\s*/;
8144 # The full name for the ccc property value is in field 2 of the
8145 # remaining ones; field 1 for all other properties. Swap ccc fields 1
8146 # and 2. (Rightmost splice removes field 2, returning it; left splice
8147 # inserts that into field 1, thus shifting former field 1 to field 2.)
8148 splice (@data, 1, 0, splice(@data, 2, 1)) if $property eq 'ccc';
8150 # If there is no short name, use the full one in element 1
8151 $data[0] = $data[1] if $data[0] eq "n/a";
8153 # Earlier releases had the pseudo property 'qc' that should expand to
8154 # the ones that replace it below.
8155 if ($property eq 'qc') {
8156 if (lc $data[0] eq 'y') {
8157 $file->insert_lines('NFC_QC; Y ; Yes',
8163 elsif (lc $data[0] eq 'n') {
8164 $file->insert_lines('NFC_QC; N ; No',
8170 elsif (lc $data[0] eq 'm') {
8171 $file->insert_lines('NFC_QC; M ; Maybe',
8172 'NFKC_QC; M ; Maybe',
8176 $file->carp_bad_line("qc followed by unexpected '$data[0]");
8181 # The first field is the short name, 2nd is the full one.
8182 my $property_object = property_ref($property);
8183 my $table = $property_object->add_match_table($data[0],
8184 Full_Name => $data[1]);
8186 # Start looking for more aliases after these two.
8187 for my $i (2 .. @data - 1) {
8188 $table->add_alias($data[$i]);
8190 } # End of looping through the file
8192 # As noted in the comments early in the program, it generates tables for
8193 # the default values for all releases, even those for which the concept
8194 # didn't exist at the time. Here we add those if missing.
8195 my $age = property_ref('age');
8196 if (defined $age && ! defined $age->table('Unassigned')) {
8197 $age->add_match_table('Unassigned');
8199 $block->add_match_table('No_Block') if -e 'Blocks.txt'
8200 && ! defined $block->table('No_Block');
8203 # Now set the default mappings of the properties from the file. This is
8204 # done after the loop because a number of properties have only @missings
8205 # entries in the file, and may not show up until the end.
8206 my @defaults = $file->get_missings;
8207 foreach my $default_ref (@defaults) {
8208 my $default = $default_ref->[0];
8209 my $property = property_ref($default_ref->[1]);
8210 $property->set_default_map($default);
8215 sub get_old_property_value_aliases () {
8216 # Returns what would be in PropValueAliases.txt if it existed in very old
8217 # versions of Unicode. It was derived from the one in 3.2, and pared
8218 # down. An attempt was made to use the existence of files to mean
8219 # inclusion or not of various aliases, but if this was not sufficient,
8220 # using version numbers was resorted to.
8222 my @return = split /\n/, <<'END';
8223 bc ; AN ; Arabic_Number
8224 bc ; B ; Paragraph_Separator
8225 bc ; CS ; Common_Separator
8226 bc ; EN ; European_Number
8227 bc ; ES ; European_Separator
8228 bc ; ET ; European_Terminator
8229 bc ; L ; Left_To_Right
8230 bc ; ON ; Other_Neutral
8231 bc ; R ; Right_To_Left
8232 bc ; WS ; White_Space
8234 # The standard combining classes are very much different in v1, so only use
8235 # ones that look right (not checked thoroughly)
8236 ccc; 0; NR ; Not_Reordered
8237 ccc; 1; OV ; Overlay
8239 ccc; 8; KV ; Kana_Voicing
8241 ccc; 202; ATBL ; Attached_Below_Left
8242 ccc; 216; ATAR ; Attached_Above_Right
8243 ccc; 218; BL ; Below_Left
8245 ccc; 222; BR ; Below_Right
8247 ccc; 228; AL ; Above_Left
8249 ccc; 232; AR ; Above_Right
8250 ccc; 234; DA ; Double_Above
8252 dt ; can ; canonical
8266 gc ; C ; Other # Cc | Cf | Cn | Co | Cs
8268 gc ; Cn ; Unassigned
8269 gc ; Co ; Private_Use
8270 gc ; L ; Letter # Ll | Lm | Lo | Lt | Lu
8271 gc ; LC ; Cased_Letter # Ll | Lt | Lu
8272 gc ; Ll ; Lowercase_Letter
8273 gc ; Lm ; Modifier_Letter
8274 gc ; Lo ; Other_Letter
8275 gc ; Lu ; Uppercase_Letter
8276 gc ; M ; Mark # Mc | Me | Mn
8277 gc ; Mc ; Spacing_Mark
8278 gc ; Mn ; Nonspacing_Mark
8279 gc ; N ; Number # Nd | Nl | No
8280 gc ; Nd ; Decimal_Number
8281 gc ; No ; Other_Number
8282 gc ; P ; Punctuation # Pc | Pd | Pe | Pf | Pi | Po | Ps
8283 gc ; Pd ; Dash_Punctuation
8284 gc ; Pe ; Close_Punctuation
8285 gc ; Po ; Other_Punctuation
8286 gc ; Ps ; Open_Punctuation
8287 gc ; S ; Symbol # Sc | Sk | Sm | So
8288 gc ; Sc ; Currency_Symbol
8289 gc ; Sm ; Math_Symbol
8290 gc ; So ; Other_Symbol
8291 gc ; Z ; Separator # Zl | Zp | Zs
8292 gc ; Zl ; Line_Separator
8293 gc ; Zp ; Paragraph_Separator
8294 gc ; Zs ; Space_Separator
8302 if (-e 'ArabicShaping.txt') {
8303 push @return, split /\n/, <<'END';
8310 jg ; n/a ; NO_JOINING_GROUP
8318 jt ; C ; Join_Causing
8319 jt ; D ; Dual_Joining
8320 jt ; L ; Left_Joining
8321 jt ; R ; Right_Joining
8322 jt ; U ; Non_Joining
8323 jt ; T ; Transparent
8325 if ($v_version ge v3.0.0) {
8326 push @return, split /\n/, <<'END';
8330 jg ; n/a ; DALATH_RISH
8333 jg ; n/a ; FINAL_SEMKATH
8336 jg ; n/a ; HAMZA_ON_HEH_GOAL
8343 jg ; n/a ; KNOTTED_HEH
8350 jg ; n/a ; REVERSED_PE
8354 jg ; n/a ; SWASH_KAF
8356 jg ; n/a ; TEH_MARBUTA
8359 jg ; n/a ; YEH_BARREE
8360 jg ; n/a ; YEH_WITH_TAIL
8369 if (-e 'EastAsianWidth.txt') {
8370 push @return, split /\n/, <<'END';
8380 if (-e 'LineBreak.txt') {
8381 push @return, split /\n/, <<'END';
8383 lb ; AL ; Alphabetic
8384 lb ; B2 ; Break_Both
8385 lb ; BA ; Break_After
8386 lb ; BB ; Break_Before
8387 lb ; BK ; Mandatory_Break
8388 lb ; CB ; Contingent_Break
8389 lb ; CL ; Close_Punctuation
8390 lb ; CM ; Combining_Mark
8391 lb ; CR ; Carriage_Return
8392 lb ; EX ; Exclamation
8395 lb ; ID ; Ideographic
8396 lb ; IN ; Inseperable
8397 lb ; IS ; Infix_Numeric
8399 lb ; NS ; Nonstarter
8401 lb ; OP ; Open_Punctuation
8402 lb ; PO ; Postfix_Numeric
8403 lb ; PR ; Prefix_Numeric
8405 lb ; SA ; Complex_Context
8408 lb ; SY ; Break_Symbols
8414 if (-e 'DNormalizationProps.txt') {
8415 push @return, split /\n/, <<'END';
8422 if (-e 'Scripts.txt') {
8423 push @return, split /\n/, <<'END';
8425 sc ; Armn ; Armenian
8427 sc ; Bopo ; Bopomofo
8428 sc ; Cans ; Canadian_Aboriginal
8429 sc ; Cher ; Cherokee
8430 sc ; Cyrl ; Cyrillic
8431 sc ; Deva ; Devanagari
8433 sc ; Ethi ; Ethiopic
8434 sc ; Geor ; Georgian
8437 sc ; Gujr ; Gujarati
8438 sc ; Guru ; Gurmukhi
8442 sc ; Hira ; Hiragana
8443 sc ; Ital ; Old_Italic
8444 sc ; Kana ; Katakana
8449 sc ; Mlym ; Malayalam
8450 sc ; Mong ; Mongolian
8454 sc ; Qaai ; Inherited
8468 if ($v_version ge v2.0.0) {
8469 push @return, split /\n/, <<'END';
8473 dt ; vert ; vertical
8478 gc ; Lt ; Titlecase_Letter
8479 gc ; Me ; Enclosing_Mark
8480 gc ; Nl ; Letter_Number
8481 gc ; Pc ; Connector_Punctuation
8482 gc ; Sk ; Modifier_Symbol
8485 if ($v_version ge v2.1.2) {
8486 push @return, "bc ; S ; Segment_Separator\n";
8488 if ($v_version ge v2.1.5) {
8489 push @return, split /\n/, <<'END';
8490 gc ; Pf ; Final_Punctuation
8491 gc ; Pi ; Initial_Punctuation
8494 if ($v_version ge v2.1.8) {
8495 push @return, "ccc; 240; IS ; Iota_Subscript\n";
8498 if ($v_version ge v3.0.0) {
8499 push @return, split /\n/, <<'END';
8500 bc ; AL ; Arabic_Letter
8501 bc ; BN ; Boundary_Neutral
8502 bc ; LRE ; Left_To_Right_Embedding
8503 bc ; LRO ; Left_To_Right_Override
8504 bc ; NSM ; Nonspacing_Mark
8505 bc ; PDF ; Pop_Directional_Format
8506 bc ; RLE ; Right_To_Left_Embedding
8507 bc ; RLO ; Right_To_Left_Override
8509 ccc; 233; DB ; Double_Below
8513 if ($v_version ge v3.1.0) {
8514 push @return, "ccc; 226; R ; Right\n";
8521 # This is used to store the range list of all the code points usable when
8522 # the little used $compare_versions feature is enabled.
8523 my $compare_versions_range_list;
8525 sub process_generic_property_file {
8526 # This processes a file containing property mappings and puts them
8527 # into internal map tables. It should be used to handle any property
8528 # files that have mappings from a code point or range thereof to
8529 # something else. This means almost all the UCD .txt files.
8530 # each_line_handlers() should be set to adjust the lines of these
8531 # files, if necessary, to what this routine understands:
8536 # the fields are: "codepoint range ; property; map"
8538 # meaning the codepoints in the range all have the value 'map' under
8540 # Beginning and trailing white space in each field are not signficant.
8541 # Note there is not a trailing semi-colon in the above. A trailing
8542 # semi-colon means the map is a null-string. An omitted map, as
8543 # opposed to a null-string, is assumed to be 'Y', based on Unicode
8544 # table syntax. (This could have been hidden from this routine by
8545 # doing it in the $file object, but that would require parsing of the
8546 # line there, so would have to parse it twice, or change the interface
8547 # to pass this an array. So not done.)
8549 # The map field may begin with a sequence of commands that apply to
8550 # this range. Each such command begins and ends with $CMD_DELIM.
8551 # These are used to indicate, for example, that the mapping for a
8552 # range has a non-default type.
8554 # This loops through the file, calling it's next_line() method, and
8555 # then taking the map and adding it to the property's table.
8556 # Complications arise because any number of properties can be in the
8557 # file, in any order, interspersed in any way. The first time a
8558 # property is seen, it gets information about that property and
8559 # caches it for quick retrieval later. It also normalizes the maps
8560 # so that only one of many synonym is stored. The Unicode input files
8561 # do use some multiple synonyms.
8564 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8566 my %property_info; # To keep track of what properties
8567 # have already had entries in the
8568 # current file, and info about each,
8569 # so don't have to recompute.
8570 my $property_name; # property currently being worked on
8571 my $property_type; # and its type
8572 my $previous_property_name = ""; # name from last time through loop
8573 my $property_object; # pointer to the current property's
8575 my $property_addr; # the address of that object
8576 my $default_map; # the string that code points missing
8577 # from the file map to
8578 my $default_table; # For non-string properties, a
8579 # reference to the match table that
8580 # will contain the list of code
8581 # points that map to $default_map.
8583 # Get the next real non-comment line
8585 while ($file->next_line) {
8587 # Default replacement type; means that if parts of the range have
8588 # already been stored in our tables, the new map overrides them if
8589 # they differ more than cosmetically
8590 my $replace = $IF_NOT_EQUIVALENT;
8591 my $map_type; # Default type for the map of this range
8593 #local $to_trace = 1 if main::DEBUG;
8594 trace $_ if main::DEBUG && $to_trace;
8596 # Split the line into components
8597 my ($range, $property_name, $map, @remainder)
8598 = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
8600 # If more or less on the line than we are expecting, warn and skip
8603 $file->carp_bad_line('Extra fields');
8606 elsif ( ! defined $property_name) {
8607 $file->carp_bad_line('Missing property');
8611 # Examine the range.
8612 if ($range !~ /^ ($code_point_re) (?:\.\. ($code_point_re) )? $/x)
8614 $file->carp_bad_line("Range '$range' not of the form 'CP1' or 'CP1..CP2' (where CP1,2 are code points in hex)");
8618 my $high = (defined $2) ? hex $2 : $low;
8620 # For the very specialized case of comparing two Unicode
8622 if (DEBUG && $compare_versions) {
8623 if ($property_name eq 'Age') {
8625 # Only allow code points at least as old as the version
8627 my $age = pack "C*", split(/\./, $map); # v string
8628 next LINE if $age gt $compare_versions;
8632 # Again, we throw out code points younger than those of
8633 # the specified version. By now, the Age property is
8634 # populated. We use the intersection of each input range
8635 # with this property to find what code points in it are
8636 # valid. To do the intersection, we have to convert the
8637 # Age property map to a Range_list. We only have to do
8639 if (! defined $compare_versions_range_list) {
8640 my $age = property_ref('Age');
8641 if (! -e 'DAge.txt') {
8642 croak "Need to have 'DAge.txt' file to do version comparison";
8644 elsif ($age->count == 0) {
8645 croak "The 'Age' table is empty, but its file exists";
8647 $compare_versions_range_list
8648 = Range_List->new(Initialize => $age);
8651 # An undefined map is always 'Y'
8652 $map = 'Y' if ! defined $map;
8654 # Calculate the intersection of the input range with the
8655 # code points that are known in the specified version
8656 my @ranges = ($compare_versions_range_list
8657 & Range->new($low, $high))->ranges;
8659 # If the intersection is empty, throw away this range
8660 next LINE unless @ranges;
8662 # Only examine the first range this time through the loop.
8663 my $this_range = shift @ranges;
8665 # Put any remaining ranges in the queue to be processed
8666 # later. Note that there is unnecessary work here, as we
8667 # will do the intersection again for each of these ranges
8668 # during some future iteration of the LINE loop, but this
8669 # code is not used in production. The later intersections
8670 # are guaranteed to not splinter, so this will not become
8672 my $line = join ';', $property_name, $map;
8673 foreach my $range (@ranges) {
8674 $file->insert_adjusted_lines(sprintf("%04X..%04X; %s",
8680 # And process the first range, like any other.
8681 $low = $this_range->start;
8682 $high = $this_range->end;
8684 } # End of $compare_versions
8686 # If changing to a new property, get the things constant per
8688 if ($previous_property_name ne $property_name) {
8690 $property_object = property_ref($property_name);
8691 if (! defined $property_object) {
8692 $file->carp_bad_line("Unexpected property '$property_name'. Skipped");
8695 { no overloading; $property_addr = 0+($property_object); }
8697 # Defer changing names until have a line that is acceptable
8698 # (the 'next' statement above means is unacceptable)
8699 $previous_property_name = $property_name;
8701 # If not the first time for this property, retrieve info about
8703 if (defined ($property_info{$property_addr}{'type'})) {
8704 $property_type = $property_info{$property_addr}{'type'};
8705 $default_map = $property_info{$property_addr}{'default'};
8707 = $property_info{$property_addr}{'pseudo_map_type'};
8709 = $property_info{$property_addr}{'default_table'};
8713 # Here, is the first time for this property. Set up the
8715 $property_type = $property_info{$property_addr}{'type'}
8716 = $property_object->type;
8718 = $property_info{$property_addr}{'pseudo_map_type'}
8719 = $property_object->pseudo_map_type;
8721 # The Unicode files are set up so that if the map is not
8722 # defined, it is a binary property
8723 if (! defined $map && $property_type != $BINARY) {
8724 if ($property_type != $UNKNOWN
8725 && $property_type != $NON_STRING)
8727 $file->carp_bad_line("No mapping defined on a non-binary property. Using 'Y' for the map");
8730 $property_object->set_type($BINARY);
8732 = $property_info{$property_addr}{'type'}
8737 # Get any @missings default for this property. This
8738 # should precede the first entry for the property in the
8739 # input file, and is located in a comment that has been
8740 # stored by the Input_file class until we access it here.
8741 # It's possible that there is more than one such line
8742 # waiting for us; collect them all, and parse
8743 my @missings_list = $file->get_missings
8744 if $file->has_missings_defaults;
8745 foreach my $default_ref (@missings_list) {
8746 my $default = $default_ref->[0];
8747 my $addr; { no overloading; $addr = 0+property_ref($default_ref->[1]); }
8749 # For string properties, the default is just what the
8750 # file says, but non-string properties should already
8751 # have set up a table for the default property value;
8752 # use the table for these, so can resolve synonyms
8753 # later to a single standard one.
8754 if ($property_type == $STRING
8755 || $property_type == $UNKNOWN)
8757 $property_info{$addr}{'missings'} = $default;
8760 $property_info{$addr}{'missings'}
8761 = $property_object->table($default);
8765 # Finished storing all the @missings defaults in the input
8766 # file so far. Get the one for the current property.
8767 my $missings = $property_info{$property_addr}{'missings'};
8769 # But we likely have separately stored what the default
8770 # should be. (This is to accommodate versions of the
8771 # standard where the @missings lines are absent or
8772 # incomplete.) Hopefully the two will match. But check
8774 $default_map = $property_object->default_map;
8776 # If the map is a ref, it means that the default won't be
8777 # processed until later, so undef it, so next few lines
8778 # will redefine it to something that nothing will match
8779 undef $default_map if ref $default_map;
8781 # Create a $default_map if don't have one; maybe a dummy
8782 # that won't match anything.
8783 if (! defined $default_map) {
8785 # Use any @missings line in the file.
8786 if (defined $missings) {
8787 if (ref $missings) {
8788 $default_map = $missings->full_name;
8789 $default_table = $missings;
8792 $default_map = $missings;
8795 # And store it with the property for outside use.
8796 $property_object->set_default_map($default_map);
8800 # Neither an @missings nor a default map. Create
8801 # a dummy one, so won't have to test definedness
8803 $default_map = '_Perl This will never be in a file
8808 # Here, we have $default_map defined, possibly in terms of
8809 # $missings, but maybe not, and possibly is a dummy one.
8810 if (defined $missings) {
8812 # Make sure there is no conflict between the two.
8813 # $missings has priority.
8814 if (ref $missings) {
8816 = $property_object->table($default_map);
8817 if (! defined $default_table
8818 || $default_table != $missings)
8820 if (! defined $default_table) {
8821 $default_table = $UNDEF;
8823 $file->carp_bad_line(<<END
8824 The \@missings line for $property_name in $file says that missings default to
8825 $missings, but we expect it to be $default_table. $missings used.
8828 $default_table = $missings;
8829 $default_map = $missings->full_name;
8831 $property_info{$property_addr}{'default_table'}
8834 elsif ($default_map ne $missings) {
8835 $file->carp_bad_line(<<END
8836 The \@missings line for $property_name in $file says that missings default to
8837 $missings, but we expect it to be $default_map. $missings used.
8840 $default_map = $missings;
8844 $property_info{$property_addr}{'default'}
8847 # If haven't done so already, find the table corresponding
8848 # to this map for non-string properties.
8849 if (! defined $default_table
8850 && $property_type != $STRING
8851 && $property_type != $UNKNOWN)
8853 $default_table = $property_info{$property_addr}
8855 = $property_object->table($default_map);
8857 } # End of is first time for this property
8858 } # End of switching properties.
8860 # Ready to process the line.
8861 # The Unicode files are set up so that if the map is not defined,
8862 # it is a binary property with value 'Y'
8863 if (! defined $map) {
8868 # If the map begins with a special command to us (enclosed in
8869 # delimiters), extract the command(s).
8870 if (substr($map, 0, 1) eq $CMD_DELIM) {
8871 while ($map =~ s/ ^ $CMD_DELIM (.*?) $CMD_DELIM //x) {
8873 if ($command =~ / ^ $REPLACE_CMD= (.*) /x) {
8876 elsif ($command =~ / ^ $MAP_TYPE_CMD= (.*) /x) {
8880 $file->carp_bad_line("Unknown command line: '$1'");
8887 if ($default_map eq $CODE_POINT && $map =~ / ^ $code_point_re $/x)
8890 # Here, we have a map to a particular code point, and the
8891 # default map is to a code point itself. If the range
8892 # includes the particular code point, change that portion of
8893 # the range to the default. This makes sure that in the final
8894 # table only the non-defaults are listed.
8895 my $decimal_map = hex $map;
8896 if ($low <= $decimal_map && $decimal_map <= $high) {
8898 # If the range includes stuff before or after the map
8899 # we're changing, split it and process the split-off parts
8901 if ($low < $decimal_map) {
8902 $file->insert_adjusted_lines(
8903 sprintf("%04X..%04X; %s; %s",
8909 if ($high > $decimal_map) {
8910 $file->insert_adjusted_lines(
8911 sprintf("%04X..%04X; %s; %s",
8917 $low = $high = $decimal_map;
8922 # If we can tell that this is a synonym for the default map, use
8923 # the default one instead.
8924 if ($property_type != $STRING
8925 && $property_type != $UNKNOWN)
8927 my $table = $property_object->table($map);
8928 if (defined $table && $table == $default_table) {
8929 $map = $default_map;
8933 # And figure out the map type if not known.
8934 if (! defined $map_type || $map_type == $COMPUTE_NO_MULTI_CP) {
8935 if ($map eq "") { # Nulls are always $NULL map type
8937 } # Otherwise, non-strings, and those that don't allow
8938 # $MULTI_CP, and those that aren't multiple code points are
8941 (($property_type != $STRING && $property_type != $UNKNOWN)
8942 || (defined $map_type && $map_type == $COMPUTE_NO_MULTI_CP)
8943 || $map !~ /^ $code_point_re ( \ $code_point_re )+ $ /x)
8948 $map_type = $MULTI_CP;
8952 $property_object->add_map($low, $high,
8955 Replace => $replace);
8956 } # End of loop through file's lines
8962 # XXX Unused until revise charnames;
8963 #sub check_and_handle_compound_name {
8964 # This looks at Name properties for parenthesized components and splits
8965 # them off. Thus it finds FF as an equivalent to Form Feed.
8966 # my $code_point = shift;
8968 # if ($name =~ /^ ( .*? ) ( \s* ) \( ( [^)]* ) \) (.*) $/x) {
8969 # #local $to_trace = 1 if main::DEBUG;
8970 # trace $1, $2, $3, $4 if main::DEBUG && $to_trace;
8971 # push @more_Names, "$code_point; $1";
8972 # push @more_Names, "$code_point; $3";
8973 # Carp::my_carp_bug("Expecting blank space before left parenthesis in '$_'. Proceeding and assuming it was there;") if $2 ne " ";
8974 # Carp::my_carp_bug("Not expecting anything after the right parenthesis in '$_'. Proceeding and ignoring that;") if $4 ne "";
8979 { # Closure for UnicodeData.txt handling
8981 # This file was the first one in the UCD; its design leads to some
8982 # awkwardness in processing. Here is a sample line:
8983 # 0041;LATIN CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0061;
8984 # The fields in order are:
8985 my $i = 0; # The code point is in field 0, and is shifted off.
8986 my $NAME = $i++; # character name (e.g. "LATIN CAPITAL LETTER A")
8987 my $CATEGORY = $i++; # category (e.g. "Lu")
8988 my $CCC = $i++; # Canonical combining class (e.g. "230")
8989 my $BIDI = $i++; # directional class (e.g. "L")
8990 my $PERL_DECOMPOSITION = $i++; # decomposition mapping
8991 my $PERL_DECIMAL_DIGIT = $i++; # decimal digit value
8992 my $NUMERIC_TYPE_OTHER_DIGIT = $i++; # digit value, like a superscript
8993 # Dual-use in this program; see below
8994 my $NUMERIC = $i++; # numeric value
8995 my $MIRRORED = $i++; # ? mirrored
8996 my $UNICODE_1_NAME = $i++; # name in Unicode 1.0
8997 my $COMMENT = $i++; # iso comment
8998 my $UPPER = $i++; # simple uppercase mapping
8999 my $LOWER = $i++; # simple lowercase mapping
9000 my $TITLE = $i++; # simple titlecase mapping
9001 my $input_field_count = $i;
9003 # This routine in addition outputs these extra fields:
9004 my $DECOMP_TYPE = $i++; # Decomposition type
9005 my $DECOMP_MAP = $i++; # Must be last; another decomposition mapping
9006 my $last_field = $i - 1;
9008 # All these are read into an array for each line, with the indices defined
9009 # above. The empty fields in the example line above indicate that the
9010 # value is defaulted. The handler called for each line of the input
9011 # changes these to their defaults.
9013 # Here are the official names of the properties, in a parallel array:
9015 $field_names[$BIDI] = 'Bidi_Class';
9016 $field_names[$CATEGORY] = 'General_Category';
9017 $field_names[$CCC] = 'Canonical_Combining_Class';
9018 $field_names[$COMMENT] = 'ISO_Comment';
9019 $field_names[$DECOMP_MAP] = 'Decomposition_Mapping';
9020 $field_names[$DECOMP_TYPE] = 'Decomposition_Type';
9021 $field_names[$LOWER] = 'Lowercase_Mapping';
9022 $field_names[$MIRRORED] = 'Bidi_Mirrored';
9023 $field_names[$NAME] = 'Name';
9024 $field_names[$NUMERIC] = 'Numeric_Value';
9025 $field_names[$NUMERIC_TYPE_OTHER_DIGIT] = 'Numeric_Type';
9026 $field_names[$PERL_DECIMAL_DIGIT] = 'Perl_Decimal_Digit';
9027 $field_names[$PERL_DECOMPOSITION] = 'Perl_Decomposition_Mapping';
9028 $field_names[$TITLE] = 'Titlecase_Mapping';
9029 $field_names[$UNICODE_1_NAME] = 'Unicode_1_Name';
9030 $field_names[$UPPER] = 'Uppercase_Mapping';
9032 # Some of these need a little more explanation. The $PERL_DECIMAL_DIGIT
9033 # field does not lead to an official Unicode property, but is used in
9034 # calculating the Numeric_Type. Perl however, creates a file from this
9035 # field, so a Perl property is created from it. Similarly, the Other
9036 # Digit field is used only for calculating the Numeric_Type, and so it can
9037 # be safely re-used as the place to store the value for Numeric_Type;
9038 # hence it is referred to as $NUMERIC_TYPE_OTHER_DIGIT. The input field
9039 # named $PERL_DECOMPOSITION is a combination of both the decomposition
9040 # mapping and its type. Perl creates a file containing exactly this
9041 # field, so it is used for that. The two properties are separated into
9042 # two extra output fields, $DECOMP_MAP and $DECOMP_TYPE.
9044 # This file is processed like most in this program. Control is passed to
9045 # process_generic_property_file() which calls filter_UnicodeData_line()
9046 # for each input line. This filter converts the input into line(s) that
9047 # process_generic_property_file() understands. There is also a setup
9048 # routine called before any of the file is processed, and a handler for
9049 # EOF processing, all in this closure.
9051 # A huge speed-up occurred at the cost of some added complexity when these
9052 # routines were altered to buffer the outputs into ranges. Almost all the
9053 # lines of the input file apply to just one code point, and for most
9054 # properties, the map for the next code point up is the same as the
9055 # current one. So instead of creating a line for each property for each
9056 # input line, filter_UnicodeData_line() remembers what the previous map
9057 # of a property was, and doesn't generate a line to pass on until it has
9058 # to, as when the map changes; and that passed-on line encompasses the
9059 # whole contiguous range of code points that have the same map for that
9060 # property. This means a slight amount of extra setup, and having to
9061 # flush these buffers on EOF, testing if the maps have changed, plus
9062 # remembering state information in the closure. But it means a lot less
9063 # real time in not having to change the data base for each property on
9066 # Another complication is that there are already a few ranges designated
9067 # in the input. There are two lines for each, with the same maps except
9068 # the code point and name on each line. This was actually the hardest
9069 # thing to design around. The code points in those ranges may actually
9070 # have real maps not given by these two lines. These maps will either
9071 # be algorthimically determinable, or in the extracted files furnished
9072 # with the UCD. In the event of conflicts between these extracted files,
9073 # and this one, Unicode says that this one prevails. But it shouldn't
9074 # prevail for conflicts that occur in these ranges. The data from the
9075 # extracted files prevails in those cases. So, this program is structured
9076 # so that those files are processed first, storing maps. Then the other
9077 # files are processed, generally overwriting what the extracted files
9078 # stored. But just the range lines in this input file are processed
9079 # without overwriting. This is accomplished by adding a special string to
9080 # the lines output to tell process_generic_property_file() to turn off the
9081 # overwriting for just this one line.
9082 # A similar mechanism is used to tell it that the map is of a non-default
9085 sub setup_UnicodeData { # Called before any lines of the input are read
9087 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9089 my $Perl_decomp = Property->new('Perl_Decomposition_Mapping',
9090 Directory => File::Spec->curdir(),
9091 File => 'Decomposition',
9092 Format => $STRING_FORMAT,
9093 Internal_Only_Warning => 1,
9094 Perl_Extension => 1,
9095 Default_Map => $CODE_POINT,
9097 # normalize.pm can't cope with these
9098 Output_Range_Counts => 0,
9100 # This is a specially formatted table
9101 # explicitly for normalize.pm, which
9102 # is expecting a particular format,
9103 # which means that mappings containing
9104 # multiple code points are in the main
9106 Map_Type => $COMPUTE_NO_MULTI_CP,
9109 $Perl_decomp->add_comment(join_lines(<<END
9110 This mapping is a combination of the Unicode 'Decomposition_Type' and
9111 'Decomposition_Mapping' properties, formatted for use by normalize.pm. It is
9112 identical to the official Unicode 'Decomposition_Mapping' property except for
9114 1) It omits the algorithmically determinable Hangul syllable decompositions,
9115 which normalize.pm handles algorithmically.
9116 2) It contains the decomposition type as well. Non-canonical decompositions
9117 begin with a word in angle brackets, like <super>, which denotes the
9118 compatible decomposition type. If the map does not begin with the <angle
9119 brackets>, the decomposition is canonical.
9123 my $Decimal_Digit = Property->new("Perl_Decimal_Digit",
9125 Perl_Extension => 1,
9126 File => 'Digit', # Trad. location
9127 Directory => $map_directory,
9131 $Decimal_Digit->add_comment(join_lines(<<END
9132 This file gives the mapping of all code points which represent a single
9133 decimal digit [0-9] to their respective digits. For example, the code point
9134 U+0031 (an ASCII '1') is mapped to a numeric 1. These code points are those
9135 that have Numeric_Type=Decimal; not special things, like subscripts nor Roman
9140 # This property is not used for generating anything else, and is
9141 # usually not output. By making it last in the list, we can just
9142 # change the high end of the loop downwards to avoid the work of
9143 # generating a table that is just going to get thrown away.
9144 if (! property_ref('Decomposition_Mapping')->to_output_map) {
9150 my $first_time = 1; # ? Is this the first line of the file
9151 my $in_range = 0; # ? Are we in one of the file's ranges
9152 my $previous_cp; # hex code point of previous line
9153 my $decimal_previous_cp = -1; # And its decimal equivalent
9154 my @start; # For each field, the current starting
9155 # code point in hex for the range
9156 # being accumulated.
9157 my @fields; # The input fields;
9158 my @previous_fields; # And those from the previous call
9160 sub filter_UnicodeData_line {
9161 # Handle a single input line from UnicodeData.txt; see comments above
9162 # Conceptually this takes a single line from the file containing N
9163 # properties, and converts it into N lines with one property per line,
9164 # which is what the final handler expects. But there are
9165 # complications due to the quirkiness of the input file, and to save
9166 # time, it accumulates ranges where the property values don't change
9167 # and only emits lines when necessary. This is about an order of
9168 # magnitude fewer lines emitted.
9171 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9173 # $_ contains the input line.
9174 # -1 in split means retain trailing null fields
9175 (my $cp, @fields) = split /\s*;\s*/, $_, -1;
9177 #local $to_trace = 1 if main::DEBUG;
9178 trace $cp, @fields , $input_field_count if main::DEBUG && $to_trace;
9179 if (@fields > $input_field_count) {
9180 $file->carp_bad_line('Extra fields');
9185 my $decimal_cp = hex $cp;
9187 # We have to output all the buffered ranges when the next code point
9188 # is not exactly one after the previous one, which means there is a
9189 # gap in the ranges.
9190 my $force_output = ($decimal_cp != $decimal_previous_cp + 1);
9192 # The decomposition mapping field requires special handling. It looks
9195 # <compat> 0032 0020
9198 # The decomposition type is enclosed in <brackets>; if missing, it
9199 # means the type is canonical. There are two decomposition mapping
9200 # tables: the one for use by Perl's normalize.pm has a special format
9201 # which is this field intact; the other, for general use is of
9202 # standard format. In either case we have to find the decomposition
9203 # type. Empty fields have None as their type, and map to the code
9205 if ($fields[$PERL_DECOMPOSITION] eq "") {
9206 $fields[$DECOMP_TYPE] = 'None';
9207 $fields[$DECOMP_MAP] = $fields[$PERL_DECOMPOSITION] = $CODE_POINT;
9210 ($fields[$DECOMP_TYPE], my $map) = $fields[$PERL_DECOMPOSITION]
9211 =~ / < ( .+? ) > \s* ( .+ ) /x;
9212 if (! defined $fields[$DECOMP_TYPE]) {
9213 $fields[$DECOMP_TYPE] = 'Canonical';
9214 $fields[$DECOMP_MAP] = $fields[$PERL_DECOMPOSITION];
9217 $fields[$DECOMP_MAP] = $map;
9221 # The 3 numeric fields also require special handling. The 2 digit
9222 # fields must be either empty or match the number field. This means
9223 # that if it is empty, they must be as well, and the numeric type is
9224 # None, and the numeric value is 'Nan'.
9225 # The decimal digit field must be empty or match the other digit
9226 # field. If the decimal digit field is non-empty, the code point is
9227 # a decimal digit, and the other two fields will have the same value.
9228 # If it is empty, but the other digit field is non-empty, the code
9229 # point is an 'other digit', and the number field will have the same
9230 # value as the other digit field. If the other digit field is empty,
9231 # but the number field is non-empty, the code point is a generic
9233 if ($fields[$NUMERIC] eq "") {
9234 if ($fields[$PERL_DECIMAL_DIGIT] ne ""
9235 || $fields[$NUMERIC_TYPE_OTHER_DIGIT] ne ""
9237 $file->carp_bad_line("Numeric values inconsistent. Trying to process anyway");
9239 $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'None';
9240 $fields[$NUMERIC] = 'NaN';
9243 $file->carp_bad_line("'$fields[$NUMERIC]' should be a whole or rational number. Processing as if it were") if $fields[$NUMERIC] !~ qr{ ^ -? \d+ ( / \d+ )? $ }x;
9244 if ($fields[$PERL_DECIMAL_DIGIT] ne "") {
9245 $file->carp_bad_line("$fields[$PERL_DECIMAL_DIGIT] should equal $fields[$NUMERIC]. Processing anyway") if $fields[$PERL_DECIMAL_DIGIT] != $fields[$NUMERIC];
9246 $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'Decimal';
9248 elsif ($fields[$NUMERIC_TYPE_OTHER_DIGIT] ne "") {
9249 $file->carp_bad_line("$fields[$NUMERIC_TYPE_OTHER_DIGIT] should equal $fields[$NUMERIC]. Processing anyway") if $fields[$NUMERIC_TYPE_OTHER_DIGIT] != $fields[$NUMERIC];
9250 $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'Digit';
9253 $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'Numeric';
9255 # Rationals require extra effort.
9256 register_fraction($fields[$NUMERIC])
9257 if $fields[$NUMERIC] =~ qr{/};
9261 # For the properties that have empty fields in the file, and which
9262 # mean something different from empty, change them to that default.
9263 # Certain fields just haven't been empty so far in any Unicode
9264 # version, so don't look at those, namely $MIRRORED, $BIDI, $CCC,
9265 # $CATEGORY. This leaves just the two fields, and so we hard-code in
9266 # the defaults; which are verly unlikely to ever change.
9267 $fields[$UPPER] = $CODE_POINT if $fields[$UPPER] eq "";
9268 $fields[$LOWER] = $CODE_POINT if $fields[$LOWER] eq "";
9270 # UAX44 says that if title is empty, it is the same as whatever upper
9272 $fields[$TITLE] = $fields[$UPPER] if $fields[$TITLE] eq "";
9274 # There are a few pairs of lines like:
9275 # AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
9276 # D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
9277 # that define ranges. These should be processed after the fields are
9278 # adjusted above, as they may override some of them; but mostly what
9279 # is left is to possibly adjust the $NAME field. The names of all the
9280 # paired lines start with a '<', but this is also true of '<control>,
9281 # which isn't one of these special ones.
9282 if ($fields[$NAME] eq '<control>') {
9284 # Some code points in this file have the pseudo-name
9285 # '<control>', but the official name for such ones is the null
9287 $fields[$NAME] = "";
9289 # We had better not be in between range lines.
9291 $file->carp_bad_line("Expecting a closing range line, not a $fields[$NAME]'. Trying anyway");
9295 elsif (substr($fields[$NAME], 0, 1) ne '<') {
9297 # Here is a non-range line. We had better not be in between range
9300 $file->carp_bad_line("Expecting a closing range line, not a $fields[$NAME]'. Trying anyway");
9303 # XXX until charnames catches up.
9304 # if ($fields[$NAME] =~ s/- $cp $//x) {
9306 # # These are code points whose names end in their code points,
9307 # # which means the names are algorithmically derivable from the
9308 # # code points. To shorten the output Name file, the algorithm
9309 # # for deriving these is placed in the file instead of each
9310 # # code point, so they have map type $CP_IN_NAME
9311 # $fields[$NAME] = $CMD_DELIM
9319 # Some official names are really two alternate names with one in
9320 # parentheses. What we do here is use the full official one for
9321 # the standard property (stored just above), but for the charnames
9322 # table, we add two more entries, one for each of the alternate
9325 #check_and_handle_compound_name($cp, $fields[$NAME]);
9326 #check_and_handle_compound_name($cp, $unicode_1_name);
9327 # XXX until charnames catches up.
9329 elsif ($fields[$NAME] =~ /^<(.+), First>$/) {
9330 $fields[$NAME] = $1;
9332 # Here we are at the beginning of a range pair.
9334 $file->carp_bad_line("Expecting a closing range line, not a beginning one, $fields[$NAME]'. Trying anyway");
9338 # Because the properties in the range do not overwrite any already
9339 # in the db, we must flush the buffers of what's already there, so
9340 # they get handled in the normal scheme.
9344 elsif ($fields[$NAME] !~ s/^<(.+), Last>$/$1/) {
9345 $file->carp_bad_line("Unexpected name starting with '<' $fields[$NAME]. Ignoring this line.");
9349 else { # Here, we are at the last line of a range pair.
9352 $file->carp_bad_line("Unexpected end of range $fields[$NAME] when not in one. Ignoring this line.");
9358 # Check that the input is valid: that the closing of the range is
9359 # the same as the beginning.
9360 foreach my $i (0 .. $last_field) {
9361 next if $fields[$i] eq $previous_fields[$i];
9362 $file->carp_bad_line("Expecting '$fields[$i]' to be the same as '$previous_fields[$i]'. Bad News. Trying anyway");
9365 # The processing differs depending on the type of range,
9366 # determined by its $NAME
9367 if ($fields[$NAME] =~ /^Hangul Syllable/) {
9369 # Check that the data looks right.
9370 if ($decimal_previous_cp != $SBase) {
9371 $file->carp_bad_line("Unexpected Hangul syllable start = $previous_cp. Bad News. Results will be wrong");
9373 if ($decimal_cp != $SBase + $SCount - 1) {
9374 $file->carp_bad_line("Unexpected Hangul syllable end = $cp. Bad News. Results will be wrong");
9377 # The Hangul syllable range has a somewhat complicated name
9378 # generation algorithm. Each code point in it has a canonical
9379 # decomposition also computable by an algorithm. The
9380 # perl decomposition map table built from these is used only
9381 # by normalize.pm, which has the algorithm built in it, so the
9382 # decomposition maps are not needed, and are large, so are
9383 # omitted from it. If the full decomposition map table is to
9384 # be output, the decompositions are generated for it, in the
9385 # EOF handling code for this input file.
9387 $previous_fields[$DECOMP_TYPE] = 'Canonical';
9389 # This range is stored in our internal structure with its
9390 # own map type, different from all others.
9391 $previous_fields[$NAME] = $CMD_DELIM
9398 elsif ($fields[$NAME] =~ /^CJK/) {
9400 # The name for these contains the code point itself, and all
9401 # are defined to have the same base name, regardless of what
9402 # is in the file. They are stored in our internal structure
9403 # with a map type of $CP_IN_NAME
9404 $previous_fields[$NAME] = $CMD_DELIM
9409 . 'CJK UNIFIED IDEOGRAPH';
9412 elsif ($fields[$CATEGORY] eq 'Co'
9413 || $fields[$CATEGORY] eq 'Cs')
9415 # The names of all the code points in these ranges are set to
9416 # null, as there are no names for the private use and
9417 # surrogate code points.
9419 $previous_fields[$NAME] = "";
9422 $file->carp_bad_line("Unexpected code point range $fields[$NAME] because category is $fields[$CATEGORY]. Attempting to process it.");
9425 # The first line of the range caused everything else to be output,
9426 # and then its values were stored as the beginning values for the
9427 # next set of ranges, which this one ends. Now, for each value,
9428 # add a command to tell the handler that these values should not
9429 # replace any existing ones in our database.
9430 foreach my $i (0 .. $last_field) {
9431 $previous_fields[$i] = $CMD_DELIM
9436 . $previous_fields[$i];
9439 # And change things so it looks like the entire range has been
9440 # gone through with this being the final part of it. Adding the
9441 # command above to each field will cause this range to be flushed
9442 # during the next iteration, as it guaranteed that the stored
9443 # field won't match whatever value the next one has.
9445 $decimal_previous_cp = $decimal_cp;
9447 # We are now set up for the next iteration; so skip the remaining
9448 # code in this subroutine that does the same thing, but doesn't
9449 # know about these ranges.
9454 # On the very first line, we fake it so the code below thinks there is
9455 # nothing to output, and initialize so that when it does get output it
9456 # uses the first line's values for the lowest part of the range.
9457 # (One could avoid this by using peek(), but then one would need to
9458 # know the adjustments done above and do the same ones in the setup
9459 # routine; not worth it)
9462 @previous_fields = @fields;
9463 @start = ($cp) x scalar @fields;
9464 $decimal_previous_cp = $decimal_cp - 1;
9467 # For each field, output the stored up ranges that this code point
9468 # doesn't fit in. Earlier we figured out if all ranges should be
9469 # terminated because of changing the replace or map type styles, or if
9470 # there is a gap between this new code point and the previous one, and
9471 # that is stored in $force_output. But even if those aren't true, we
9472 # need to output the range if this new code point's value for the
9473 # given property doesn't match the stored range's.
9474 #local $to_trace = 1 if main::DEBUG;
9475 foreach my $i (0 .. $last_field) {
9476 my $field = $fields[$i];
9477 if ($force_output || $field ne $previous_fields[$i]) {
9479 # Flush the buffer of stored values.
9480 $file->insert_adjusted_lines("$start[$i]..$previous_cp; $field_names[$i]; $previous_fields[$i]");
9482 # Start a new range with this code point and its value
9484 $previous_fields[$i] = $field;
9488 # Set the values for the next time.
9490 $decimal_previous_cp = $decimal_cp;
9492 # The input line has generated whatever adjusted lines are needed, and
9493 # should not be looked at further.
9498 sub EOF_UnicodeData {
9499 # Called upon EOF to flush the buffers, and create the Hangul
9500 # decomposition mappings if needed.
9503 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9505 # Flush the buffers.
9506 foreach my $i (1 .. $last_field) {
9507 $file->insert_adjusted_lines("$start[$i]..$previous_cp; $field_names[$i]; $previous_fields[$i]");
9510 if (-e 'Jamo.txt') {
9512 # The algorithm is published by Unicode, based on values in
9513 # Jamo.txt, (which should have been processed before this
9514 # subroutine), and the results left in %Jamo
9516 Carp::my_carp_bug("Jamo.txt should be processed before Unicode.txt. Hangul syllables not generated.");
9520 # If the full decomposition map table is being output, insert
9521 # into it the Hangul syllable mappings. This is to avoid having
9522 # to publish a subroutine in it to compute them. (which would
9523 # essentially be this code.) This uses the algorithm published by
9525 if (property_ref('Decomposition_Mapping')->to_output_map) {
9526 for (my $S = $SBase; $S < $SBase + $SCount; $S++) {
9528 my $SIndex = $S - $SBase;
9529 my $L = $LBase + $SIndex / $NCount;
9530 my $V = $VBase + ($SIndex % $NCount) / $TCount;
9531 my $T = $TBase + $SIndex % $TCount;
9533 trace "L=$L, V=$V, T=$T" if main::DEBUG && $to_trace;
9534 my $decomposition = sprintf("%04X %04X", $L, $V);
9535 $decomposition .= sprintf(" %04X", $T) if $T != $TBase;
9536 $file->insert_adjusted_lines(
9537 sprintf("%04X; Decomposition_Mapping; %s",
9548 # Fix UCD lines in version 1. This is probably overkill, but this
9549 # fixes some glaring errors in Version 1 UnicodeData.txt. That file:
9550 # 1) had many Hangul (U+3400 - U+4DFF) code points that were later
9551 # removed. This program retains them
9552 # 2) didn't include ranges, which it should have, and which are now
9553 # added in @corrected_lines below. It was hand populated by
9554 # taking the data from Version 2, verified by analyzing
9556 # 3) There is a syntax error in the entry for U+09F8 which could
9557 # cause problems for utf8_heavy, and so is changed. It's
9558 # numeric value was simply a minus sign, without any number.
9559 # (Eventually Unicode changed the code point to non-numeric.)
9560 # 4) The decomposition types often don't match later versions
9561 # exactly, and the whole syntax of that field is different; so
9562 # the syntax is changed as well as the types to their later
9563 # terminology. Otherwise normalize.pm would be very unhappy
9564 # 5) Many ccc classes are different. These are left intact.
9565 # 6) U+FF10 - U+FF19 are missing their numeric values in all three
9566 # fields. These are unchanged because it doesn't really cause
9567 # problems for Perl.
9568 # 7) A number of code points, such as controls, don't have their
9569 # Unicode Version 1 Names in this file. These are unchanged.
9571 my @corrected_lines = split /\n/, <<'END';
9572 4E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;;
9573 9FA5;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;;
9574 E000;<Private Use, First>;Co;0;L;;;;;N;;;;;
9575 F8FF;<Private Use, Last>;Co;0;L;;;;;N;;;;;
9576 F900;<CJK Compatibility Ideograph, First>;Lo;0;L;;;;;N;;;;;
9577 FA2D;<CJK Compatibility Ideograph, Last>;Lo;0;L;;;;;N;;;;;
9581 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9583 #local $to_trace = 1 if main::DEBUG;
9584 trace $_ if main::DEBUG && $to_trace;
9586 # -1 => retain trailing null fields
9587 my ($code_point, @fields) = split /\s*;\s*/, $_, -1;
9589 # At the first place that is wrong in the input, insert all the
9590 # corrections, replacing the wrong line.
9591 if ($code_point eq '4E00') {
9592 my @copy = @corrected_lines;
9594 ($code_point, @fields) = split /\s*;\s*/, $_, -1;
9596 $file->insert_lines(@copy);
9600 if ($fields[$NUMERIC] eq '-') {
9601 $fields[$NUMERIC] = '-1'; # This is what 2.0 made it.
9604 if ($fields[$PERL_DECOMPOSITION] ne "") {
9606 # Several entries have this change to superscript 2 or 3 in the
9607 # middle. Convert these to the modern version, which is to use
9608 # the actual U+00B2 and U+00B3 (the superscript forms) instead.
9609 # So 'HHHH HHHH <+sup> 0033 <-sup> HHHH' becomes
9610 # 'HHHH HHHH 00B3 HHHH'.
9611 # It turns out that all of these that don't have another
9612 # decomposition defined at the beginning of the line have the
9613 # <square> decomposition in later releases.
9614 if ($code_point ne '00B2' && $code_point ne '00B3') {
9615 if ($fields[$PERL_DECOMPOSITION]
9616 =~ s/<\+sup> 003([23]) <-sup>/00B$1/)
9618 if (substr($fields[$PERL_DECOMPOSITION], 0, 1) ne '<') {
9619 $fields[$PERL_DECOMPOSITION] = '<square> '
9620 . $fields[$PERL_DECOMPOSITION];
9625 # If is like '<+circled> 0052 <-circled>', convert to
9627 $fields[$PERL_DECOMPOSITION] =~
9628 s/ < \+ ( .*? ) > \s* (.*?) \s* <-\1> /<$1> $2/x;
9630 # Convert '<join> HHHH HHHH <join>' to '<medial> HHHH HHHH', etc.
9631 $fields[$PERL_DECOMPOSITION] =~
9632 s/ <join> \s* (.*?) \s* <no-join> /<final> $1/x
9633 or $fields[$PERL_DECOMPOSITION] =~
9634 s/ <join> \s* (.*?) \s* <join> /<medial> $1/x
9635 or $fields[$PERL_DECOMPOSITION] =~
9636 s/ <no-join> \s* (.*?) \s* <join> /<initial> $1/x
9637 or $fields[$PERL_DECOMPOSITION] =~
9638 s/ <no-join> \s* (.*?) \s* <no-join> /<isolated> $1/x;
9640 # Convert '<break> HHHH HHHH <break>' to '<break> HHHH', etc.
9641 $fields[$PERL_DECOMPOSITION] =~
9642 s/ <(break|no-break)> \s* (.*?) \s* <\1> /<$1> $2/x;
9644 # Change names to modern form.
9645 $fields[$PERL_DECOMPOSITION] =~ s/<font variant>/<font>/g;
9646 $fields[$PERL_DECOMPOSITION] =~ s/<no-break>/<noBreak>/g;
9647 $fields[$PERL_DECOMPOSITION] =~ s/<circled>/<circle>/g;
9648 $fields[$PERL_DECOMPOSITION] =~ s/<break>/<fraction>/g;
9650 # One entry has weird braces
9651 $fields[$PERL_DECOMPOSITION] =~ s/[{}]//g;
9654 $_ = join ';', $code_point, @fields;
9655 trace $_ if main::DEBUG && $to_trace;
9659 sub filter_v2_1_5_ucd {
9660 # A dozen entries in this 2.1.5 file had the mirrored and numeric
9661 # columns swapped; These all had mirrored be 'N'. So if the numeric
9662 # column appears to be N, swap it back.
9664 my ($code_point, @fields) = split /\s*;\s*/, $_, -1;
9665 if ($fields[$NUMERIC] eq 'N') {
9666 $fields[$NUMERIC] = $fields[$MIRRORED];
9667 $fields[$MIRRORED] = 'N';
9668 $_ = join ';', $code_point, @fields;
9672 } # End closure for UnicodeData
9674 sub process_GCB_test {
9677 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9679 while ($file->next_line) {
9680 push @backslash_X_tests, $_;
9686 sub process_NamedSequences {
9687 # NamedSequences.txt entries are just added to an array. Because these
9688 # don't look like the other tables, they have their own handler.
9690 # LATIN CAPITAL LETTER A WITH MACRON AND GRAVE;0100 0300
9692 # This just adds the sequence to an array for later handling
9694 return; # XXX Until charnames catches up
9696 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9698 while ($file->next_line) {
9699 my ($name, $sequence, @remainder) = split /\s*;\s*/, $_, -1;
9701 $file->carp_bad_line(
9702 "Doesn't look like 'KHMER VOWEL SIGN OM;17BB 17C6'");
9705 push @named_sequences, "$sequence\t\t$name";
9714 sub filter_early_ea_lb {
9715 # Fixes early EastAsianWidth.txt and LineBreak.txt files. These had a
9716 # third field be the name of the code point, which can be ignored in
9717 # most cases. But it can be meaningful if it marks a range:
9718 # 33FE;W;IDEOGRAPHIC TELEGRAPH SYMBOL FOR DAY THIRTY-ONE
9719 # 3400;W;<CJK Ideograph Extension A, First>
9721 # We need to see the First in the example above to know it's a range.
9722 # They did not use the later range syntaxes. This routine changes it
9723 # to use the modern syntax.
9724 # $1 is the Input_file object.
9726 my @fields = split /\s*;\s*/;
9727 if ($fields[2] =~ /^<.*, First>/) {
9728 $first_range = $fields[0];
9731 elsif ($fields[2] =~ /^<.*, Last>/) {
9732 $_ = $_ = "$first_range..$fields[0]; $fields[1]";
9736 $_ = "$fields[0]; $fields[1]";
9743 sub filter_old_style_arabic_shaping {
9744 # Early versions used a different term for the later one.
9746 my @fields = split /\s*;\s*/;
9747 $fields[3] =~ s/<no shaping>/No_Joining_Group/;
9748 $fields[3] =~ s/\s+/_/g; # Change spaces to underscores
9749 $_ = join ';', @fields;
9753 sub filter_arabic_shaping_line {
9754 # ArabicShaping.txt has entries that look like:
9756 # The field containing 'TEH' is not used. The next field is Joining_Type
9757 # and the last is Joining_Group
9758 # This generates two lines to pass on, one for each property on the input
9762 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9764 my @fields = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
9767 $file->carp_bad_line('Extra fields');
9772 $file->insert_adjusted_lines("$fields[0]; Joining_Group; $fields[3]");
9773 $_ = "$fields[0]; Joining_Type; $fields[2]";
9778 sub setup_special_casing {
9779 # SpecialCasing.txt contains the non-simple case change mappings. The
9780 # simple ones are in UnicodeData.txt, which should already have been read
9781 # in to the full property data structures, so as to initialize these with
9782 # the simple ones. Then the SpecialCasing.txt entries overwrite the ones
9783 # which have different full mappings.
9785 # This routine sees if the simple mappings are to be output, and if so,
9786 # copies what has already been put into the full mapping tables, while
9787 # they still contain only the simple mappings.
9789 # The reason it is done this way is that the simple mappings are probably
9790 # not going to be output, so it saves work to initialize the full tables
9791 # with the simple mappings, and then overwrite those relatively few
9792 # entries in them that have different full mappings, and thus skip the
9793 # simple mapping tables altogether.
9796 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9798 # For each of the case change mappings...
9799 foreach my $case ('lc', 'tc', 'uc') {
9800 my $full = property_ref($case);
9801 unless (defined $full && ! $full->is_empty) {
9802 Carp::my_carp_bug("Need to process UnicodeData before SpecialCasing. Only special casing will be generated.");
9805 # The simple version's name in each mapping merely has an 's' in front
9807 my $simple = property_ref('s' . $case);
9808 $simple->initialize($case) if $simple->to_output_map();
9814 sub filter_special_casing_line {
9815 # Change the format of $_ from SpecialCasing.txt into something that the
9816 # generic handler understands. Each input line contains three case
9817 # mappings. This will generate three lines to pass to the generic handler
9818 # for each of those.
9820 # The input syntax (after stripping comments and trailing white space is
9821 # like one of the following (with the final two being entries that we
9823 # 00DF; 00DF; 0053 0073; 0053 0053; # LATIN SMALL LETTER SHARP S
9824 # 03A3; 03C2; 03A3; 03A3; Final_Sigma;
9825 # 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
9826 # Note the trailing semi-colon, unlike many of the input files. That
9827 # means that there will be an extra null field generated by the split
9830 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9832 my @fields = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
9834 # field #4 is when this mapping is conditional. If any of these get
9835 # implemented, it would be by hard-coding in the casing functions in the
9836 # Perl core, not through tables. But if there is a new condition we don't
9837 # know about, output a warning. We know about all the conditions through
9839 if ($fields[4] ne "") {
9840 my @conditions = split ' ', $fields[4];
9841 if ($conditions[0] ne 'tr' # We know that these languages have
9842 # conditions, and some are multiple
9843 && $conditions[0] ne 'az'
9844 && $conditions[0] ne 'lt'
9846 # And, we know about a single condition Final_Sigma, but
9848 && ($v_version gt v5.2.0
9849 && (@conditions > 1 || $conditions[0] ne 'Final_Sigma')))
9851 $file->carp_bad_line("Unknown condition '$fields[4]'. You should inspect it and either add code to handle it, or add to list of those that are to ignore");
9853 elsif ($conditions[0] ne 'Final_Sigma') {
9855 # Don't print out a message for Final_Sigma, because we have
9856 # hard-coded handling for it. (But the standard could change
9857 # what the rule should be, but it wouldn't show up here
9860 print "# SKIPPING Special Casing: $_\n"
9861 if $verbosity >= $VERBOSE;
9866 elsif (@fields > 6 || (@fields == 6 && $fields[5] ne "" )) {
9867 $file->carp_bad_line('Extra fields');
9872 $_ = "$fields[0]; lc; $fields[1]";
9873 $file->insert_adjusted_lines("$fields[0]; tc; $fields[2]");
9874 $file->insert_adjusted_lines("$fields[0]; uc; $fields[3]");
9879 sub filter_old_style_case_folding {
9880 # This transforms $_ containing the case folding style of 3.0.1, to 3.1
9881 # and later style. Different letters were used in the earlier.
9884 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9886 my @fields = split /\s*;\s*/;
9887 if ($fields[0] =~ /^ 013 [01] $/x) { # The two turkish fields
9890 elsif ($fields[1] eq 'L') {
9891 $fields[1] = 'C'; # L => C always
9893 elsif ($fields[1] eq 'E') {
9894 if ($fields[2] =~ / /) { # E => C if one code point; F otherwise
9902 $file->carp_bad_line("Expecting L or E in second field");
9906 $_ = join("; ", @fields) . ';';
9910 { # Closure for case folding
9912 # Create the map for simple only if are going to output it, for otherwise
9913 # it takes no part in anything we do.
9914 my $to_output_simple;
9916 # These are experimental, perhaps will need these to pass to regcomp.c to
9917 # handle the cases where for example the Kelvin sign character folds to k,
9918 # and in regcomp, we need to know which of the characters can have a
9919 # non-latin1 char fold to it, so it doesn't do the optimizations it might
9921 my @latin1_singly_folded;
9924 sub setup_case_folding($) {
9925 # Read in the case foldings in CaseFolding.txt. This handles both
9926 # simple and full case folding.
9929 = property_ref('Simple_Case_Folding')->to_output_map;
9934 sub filter_case_folding_line {
9935 # Called for each line in CaseFolding.txt
9936 # Input lines look like:
9937 # 0041; C; 0061; # LATIN CAPITAL LETTER A
9938 # 00DF; F; 0073 0073; # LATIN SMALL LETTER SHARP S
9939 # 1E9E; S; 00DF; # LATIN CAPITAL LETTER SHARP S
9941 # 'C' means that folding is the same for both simple and full
9942 # 'F' that it is only for full folding
9943 # 'S' that it is only for simple folding
9944 # 'T' is locale-dependent, and ignored
9945 # 'I' is a type of 'F' used in some early releases.
9946 # Note the trailing semi-colon, unlike many of the input files. That
9947 # means that there will be an extra null field generated by the split
9948 # below, which we ignore and hence is not an error.
9951 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9953 my ($range, $type, $map, @remainder) = split /\s*;\s*/, $_, -1;
9954 if (@remainder > 1 || (@remainder == 1 && $remainder[0] ne "" )) {
9955 $file->carp_bad_line('Extra fields');
9960 if ($type eq 'T') { # Skip Turkic case folding, is locale dependent
9965 # C: complete, F: full, or I: dotted uppercase I -> dotless lowercase
9966 # I are all full foldings
9967 if ($type eq 'C' || $type eq 'F' || $type eq 'I') {
9968 $_ = "$range; Case_Folding; $map";
9973 $file->carp_bad_line('Expecting C F I S or T in second field');
9978 # C and S are simple foldings, but simple case folding is not needed
9979 # unless we explicitly want its map table output.
9980 if ($to_output_simple && $type eq 'C' || $type eq 'S') {
9981 $file->insert_adjusted_lines("$range; Simple_Case_Folding; $map");
9984 # Experimental, see comment above
9985 if ($type ne 'S' && hex($range) >= 256) { # assumes range is 1 point
9986 my @folded = split ' ', $map;
9987 if (hex $folded[0] < 256 && @folded == 1) {
9988 push @latin1_singly_folded, hex $folded[0];
9990 foreach my $folded (@folded) {
9991 push @latin1_folded, hex $folded if hex $folded < 256;
9999 # Experimental, see comment above
10002 #local $to_trace = 1 if main::DEBUG;
10003 @latin1_singly_folded = uniques(@latin1_singly_folded);
10004 @latin1_folded = uniques(@latin1_folded);
10005 trace "latin1 single folded:", map { chr $_ } sort { $a <=> $b } @latin1_singly_folded if main::DEBUG && $to_trace;
10006 trace "latin1 folded:", map { chr $_ } sort { $a <=> $b } @latin1_folded if main::DEBUG && $to_trace;
10009 } # End case fold closure
10011 sub filter_jamo_line {
10012 # Filter Jamo.txt lines. This routine mainly is used to populate hashes
10013 # from this file that is used in generating the Name property for Jamo
10014 # code points. But, it also is used to convert early versions' syntax
10015 # into the modern form. Here are two examples:
10016 # 1100; G # HANGUL CHOSEONG KIYEOK # Modern syntax
10017 # U+1100; G; HANGUL CHOSEONG KIYEOK # 2.0 syntax
10019 # The input is $_, the output is $_ filtered.
10021 my @fields = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
10023 # Let the caller handle unexpected input. In earlier versions, there was
10024 # a third field which is supposed to be a comment, but did not have a '#'
10026 return if @fields > (($v_version gt v3.0.0) ? 2 : 3);
10028 $fields[0] =~ s/^U\+//; # Also, early versions had this extraneous
10031 # Some 2.1 versions had this wrong. Causes havoc with the algorithm.
10032 $fields[1] = 'R' if $fields[0] eq '1105';
10034 # Add to structure so can generate Names from it.
10035 my $cp = hex $fields[0];
10036 my $short_name = $fields[1];
10037 $Jamo{$cp} = $short_name;
10038 if ($cp <= $LBase + $LCount) {
10039 $Jamo_L{$short_name} = $cp - $LBase;
10041 elsif ($cp <= $VBase + $VCount) {
10042 $Jamo_V{$short_name} = $cp - $VBase;
10044 elsif ($cp <= $TBase + $TCount) {
10045 $Jamo_T{$short_name} = $cp - $TBase;
10048 Carp::my_carp_bug("Unexpected Jamo code point in $_");
10052 # Reassemble using just the first two fields to look like a typical
10053 # property file line
10054 $_ = "$fields[0]; $fields[1]";
10059 sub register_fraction($) {
10060 # This registers the input rational number so that it can be passed on to
10061 # utf8_heavy.pl, both in rational and floating forms.
10063 my $rational = shift;
10065 my $float = eval $rational;
10066 $nv_floating_to_rational{$float} = $rational;
10070 sub filter_numeric_value_line {
10071 # DNumValues contains lines of a different syntax than the typical
10073 # 0F33 ; -0.5 ; ; -1/2 # No TIBETAN DIGIT HALF ZERO
10075 # This routine transforms $_ containing the anomalous syntax to the
10076 # typical, by filtering out the extra columns, and convert early version
10077 # decimal numbers to strings that look like rational numbers.
10080 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10082 # Starting in 5.1, there is a rational field. Just use that, omitting the
10083 # extra columns. Otherwise convert the decimal number in the second field
10084 # to a rational, and omit extraneous columns.
10085 my @fields = split /\s*;\s*/, $_, -1;
10088 if ($v_version ge v5.1.0) {
10089 if (@fields != 4) {
10090 $file->carp_bad_line('Not 4 semi-colon separated fields');
10094 $rational = $fields[3];
10095 $_ = join '; ', @fields[ 0, 3 ];
10099 # Here, is an older Unicode file, which has decimal numbers instead of
10100 # rationals in it. Use the fraction to calculate the denominator and
10101 # convert to rational.
10103 if (@fields != 2 && @fields != 3) {
10104 $file->carp_bad_line('Not 2 or 3 semi-colon separated fields');
10109 my $codepoints = $fields[0];
10110 my $decimal = $fields[1];
10111 if ($decimal =~ s/\.0+$//) {
10113 # Anything ending with a decimal followed by nothing but 0's is an
10115 $_ = "$codepoints; $decimal";
10116 $rational = $decimal;
10121 if ($decimal =~ /\.50*$/) {
10125 # Here have the hardcoded repeating decimals in the fraction, and
10126 # the denominator they imply. There were only a few denominators
10127 # in the older Unicode versions of this file which this code
10128 # handles, so it is easy to convert them.
10130 # The 4 is because of a round-off error in the Unicode 3.2 files
10131 elsif ($decimal =~ /\.33*[34]$/ || $decimal =~ /\.6+7$/) {
10134 elsif ($decimal =~ /\.[27]50*$/) {
10137 elsif ($decimal =~ /\.[2468]0*$/) {
10140 elsif ($decimal =~ /\.16+7$/ || $decimal =~ /\.83+$/) {
10143 elsif ($decimal =~ /\.(12|37|62|87)50*$/) {
10146 if ($denominator) {
10147 my $sign = ($decimal < 0) ? "-" : "";
10148 my $numerator = int((abs($decimal) * $denominator) + .5);
10149 $rational = "$sign$numerator/$denominator";
10150 $_ = "$codepoints; $rational";
10153 $file->carp_bad_line("Can't cope with number '$decimal'.");
10160 register_fraction($rational) if $rational =~ qr{/};
10165 my %unihan_properties;
10170 # Do any special setup for Unihan properties.
10172 # This property gives the wrong computed type, so override.
10173 my $usource = property_ref('kIRG_USource');
10174 $usource->set_type($STRING) if defined $usource;
10176 # This property is to be considered binary, so change all the values
10178 $iicore = property_ref('kIICore');
10179 if (defined $iicore) {
10180 $iicore->add_match_table('Y') if ! defined $iicore->table('Y');
10182 # We have to change the default map, because the @missing line is
10183 # misleading, given that we are treating it as binary.
10184 $iicore->set_default_map('N');
10185 $iicore->set_type($BINARY);
10191 sub filter_unihan_line {
10192 # Change unihan db lines to look like the others in the db. Here is
10194 # U+341C kCangjie IEKN
10196 # Tabs are used instead of semi-colons to separate fields; therefore
10197 # they may have semi-colons embedded in them. Change these to periods
10198 # so won't screw up the rest of the code.
10201 # Remove lines that don't look like ones we accept.
10202 if ($_ !~ /^ [^\t]* \t ( [^\t]* ) /x) {
10207 # Extract the property, and save a reference to its object.
10209 if (! exists $unihan_properties{$property}) {
10210 $unihan_properties{$property} = property_ref($property);
10213 # Don't do anything unless the property is one we're handling, which
10214 # we determine by seeing if there is an object defined for it or not
10215 if (! defined $unihan_properties{$property}) {
10220 # The iicore property is supposed to be a boolean, so convert to our
10221 # standard boolean form.
10222 if (defined $iicore && $unihan_properties{$property} == $iicore) {
10223 $_ =~ s/$property.*/$property\tY/
10226 # Convert the tab separators to our standard semi-colons, and convert
10227 # the U+HHHH notation to the rest of the standard's HHHH
10229 s/\b U \+ (?= $code_point_re )//xg;
10231 #local $to_trace = 1 if main::DEBUG;
10232 trace $_ if main::DEBUG && $to_trace;
10238 sub filter_blocks_lines {
10239 # In the Blocks.txt file, the names of the blocks don't quite match the
10240 # names given in PropertyValueAliases.txt, so this changes them so they
10241 # do match: Blanks and hyphens are changed into underscores. Also makes
10242 # early release versions look like later ones
10244 # $_ is transformed to the correct value.
10247 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10249 if ($v_version lt v3.2.0) {
10250 if (/FEFF.*Specials/) { # Bug in old versions: line wrongly inserted
10255 # Old versions used a different syntax to mark the range.
10256 $_ =~ s/;\s+/../ if $v_version lt v3.1.0;
10259 my @fields = split /\s*;\s*/, $_, -1;
10260 if (@fields != 2) {
10261 $file->carp_bad_line("Expecting exactly two fields");
10266 # Change hyphens and blanks in the block name field only
10267 $fields[1] =~ s/[ -]/_/g;
10268 $fields[1] =~ s/_ ( [a-z] ) /_\u$1/g; # Capitalize first letter of word
10270 $_ = join("; ", @fields);
10275 my $current_property;
10277 sub filter_old_style_proplist {
10278 # PropList.txt has been in Unicode since version 2.0. Until 3.1, it
10279 # was in a completely different syntax. Ken Whistler of Unicode says
10280 # that it was something he used as an aid for his own purposes, but
10281 # was never an official part of the standard. However, comments in
10282 # DAge.txt indicate that non-character code points were available in
10283 # the UCD as of 3.1. It is unclear to me (khw) how they could be
10284 # there except through this file (but on the other hand, they first
10285 # appeared there in 3.0.1), so maybe it was part of the UCD, and maybe
10286 # not. But the claim is that it was published as an aid to others who
10287 # might want some more information than was given in the official UCD
10288 # of the time. Many of the properties in it were incorporated into
10289 # the later PropList.txt, but some were not. This program uses this
10290 # early file to generate property tables that are otherwise not
10291 # accessible in the early UCD's, and most were probably not really
10292 # official at that time, so one could argue that it should be ignored,
10293 # and you can easily modify things to skip this. And there are bugs
10294 # in this file in various versions. (For example, the 2.1.9 version
10295 # removes from Alphabetic the CJK range starting at 4E00, and they
10296 # weren't added back in until 3.1.0.) Many of this file's properties
10297 # were later sanctioned, so this code generates tables for those
10298 # properties that aren't otherwise in the UCD of the time but
10299 # eventually did become official, and throws away the rest. Here is a
10300 # list of all the ones that are thrown away:
10301 # Bidi=* duplicates UnicodeData.txt
10302 # Combining never made into official property;
10304 # Composite never made into official property.
10305 # Currency Symbol duplicates UnicodeData.txt: gc=sc
10306 # Decimal Digit duplicates UnicodeData.txt: gc=nd
10307 # Delimiter never made into official property;
10309 # Format Control never made into official property;
10311 # High Surrogate duplicates Blocks.txt
10312 # Ignorable Control never made into official property;
10314 # ISO Control duplicates UnicodeData.txt: gc=cc
10315 # Left of Pair never made into official property;
10316 # Line Separator duplicates UnicodeData.txt: gc=zl
10317 # Low Surrogate duplicates Blocks.txt
10318 # Non-break was actually listed as a property
10319 # in 3.2, but without any code
10320 # points. Unicode denies that this
10321 # was ever an official property
10322 # Non-spacing duplicate UnicodeData.txt: gc=mn
10323 # Numeric duplicates UnicodeData.txt: gc=cc
10324 # Paired Punctuation never made into official property;
10325 # appears to be gc=ps + gc=pe
10326 # Paragraph Separator duplicates UnicodeData.txt: gc=cc
10327 # Private Use duplicates UnicodeData.txt: gc=co
10328 # Private Use High Surrogate duplicates Blocks.txt
10329 # Punctuation duplicates UnicodeData.txt: gc=p
10330 # Space different definition than eventual
10332 # Titlecase duplicates UnicodeData.txt: gc=lt
10333 # Unassigned Code Value duplicates UnicodeData.txt: gc=cc
10334 # Zero-width never made into offical property;
10336 # Most of the properties have the same names in this file as in later
10337 # versions, but a couple do not.
10339 # This subroutine filters $_, converting it from the old style into
10340 # the new style. Here's a sample of the old-style
10342 # *******************************************
10344 # Property dump for: 0x100000A0 (Join Control)
10346 # 200C..200D (2 chars)
10348 # In the example, the property is "Join Control". It is kept in this
10349 # closure between calls to the subroutine. The numbers beginning with
10350 # 0x were internal to Ken's program that generated this file.
10352 # If this line contains the property name, extract it.
10353 if (/^Property dump for: [^(]*\((.*)\)/) {
10356 # Convert white space to underscores.
10359 # Convert the few properties that don't have the same name as
10360 # their modern counterparts
10361 s/Identifier_Part/ID_Continue/
10362 or s/Not_a_Character/NChar/;
10364 # If the name matches an existing property, use it.
10365 if (defined property_ref($_)) {
10366 trace "new property=", $_ if main::DEBUG && $to_trace;
10367 $current_property = $_;
10369 else { # Otherwise discard it
10370 trace "rejected property=", $_ if main::DEBUG && $to_trace;
10371 undef $current_property;
10373 $_ = ""; # The property is saved for the next lines of the
10374 # file, but this defining line is of no further use,
10375 # so clear it so that the caller won't process it
10378 elsif (! defined $current_property || $_ !~ /^$code_point_re/) {
10380 # Here, the input line isn't a header defining a property for the
10381 # following section, and either we aren't in such a section, or
10382 # the line doesn't look like one that defines the code points in
10383 # such a section. Ignore this line.
10388 # Here, we have a line defining the code points for the current
10389 # stashed property. Anything starting with the first blank is
10390 # extraneous. Otherwise, it should look like a normal range to
10391 # the caller. Append the property name so that it looks just like
10392 # a modern PropList entry.
10395 $_ .= "; $current_property";
10397 trace $_ if main::DEBUG && $to_trace;
10400 } # End closure for old style proplist
10402 sub filter_old_style_normalization_lines {
10403 # For early releases of Unicode, the lines were like:
10404 # 74..2A76 ; NFKD_NO
10405 # For later releases this became:
10406 # 74..2A76 ; NFKD_QC; N
10407 # Filter $_ to look like those in later releases.
10408 # Similarly for MAYBEs
10410 s/ _NO \b /_QC; N/x || s/ _MAYBE \b /_QC; M/x;
10412 # Also, the property FC_NFKC was abbreviated to FNC
10417 sub finish_Unicode() {
10418 # This routine should be called after all the Unicode files have been read
10420 # 1) Adds the mappings for code points missing from the files which have
10421 # defaults specified for them.
10422 # 2) At this this point all mappings are known, so it computes the type of
10423 # each property whose type hasn't been determined yet.
10424 # 3) Calculates all the regular expression match tables based on the
10426 # 3) Calculates and adds the tables which are defined by Unicode, but
10427 # which aren't derived by them
10429 # For each property, fill in any missing mappings, and calculate the re
10430 # match tables. If a property has more than one missing mapping, the
10431 # default is a reference to a data structure, and requires data from other
10432 # properties to resolve. The sort is used to cause these to be processed
10433 # last, after all the other properties have been calculated.
10434 # (Fortunately, the missing properties so far don't depend on each other.)
10435 foreach my $property
10436 (sort { (defined $a->default_map && ref $a->default_map) ? 1 : -1 }
10439 # $perl has been defined, but isn't one of the Unicode properties that
10440 # need to be finished up.
10441 next if $property == $perl;
10443 # Handle the properties that have more than one possible default
10444 if (ref $property->default_map) {
10445 my $default_map = $property->default_map;
10447 # These properties have stored in the default_map:
10449 # 1) A default map which applies to all code points in a
10451 # 2) an expression which will evaluate to the list of code
10452 # points in that class
10454 # 3) the default map which applies to every other missing code
10457 # Go through each list.
10458 while (my ($default, $eval) = $default_map->get_next_defaults) {
10460 # Get the class list, and intersect it with all the so-far
10461 # unspecified code points yielding all the code points
10462 # in the class that haven't been specified.
10463 my $list = eval $eval;
10465 Carp::my_carp("Can't set some defaults for missing code points for $property because eval '$eval' failed with '$@'");
10469 # Narrow down the list to just those code points we don't have
10471 $list = $list & $property->inverse_list;
10473 # Add mappings to the property for each code point in the list
10474 foreach my $range ($list->ranges) {
10475 $property->add_map($range->start, $range->end, $default);
10479 # All remaining code points have the other mapping. Set that up
10480 # so the normal single-default mapping code will work on them
10481 $property->set_default_map($default_map->other_default);
10483 # And fall through to do that
10486 # We should have enough data now to compute the type of the property.
10487 $property->compute_type;
10488 my $property_type = $property->type;
10490 next if ! $property->to_create_match_tables;
10492 # Here want to create match tables for this property
10494 # The Unicode db always (so far, and they claim into the future) have
10495 # the default for missing entries in binary properties be 'N' (unless
10496 # there is a '@missing' line that specifies otherwise)
10497 if ($property_type == $BINARY && ! defined $property->default_map) {
10498 $property->set_default_map('N');
10501 # Add any remaining code points to the mapping, using the default for
10502 # missing code points
10503 if (defined (my $default_map = $property->default_map)) {
10504 foreach my $range ($property->inverse_list->ranges) {
10505 $property->add_map($range->start, $range->end, $default_map);
10508 # Make sure there is a match table for the default
10509 if (! defined $property->table($default_map)) {
10510 $property->add_match_table($default_map);
10514 # Have all we need to populate the match tables.
10515 my $property_name = $property->name;
10516 foreach my $range ($property->ranges) {
10517 my $map = $range->value;
10518 my $table = property_ref($property_name)->table($map);
10519 if (! defined $table) {
10521 # Integral and rational property values are not necessarily
10522 # defined in PropValueAliases, but all other ones should be,
10524 if ($v_version ge v5.1.0
10525 && $map !~ /^ -? \d+ ( \/ \d+ )? $/x)
10527 Carp::my_carp("Table '$property_name=$map' should have been defined. Defining it now.")
10529 $table = property_ref($property_name)->add_match_table($map);
10532 $table->add_range($range->start, $range->end);
10535 # And add the Is_ prefix synonyms for Perl 5.6 compatibility, in which
10536 # all properties have this optional prefix. These do not get a
10537 # separate entry in the pod file, because are covered by a wild-card
10539 foreach my $alias ($property->aliases) {
10540 my $Is_name = 'Is_' . $alias->name;
10541 if (! defined (my $pre_existing = property_ref($Is_name))) {
10542 $property->add_alias($Is_name,
10544 Status => $alias->status,
10545 Externally_Ok => 0);
10549 # It seemed too much work to add in these warnings when it
10550 # appears that Unicode has made a decision never to begin a
10551 # property name with 'Is_', so this shouldn't happen, but just
10552 # in case, it is a warning.
10553 Carp::my_carp(<<END
10554 There is already an alias named $Is_name (from " . $pre_existing . "), so not
10555 creating this alias for $property. The generated table and pod files do not
10556 warn users of this conflict.
10559 $has_Is_conflicts++;
10561 } # End of loop through aliases for this property
10562 } # End of loop through all Unicode properties.
10564 # Fill in the mappings that Unicode doesn't completely furnish. First the
10565 # single letter major general categories. If Unicode were to start
10566 # delivering the values, this would be redundant, but better that than to
10567 # try to figure out if should skip and not get it right. Ths could happen
10568 # if a new major category were to be introduced, and the hard-coded test
10569 # wouldn't know about it.
10570 # This routine depends on the standard names for the general categories
10571 # being what it thinks they are, like 'Cn'. The major categories are the
10572 # union of all the general category tables which have the same first
10573 # letters. eg. L = Lu + Lt + Ll + Lo + Lm
10574 foreach my $minor_table ($gc->tables) {
10575 my $minor_name = $minor_table->name;
10576 next if length $minor_name == 1;
10577 if (length $minor_name != 2) {
10578 Carp::my_carp_bug("Unexpected general category '$minor_name'. Skipped.");
10582 my $major_name = uc(substr($minor_name, 0, 1));
10583 my $major_table = $gc->table($major_name);
10584 $major_table += $minor_table;
10587 # LC is Ll, Lu, and Lt. (used to be L& or L_, but PropValueAliases.txt
10588 # defines it as LC)
10589 my $LC = $gc->table('LC');
10590 $LC->add_alias('L_', Status => $DISCOURAGED); # For backwards...
10591 $LC->add_alias('L&', Status => $DISCOURAGED); # compatibility.
10594 if ($LC->is_empty) { # Assume if not empty that Unicode has started to
10595 # deliver the correct values in it
10596 $LC->initialize($gc->table('Ll') + $gc->table('Lu'));
10598 # Lt not in release 1.
10599 $LC += $gc->table('Lt') if defined $gc->table('Lt');
10601 $LC->add_description('[\p{Ll}\p{Lu}\p{Lt}]');
10603 my $Cs = $gc->table('Cs');
10605 $Cs->add_note('Mostly not usable in Perl.');
10606 $Cs->add_comment(join_lines(<<END
10607 Surrogates are used exclusively for I/O in UTF-16, and should not appear in
10608 Unicode text, and hence their use will generate (usually fatal) messages
10614 # Folding information was introduced later into Unicode data. To get
10615 # Perl's case ignore (/i) to work at all in releases that don't have
10616 # folding, use the best available alternative, which is lower casing.
10617 my $fold = property_ref('Simple_Case_Folding');
10618 if ($fold->is_empty) {
10619 $fold->initialize(property_ref('Simple_Lowercase_Mapping'));
10620 $fold->add_note(join_lines(<<END
10621 WARNING: This table uses lower case as a substitute for missing fold
10627 # Multiple-character mapping was introduced later into Unicode data. If
10628 # missing, use the single-characters maps as best available alternative
10629 foreach my $map (qw { Uppercase_Mapping
10634 my $full = property_ref($map);
10635 if ($full->is_empty) {
10636 my $simple = property_ref('Simple_' . $map);
10637 $full->initialize($simple);
10638 $full->add_comment($simple->comment) if ($simple->comment);
10639 $full->add_note(join_lines(<<END
10640 WARNING: This table uses simple mapping (single-character only) as a
10641 substitute for missing multiple-character information
10649 sub compile_perl() {
10650 # Create perl-defined tables. Almost all are part of the pseudo-property
10651 # named 'perl' internally to this program. Many of these are recommended
10652 # in UTS#18 "Unicode Regular Expressions", and their derivations are based
10653 # on those found there.
10654 # Almost all of these are equivalent to some Unicode property.
10655 # A number of these properties have equivalents restricted to the ASCII
10656 # range, with their names prefaced by 'Posix', to signify that these match
10657 # what the Posix standard says they should match. A couple are
10658 # effectively this, but the name doesn't have 'Posix' in it because there
10659 # just isn't any Posix equivalent.
10661 # 'Any' is all code points. As an error check, instead of just setting it
10662 # to be that, construct it to be the union of all the major categories
10663 my $Any = $perl->add_match_table('Any',
10664 Description => "[\\x{0000}-\\x{$LAST_UNICODE_CODEPOINT_STRING}]",
10667 foreach my $major_table ($gc->tables) {
10669 # Major categories are the ones with single letter names.
10670 next if length($major_table->name) != 1;
10672 $Any += $major_table;
10675 if ($Any->max != $LAST_UNICODE_CODEPOINT) {
10676 Carp::my_carp_bug("Generated highest code point ("
10677 . sprintf("%X", $Any->max)
10678 . ") doesn't match expected value $LAST_UNICODE_CODEPOINT_STRING.")
10680 if ($Any->range_count != 1 || $Any->min != 0) {
10681 Carp::my_carp_bug("Generated table 'Any' doesn't match all code points.")
10684 $Any->add_alias('All');
10686 # Assigned is the opposite of gc=unassigned
10687 my $Assigned = $perl->add_match_table('Assigned',
10688 Description => "All assigned code points",
10689 Initialize => ~ $gc->table('Unassigned'),
10692 # Our internal-only property should be treated as more than just a
10694 $perl->add_match_table('_CombAbove')
10695 ->set_equivalent_to(property_ref('ccc')->table('Above'),
10698 my $ASCII = $perl->add_match_table('ASCII', Description => '[[:ASCII:]]');
10699 if (defined $block) { # This is equivalent to the block if have it.
10700 my $Unicode_ASCII = $block->table('Basic_Latin');
10701 if (defined $Unicode_ASCII && ! $Unicode_ASCII->is_empty) {
10702 $ASCII->set_equivalent_to($Unicode_ASCII, Related => 1);
10706 # Very early releases didn't have blocks, so initialize ASCII ourselves if
10708 if ($ASCII->is_empty) {
10709 $ASCII->initialize([ 0..127 ]);
10712 # Get the best available case definitions. Early Unicode versions didn't
10713 # have Uppercase and Lowercase defined, so use the general category
10714 # instead for them.
10715 my $Lower = $perl->add_match_table('Lower');
10716 my $Unicode_Lower = property_ref('Lowercase');
10717 if (defined $Unicode_Lower && ! $Unicode_Lower->is_empty) {
10718 $Lower->set_equivalent_to($Unicode_Lower->table('Y'), Related => 1);
10721 $Lower->set_equivalent_to($gc->table('Lowercase_Letter'),
10724 $perl->add_match_table("PosixLower",
10725 Description => "[a-z]",
10726 Initialize => $Lower & $ASCII,
10729 my $Upper = $perl->add_match_table('Upper');
10730 my $Unicode_Upper = property_ref('Uppercase');
10731 if (defined $Unicode_Upper && ! $Unicode_Upper->is_empty) {
10732 $Upper->set_equivalent_to($Unicode_Upper->table('Y'), Related => 1);
10735 $Upper->set_equivalent_to($gc->table('Uppercase_Letter'),
10738 $perl->add_match_table("PosixUpper",
10739 Description => "[A-Z]",
10740 Initialize => $Upper & $ASCII,
10743 # Earliest releases didn't have title case. Initialize it to empty if not
10744 # otherwise present
10745 my $Title = $perl->add_match_table('Title');
10746 my $lt = $gc->table('Lt');
10748 $Title->set_equivalent_to($lt, Related => 1);
10751 # If this Unicode version doesn't have Cased, set up our own. From
10752 # Unicode 5.1: Definition D120: A character C is defined to be cased if
10753 # and only if C has the Lowercase or Uppercase property or has a
10754 # General_Category value of Titlecase_Letter.
10755 unless (defined property_ref('Cased')) {
10756 my $cased = $perl->add_match_table('Cased',
10757 Initialize => $Lower + $Upper + $Title,
10758 Description => 'Uppercase or Lowercase or Titlecase',
10762 # Similarly, set up our own Case_Ignorable property if this Unicode
10763 # version doesn't have it. From Unicode 5.1: Definition D121: A character
10764 # C is defined to be case-ignorable if C has the value MidLetter or the
10765 # value MidNumLet for the Word_Break property or its General_Category is
10766 # one of Nonspacing_Mark (Mn), Enclosing_Mark (Me), Format (Cf),
10767 # Modifier_Letter (Lm), or Modifier_Symbol (Sk).
10769 # Perl has long had an internal-only alias for this property.
10770 my $perl_case_ignorable = $perl->add_match_table('_Case_Ignorable');
10771 my $case_ignorable = property_ref('Case_Ignorable');
10772 if (defined $case_ignorable && ! $case_ignorable->is_empty) {
10773 $perl_case_ignorable->set_equivalent_to($case_ignorable->table('Y'),
10778 $perl_case_ignorable->initialize($gc->table('Mn') + $gc->table('Lm'));
10780 # The following three properties are not in early releases
10781 $perl_case_ignorable += $gc->table('Me') if defined $gc->table('Me');
10782 $perl_case_ignorable += $gc->table('Cf') if defined $gc->table('Cf');
10783 $perl_case_ignorable += $gc->table('Sk') if defined $gc->table('Sk');
10785 # For versions 4.1 - 5.0, there is no MidNumLet property, and
10786 # correspondingly the case-ignorable definition lacks that one. For
10787 # 4.0, it appears that it was meant to be the same definition, but was
10788 # inadvertently omitted from the standard's text, so add it if the
10789 # property actually is there
10790 my $wb = property_ref('Word_Break');
10792 my $midlet = $wb->table('MidLetter');
10793 $perl_case_ignorable += $midlet if defined $midlet;
10794 my $midnumlet = $wb->table('MidNumLet');
10795 $perl_case_ignorable += $midnumlet if defined $midnumlet;
10799 # In earlier versions of the standard, instead of the above two
10800 # properties , just the following characters were used:
10801 $perl_case_ignorable += 0x0027 # APOSTROPHE
10802 + 0x00AD # SOFT HYPHEN (SHY)
10803 + 0x2019; # RIGHT SINGLE QUOTATION MARK
10807 # The remaining perl defined tables are mostly based on Unicode TR 18,
10808 # "Annex C: Compatibility Properties". All of these have two versions,
10809 # one whose name generally begins with Posix that is posix-compliant, and
10810 # one that matches Unicode characters beyond the Posix, ASCII range
10812 my $Alpha = $perl->add_match_table('Alpha');
10814 # Alphabetic was not present in early releases
10815 my $Alphabetic = property_ref('Alphabetic');
10816 if (defined $Alphabetic && ! $Alphabetic->is_empty) {
10817 $Alpha->set_equivalent_to($Alphabetic->table('Y'), Related => 1);
10821 # For early releases, we don't get it exactly right. The below
10822 # includes more than it should, which in 5.2 terms is: L + Nl +
10823 # Other_Alphabetic. Other_Alphabetic contains many characters from
10824 # Mn and Mc. It's better to match more than we should, than less than
10826 $Alpha->initialize($gc->table('Letter')
10828 + $gc->table('Mc'));
10829 $Alpha += $gc->table('Nl') if defined $gc->table('Nl');
10830 $Alpha->add_description('Alphabetic');
10832 $perl->add_match_table("PosixAlpha",
10833 Description => "[A-Za-z]",
10834 Initialize => $Alpha & $ASCII,
10837 my $Alnum = $perl->add_match_table('Alnum',
10838 Description => 'Alphabetic and (Decimal) Numeric',
10839 Initialize => $Alpha + $gc->table('Decimal_Number'),
10841 $perl->add_match_table("PosixAlnum",
10842 Description => "[A-Za-z0-9]",
10843 Initialize => $Alnum & $ASCII,
10846 my $Word = $perl->add_match_table('Word',
10847 Description => '\w, including beyond ASCII',
10848 Initialize => $Alnum + $gc->table('Mark'),
10850 my $Pc = $gc->table('Connector_Punctuation'); # 'Pc' Not in release 1
10851 $Word += $Pc if defined $Pc;
10853 # This is a Perl extension, so the name doesn't begin with Posix.
10854 $perl->add_match_table('PerlWord',
10855 Description => '\w, restricted to ASCII = [A-Za-z0-9_]',
10856 Initialize => $Word & $ASCII,
10859 my $Blank = $perl->add_match_table('Blank',
10860 Description => '\h, Horizontal white space',
10862 # 200B is Zero Width Space which is for line
10863 # break control, and was listed as
10864 # Space_Separator in early releases
10865 Initialize => $gc->table('Space_Separator')
10869 $Blank->add_alias('HorizSpace'); # Another name for it.
10870 $perl->add_match_table("PosixBlank",
10871 Description => "\\t and ' '",
10872 Initialize => $Blank & $ASCII,
10875 my $VertSpace = $perl->add_match_table('VertSpace',
10876 Description => '\v',
10877 Initialize => $gc->table('Line_Separator')
10878 + $gc->table('Paragraph_Separator')
10879 + 0x000A # LINE FEED
10880 + 0x000B # VERTICAL TAB
10881 + 0x000C # FORM FEED
10882 + 0x000D # CARRIAGE RETURN
10885 # No Posix equivalent for vertical space
10887 my $Space = $perl->add_match_table('Space',
10888 Description => '\s including beyond ASCII plus vertical tab',
10889 Initialize => $Blank + $VertSpace,
10891 $perl->add_match_table("PosixSpace",
10892 Description => "\\t, \\n, \\cK, \\f, \\r, and ' '. (\\cK is vertical tab)",
10893 Initialize => $Space & $ASCII,
10896 # Perl's traditional space doesn't include Vertical Tab
10897 my $SpacePerl = $perl->add_match_table('SpacePerl',
10898 Description => '\s, including beyond ASCII',
10899 Initialize => $Space - 0x000B,
10901 $perl->add_match_table('PerlSpace',
10902 Description => '\s, restricted to ASCII',
10903 Initialize => $SpacePerl & $ASCII,
10906 my $Cntrl = $perl->add_match_table('Cntrl',
10907 Description => 'Control characters');
10908 $Cntrl->set_equivalent_to($gc->table('Cc'), Related => 1);
10909 $perl->add_match_table("PosixCntrl",
10910 Description => "ASCII control characters: NUL, SOH, STX, ETX, EOT, ENQ, ACK, BEL, BS, HT, LF, VT, FF, CR, SO, SI, DLE, DC1, DC2, DC3, DC4, NAK, SYN, ETB, CAN, EOM, SUB, ESC, FS, GS, RS, US, and DEL",
10911 Initialize => $Cntrl & $ASCII,
10914 # $controls is a temporary used to construct Graph.
10915 my $controls = Range_List->new(Initialize => $gc->table('Unassigned')
10916 + $gc->table('Control'));
10917 # Cs not in release 1
10918 $controls += $gc->table('Surrogate') if defined $gc->table('Surrogate');
10920 # Graph is ~space & ~(Cc|Cs|Cn) = ~(space + $controls)
10921 my $Graph = $perl->add_match_table('Graph',
10922 Description => 'Characters that are graphical',
10923 Initialize => ~ ($Space + $controls),
10925 $perl->add_match_table("PosixGraph",
10927 '[-!"#$%&\'()*+,./:;<>?@[\\\]^_`{|}~0-9A-Za-z]',
10928 Initialize => $Graph & $ASCII,
10931 my $Print = $perl->add_match_table('Print',
10932 Description => 'Characters that are graphical plus space characters (but no controls)',
10933 Initialize => $Blank + $Graph - $gc->table('Control'),
10935 $perl->add_match_table("PosixPrint",
10937 '[- 0-9A-Za-z!"#$%&\'()*+,./:;<>?@[\\\]^_`{|}~]',
10938 Initialize => $Print & $ASCII,
10941 my $Punct = $perl->add_match_table('Punct');
10942 $Punct->set_equivalent_to($gc->table('Punctuation'), Related => 1);
10944 # \p{punct} doesn't include the symbols, which posix does
10945 $perl->add_match_table('PosixPunct',
10946 Description => '[-!"#$%&\'()*+,./:;<>?@[\\\]^_`{|}~]',
10947 Initialize => $ASCII & ($gc->table('Punctuation')
10948 + $gc->table('Symbol')),
10951 my $Digit = $perl->add_match_table('Digit',
10952 Description => '\d, extended beyond just [0-9]');
10953 $Digit->set_equivalent_to($gc->table('Decimal_Number'), Related => 1);
10954 my $PosixDigit = $perl->add_match_table("PosixDigit",
10955 Description => '[0-9]',
10956 Initialize => $Digit & $ASCII,
10959 # Hex_Digit was not present in first release
10960 my $Xdigit = $perl->add_match_table('XDigit');
10961 my $Hex = property_ref('Hex_Digit');
10962 if (defined $Hex && ! $Hex->is_empty) {
10963 $Xdigit->set_equivalent_to($Hex->table('Y'), Related => 1);
10966 # (Have to use hex instead of e.g. '0', because could be running on an
10967 # non-ASCII machine, and we want the Unicode (ASCII) values)
10968 $Xdigit->initialize([ 0x30..0x39, 0x41..0x46, 0x61..0x66,
10969 0xFF10..0xFF19, 0xFF21..0xFF26, 0xFF41..0xFF46]);
10970 $Xdigit->add_description('[0-9A-Fa-f] and corresponding fullwidth versions, like U+FF10: FULLWIDTH DIGIT ZERO');
10973 my $dt = property_ref('Decomposition_Type');
10974 $dt->add_match_table('Non_Canon', Full_Name => 'Non_Canonical',
10975 Initialize => ~ ($dt->table('None') + $dt->table('Canonical')),
10976 Perl_Extension => 1,
10977 Note => 'Union of all non-canonical decompositions',
10980 # _CanonDCIJ is equivalent to Soft_Dotted, but if on a release earlier
10981 # than SD appeared, construct it ourselves, based on the first release SD
10983 my $CanonDCIJ = $perl->add_match_table('_CanonDCIJ');
10984 my $soft_dotted = property_ref('Soft_Dotted');
10985 if (defined $soft_dotted && ! $soft_dotted->is_empty) {
10986 $CanonDCIJ->set_equivalent_to($soft_dotted->table('Y'), Related => 1);
10990 # This list came from 3.2 Soft_Dotted.
10991 $CanonDCIJ->initialize([ 0x0069,
11000 $CanonDCIJ = $CanonDCIJ & $Assigned;
11003 # These are used in Unicode's definition of \X
11004 my $begin = $perl->add_match_table('_X_Begin', Perl_Extension => 1);
11005 my $extend = $perl->add_match_table('_X_Extend', Perl_Extension => 1);
11007 my $gcb = property_ref('Grapheme_Cluster_Break');
11009 # The 'extended' grapheme cluster came in 5.1. The non-extended
11010 # definition differs too much from the traditional Perl one to use.
11011 if (defined $gcb && defined $gcb->table('SpacingMark')) {
11013 # Note that assumes HST is defined; it came in an earlier release than
11014 # GCB. In the line below, two negatives means: yes hangul
11015 $begin += ~ property_ref('Hangul_Syllable_Type')
11016 ->table('Not_Applicable')
11017 + ~ ($gcb->table('Control')
11018 + $gcb->table('CR')
11019 + $gcb->table('LF'));
11020 $begin->add_comment('For use in \X; matches: Hangul_Syllable | ! Control');
11022 $extend += $gcb->table('Extend') + $gcb->table('SpacingMark');
11023 $extend->add_comment('For use in \X; matches: Extend | SpacingMark');
11025 else { # Old definition, used on early releases.
11026 $extend += $gc->table('Mark')
11029 $begin += ~ $extend;
11031 # Here we may have a release that has the regular grapheme cluster
11032 # defined, or a release that doesn't have anything defined.
11033 # We set things up so the Perl core degrades gracefully, possibly with
11034 # placeholders that match nothing.
11036 if (! defined $gcb) {
11037 $gcb = Property->new('GCB', Status => $PLACEHOLDER);
11039 my $hst = property_ref('HST');
11040 if (!defined $hst) {
11041 $hst = Property->new('HST', Status => $PLACEHOLDER);
11042 $hst->add_match_table('Not_Applicable',
11043 Initialize => $Any,
11047 # On some releases, here we may not have the needed tables for the
11048 # perl core, in some releases we may.
11049 foreach my $name (qw{ L LV LVT T V prepend }) {
11050 my $table = $gcb->table($name);
11051 if (! defined $table) {
11052 $table = $gcb->add_match_table($name);
11053 push @tables_that_may_be_empty, $table->complete_name;
11056 # The HST property predates the GCB one, and has identical tables
11057 # for some of them, so use it if we can.
11058 if ($table->is_empty
11060 && defined $hst->table($name))
11062 $table += $hst->table($name);
11067 # More GCB. If we found some hangul syllables, populate a combined
11069 my $lv_lvt_v = $perl->add_match_table('_X_LV_LVT_V');
11070 my $LV = $gcb->table('LV');
11071 if ($LV->is_empty) {
11072 push @tables_that_may_be_empty, $lv_lvt_v->complete_name;
11074 $lv_lvt_v += $LV + $gcb->table('LVT') + $gcb->table('V');
11075 $lv_lvt_v->add_comment('For use in \X; matches: HST=LV | HST=LVT | HST=V');
11078 # Create a new property specially located that is a combination of the
11079 # various Name properties: Name, Unicode_1_Name, Named Sequences, and
11080 # Name_Alias properties. (The final duplicates elements of the first.) A
11081 # comment for it is constructed based on the actual properties present and
11083 my $perl_charname = Property->new('Perl_Charnames',
11084 Core_Access => '\N{...} and charnames.pm',
11086 Directory => File::Spec->curdir(),
11088 Internal_Only_Warning => 1,
11089 Perl_Extension => 1,
11092 Initialize => property_ref('Unicode_1_Name'),
11094 # Name overrides Unicode_1_Name
11095 $perl_charname->property_add_or_replace_non_nulls(property_ref('Name'));
11096 my @composition = ('Name', 'Unicode_1_Name');
11098 if (@named_sequences) {
11099 push @composition, 'Named_Sequence';
11100 foreach my $sequence (@named_sequences) {
11101 $perl_charname->add_anomalous_entry($sequence);
11105 my $alias_sentence = "";
11106 my $alias = property_ref('Name_Alias');
11107 if (defined $alias) {
11108 push @composition, 'Name_Alias';
11109 $alias->reset_each_range;
11110 while (my ($range) = $alias->each_range) {
11111 next if $range->value eq "";
11112 if ($range->start != $range->end) {
11113 Carp::my_carp("Expecting only one code point in the range $range. Just to keep going, using just the first code point;");
11115 $perl_charname->add_duplicate($range->start, $range->value);
11117 $alias_sentence = <<END;
11118 The Name_Alias property adds duplicate code point entries with a corrected
11119 name. The original (less correct, but still valid) name will be physically
11124 if (@composition <= 2) { # Always at least 2
11125 $comment = join " and ", @composition;
11128 $comment = join ", ", @composition[0 .. scalar @composition - 2];
11129 $comment .= ", and $composition[-1]";
11132 # Wait for charnames to catch up
11133 # foreach my $entry (@more_Names,
11134 # split "\n", <<"END"
11142 #FEFF; BYTE ORDER MARK
11145 # #local $to_trace = 1 if main::DEBUG;
11146 # trace $entry if main::DEBUG && $to_trace;
11147 # my ($code_point, $name) = split /\s*;\s*/, $entry;
11148 # $code_point = hex $code_point;
11149 # trace $code_point, $name if main::DEBUG && $to_trace;
11150 # $perl_charname->add_duplicate($code_point, $name);
11152 # #$perl_charname->add_comment("This file is for charnames.pm. It is the union of the $comment properties, plus certain commonly used but unofficial names, such as 'FF' and 'ZWNJ'. Unicode_1_Name entries are used only for otherwise nameless code points.$alias_sentence");
11153 $perl_charname->add_comment(join_lines( <<END
11154 This file is for charnames.pm. It is the union of the $comment properties.
11155 Unicode_1_Name entries are used only for otherwise nameless code
11161 # The combining class property used by Perl's normalize.pm is not located
11162 # in the normal mapping directory; create a copy for it.
11163 my $ccc = property_ref('Canonical_Combining_Class');
11164 my $perl_ccc = Property->new('Perl_ccc',
11165 Default_Map => $ccc->default_map,
11166 Full_Name => 'Perl_Canonical_Combining_Class',
11167 Internal_Only_Warning => 1,
11168 Perl_Extension => 1,
11171 Initialize => $ccc,
11172 File => 'CombiningClass',
11173 Directory => File::Spec->curdir(),
11175 $perl_ccc->set_to_output_map(1);
11176 $perl_ccc->add_comment(join_lines(<<END
11177 This mapping is for normalize.pm. It is currently identical to the Unicode
11178 Canonical_Combining_Class property.
11182 # This one match table for it is needed for calculations on output
11183 my $default = $perl_ccc->add_match_table($ccc->default_map,
11184 Initialize => $ccc->table($ccc->default_map),
11185 Status => $SUPPRESSED);
11187 # Construct the Present_In property from the Age property.
11188 if (-e 'DAge.txt' && defined (my $age = property_ref('Age'))) {
11189 my $default_map = $age->default_map;
11190 my $in = Property->new('In',
11191 Default_Map => $default_map,
11192 Full_Name => "Present_In",
11193 Internal_Only_Warning => 1,
11194 Perl_Extension => 1,
11196 Initialize => $age,
11198 $in->add_comment(join_lines(<<END
11199 This file should not be used for any purpose. The values in this file are the
11200 same as for $age, and not for what $in really means. This is because anything
11201 defined in a given release should have multiple values: that release and all
11202 higher ones. But only one value per code point can be represented in a table
11207 # The Age tables are named like 1.5, 2.0, 2.1, .... Sort so that the
11208 # lowest numbered (earliest) come first, with the non-numeric one
11210 my ($first_age, @rest_ages) = sort { ($a->name !~ /^[\d.]*$/)
11212 : ($b->name !~ /^[\d.]*$/)
11214 : $a->name <=> $b->name
11217 # The Present_In property is the cumulative age properties. The first
11218 # one hence is identical to the first age one.
11219 my $previous_in = $in->add_match_table($first_age->name);
11220 $previous_in->set_equivalent_to($first_age, Related => 1);
11222 my $description_start = "Code point's usage introduced in version ";
11223 $first_age->add_description($description_start . $first_age->name);
11225 # To construct the accumlated values, for each of the age tables
11226 # starting with the 2nd earliest, merge the earliest with it, to get
11227 # all those code points existing in the 2nd earliest. Repeat merging
11228 # the new 2nd earliest with the 3rd earliest to get all those existing
11229 # in the 3rd earliest, and so on.
11230 foreach my $current_age (@rest_ages) {
11231 next if $current_age->name !~ /^[\d.]*$/; # Skip the non-numeric
11233 my $current_in = $in->add_match_table(
11234 $current_age->name,
11235 Initialize => $current_age + $previous_in,
11236 Description => $description_start
11237 . $current_age->name
11240 $previous_in = $current_in;
11242 # Add clarifying material for the corresponding age file. This is
11243 # in part because of the confusing and contradictory information
11244 # given in the Standard's documentation itself, as of 5.2.
11245 $current_age->add_description(
11246 "Code point's usage was introduced in version "
11247 . $current_age->name);
11248 $current_age->add_note("See also $in");
11252 # And finally the code points whose usages have yet to be decided are
11253 # the same in both properties. Note that permanently unassigned code
11254 # points actually have their usage assigned (as being permanently
11255 # unassigned), so that these tables are not the same as gc=cn.
11256 my $unassigned = $in->add_match_table($default_map);
11257 my $age_default = $age->table($default_map);
11258 $age_default->add_description(<<END
11259 Code point's usage has not been assigned in any Unicode release thus far.
11262 $unassigned->set_equivalent_to($age_default, Related => 1);
11266 # Finished creating all the perl properties. All non-internal non-string
11267 # ones have a synonym of 'Is_' prefixed. (Internal properties begin with
11268 # an underscore.) These do not get a separate entry in the pod file
11269 foreach my $table ($perl->tables) {
11270 foreach my $alias ($table->aliases) {
11271 next if $alias->name =~ /^_/;
11272 $table->add_alias('Is_' . $alias->name,
11274 Status => $alias->status,
11275 Externally_Ok => 0);
11282 sub add_perl_synonyms() {
11283 # A number of Unicode tables have Perl synonyms that are expressed in
11284 # the single-form, \p{name}. These are:
11285 # All the binary property Y tables, so that \p{Name=Y} gets \p{Name} and
11286 # \p{Is_Name} as synonyms
11287 # \p{Script=Value} gets \p{Value}, \p{Is_Value} as synonyms
11288 # \p{General_Category=Value} gets \p{Value}, \p{Is_Value} as synonyms
11289 # \p{Block=Value} gets \p{In_Value} as a synonym, and, if there is no
11290 # conflict, \p{Value} and \p{Is_Value} as well
11292 # This routine generates these synonyms, warning of any unexpected
11295 # Construct the list of tables to get synonyms for. Start with all the
11296 # binary and the General_Category ones.
11297 my @tables = grep { $_->type == $BINARY } property_ref('*');
11298 push @tables, $gc->tables;
11300 # If the version of Unicode includes the Script property, add its tables
11301 if (defined property_ref('Script')) {
11302 push @tables, property_ref('Script')->tables;
11305 # The Block tables are kept separate because they are treated differently.
11306 # And the earliest versions of Unicode didn't include them, so add only if
11309 push @blocks, $block->tables if defined $block;
11311 # Here, have the lists of tables constructed. Process blocks last so that
11312 # if there are name collisions with them, blocks have lowest priority.
11313 # Should there ever be other collisions, manual intervention would be
11314 # required. See the comments at the beginning of the program for a
11315 # possible way to handle those semi-automatically.
11316 foreach my $table (@tables, @blocks) {
11318 # For non-binary properties, the synonym is just the name of the
11319 # table, like Greek, but for binary properties the synonym is the name
11320 # of the property, and means the code points in its 'Y' table.
11321 my $nominal = $table;
11322 my $nominal_property = $nominal->property;
11324 if (! $nominal->isa('Property')) {
11329 # Here is a binary property. Use the 'Y' table. Verify that is
11331 my $yes = $nominal->table('Y');
11332 unless (defined $yes) { # Must be defined, but is permissible to
11334 Carp::my_carp_bug("Undefined $nominal, 'Y'. Skipping.");
11340 foreach my $alias ($nominal->aliases) {
11342 # Attempt to create a table in the perl directory for the
11343 # candidate table, using whatever aliases in it that don't
11344 # conflict. Also add non-conflicting aliases for all these
11345 # prefixed by 'Is_' (and/or 'In_' for Block property tables)
11347 foreach my $prefix ("", 'Is_', 'In_') {
11349 # Only Block properties can have added 'In_' aliases.
11350 next if $prefix eq 'In_' and $nominal_property != $block;
11352 my $proposed_name = $prefix . $alias->name;
11354 # No Is_Is, In_In, nor combinations thereof
11355 trace "$proposed_name is a no-no" if main::DEBUG && $to_trace && $proposed_name =~ /^ I [ns] _I [ns] _/x;
11356 next if $proposed_name =~ /^ I [ns] _I [ns] _/x;
11358 trace "Seeing if can add alias or table: 'perl=$proposed_name' based on $nominal" if main::DEBUG && $to_trace;
11360 # Get a reference to any existing table in the perl
11361 # directory with the desired name.
11362 my $pre_existing = $perl->table($proposed_name);
11364 if (! defined $pre_existing) {
11366 # No name collision, so ok to add the perl synonym.
11368 my $make_pod_entry;
11370 my $status = $actual->status;
11371 if ($nominal_property == $block) {
11373 # For block properties, the 'In' form is preferred for
11374 # external use; the pod file contains wild cards for
11375 # this and the 'Is' form so no entries for those; and
11376 # we don't want people using the name without the
11377 # 'In', so discourage that.
11378 if ($prefix eq "") {
11379 $make_pod_entry = 1;
11380 $status = $status || $DISCOURAGED;
11381 $externally_ok = 0;
11383 elsif ($prefix eq 'In_') {
11384 $make_pod_entry = 0;
11385 $status = $status || $NORMAL;
11386 $externally_ok = 1;
11389 $make_pod_entry = 0;
11390 $status = $status || $DISCOURAGED;
11391 $externally_ok = 0;
11394 elsif ($prefix ne "") {
11396 # The 'Is' prefix is handled in the pod by a wild
11397 # card, and we won't use it for an external name
11398 $make_pod_entry = 0;
11399 $status = $status || $NORMAL;
11400 $externally_ok = 0;
11404 # Here, is an empty prefix, non block. This gets its
11405 # own pod entry and can be used for an external name.
11406 $make_pod_entry = 1;
11407 $status = $status || $NORMAL;
11408 $externally_ok = 1;
11411 # Here, there isn't a perl pre-existing table with the
11412 # name. Look through the list of equivalents of this
11413 # table to see if one is a perl table.
11414 foreach my $equivalent ($actual->leader->equivalents) {
11415 next if $equivalent->property != $perl;
11417 # Here, have found a table for $perl. Add this alias
11418 # to it, and are done with this prefix.
11419 $equivalent->add_alias($proposed_name,
11420 Pod_Entry => $make_pod_entry,
11422 Externally_Ok => $externally_ok);
11423 trace "adding alias perl=$proposed_name to $equivalent" if main::DEBUG && $to_trace;
11427 # Here, $perl doesn't already have a table that is a
11428 # synonym for this property, add one.
11429 my $added_table = $perl->add_match_table($proposed_name,
11430 Pod_Entry => $make_pod_entry,
11432 Externally_Ok => $externally_ok);
11433 # And it will be related to the actual table, since it is
11435 $added_table->set_equivalent_to($actual, Related => 1);
11436 trace "added ", $perl->table($proposed_name) if main::DEBUG && $to_trace;
11438 } # End of no pre-existing.
11440 # Here, there is a pre-existing table that has the proposed
11441 # name. We could be in trouble, but not if this is just a
11442 # synonym for another table that we have already made a child
11443 # of the pre-existing one.
11444 if ($pre_existing->is_equivalent_to($actual)) {
11445 trace "$pre_existing is already equivalent to $actual; adding alias perl=$proposed_name to it" if main::DEBUG && $to_trace;
11446 $pre_existing->add_alias($proposed_name);
11450 # Here, there is a name collision, but it still could be ok if
11451 # the tables match the identical set of code points, in which
11452 # case, we can combine the names. Compare each table's code
11453 # point list to see if they are identical.
11454 trace "Potential name conflict with $pre_existing having ", $pre_existing->count, " code points" if main::DEBUG && $to_trace;
11455 if ($pre_existing->matches_identically_to($actual)) {
11457 # Here, they do match identically. Not a real conflict.
11458 # Make the perl version a child of the Unicode one, except
11459 # in the non-obvious case of where the perl name is
11460 # already a synonym of another Unicode property. (This is
11461 # excluded by the test for it being its own parent.) The
11462 # reason for this exclusion is that then the two Unicode
11463 # properties become related; and we don't really know if
11464 # they are or not. We generate documentation based on
11465 # relatedness, and this would be misleading. Code
11466 # later executed in the process will cause the tables to
11467 # be represented by a single file anyway, without making
11468 # it look in the pod like they are necessarily related.
11469 if ($pre_existing->parent == $pre_existing
11470 && ($pre_existing->property == $perl
11471 || $actual->property == $perl))
11473 trace "Setting $pre_existing equivalent to $actual since one is \$perl, and match identical sets" if main::DEBUG && $to_trace;
11474 $pre_existing->set_equivalent_to($actual, Related => 1);
11476 elsif (main::DEBUG && $to_trace) {
11477 trace "$pre_existing is equivalent to $actual since match identical sets, but not setting them equivalent, to preserve the separateness of the perl aliases";
11478 trace $pre_existing->parent;
11483 # Here they didn't match identically, there is a real conflict
11484 # between our new name and a pre-existing property.
11485 $actual->add_conflicting($proposed_name, 'p', $pre_existing);
11486 $pre_existing->add_conflicting($nominal->full_name,
11490 # Don't output a warning for aliases for the block
11491 # properties (unless they start with 'In_') as it is
11492 # expected that there will be conflicts and the block
11494 if ($verbosity >= $NORMAL_VERBOSITY
11495 && ($actual->property != $block || $prefix eq 'In_'))
11497 print simple_fold(join_lines(<<END
11498 There is already an alias named $proposed_name (from " . $pre_existing . "),
11499 so not creating this alias for " . $actual
11504 # Keep track for documentation purposes.
11505 $has_In_conflicts++ if $prefix eq 'In_';
11506 $has_Is_conflicts++ if $prefix eq 'Is_';
11511 # There are some properties which have No and Yes (and N and Y) as
11512 # property values, but aren't binary, and could possibly be confused with
11513 # binary ones. So create caveats for them. There are tables that are
11514 # named 'No', and tables that are named 'N', but confusion is not likely
11515 # unless they are the same table. For example, N meaning Number or
11516 # Neutral is not likely to cause confusion, so don't add caveats to things
11518 foreach my $property (grep { $_->type != $BINARY } property_ref('*')) {
11519 my $yes = $property->table('Yes');
11520 if (defined $yes) {
11521 my $y = $property->table('Y');
11522 if (defined $y && $yes == $y) {
11523 foreach my $alias ($property->aliases) {
11524 $yes->add_conflicting($alias->name);
11528 my $no = $property->table('No');
11530 my $n = $property->table('N');
11531 if (defined $n && $no == $n) {
11532 foreach my $alias ($property->aliases) {
11533 $no->add_conflicting($alias->name, 'P');
11542 sub register_file_for_name($$$) {
11543 # Given info about a table and a datafile that it should be associated
11544 # with, register that assocation
11547 my $directory_ref = shift; # Array of the directory path for the file
11548 my $file = shift; # The file name in the final directory, [-1].
11549 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
11551 trace "table=$table, file=$file, directory=@$directory_ref" if main::DEBUG && $to_trace;
11553 if ($table->isa('Property')) {
11554 $table->set_file_path(@$directory_ref, $file);
11555 push @map_properties, $table
11556 if $directory_ref->[0] eq $map_directory;
11560 # Do all of the work for all equivalent tables when called with the leader
11561 # table, so skip if isn't the leader.
11562 return if $table->leader != $table;
11564 # Join all the file path components together, using slashes.
11565 my $full_filename = join('/', @$directory_ref, $file);
11567 # All go in the same subdirectory of unicore
11568 if ($directory_ref->[0] ne $matches_directory) {
11569 Carp::my_carp("Unexpected directory in "
11570 . join('/', @{$directory_ref}, $file));
11573 # For this table and all its equivalents ...
11574 foreach my $table ($table, $table->equivalents) {
11576 # Associate it with its file internally. Don't include the
11577 # $matches_directory first component
11578 $table->set_file_path(@$directory_ref, $file);
11579 my $sub_filename = join('/', $directory_ref->[1, -1], $file);
11581 my $property = $table->property;
11582 $property = ($property == $perl)
11583 ? "" # 'perl' is never explicitly stated
11584 : standardize($property->name) . '=';
11586 my $deprecated = ($table->status eq $DEPRECATED)
11587 ? $table->status_info
11590 # And for each of the table's aliases... This inner loop eventually
11591 # goes through all aliases in the UCD that we generate regex match
11593 foreach my $alias ($table->aliases) {
11594 my $name = $alias->name;
11596 # Generate an entry in either the loose or strict hashes, which
11597 # will translate the property and alias names combination into the
11598 # file where the table for them is stored.
11600 if ($alias->loose_match) {
11601 $standard = $property . standardize($alias->name);
11602 if (exists $loose_to_file_of{$standard}) {
11603 Carp::my_carp("Can't change file registered to $loose_to_file_of{$standard} to '$sub_filename'.");
11606 $loose_to_file_of{$standard} = $sub_filename;
11610 $standard = lc ($property . $name);
11611 if (exists $stricter_to_file_of{$standard}) {
11612 Carp::my_carp("Can't change file registered to $stricter_to_file_of{$standard} to '$sub_filename'.");
11615 $stricter_to_file_of{$standard} = $sub_filename;
11617 # Tightly coupled with how utf8_heavy.pl works, for a
11618 # floating point number that is a whole number, get rid of
11619 # the trailing decimal point and 0's, so that utf8_heavy
11620 # will work. Also note that this assumes that such a
11621 # number is matched strictly; so if that were to change,
11622 # this would be wrong.
11623 if ((my $integer_name = $name)
11624 =~ s/^ ( -? \d+ ) \.0+ $ /$1/x)
11626 $stricter_to_file_of{$property . $integer_name}
11632 # Keep a list of the deprecated properties and their filenames
11634 $utf8::why_deprecated{$sub_filename} = $deprecated;
11643 my %base_names; # Names already used for avoiding DOS 8.3 filesystem
11645 my %full_dir_name_of; # Full length names of directories used.
11647 sub construct_filename($$$) {
11648 # Return a file name for a table, based on the table name, but perhaps
11649 # changed to get rid of non-portable characters in it, and to make
11650 # sure that it is unique on a file system that allows the names before
11651 # any period to be at most 8 characters (DOS). While we're at it
11652 # check and complain if there are any directory conflicts.
11654 my $name = shift; # The name to start with
11655 my $mutable = shift; # Boolean: can it be changed? If no, but
11656 # yet it must be to work properly, a warning
11658 my $directories_ref = shift; # A reference to an array containing the
11659 # path to the file, with each element one path
11660 # component. This is used because the same
11661 # name can be used in different directories.
11662 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
11664 my $warn = ! defined wantarray; # If true, then if the name is
11665 # changed, a warning is issued as well.
11667 if (! defined $name) {
11668 Carp::my_carp("Undefined name in directory "
11669 . File::Spec->join(@$directories_ref)
11674 # Make sure that no directory names conflict with each other. Look at
11675 # each directory in the input file's path. If it is already in use,
11676 # assume it is correct, and is merely being re-used, but if we
11677 # truncate it to 8 characters, and find that there are two directories
11678 # that are the same for the first 8 characters, but differ after that,
11679 # then that is a problem.
11680 foreach my $directory (@$directories_ref) {
11681 my $short_dir = substr($directory, 0, 8);
11682 if (defined $full_dir_name_of{$short_dir}) {
11683 next if $full_dir_name_of{$short_dir} eq $directory;
11684 Carp::my_carp("$directory conflicts with $full_dir_name_of{$short_dir}. Bad News. Continuing anyway");
11687 $full_dir_name_of{$short_dir} = $directory;
11691 my $path = join '/', @$directories_ref;
11692 $path .= '/' if $path;
11694 # Remove interior underscores.
11695 (my $filename = $name) =~ s/ (?<=.) _ (?=.) //xg;
11697 # Change any non-word character into an underscore, and truncate to 8.
11698 $filename =~ s/\W+/_/g; # eg., "L&" -> "L_"
11699 substr($filename, 8) = "" if length($filename) > 8;
11701 # Make sure the basename doesn't conflict with something we
11702 # might have already written. If we have, say,
11709 while (my $num = $base_names{$path}{lc $filename}++) {
11710 $num++; # so basenames with numbers start with '2', which
11711 # just looks more natural.
11713 # Want to append $num, but if it'll make the basename longer
11714 # than 8 characters, pre-truncate $filename so that the result
11716 my $delta = length($filename) + length($num) - 8;
11718 substr($filename, -$delta) = $num;
11723 if ($warn && ! $warned) {
11725 Carp::my_carp("'$path$name' conflicts with another name on a filesystem with 8 significant characters (like DOS). Proceeding anyway.");
11729 return $filename if $mutable;
11731 # If not changeable, must return the input name, but warn if needed to
11732 # change it beyond shortening it.
11733 if ($name ne $filename
11734 && substr($name, 0, length($filename)) ne $filename) {
11735 Carp::my_carp("'$path$name' had to be changed into '$filename'. Bad News. Proceeding anyway.");
11741 # The pod file contains a very large table. Many of the lines in that table
11742 # would exceed a typical output window's size, and so need to be wrapped with
11743 # a hanging indent to make them look good. The pod language is really
11744 # insufficient here. There is no general construct to do that in pod, so it
11745 # is done here by beginning each such line with a space to cause the result to
11746 # be output without formatting, and doing all the formatting here. This leads
11747 # to the result that if the eventual display window is too narrow it won't
11748 # look good, and if the window is too wide, no advantage is taken of that
11749 # extra width. A further complication is that the output may be indented by
11750 # the formatter so that there is less space than expected. What I (khw) have
11751 # done is to assume that that indent is a particular number of spaces based on
11752 # what it is in my Linux system; people can always resize their windows if
11753 # necessary, but this is obviously less than desirable, but the best that can
11755 my $automatic_pod_indent = 8;
11757 # Try to format so that uses fewest lines, but few long left column entries
11758 # slide into the right column. An experiment on 5.1 data yielded the
11759 # following percentages that didn't cut into the other side along with the
11760 # associated first-column widths
11762 # 80% not too bad except for a few blocks
11763 # 90% = 33; # , cuts 353/3053 lines from 37 = 12%
11765 my $indent_info_column = 27; # 75% of lines didn't have overlap
11767 my $FILLER = 3; # Length of initial boiler-plate columns in a pod line
11768 # The 3 is because of:
11769 # 1 for the leading space to tell the pod formatter to
11772 # 1 for the space between the flag and the main data
11774 sub format_pod_line ($$$;$$) {
11775 # Take a pod line and return it, formatted properly
11777 my $first_column_width = shift;
11778 my $entry = shift; # Contents of left column
11779 my $info = shift; # Contents of right column
11781 my $status = shift || ""; # Any flag
11783 my $loose_match = shift; # Boolean.
11784 $loose_match = 1 unless defined $loose_match;
11786 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
11789 $flags .= $STRICTER if ! $loose_match;
11791 $flags .= $status if $status;
11793 # There is a blank in the left column to cause the pod formatter to
11794 # output the line as-is.
11795 return sprintf " %-*s%-*s %s\n",
11796 # The first * in the format is replaced by this, the -1 is
11797 # to account for the leading blank. There isn't a
11798 # hard-coded blank after this to separate the flags from
11799 # the rest of the line, so that in the unlikely event that
11800 # multiple flags are shown on the same line, they both
11801 # will get displayed at the expense of that separation,
11802 # but since they are left justified, a blank will be
11803 # inserted in the normal case.
11807 # The other * in the format is replaced by this number to
11808 # cause the first main column to right fill with blanks.
11809 # The -1 is for the guaranteed blank following it.
11810 $first_column_width - $FILLER - 1,
11815 my @zero_match_tables; # List of tables that have no matches in this release
11817 sub make_table_pod_entries($) {
11818 # This generates the entries for the pod file for a given table.
11819 # Also done at this time are any children tables. The output looks like:
11820 # \p{Common} \p{Script=Common} (Short: \p{Zyyy}) (5178)
11822 my $input_table = shift; # Table the entry is for
11823 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
11825 # Generate parent and all its children at the same time.
11826 return if $input_table->parent != $input_table;
11828 my $property = $input_table->property;
11829 my $type = $property->type;
11830 my $full_name = $property->full_name;
11832 my $count = $input_table->count;
11833 my $string_count = clarify_number($count);
11834 my $status = $input_table->status;
11835 my $status_info = $input_table->status_info;
11837 my $entry_for_first_table; # The entry for the first table output.
11838 # Almost certainly, it is the parent.
11840 # For each related table (including itself), we will generate a pod entry
11841 # for each name each table goes by
11842 foreach my $table ($input_table, $input_table->children) {
11844 # utf8_heavy.pl cannot deal with null string property values, so don't
11846 next if $table->name eq "";
11848 # First, gather all the info that applies to this table as a whole.
11850 push @zero_match_tables, $table if $count == 0;
11852 my $table_property = $table->property;
11854 # The short name has all the underscores removed, while the full name
11855 # retains them. Later, we decide whether to output a short synonym
11856 # for the full one, we need to compare apples to apples, so we use the
11857 # short name's length including underscores.
11858 my $table_property_short_name_length;
11859 my $table_property_short_name
11860 = $table_property->short_name(\$table_property_short_name_length);
11861 my $table_property_full_name = $table_property->full_name;
11863 # Get how much savings there is in the short name over the full one
11864 # (delta will always be <= 0)
11865 my $table_property_short_delta = $table_property_short_name_length
11866 - length($table_property_full_name);
11867 my @table_description = $table->description;
11868 my @table_note = $table->note;
11870 # Generate an entry for each alias in this table.
11871 my $entry_for_first_alias; # saves the first one encountered.
11872 foreach my $alias ($table->aliases) {
11874 # Skip if not to go in pod.
11875 next unless $alias->make_pod_entry;
11877 # Start gathering all the components for the entry
11878 my $name = $alias->name;
11880 my $entry; # Holds the left column, may include extras
11881 my $entry_ref; # To refer to the left column's contents from
11882 # another entry; has no extras
11884 # First the left column of the pod entry. Tables for the $perl
11885 # property always use the single form.
11886 if ($table_property == $perl) {
11887 $entry = "\\p{$name}";
11888 $entry_ref = "\\p{$name}";
11890 else { # Compound form.
11892 # Only generate one entry for all the aliases that mean true
11893 # or false in binary properties. Append a '*' to indicate
11894 # some are missing. (The heading comment notes this.)
11895 my $wild_card_mark;
11896 if ($type == $BINARY) {
11897 next if $name ne 'N' && $name ne 'Y';
11898 $wild_card_mark = '*';
11901 $wild_card_mark = "";
11904 # Colon-space is used to give a little more space to be easier
11907 . $table_property_full_name
11908 . ": $name$wild_card_mark}";
11910 # But for the reference to this entry, which will go in the
11911 # right column, where space is at a premium, use equals
11913 $entry_ref = "\\p{" . $table_property_full_name . "=$name}";
11916 # Then the right (info) column. This is stored as components of
11917 # an array for the moment, then joined into a string later. For
11918 # non-internal only properties, begin the info with the entry for
11919 # the first table we encountered (if any), as things are ordered
11920 # so that that one is the most descriptive. This leads to the
11921 # info column of an entry being a more descriptive version of the
11924 if ($name =~ /^_/) {
11926 '(For internal use by Perl, not necessarily stable)';
11928 elsif ($entry_for_first_alias) {
11929 push @info, $entry_for_first_alias;
11932 # If this entry is equivalent to another, add that to the info,
11933 # using the first such table we encountered
11934 if ($entry_for_first_table) {
11936 push @info, "(= $entry_for_first_table)";
11939 push @info, $entry_for_first_table;
11943 # If the name is a large integer, add an equivalent with an
11944 # exponent for better readability
11945 if ($name =~ /^[+-]?[\d]+$/ && $name >= 10_000) {
11946 push @info, sprintf "(= %.1e)", $name
11949 my $parenthesized = "";
11950 if (! $entry_for_first_alias) {
11952 # This is the first alias for the current table. The alias
11953 # array is ordered so that this is the fullest, most
11954 # descriptive alias, so it gets the fullest info. The other
11955 # aliases are mostly merely pointers to this one, using the
11956 # information already added above.
11958 # Display any status message, but only on the parent table
11959 if ($status && ! $entry_for_first_table) {
11960 push @info, $status_info;
11963 # Put out any descriptive info
11964 if (@table_description || @table_note) {
11965 push @info, join "; ", @table_description, @table_note;
11968 # Look to see if there is a shorter name we can point people
11970 my $standard_name = standardize($name);
11972 my $proposed_short = $table->short_name;
11973 if (defined $proposed_short) {
11974 my $standard_short = standardize($proposed_short);
11976 # If the short name is shorter than the standard one, or
11977 # even it it's not, but the combination of it and its
11978 # short property name (as in \p{prop=short} ($perl doesn't
11979 # have this form)) saves at least two characters, then,
11980 # cause it to be listed as a shorter synonym.
11981 if (length $standard_short < length $standard_name
11982 || ($table_property != $perl
11983 && (length($standard_short)
11984 - length($standard_name)
11985 + $table_property_short_delta) # (<= 0)
11988 $short_name = $proposed_short;
11989 if ($table_property != $perl) {
11990 $short_name = $table_property_short_name
11993 $short_name = "\\p{$short_name}";
11997 # And if this is a compound form name, see if there is a
11998 # single form equivalent
12000 if ($table_property != $perl) {
12002 # Special case the binary N tables, so that will print
12003 # \P{single}, but use the Y table values to populate
12004 # 'single', as we haven't populated the N table.
12007 if ($type == $BINARY
12008 && $input_table == $property->table('No'))
12010 $test_table = $property->table('Yes');
12014 $test_table = $input_table;
12018 # Look for a single form amongst all the children.
12019 foreach my $table ($test_table->children) {
12020 next if $table->property != $perl;
12021 my $proposed_name = $table->short_name;
12022 next if ! defined $proposed_name;
12024 # Don't mention internal-only properties as a possible
12025 # single form synonym
12026 next if substr($proposed_name, 0, 1) eq '_';
12028 $proposed_name = "\\$p\{$proposed_name}";
12029 if (! defined $single_form
12030 || length($proposed_name) < length $single_form)
12032 $single_form = $proposed_name;
12034 # The goal here is to find a single form; not the
12035 # shortest possible one. We've already found a
12036 # short name. So, stop at the first single form
12037 # found, which is likely to be closer to the
12044 # Ouput both short and single in the same parenthesized
12045 # expression, but with only one of 'Single', 'Short' if there
12047 if ($short_name || $single_form || $table->conflicting) {
12048 $parenthesized .= '(';
12049 $parenthesized .= "Short: $short_name" if $short_name;
12050 if ($short_name && $single_form) {
12051 $parenthesized .= ', ';
12053 elsif ($single_form) {
12054 $parenthesized .= 'Single: ';
12056 $parenthesized .= $single_form if $single_form;
12061 # Warn if this property isn't the same as one that a
12062 # semi-casual user might expect. The other components of this
12063 # parenthesized structure are calculated only for the first entry
12064 # for this table, but the conflicting is deemed important enough
12065 # to go on every entry.
12066 my $conflicting = join " NOR ", $table->conflicting;
12067 if ($conflicting) {
12068 $parenthesized .= '(' if ! $parenthesized;
12069 $parenthesized .= '; ' if $parenthesized ne '(';
12070 $parenthesized .= "NOT $conflicting";
12072 $parenthesized .= ')' if $parenthesized;
12074 push @info, $parenthesized if $parenthesized;
12076 if ($table_property != $perl && $table->perl_extension) {
12077 push @info, '(Perl extension)';
12079 push @info, "($string_count)" if $output_range_counts;
12081 # Now, we have both the entry and info so add them to the
12082 # list of all the properties.
12083 push @match_properties,
12084 format_pod_line($indent_info_column,
12088 $alias->loose_match);
12090 $entry_for_first_alias = $entry_ref unless $entry_for_first_alias;
12091 } # End of looping through the aliases for this table.
12093 if (! $entry_for_first_table) {
12094 $entry_for_first_table = $entry_for_first_alias;
12096 } # End of looping through all the related tables
12100 sub pod_alphanumeric_sort {
12101 # Sort pod entries alphanumerically.
12103 # The first few character columns are filler, plus the '\p{'; and get rid
12104 # of all the trailing stuff, starting with the trailing '}', so as to sort
12105 # on just 'Name=Value'
12106 (my $a = lc $a) =~ s/^ .*? { //x;
12108 (my $b = lc $b) =~ s/^ .*? { //x;
12111 # Determine if the two operands are both internal only or both not.
12112 # Character 0 should be a '\'; 1 should be a p; 2 should be '{', so 3
12113 # should be the underscore that begins internal only
12114 my $a_is_internal = (substr($a, 0, 1) eq '_');
12115 my $b_is_internal = (substr($b, 0, 1) eq '_');
12117 # Sort so the internals come last in the table instead of first (which the
12118 # leading underscore would otherwise indicate).
12119 if ($a_is_internal != $b_is_internal) {
12120 return 1 if $a_is_internal;
12124 # Determine if the two operands are numeric property values or not.
12125 # A numeric property will look like xyz: 3. But the number
12126 # can begin with an optional minus sign, and may have a
12127 # fraction or rational component, like xyz: 3/2. If either
12128 # isn't numeric, use alphabetic sort.
12129 my ($a_initial, $a_number) =
12130 ($a =~ /^ ( [^:=]+ [:=] \s* ) (-? \d+ (?: [.\/] \d+)? )/ix);
12131 return $a cmp $b unless defined $a_number;
12132 my ($b_initial, $b_number) =
12133 ($b =~ /^ ( [^:=]+ [:=] \s* ) (-? \d+ (?: [.\/] \d+)? )/ix);
12134 return $a cmp $b unless defined $b_number;
12136 # Here they are both numeric, but use alphabetic sort if the
12137 # initial parts don't match
12138 return $a cmp $b if $a_initial ne $b_initial;
12140 # Convert rationals to floating for the comparison.
12141 $a_number = eval $a_number if $a_number =~ qr{/};
12142 $b_number = eval $b_number if $b_number =~ qr{/};
12144 return $a_number <=> $b_number;
12148 # Create the .pod file. This generates the various subsections and then
12149 # combines them in one big HERE document.
12151 return unless defined $pod_directory;
12152 print "Making pod file\n" if $verbosity >= $PROGRESS;
12154 my $exception_message =
12155 '(Any exceptions are individually noted beginning with the word NOT.)';
12157 if (-e 'Blocks.txt') {
12159 # Add the line: '\p{In_*} \p{Block: *}', with the warning message
12160 # if the global $has_In_conflicts indicates we have them.
12161 push @match_properties, format_pod_line($indent_info_column,
12164 . (($has_In_conflicts)
12165 ? " $exception_message"
12167 @block_warning = << "END";
12169 Matches in the Block property have shortcuts that begin with 'In_'. For
12170 example, \\p{Block=Latin1} can be written as \\p{In_Latin1}. For backward
12171 compatibility, if there is no conflict with another shortcut, these may also
12172 be written as \\p{Latin1} or \\p{Is_Latin1}. But, N.B., there are numerous
12173 such conflicting shortcuts. Use of these forms for Block is discouraged, and
12174 are flagged as such, not only because of the potential confusion as to what is
12175 meant, but also because a later release of Unicode may preempt the shortcut,
12176 and your program would no longer be correct. Use the 'In_' form instead to
12177 avoid this, or even more clearly, use the compound form, e.g.,
12178 \\p{blk:latin1}. See L<perlunicode/"Blocks"> for more information about this.
12181 my $text = "If an entry has flag(s) at its beginning, like '$DEPRECATED', the 'Is_' form has the same flag(s)";
12182 $text = "$exception_message $text" if $has_Is_conflicts;
12184 # And the 'Is_ line';
12185 push @match_properties, format_pod_line($indent_info_column,
12189 # Sort the properties array for output. It is sorted alphabetically
12190 # except numerically for numeric properties, and only output unique lines.
12191 @match_properties = sort pod_alphanumeric_sort uniques @match_properties;
12193 my $formatted_properties = simple_fold(\@match_properties,
12195 # indent succeeding lines by two extra
12196 # which looks better
12197 $indent_info_column + 2,
12199 # shorten the line length by how much
12200 # the formatter indents, so the folded
12201 # line will fit in the space
12202 # presumably available
12203 $automatic_pod_indent);
12204 # Add column headings, indented to be a little more centered, but not
12206 $formatted_properties = format_pod_line($indent_info_column,
12210 . $formatted_properties;
12212 # Generate pod documentation lines for the tables that match nothing
12214 if (@zero_match_tables) {
12215 @zero_match_tables = uniques(@zero_match_tables);
12216 $zero_matches = join "\n\n",
12217 map { $_ = '=item \p{' . $_->complete_name . "}" }
12218 sort { $a->complete_name cmp $b->complete_name }
12219 uniques(@zero_match_tables);
12221 $zero_matches = <<END;
12223 =head2 Legal \\p{} and \\P{} constructs that match no characters
12225 Unicode has some property-value pairs that currently don't match anything.
12226 This happens generally either because they are obsolete, or for symmetry with
12227 other forms, but no language has yet been encoded that uses them. In this
12228 version of Unicode, the following match zero code points:
12239 # Generate list of properties that we don't accept, grouped by the reasons
12240 # why. This is so only put out the 'why' once, and then list all the
12241 # properties that have that reason under it.
12243 my %why_list; # The keys are the reasons; the values are lists of
12244 # properties that have the key as their reason
12246 # For each property, add it to the list that are suppressed for its reason
12247 # The sort will cause the alphabetically first properties to be added to
12248 # each list first, so each list will be sorted.
12249 foreach my $property (sort keys %why_suppressed) {
12250 push @{$why_list{$why_suppressed{$property}}}, $property;
12253 # For each reason (sorted by the first property that has that reason)...
12254 my @bad_re_properties;
12255 foreach my $why (sort { $why_list{$a}->[0] cmp $why_list{$b}->[0] }
12258 # Add to the output, all the properties that have that reason. Start
12259 # with an empty line.
12260 push @bad_re_properties, "\n\n";
12262 my $has_item = 0; # Flag if actually output anything.
12263 foreach my $name (@{$why_list{$why}}) {
12265 # Split compound names into $property and $table components
12266 my $property = $name;
12268 if ($property =~ / (.*) = (.*) /x) {
12273 # This release of Unicode may not have a property that is
12274 # suppressed, so don't reference a non-existent one.
12275 $property = property_ref($property);
12276 next if ! defined $property;
12278 # And since this list is only for match tables, don't list the
12279 # ones that don't have match tables.
12280 next if ! $property->to_create_match_tables;
12282 # Find any abbreviation, and turn it into a compound name if this
12283 # is a property=value pair.
12284 my $short_name = $property->name;
12285 $short_name .= '=' . $property->table($table)->name if $table;
12287 # And add the property as an item for the reason.
12288 push @bad_re_properties, "\n=item I<$name> ($short_name)\n";
12292 # And add the reason under the list of properties, if such a list
12293 # actually got generated. Note that the header got added
12294 # unconditionally before. But pod ignores extra blank lines, so no
12296 push @bad_re_properties, "\n$why\n" if $has_item;
12298 } # End of looping through each reason.
12300 # Generate a list of the properties whose map table we output, from the
12301 # global @map_properties.
12302 my @map_tables_actually_output;
12303 my $info_indent = 20; # Left column is narrower than \p{} table.
12304 foreach my $property (@map_properties) {
12306 # Get the path to the file; don't output any not in the standard
12308 my @path = $property->file_path;
12309 next if $path[0] ne $map_directory;
12310 shift @path; # Remove the standard name
12312 my $file = join '/', @path; # In case is in sub directory
12313 my $info = $property->full_name;
12314 my $short_name = $property->name;
12315 if ($info ne $short_name) {
12316 $info .= " ($short_name)";
12318 foreach my $more_info ($property->description,
12320 $property->status_info)
12322 next unless $more_info;
12324 $info .= ". $more_info";
12326 push @map_tables_actually_output, format_pod_line($info_indent,
12329 $property->status);
12332 # Sort alphabetically, and fold for output
12333 @map_tables_actually_output = sort
12334 pod_alphanumeric_sort @map_tables_actually_output;
12335 @map_tables_actually_output
12336 = simple_fold(\@map_tables_actually_output,
12339 $automatic_pod_indent);
12341 # Generate a list of the formats that can appear in the map tables.
12342 my @map_table_formats;
12343 foreach my $format (sort keys %map_table_formats) {
12344 push @map_table_formats, " $format $map_table_formats{$format}\n";
12347 # Everything is ready to assemble.
12348 my @OUT = << "END";
12353 To change this file, edit $0 instead.
12359 $pod_file - Index of Unicode Version $string_version properties in Perl
12363 There are many properties in Unicode, and Perl provides access to almost all of
12364 them, as well as some additional extensions and short-cut synonyms.
12366 And just about all of the few that aren't accessible through the Perl
12367 core are accessible through the modules: Unicode::Normalize and
12368 Unicode::UCD, and for Unihan properties, via the CPAN module Unicode::Unihan.
12370 This document merely lists all available properties and does not attempt to
12371 explain what each property really means. There is a brief description of each
12372 Perl extension. There is some detail about Blocks, Scripts, General_Category,
12373 and Bidi_Class in L<perlunicode>, but to find out about the intricacies of the
12374 Unicode properties, refer to the Unicode standard. A good starting place is
12375 L<$unicode_reference_url>. More information on the Perl extensions is in
12376 L<perlrecharclass>.
12378 Note that you can define your own properties; see
12379 L<perlunicode/"User-Defined Character Properties">.
12381 =head1 Properties accessible through \\p{} and \\P{}
12383 The Perl regular expression \\p{} and \\P{} constructs give access to most of
12384 the Unicode character properties. The table below shows all these constructs,
12385 both single and compound forms.
12387 B<Compound forms> consist of two components, separated by an equals sign or a
12388 colon. The first component is the property name, and the second component is
12389 the particular value of the property to match against, for example,
12390 '\\p{Script: Greek}' or '\\p{Script=Greek}' both mean to match characters
12391 whose Script property is Greek.
12393 B<Single forms>, like '\\p{Greek}', are mostly Perl-defined shortcuts for
12394 their equivalent compound forms. The table shows these equivalences. (In our
12395 example, '\\p{Greek}' is a just a shortcut for '\\p{Script=Greek}'.)
12396 There are also a few Perl-defined single forms that are not shortcuts for a
12397 compound form. One such is \\p{Word}. These are also listed in the table.
12399 In parsing these constructs, Perl always ignores Upper/lower case differences
12400 everywhere within the {braces}. Thus '\\p{Greek}' means the same thing as
12401 '\\p{greek}'. But note that changing the case of the 'p' or 'P' before the
12402 left brace completely changes the meaning of the construct, from "match" (for
12403 '\\p{}') to "doesn't match" (for '\\P{}'). Casing in this document is for
12404 improved legibility.
12406 Also, white space, hyphens, and underscores are also normally ignored
12407 everywhere between the {braces}, and hence can be freely added or removed
12408 even if the C</x> modifier hasn't been specified on the regular expression.
12409 But $a_bold_stricter at the beginning of an entry in the table below
12410 means that tighter (stricter) rules are used for that entry:
12414 =item Single form (\\p{name}) tighter rules:
12416 White space, hyphens, and underscores ARE significant
12421 =item * white space adjacent to a non-word character
12423 =item * underscores separating digits in numbers
12427 That means, for example, that you can freely add or remove white space
12428 adjacent to (but within) the braces without affecting the meaning.
12430 =item Compound form (\\p{name=value} or \\p{name:value}) tighter rules:
12432 The tighter rules given above for the single form apply to everything to the
12433 right of the colon or equals; the looser rules still apply to everything to
12436 That means, for example, that you can freely add or remove white space
12437 adjacent to (but within) the braces and the colon or equal sign.
12441 Some properties are considered obsolete, but still available. There are
12442 several varieties of obsolesence:
12448 Properties marked with $a_bold_obsolete in the table are considered
12449 obsolete. At the time of this writing (Unicode version 5.2) there is no
12450 information in the Unicode standard about the implications of a property being
12455 Obsolete properties may be stabilized. This means that they are not actively
12456 maintained by Unicode, and will not be extended as new characters are added to
12457 the standard. Such properties are marked with $a_bold_stabilized in the
12458 table. At the time of this writing (Unicode version 5.2) there is no further
12459 information in the Unicode standard about the implications of a property being
12464 Obsolete properties may be deprecated. This means that their use is strongly
12465 discouraged, so much so that a warning will be issued if used, unless the
12466 regular expression is in the scope of a C<S<no warnings 'deprecated'>>
12467 statement. $A_bold_deprecated flags each such entry in the table, and
12468 the entry there for the longest, most descriptive version of the property will
12469 give the reason it is deprecated, and perhaps advice. Perl may issue such a
12470 warning, even for properties that aren't officially deprecated by Unicode,
12471 when there used to be characters or code points that were matched by them, but
12472 no longer. This is to warn you that your program may not work like it did on
12473 earlier Unicode releases.
12475 A deprecated property may be made unavailable in a future Perl version, so it
12476 is best to move away from them.
12480 Some Perl extensions are present for backwards compatibility and are
12481 discouraged from being used, but not obsolete. $A_bold_discouraged
12482 flags each such entry in the table.
12486 The table below has two columns. The left column contains the \\p{}
12487 constructs to look up, possibly preceeded by the flags mentioned above; and
12488 the right column contains information about them, like a description, or
12489 synonyms. It shows both the single and compound forms for each property that
12490 has them. If the left column is a short name for a property, the right column
12491 will give its longer, more descriptive name; and if the left column is the
12492 longest name, the right column will show any equivalent shortest name, in both
12493 single and compound forms if applicable.
12495 The right column will also caution you if a property means something different
12496 than what might normally be expected.
12498 All single forms are Perl extensions; a few compound forms are as well, and
12501 Numbers in (parentheses) indicate the total number of code points matched by
12502 the property. For emphasis, those properties that match no code points at all
12503 are listed as well in a separate section following the table.
12505 There is no description given for most non-Perl defined properties (See
12506 $unicode_reference_url for that).
12508 For compactness, 'B<*>' is used as a wildcard instead of showing all possible
12509 combinations. For example, entries like:
12511 \\p{Gc: *} \\p{General_Category: *}
12513 mean that 'Gc' is a synonym for 'General_Category', and anything that is valid
12514 for the latter is also valid for the former. Similarly,
12518 means that if and only if, for example, \\p{Foo} exists, then \\p{Is_Foo} and
12519 \\p{IsFoo} are also valid and all mean the same thing. And similarly,
12520 \\p{Foo=Bar} means the same as \\p{Is_Foo=Bar} and \\p{IsFoo=Bar}. '*' here
12521 is restricted to something not beginning with an underscore.
12523 Also, in binary properties, 'Yes', 'T', and 'True' are all synonyms for 'Y'.
12524 And 'No', 'F', and 'False' are all synonyms for 'N'. The table shows 'Y*' and
12525 'N*' to indicate this, and doesn't have separate entries for the other
12526 possibilities. Note that not all properties which have values 'Yes' and 'No'
12527 are binary, and they have all their values spelled out without using this wild
12528 card, and a C<NOT> clause in their description that highlights their not being
12529 binary. These also require the compound form to match them, whereas true
12530 binary properties have both single and compound forms available.
12532 Note that all non-essential underscores are removed in the display of the
12539 =item B<*> is a wild-card
12541 =item B<(\\d+)> in the info column gives the number of code points matched by
12544 =item B<$DEPRECATED> means this is deprecated.
12546 =item B<$OBSOLETE> means this is obsolete.
12548 =item B<$STABILIZED> means this is stabilized.
12550 =item B<$STRICTER> means tighter (stricter) name matching applies.
12552 =item B<$DISCOURAGED> means use of this form is discouraged.
12556 $formatted_properties
12560 =head1 Properties not accessible through \\p{} and \\P{}
12562 A few properties are accessible in Perl via various function calls only.
12564 Lowercase_Mapping lc() and lcfirst()
12565 Titlecase_Mapping ucfirst()
12566 Uppercase_Mapping uc()
12568 Case_Folding is accessible through the /i modifier in regular expressions.
12570 The Name property is accessible through the \\N{} interpolation in
12571 double-quoted strings and regular expressions, but both usages require a C<use
12572 charnames;> to be specified, which also contains related functions viacode()
12575 =head1 Unicode regular expression properties that are NOT accepted by Perl
12577 Perl will generate an error for a few character properties in Unicode when
12578 used in a regular expression. The non-Unihan ones are listed below, with the
12579 reasons they are not accepted, perhaps with work-arounds. The short names for
12580 the properties are listed enclosed in (parentheses).
12588 An installation can choose to allow any of these to be matched by changing the
12589 controlling lists contained in the program C<\$Config{privlib}>/F<unicore/$0>
12590 and then re-running F<$0>. (C<\%Config> is available from the Config module).
12592 =head1 Files in the I<To> directory (for serious hackers only)
12594 All Unicode properties are really mappings (in the mathematical sense) from
12595 code points to their respective values. As part of its build process,
12596 Perl constructs tables containing these mappings for all properties that it
12597 deals with. But only a few of these are written out into files.
12598 Those written out are in the directory C<\$Config{privlib}>/F<unicore/To/>
12599 (%Config is available from the Config module).
12601 Those ones written are ones needed by Perl internally during execution, or for
12602 which there is some demand, and those for which there is no access through the
12603 Perl core. Generally, properties that can be used in regular expression
12604 matching do not have their map tables written, like Script. Nor are the
12605 simplistic properties that have a better, more complete version, such as
12606 Simple_Uppercase_Mapping (Uppercase_Mapping is written instead).
12608 None of the properties in the I<To> directory are currently directly
12609 accessible through the Perl core, although some may be accessed indirectly.
12610 For example, the uc() function implements the Uppercase_Mapping property and
12611 uses the F<Upper.pl> file found in this directory.
12613 The available files with their properties (short names in parentheses),
12614 and any flags or comments about them, are:
12616 @map_tables_actually_output
12618 An installation can choose to change which files are generated by changing the
12619 controlling lists contained in the program C<\$Config{privlib}>/F<unicore/$0>
12620 and then re-running F<$0>.
12622 Each of these files defines two hash entries to help reading programs decipher
12623 it. One of them looks like this:
12625 \$utf8::SwashInfo{'ToNAME'}{'format'} = 's';
12627 where 'NAME' is a name to indicate the property. For backwards compatibility,
12628 this is not necessarily the property's official Unicode name. (The 'To' is
12629 also for backwards compatibility.) The hash entry gives the format of the
12630 mapping fields of the table, currently one of the following:
12634 This format applies only to the entries in the main body of the table.
12635 Entries defined in hashes or ones that are missing from the list can have a
12638 The value that the missing entries have is given by the other SwashInfo hash
12639 entry line; it looks like this:
12641 \$utf8::SwashInfo{'ToNAME'}{'missing'} = 'NaN';
12643 This example line says that any Unicode code points not explicitly listed in
12644 the file have the value 'NaN' under the property indicated by NAME. If the
12645 value is the special string C<< <code point> >>, it means that the value for
12646 any missing code point is the code point itself. This happens, for example,
12647 in the file for Uppercase_Mapping (To/Upper.pl), in which code points like the
12648 character 'A', are missing because the uppercase of 'A' is itself.
12652 L<$unicode_reference_url>
12661 main::write([ $pod_directory, "$pod_file.pod" ], @OUT);
12665 sub make_Heavy () {
12666 # Create and write Heavy.pl, which passes info about the tables to
12673 # This file is for the use of utf8_heavy.pl
12675 # Maps property names in loose standard form to its standard name
12676 \%utf8::loose_property_name_of = (
12679 push @heavy, simple_dumper (\%loose_property_name_of, ' ' x 4);
12680 push @heavy, <<END;
12683 # Maps property, table to file for those using stricter matching
12684 \%utf8::stricter_to_file_of = (
12686 push @heavy, simple_dumper (\%stricter_to_file_of, ' ' x 4);
12687 push @heavy, <<END;
12690 # Maps property, table to file for those using loose matching
12691 \%utf8::loose_to_file_of = (
12693 push @heavy, simple_dumper (\%loose_to_file_of, ' ' x 4);
12694 push @heavy, <<END;
12697 # Maps floating point to fractional form
12698 \%utf8::nv_floating_to_rational = (
12700 push @heavy, simple_dumper (\%nv_floating_to_rational, ' ' x 4);
12701 push @heavy, <<END;
12704 # If a floating point number doesn't have enough digits in it to get this
12705 # close to a fraction, it isn't considered to be that fraction even if all the
12706 # digits it does have match.
12707 \$utf8::max_floating_slop = $MAX_FLOATING_SLOP;
12709 # Deprecated tables to generate a warning for. The key is the file containing
12710 # the table, so as to avoid duplication, as many property names can map to the
12711 # file, but we only need one entry for all of them.
12712 \%utf8::why_deprecated = (
12715 push @heavy, simple_dumper (\%utf8::why_deprecated, ' ' x 4);
12716 push @heavy, <<END;
12722 main::write("Heavy.pl", @heavy);
12726 sub write_all_tables() {
12727 # Write out all the tables generated by this program to files, as well as
12728 # the supporting data structures, pod file, and .t file.
12730 my @writables; # List of tables that actually get written
12731 my %match_tables_to_write; # Used to collapse identical match tables
12732 # into one file. Each key is a hash function
12733 # result to partition tables into buckets.
12734 # Each value is an array of the tables that
12735 # fit in the bucket.
12737 # For each property ...
12738 # (sort so that if there is an immutable file name, it has precedence, so
12739 # some other property can't come in and take over its file name. If b's
12740 # file name is defined, will return 1, meaning to take it first; don't
12741 # care if both defined, as they had better be different anyway)
12743 foreach my $property (sort { defined $b->file } property_ref('*')) {
12744 my $type = $property->type;
12746 # And for each table for that property, starting with the mapping
12749 foreach my $table($property,
12751 # and all the match tables for it (if any), sorted so
12752 # the ones with the shortest associated file name come
12753 # first. The length sorting prevents problems of a
12754 # longer file taking a name that might have to be used
12755 # by a shorter one. The alphabetic sorting prevents
12756 # differences between releases
12757 sort { my $ext_a = $a->external_name;
12758 return 1 if ! defined $ext_a;
12759 my $ext_b = $b->external_name;
12760 return -1 if ! defined $ext_b;
12761 my $cmp = length $ext_a <=> length $ext_b;
12763 # Return result if lengths not equal
12764 return $cmp if $cmp;
12766 # Alphabetic if lengths equal
12767 return $ext_a cmp $ext_b
12768 } $property->tables
12772 # Here we have a table associated with a property. It could be
12773 # the map table (done first for each property), or one of the
12774 # other tables. Determine which type.
12775 my $is_property = $table->isa('Property');
12777 my $name = $table->name;
12778 my $complete_name = $table->complete_name;
12780 # See if should suppress the table if is empty, but warn if it
12781 # contains something.
12782 my $suppress_if_empty_warn_if_not = grep { $complete_name eq $_ }
12783 keys %why_suppress_if_empty_warn_if_not;
12785 # Calculate if this table should have any code points associated
12787 my $expected_empty =
12789 # $perl should be empty, as well as properties that we just
12790 # don't do anything with
12792 && ($table == $perl
12793 || grep { $complete_name eq $_ }
12794 @unimplemented_properties
12798 # Match tables in properties we skipped populating should be
12800 || (! $is_property && ! $property->to_create_match_tables)
12802 # Tables and properties that are expected to have no code
12803 # points should be empty
12804 || $suppress_if_empty_warn_if_not
12807 # Set a boolean if this table is the complement of an empty binary
12809 my $is_complement_of_empty_binary =
12810 $type == $BINARY &&
12811 (($table == $property->table('Y')
12812 && $property->table('N')->is_empty)
12813 || ($table == $property->table('N')
12814 && $property->table('Y')->is_empty));
12817 # Some tables should match everything
12818 my $expected_full =
12820 ? # All these types of map tables will be full because
12821 # they will have been populated with defaults
12822 ($type == $ENUM || $type == $BINARY)
12824 : # A match table should match everything if its method
12826 ($table->matches_all
12828 # The complement of an empty binary table will match
12830 || $is_complement_of_empty_binary
12834 if ($table->is_empty) {
12837 if ($suppress_if_empty_warn_if_not) {
12838 $table->set_status($SUPPRESSED,
12839 $why_suppress_if_empty_warn_if_not{$complete_name});
12842 # Suppress expected empty tables.
12843 next TABLE if $expected_empty;
12845 # And setup to later output a warning for those that aren't
12846 # known to be allowed to be empty. Don't do the warning if
12847 # this table is a child of another one to avoid duplicating
12848 # the warning that should come from the parent one.
12849 if (($table == $property || $table->parent == $table)
12850 && $table->status ne $SUPPRESSED
12851 && ! grep { $complete_name =~ /^$_$/ }
12852 @tables_that_may_be_empty)
12854 push @unhandled_properties, "$table";
12857 elsif ($expected_empty) {
12859 if ($suppress_if_empty_warn_if_not) {
12860 $because = " because $why_suppress_if_empty_warn_if_not{$complete_name}";
12863 Carp::my_carp("Not expecting property $table$because. Generating file for it anyway.");
12866 my $count = $table->count;
12867 if ($expected_full) {
12868 if ($count != $MAX_UNICODE_CODEPOINTS) {
12869 Carp::my_carp("$table matches only "
12870 . clarify_number($count)
12871 . " Unicode code points but should match "
12872 . clarify_number($MAX_UNICODE_CODEPOINTS)
12874 . clarify_number(abs($MAX_UNICODE_CODEPOINTS - $count))
12875 . "). Proceeding anyway.");
12878 # Here is expected to be full. If it is because it is the
12879 # complement of an (empty) binary table that is to be
12880 # suppressed, then suppress this one as well.
12881 if ($is_complement_of_empty_binary) {
12882 my $opposing_name = ($name eq 'Y') ? 'N' : 'Y';
12883 my $opposing = $property->table($opposing_name);
12884 my $opposing_status = $opposing->status;
12885 if ($opposing_status) {
12886 $table->set_status($opposing_status,
12887 $opposing->status_info);
12891 elsif ($count == $MAX_UNICODE_CODEPOINTS) {
12892 if ($table == $property || $table->leader == $table) {
12893 Carp::my_carp("$table unexpectedly matches all Unicode code points. Proceeding anyway.");
12897 if ($table->status eq $SUPPRESSED) {
12898 if (! $is_property) {
12899 my @children = $table->children;
12900 foreach my $child (@children) {
12901 if ($child->status ne $SUPPRESSED) {
12902 Carp::my_carp_bug("'$table' is suppressed and has a child '$child' which isn't");
12909 if (! $is_property) {
12911 # Several things need to be done just once for each related
12912 # group of match tables. Do them on the parent.
12913 if ($table->parent == $table) {
12915 # Add an entry in the pod file for the table; it also does
12917 make_table_pod_entries($table) if defined $pod_directory;
12919 # See if the the table matches identical code points with
12920 # something that has already been output. In that case,
12921 # no need to have two files with the same code points in
12922 # them. We use the table's hash() method to store these
12923 # in buckets, so that it is quite likely that if two
12924 # tables are in the same bucket they will be identical, so
12925 # don't have to compare tables frequently. The tables
12926 # have to have the same status to share a file, so add
12927 # this to the bucket hash. (The reason for this latter is
12928 # that Heavy.pl associates a status with a file.)
12929 my $hash = $table->hash . ';' . $table->status;
12931 # Look at each table that is in the same bucket as this
12933 foreach my $comparison (@{$match_tables_to_write{$hash}})
12935 if ($table->matches_identically_to($comparison)) {
12936 $table->set_equivalent_to($comparison,
12942 # Here, not equivalent, add this table to the bucket.
12943 push @{$match_tables_to_write{$hash}}, $table;
12948 # Here is the property itself.
12949 # Don't write out or make references to the $perl property
12950 next if $table == $perl;
12952 if ($type != $STRING) {
12954 # There is a mapping stored of the various synonyms to the
12955 # standardized name of the property for utf8_heavy.pl.
12956 # Also, the pod file contains entries of the form:
12957 # \p{alias: *} \p{full: *}
12958 # rather than show every possible combination of things.
12960 my @property_aliases = $property->aliases;
12962 # The full name of this property is stored by convention
12963 # first in the alias array
12964 my $full_property_name =
12965 '\p{' . $property_aliases[0]->name . ': *}';
12966 my $standard_property_name = standardize($table->name);
12968 # For each synonym ...
12969 for my $i (0 .. @property_aliases - 1) {
12970 my $alias = $property_aliases[$i];
12971 my $alias_name = $alias->name;
12972 my $alias_standard = standardize($alias_name);
12974 # Set the mapping for utf8_heavy of the alias to the
12976 if (exists ($loose_property_name_of{$alias_standard}))
12978 Carp::my_carp("There already is a property with the same standard name as $alias_name: $loose_property_name_of{$alias_standard}. Old name is retained");
12981 $loose_property_name_of{$alias_standard}
12982 = $standard_property_name;
12985 # Now for the pod entry for this alias. Skip if not
12986 # outputting a pod; skip the first one, which is the
12987 # full name so won't have an entry like: '\p{full: *}
12988 # \p{full: *}', and skip if don't want an entry for
12991 || ! defined $pod_directory
12992 || ! $alias->make_pod_entry;
12994 my $rhs = $full_property_name;
12995 if ($property != $perl && $table->perl_extension) {
12996 $rhs .= ' (Perl extension)';
12998 push @match_properties,
12999 format_pod_line($indent_info_column,
13000 '\p{' . $alias->name . ': *}',
13004 } # End of non-string-like property code
13007 # Don't output a mapping file if not desired.
13008 next if ! $property->to_output_map;
13011 # Here, we know we want to write out the table, but don't do it
13012 # yet because there may be other tables that come along and will
13013 # want to share the file, and the file's comments will change to
13014 # mention them. So save for later.
13015 push @writables, $table;
13017 } # End of looping through the property and all its tables.
13018 } # End of looping through all properties.
13020 # Now have all the tables that will have files written for them. Do it.
13021 foreach my $table (@writables) {
13024 my $property = $table->property;
13025 my $is_property = ($table == $property);
13026 if (! $is_property) {
13028 # Match tables for the property go in lib/$subdirectory, which is
13029 # the property's name. Don't use the standard file name for this,
13030 # as may get an unfamiliar alias
13031 @directory = ($matches_directory, $property->external_name);
13035 @directory = $table->directory;
13036 $filename = $table->file;
13039 # Use specified filename if avaliable, or default to property's
13040 # shortest name. We need an 8.3 safe filename (which means "an 8
13041 # safe" filename, since after the dot is only 'pl', which is < 3)
13042 # The 2nd parameter is if the filename shouldn't be changed, and
13043 # it shouldn't iff there is a hard-coded name for this table.
13044 $filename = construct_filename(
13045 $filename || $table->external_name,
13046 ! $filename, # mutable if no filename
13049 register_file_for_name($table, \@directory, $filename);
13051 # Only need to write one file when shared by more than one
13053 next if ! $is_property && $table->leader != $table;
13055 # Construct a nice comment to add to the file
13056 $table->set_final_comment;
13062 # Write out the pod file
13068 make_property_test_script() if $make_test_script;
13072 my @white_space_separators = ( # This used only for making the test script.
13079 sub generate_separator($) {
13080 # This used only for making the test script. It generates the colon or
13081 # equal separator between the property and property value, with random
13082 # white space surrounding the separator
13086 return "" if $lhs eq ""; # No separator if there's only one (the r) side
13088 # Choose space before and after randomly
13089 my $spaces_before =$white_space_separators[rand(@white_space_separators)];
13090 my $spaces_after = $white_space_separators[rand(@white_space_separators)];
13092 # And return the whole complex, half the time using a colon, half the
13094 return $spaces_before
13095 . (rand() < 0.5) ? '=' : ':'
13099 sub generate_tests($$$$$$) {
13100 # This used only for making the test script. It generates test cases that
13101 # are expected to compile successfully in perl. Note that the lhs and
13102 # rhs are assumed to already be as randomized as the caller wants.
13104 my $file_handle = shift; # Where to output the tests
13105 my $lhs = shift; # The property: what's to the left of the colon
13106 # or equals separator
13107 my $rhs = shift; # The property value; what's to the right
13108 my $valid_code = shift; # A code point that's known to be in the
13109 # table given by lhs=rhs; undef if table is
13111 my $invalid_code = shift; # A code point known to not be in the table;
13112 # undef if the table is all code points
13113 my $warning = shift;
13115 # Get the colon or equal
13116 my $separator = generate_separator($lhs);
13118 # The whole 'property=value'
13119 my $name = "$lhs$separator$rhs";
13121 # Create a complete set of tests, with complements.
13122 if (defined $valid_code) {
13123 printf $file_handle
13124 qq/Expect(1, $valid_code, '\\p{$name}', $warning);\n/;
13125 printf $file_handle
13126 qq/Expect(0, $valid_code, '\\p{^$name}', $warning);\n/;
13127 printf $file_handle
13128 qq/Expect(0, $valid_code, '\\P{$name}', $warning);\n/;
13129 printf $file_handle
13130 qq/Expect(1, $valid_code, '\\P{^$name}', $warning);\n/;
13132 if (defined $invalid_code) {
13133 printf $file_handle
13134 qq/Expect(0, $invalid_code, '\\p{$name}', $warning);\n/;
13135 printf $file_handle
13136 qq/Expect(1, $invalid_code, '\\p{^$name}', $warning);\n/;
13137 printf $file_handle
13138 qq/Expect(1, $invalid_code, '\\P{$name}', $warning);\n/;
13139 printf $file_handle
13140 qq/Expect(0, $invalid_code, '\\P{^$name}', $warning);\n/;
13145 sub generate_error($$$$) {
13146 # This used only for making the test script. It generates test cases that
13147 # are expected to not only not match, but to be syntax or similar errors
13149 my $file_handle = shift; # Where to output to.
13150 my $lhs = shift; # The property: what's to the left of the
13151 # colon or equals separator
13152 my $rhs = shift; # The property value; what's to the right
13153 my $already_in_error = shift; # Boolean; if true it's known that the
13154 # unmodified lhs and rhs will cause an error.
13155 # This routine should not force another one
13156 # Get the colon or equal
13157 my $separator = generate_separator($lhs);
13159 # Since this is an error only, don't bother to randomly decide whether to
13160 # put the error on the left or right side; and assume that the rhs is
13161 # loosely matched, again for convenience rather than rigor.
13162 $rhs = randomize_loose_name($rhs, 'ERROR') unless $already_in_error;
13164 my $property = $lhs . $separator . $rhs;
13166 print $file_handle qq/Error('\\p{$property}');\n/;
13167 print $file_handle qq/Error('\\P{$property}');\n/;
13171 # These are used only for making the test script
13172 # XXX Maybe should also have a bad strict seps, which includes underscore.
13174 my @good_loose_seps = (
13181 my @bad_loose_seps = (
13186 sub randomize_stricter_name {
13187 # This used only for making the test script. Take the input name and
13188 # return a randomized, but valid version of it under the stricter matching
13192 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
13194 # If the name looks like a number (integer, floating, or rational), do
13196 if ($name =~ qr{ ^ ( -? ) (\d+ ( ( [./] ) \d+ )? ) $ }x) {
13199 my $separator = $3;
13201 # If there isn't a sign, part of the time add a plus
13202 # Note: Not testing having any denominator having a minus sign
13204 $sign = '+' if rand() <= .3;
13207 # And add 0 or more leading zeros.
13208 $name = $sign . ('0' x int rand(10)) . $number;
13210 if (defined $separator) {
13211 my $extra_zeros = '0' x int rand(10);
13213 if ($separator eq '.') {
13215 # Similarly, add 0 or more trailing zeros after a decimal
13217 $name .= $extra_zeros;
13221 # Or, leading zeros before the denominator
13222 $name =~ s,/,/$extra_zeros,;
13227 # For legibility of the test, only change the case of whole sections at a
13228 # time. To do this, first split into sections. The split returns the
13231 for my $section (split / ( [ - + \s _ . ]+ ) /x, $name) {
13232 trace $section if main::DEBUG && $to_trace;
13234 if (length $section > 1 && $section !~ /\D/) {
13236 # If the section is a sequence of digits, about half the time
13237 # randomly add underscores between some of them.
13240 # Figure out how many underscores to add. max is 1 less than
13241 # the number of digits. (But add 1 at the end to make sure
13242 # result isn't 0, and compensate earlier by subtracting 2
13244 my $num_underscores = int rand(length($section) - 2) + 1;
13246 # And add them evenly throughout, for convenience, not rigor
13248 my $spacing = (length($section) - 1)/ $num_underscores;
13249 my $temp = $section;
13251 for my $i (1 .. $num_underscores) {
13252 $section .= substr($temp, 0, $spacing, "") . '_';
13256 push @sections, $section;
13260 # Here not a sequence of digits. Change the case of the section
13262 my $switch = int rand(4);
13263 if ($switch == 0) {
13264 push @sections, uc $section;
13266 elsif ($switch == 1) {
13267 push @sections, lc $section;
13269 elsif ($switch == 2) {
13270 push @sections, ucfirst $section;
13273 push @sections, $section;
13277 trace "returning", join "", @sections if main::DEBUG && $to_trace;
13278 return join "", @sections;
13281 sub randomize_loose_name($;$) {
13282 # This used only for making the test script
13285 my $want_error = shift; # if true, make an error
13286 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
13288 $name = randomize_stricter_name($name);
13291 push @parts, $good_loose_seps[rand(@good_loose_seps)];
13292 for my $part (split /[-\s_]+/, $name) {
13294 if ($want_error and rand() < 0.3) {
13295 push @parts, $bad_loose_seps[rand(@bad_loose_seps)];
13299 push @parts, $good_loose_seps[rand(@good_loose_seps)];
13302 push @parts, $part;
13304 my $new = join("", @parts);
13305 trace "$name => $new" if main::DEBUG && $to_trace;
13308 if (rand() >= 0.5) {
13309 $new .= $bad_loose_seps[rand(@bad_loose_seps)];
13312 $new = $bad_loose_seps[rand(@bad_loose_seps)] . $new;
13318 # Used to make sure don't generate duplicate test cases.
13319 my %test_generated;
13321 sub make_property_test_script() {
13322 # This used only for making the test script
13323 # this written directly -- it's huge.
13325 print "Making test script\n" if $verbosity >= $PROGRESS;
13327 # This uses randomness to test different possibilities without testing all
13328 # possibilities. To ensure repeatability, set the seed to 0. But if
13329 # tests are added, it will perturb all later ones in the .t file
13332 $t_path = 'TestProp.pl' unless defined $t_path; # the traditional name
13334 force_unlink ($t_path);
13335 push @files_actually_output, $t_path;
13337 if (not open $OUT, "> $t_path") {
13338 Carp::my_carp("Can't open $t_path. Skipping: $!");
13342 # Keep going down an order of magnitude
13343 # until find that adding this quantity to
13344 # 1 remains 1; but put an upper limit on
13345 # this so in case this algorithm doesn't
13346 # work properly on some platform, that we
13347 # won't loop forever.
13349 my $min_floating_slop = 1;
13350 while (1+ $min_floating_slop != 1
13353 my $next = $min_floating_slop / 10;
13354 last if $next == 0; # If underflows,
13356 $min_floating_slop = $next;
13358 print $OUT $HEADER, <DATA>;
13360 foreach my $property (property_ref('*')) {
13361 foreach my $table ($property->tables) {
13363 # Find code points that match, and don't match this table.
13364 my $valid = $table->get_valid_code_point;
13365 my $invalid = $table->get_invalid_code_point;
13366 my $warning = ($table->status eq $DEPRECATED)
13370 # Test each possible combination of the property's aliases with
13371 # the table's. If this gets to be too many, could do what is done
13372 # in the set_final_comment() for Tables
13373 my @table_aliases = $table->aliases;
13374 my @property_aliases = $table->property->aliases;
13375 my $max = max(scalar @table_aliases, scalar @property_aliases);
13376 for my $j (0 .. $max - 1) {
13378 # The current alias for property is the next one on the list,
13379 # or if beyond the end, start over. Similarly for table
13381 = $property_aliases[$j % @property_aliases]->name;
13383 $property_name = "" if $table->property == $perl;
13384 my $table_alias = $table_aliases[$j % @table_aliases];
13385 my $table_name = $table_alias->name;
13386 my $loose_match = $table_alias->loose_match;
13388 # If the table doesn't have a file, any test for it is
13389 # already guaranteed to be in error
13390 my $already_error = ! $table->file_path;
13392 # Generate error cases for this alias.
13393 generate_error($OUT,
13398 # If the table is guaranteed to always generate an error,
13399 # quit now without generating success cases.
13400 next if $already_error;
13402 # Now for the success cases.
13404 if ($loose_match) {
13406 # For loose matching, create an extra test case for the
13408 my $standard = standardize($table_name);
13410 # $test_name should be a unique combination for each test
13411 # case; used just to avoid duplicate tests
13412 my $test_name = "$property_name=$standard";
13414 # Don't output duplicate test cases.
13415 if (! exists $test_generated{$test_name}) {
13416 $test_generated{$test_name} = 1;
13417 generate_tests($OUT,
13425 $random = randomize_loose_name($table_name)
13427 else { # Stricter match
13428 $random = randomize_stricter_name($table_name);
13431 # Now for the main test case for this alias.
13432 my $test_name = "$property_name=$random";
13433 if (! exists $test_generated{$test_name}) {
13434 $test_generated{$test_name} = 1;
13435 generate_tests($OUT,
13443 # If the name is a rational number, add tests for the
13444 # floating point equivalent.
13445 if ($table_name =~ qr{/}) {
13447 # Calculate the float, and find just the fraction.
13448 my $float = eval $table_name;
13449 my ($whole, $fraction)
13450 = $float =~ / (.*) \. (.*) /x;
13452 # Starting with one digit after the decimal point,
13453 # create a test for each possible precision (number of
13454 # digits past the decimal point) until well beyond the
13455 # native number found on this machine. (If we started
13456 # with 0 digits, it would be an integer, which could
13457 # well match an unrelated table)
13459 for my $i (1 .. $min_floating_slop + 3) {
13460 my $table_name = sprintf("%.*f", $i, $float);
13461 if ($i < $MIN_FRACTION_LENGTH) {
13463 # If the test case has fewer digits than the
13464 # minimum acceptable precision, it shouldn't
13465 # succeed, so we expect an error for it.
13466 # E.g., 2/3 = .7 at one decimal point, and we
13467 # shouldn't say it matches .7. We should make
13468 # it be .667 at least before agreeing that the
13469 # intent was to match 2/3. But at the
13470 # less-than- acceptable level of precision, it
13471 # might actually match an unrelated number.
13472 # So don't generate a test case if this
13473 # conflating is possible. In our example, we
13474 # don't want 2/3 matching 7/10, if there is
13475 # a 7/10 code point.
13477 (keys %nv_floating_to_rational)
13480 if abs($table_name - $existing)
13481 < $MAX_FLOATING_SLOP;
13483 generate_error($OUT,
13486 1 # 1 => already an error
13491 # Here the number of digits exceeds the
13492 # minimum we think is needed. So generate a
13493 # success test case for it.
13494 generate_tests($OUT,
13509 foreach my $test (@backslash_X_tests) {
13510 print $OUT "Test_X('$test');\n";
13513 print $OUT "Finished();\n";
13518 # This is a list of the input files and how to handle them. The files are
13519 # processed in their order in this list. Some reordering is possible if
13520 # desired, but the v0 files should be first, and the extracted before the
13521 # others except DAge.txt (as data in an extracted file can be over-ridden by
13522 # the non-extracted. Some other files depend on data derived from an earlier
13523 # file, like UnicodeData requires data from Jamo, and the case changing and
13524 # folding requires data from Unicode. Mostly, it safest to order by first
13525 # version releases in (except the Jamo). DAge.txt is read before the
13526 # extracted ones because of the rarely used feature $compare_versions. In the
13527 # unlikely event that there were ever an extracted file that contained the Age
13528 # property information, it would have to go in front of DAge.
13530 # The version strings allow the program to know whether to expect a file or
13531 # not, but if a file exists in the directory, it will be processed, even if it
13532 # is in a version earlier than expected, so you can copy files from a later
13533 # release into an earlier release's directory.
13534 my @input_file_objects = (
13535 Input_file->new('PropertyAliases.txt', v0,
13536 Handler => \&process_PropertyAliases,
13538 Input_file->new(undef, v0, # No file associated with this
13539 Progress_Message => 'Finishing property setup',
13540 Handler => \&finish_property_setup,
13542 Input_file->new('PropValueAliases.txt', v0,
13543 Handler => \&process_PropValueAliases,
13544 Has_Missings_Defaults => $NOT_IGNORED,
13546 Input_file->new('DAge.txt', v3.2.0,
13547 Has_Missings_Defaults => $NOT_IGNORED,
13550 Input_file->new("${EXTRACTED}DGeneralCategory.txt", v3.1.0,
13551 Property => 'General_Category',
13553 Input_file->new("${EXTRACTED}DCombiningClass.txt", v3.1.0,
13554 Property => 'Canonical_Combining_Class',
13555 Has_Missings_Defaults => $NOT_IGNORED,
13557 Input_file->new("${EXTRACTED}DNumType.txt", v3.1.0,
13558 Property => 'Numeric_Type',
13559 Has_Missings_Defaults => $NOT_IGNORED,
13561 Input_file->new("${EXTRACTED}DEastAsianWidth.txt", v3.1.0,
13562 Property => 'East_Asian_Width',
13563 Has_Missings_Defaults => $NOT_IGNORED,
13565 Input_file->new("${EXTRACTED}DLineBreak.txt", v3.1.0,
13566 Property => 'Line_Break',
13567 Has_Missings_Defaults => $NOT_IGNORED,
13569 Input_file->new("${EXTRACTED}DBidiClass.txt", v3.1.1,
13570 Property => 'Bidi_Class',
13571 Has_Missings_Defaults => $NOT_IGNORED,
13573 Input_file->new("${EXTRACTED}DDecompositionType.txt", v3.1.0,
13574 Property => 'Decomposition_Type',
13575 Has_Missings_Defaults => $NOT_IGNORED,
13577 Input_file->new("${EXTRACTED}DBinaryProperties.txt", v3.1.0),
13578 Input_file->new("${EXTRACTED}DNumValues.txt", v3.1.0,
13579 Property => 'Numeric_Value',
13580 Each_Line_Handler => \&filter_numeric_value_line,
13581 Has_Missings_Defaults => $NOT_IGNORED,
13583 Input_file->new("${EXTRACTED}DJoinGroup.txt", v3.1.0,
13584 Property => 'Joining_Group',
13585 Has_Missings_Defaults => $NOT_IGNORED,
13588 Input_file->new("${EXTRACTED}DJoinType.txt", v3.1.0,
13589 Property => 'Joining_Type',
13590 Has_Missings_Defaults => $NOT_IGNORED,
13592 Input_file->new('Jamo.txt', v2.0.0,
13593 Property => 'Jamo_Short_Name',
13594 Each_Line_Handler => \&filter_jamo_line,
13596 Input_file->new('UnicodeData.txt', v1.1.5,
13597 Pre_Handler => \&setup_UnicodeData,
13599 # We clean up this file for some early versions.
13600 Each_Line_Handler => [ (($v_version lt v2.0.0 )
13602 : ($v_version eq v2.1.5)
13603 ? \&filter_v2_1_5_ucd
13606 # And the main filter
13607 \&filter_UnicodeData_line,
13609 EOF_Handler => \&EOF_UnicodeData,
13611 Input_file->new('ArabicShaping.txt', v2.0.0,
13612 Each_Line_Handler =>
13613 [ ($v_version lt 4.1.0)
13614 ? \&filter_old_style_arabic_shaping
13616 \&filter_arabic_shaping_line,
13618 Has_Missings_Defaults => $NOT_IGNORED,
13620 Input_file->new('Blocks.txt', v2.0.0,
13621 Property => 'Block',
13622 Has_Missings_Defaults => $NOT_IGNORED,
13623 Each_Line_Handler => \&filter_blocks_lines
13625 Input_file->new('PropList.txt', v2.0.0,
13626 Each_Line_Handler => (($v_version lt v3.1.0)
13627 ? \&filter_old_style_proplist
13630 Input_file->new('Unihan.txt', v2.0.0,
13631 Pre_Handler => \&setup_unihan,
13633 Each_Line_Handler => \&filter_unihan_line,
13635 Input_file->new('SpecialCasing.txt', v2.1.8,
13636 Each_Line_Handler => \&filter_special_casing_line,
13637 Pre_Handler => \&setup_special_casing,
13640 'LineBreak.txt', v3.0.0,
13641 Has_Missings_Defaults => $NOT_IGNORED,
13642 Property => 'Line_Break',
13643 # Early versions had problematic syntax
13644 Each_Line_Handler => (($v_version lt v3.1.0)
13645 ? \&filter_early_ea_lb
13648 Input_file->new('EastAsianWidth.txt', v3.0.0,
13649 Property => 'East_Asian_Width',
13650 Has_Missings_Defaults => $NOT_IGNORED,
13651 # Early versions had problematic syntax
13652 Each_Line_Handler => (($v_version lt v3.1.0)
13653 ? \&filter_early_ea_lb
13656 Input_file->new('CompositionExclusions.txt', v3.0.0,
13657 Property => 'Composition_Exclusion',
13659 Input_file->new('BidiMirroring.txt', v3.0.1,
13660 Property => 'Bidi_Mirroring_Glyph',
13662 Input_file->new("NormalizationTest.txt", v3.0.1,
13665 Input_file->new('CaseFolding.txt', v3.0.1,
13666 Pre_Handler => \&setup_case_folding,
13667 Each_Line_Handler =>
13668 [ ($v_version lt v3.1.0)
13669 ? \&filter_old_style_case_folding
13671 \&filter_case_folding_line
13673 Post_Handler => \&post_fold,
13675 Input_file->new('DCoreProperties.txt', v3.1.0,
13676 # 5.2 changed this file
13677 Has_Missings_Defaults => (($v_version ge v5.2.0)
13681 Input_file->new('Scripts.txt', v3.1.0,
13682 Property => 'Script',
13683 Has_Missings_Defaults => $NOT_IGNORED,
13685 Input_file->new('DNormalizationProps.txt', v3.1.0,
13686 Has_Missings_Defaults => $NOT_IGNORED,
13687 Each_Line_Handler => (($v_version lt v4.0.1)
13688 ? \&filter_old_style_normalization_lines
13691 Input_file->new('HangulSyllableType.txt', v4.0.0,
13692 Has_Missings_Defaults => $NOT_IGNORED,
13693 Property => 'Hangul_Syllable_Type'),
13694 Input_file->new("$AUXILIARY/WordBreakProperty.txt", v4.1.0,
13695 Property => 'Word_Break',
13696 Has_Missings_Defaults => $NOT_IGNORED,
13698 Input_file->new("$AUXILIARY/GraphemeBreakProperty.txt", v4.1.0,
13699 Property => 'Grapheme_Cluster_Break',
13700 Has_Missings_Defaults => $NOT_IGNORED,
13702 Input_file->new("$AUXILIARY/GCBTest.txt", v4.1.0,
13703 Handler => \&process_GCB_test,
13705 Input_file->new("$AUXILIARY/LBTest.txt", v4.1.0,
13708 Input_file->new("$AUXILIARY/SBTest.txt", v4.1.0,
13711 Input_file->new("$AUXILIARY/WBTest.txt", v4.1.0,
13714 Input_file->new("$AUXILIARY/SentenceBreakProperty.txt", v4.1.0,
13715 Property => 'Sentence_Break',
13716 Has_Missings_Defaults => $NOT_IGNORED,
13718 Input_file->new('NamedSequences.txt', v4.1.0,
13719 Handler => \&process_NamedSequences
13721 Input_file->new('NameAliases.txt', v5.0.0,
13722 Property => 'Name_Alias',
13724 Input_file->new("BidiTest.txt", v5.2.0,
13727 Input_file->new('UnihanIndicesDictionary.txt', v5.2.0,
13729 Each_Line_Handler => \&filter_unihan_line,
13731 Input_file->new('UnihanDataDictionaryLike.txt', v5.2.0,
13733 Each_Line_Handler => \&filter_unihan_line,
13735 Input_file->new('UnihanIRGSources.txt', v5.2.0,
13737 Pre_Handler => \&setup_unihan,
13738 Each_Line_Handler => \&filter_unihan_line,
13740 Input_file->new('UnihanNumericValues.txt', v5.2.0,
13742 Each_Line_Handler => \&filter_unihan_line,
13744 Input_file->new('UnihanOtherMappings.txt', v5.2.0,
13746 Each_Line_Handler => \&filter_unihan_line,
13748 Input_file->new('UnihanRadicalStrokeCounts.txt', v5.2.0,
13750 Each_Line_Handler => \&filter_unihan_line,
13752 Input_file->new('UnihanReadings.txt', v5.2.0,
13754 Each_Line_Handler => \&filter_unihan_line,
13756 Input_file->new('UnihanVariants.txt', v5.2.0,
13758 Each_Line_Handler => \&filter_unihan_line,
13762 # End of all the preliminaries.
13765 if ($compare_versions) {
13766 Carp::my_carp(<<END
13767 Warning. \$compare_versions is set. Output is not suitable for production
13772 # Put into %potential_files a list of all the files in the directory structure
13773 # that could be inputs to this program, excluding those that we should ignore.
13774 # Use absolute file names because it makes it easier across machine types.
13775 my @ignored_files_full_names = map { File::Spec->rel2abs(
13776 internal_file_to_platform($_))
13777 } keys %ignored_files;
13780 return unless /\.txt$/i; # Some platforms change the name's case
13781 my $full = lc(File::Spec->rel2abs($_));
13782 $potential_files{$full} = 1
13783 if ! grep { $full eq lc($_) } @ignored_files_full_names;
13786 }, File::Spec->curdir());
13788 my @mktables_list_output_files;
13790 if ($write_unchanged_files) {
13791 print "Not checking file list '$file_list'.\n" if $verbosity >= $VERBOSE;
13794 print "Reading file list '$file_list'\n" if $verbosity >= $VERBOSE;
13796 if (! open $file_handle, "<", $file_list) {
13797 Carp::my_carp("Failed to open '$file_list' (this is expected to be missing the first time); turning on -globlist option instead: $!");
13803 # Read and parse mktables.lst, placing the results from the first part
13804 # into @input, and the second part into @mktables_list_output_files
13805 for my $list ( \@input, \@mktables_list_output_files ) {
13806 while (<$file_handle>) {
13807 s/^ \s+ | \s+ $//xg;
13808 next if /^ \s* (?: \# .* )? $/x;
13810 my ( $file ) = split /\t/;
13811 push @$list, $file;
13813 @$list = uniques(@$list);
13817 # Look through all the input files
13818 foreach my $input (@input) {
13819 next if $input eq 'version'; # Already have checked this.
13821 # Ignore if doesn't exist. The checking about whether we care or
13822 # not is done via the Input_file object.
13823 next if ! file_exists($input);
13825 # The paths are stored with relative names, and with '/' as the
13826 # delimiter; convert to absolute on this machine
13827 my $full = lc(File::Spec->rel2abs(internal_file_to_platform($input)));
13828 $potential_files{$full} = 1
13829 if ! grep { lc($full) eq lc($_) } @ignored_files_full_names;
13833 close $file_handle;
13838 # Here wants to process all .txt files in the directory structure.
13839 # Convert them to full path names. They are stored in the platform's
13842 foreach my $object (@input_file_objects) {
13843 my $file = $object->file;
13844 next unless defined $file;
13845 push @known_files, File::Spec->rel2abs($file);
13848 my @unknown_input_files;
13849 foreach my $file (keys %potential_files) {
13850 next if grep { lc($file) eq lc($_) } @known_files;
13852 # Here, the file is unknown to us. Get relative path name
13853 $file = File::Spec->abs2rel($file);
13854 push @unknown_input_files, $file;
13856 # What will happen is we create a data structure for it, and add it to
13857 # the list of input files to process. First get the subdirectories
13859 my (undef, $directories, undef) = File::Spec->splitpath($file);
13860 $directories =~ s;/$;;; # Can have extraneous trailing '/'
13861 my @directories = File::Spec->splitdir($directories);
13863 # If the file isn't extracted (meaning none of the directories is the
13864 # extracted one), just add it to the end of the list of inputs.
13865 if (! grep { $EXTRACTED_DIR eq $_ } @directories) {
13866 push @input_file_objects, Input_file->new($file, v0);
13870 # Here, the file is extracted. It needs to go ahead of most other
13871 # processing. Search for the first input file that isn't a
13872 # special required property (that is, find one whose first_release
13873 # is non-0), and isn't extracted. Also, the Age property file is
13874 # processed before the extracted ones, just in case
13875 # $compare_versions is set.
13876 for (my $i = 0; $i < @input_file_objects; $i++) {
13877 if ($input_file_objects[$i]->first_released ne v0
13878 && lc($input_file_objects[$i]->file) ne 'dage.txt'
13879 && $input_file_objects[$i]->file !~ /$EXTRACTED_DIR/i)
13881 splice @input_file_objects, $i, 0,
13882 Input_file->new($file, v0);
13889 if (@unknown_input_files) {
13890 print STDERR simple_fold(join_lines(<<END
13892 The following files are unknown as to how to handle. Assuming they are
13893 typical property files. You'll know by later error messages if it worked or
13896 ) . " " . join(", ", @unknown_input_files) . "\n\n");
13898 } # End of looking through directory structure for more .txt files.
13900 # Create the list of input files from the objects we have defined, plus
13902 my @input_files = 'version';
13903 foreach my $object (@input_file_objects) {
13904 my $file = $object->file;
13905 next if ! defined $file; # Not all objects have files
13906 next if $object->optional && ! -e $file;
13907 push @input_files, $file;
13910 if ( $verbosity >= $VERBOSE ) {
13911 print "Expecting ".scalar( @input_files )." input files. ",
13912 "Checking ".scalar( @mktables_list_output_files )." output files.\n";
13915 # We set $youngest to be the most recently changed input file, including this
13916 # program itself (done much earlier in this file)
13917 foreach my $in (@input_files) {
13919 next unless defined $age; # Keep going even if missing a file
13920 $youngest = $age if $age < $youngest;
13922 # See that the input files have distinct names, to warn someone if they
13923 # are adding a new one
13925 my ($volume, $directories, $file ) = File::Spec->splitpath($in);
13926 $directories =~ s;/$;;; # Can have extraneous trailing '/'
13927 my @directories = File::Spec->splitdir($directories);
13928 my $base = $file =~ s/\.txt$//;
13929 construct_filename($file, 'mutable', \@directories);
13933 my $ok = ! $write_unchanged_files
13934 && scalar @mktables_list_output_files; # If none known, rebuild
13936 # Now we check to see if any output files are older than youngest, if
13937 # they are, we need to continue on, otherwise we can presumably bail.
13939 foreach my $out (@mktables_list_output_files) {
13940 if ( ! file_exists($out)) {
13941 print "'$out' is missing.\n" if $verbosity >= $VERBOSE;
13945 #local $to_trace = 1 if main::DEBUG;
13946 trace $youngest, -M $out if main::DEBUG && $to_trace;
13947 if ( -M $out > $youngest ) {
13948 #trace "$out: age: ", -M $out, ", youngest: $youngest\n" if main::DEBUG && $to_trace;
13949 print "'$out' is too old.\n" if $verbosity >= $VERBOSE;
13956 print "Files seem to be ok, not bothering to rebuild.\n";
13959 print "Must rebuild tables.\n" if $verbosity >= $VERBOSE;
13961 # Ready to do the major processing. First create the perl pseudo-property.
13962 $perl = Property->new('perl', Type => $NON_STRING, Perl_Extension => 1);
13964 # Process each input file
13965 foreach my $file (@input_file_objects) {
13969 # Finish the table generation.
13971 print "Finishing processing Unicode properties\n" if $verbosity >= $PROGRESS;
13974 print "Compiling Perl properties\n" if $verbosity >= $PROGRESS;
13977 print "Creating Perl synonyms\n" if $verbosity >= $PROGRESS;
13978 add_perl_synonyms();
13980 print "Writing tables\n" if $verbosity >= $PROGRESS;
13981 write_all_tables();
13983 # Write mktables.lst
13984 if ( $file_list and $make_list ) {
13986 print "Updating '$file_list'\n" if $verbosity >= $PROGRESS;
13987 foreach my $file (@input_files, @files_actually_output) {
13988 my (undef, $directories, $file) = File::Spec->splitpath($file);
13989 my @directories = File::Spec->splitdir($directories);
13990 $file = join '/', @directories, $file;
13994 if (! open $ofh,">",$file_list) {
13995 Carp::my_carp("Can't write to '$file_list'. Skipping: $!");
13999 print $ofh <<"END";
14001 # $file_list -- File list for $0.
14003 # Autogenerated on @{[scalar localtime]}
14005 # - First section is input files
14006 # ($0 itself is not listed but is automatically considered an input)
14007 # - Section seperator is /^=+\$/
14008 # - Second section is a list of output files.
14009 # - Lines matching /^\\s*#/ are treated as comments
14010 # which along with blank lines are ignored.
14016 print $ofh "$_\n" for sort(@input_files);
14017 print $ofh "\n=================================\n# Output files:\n\n";
14018 print $ofh "$_\n" for sort @files_actually_output;
14019 print $ofh "\n# ",scalar(@input_files)," input files\n",
14020 "# ",scalar(@files_actually_output)+1," output files\n\n",
14023 or Carp::my_carp("Failed to close $ofh: $!");
14025 print "Filelist has ",scalar(@input_files)," input files and ",
14026 scalar(@files_actually_output)+1," output files\n"
14027 if $verbosity >= $VERBOSE;
14031 # Output these warnings unless -q explicitly specified.
14032 if ($verbosity >= $NORMAL_VERBOSITY) {
14033 if (@unhandled_properties) {
14034 print "\nProperties and tables that unexpectedly have no code points\n";
14035 foreach my $property (sort @unhandled_properties) {
14036 print $property, "\n";
14040 if (%potential_files) {
14041 print "\nInput files that are not considered:\n";
14042 foreach my $file (sort keys %potential_files) {
14043 print File::Spec->abs2rel($file), "\n";
14046 print "\nAll done\n" if $verbosity >= $VERBOSE;
14050 # TRAILING CODE IS USED BY make_property_test_script()
14056 # Test qr/\X/ and the \p{} regular expression constructs. This file is
14057 # constructed by mktables from the tables it generates, so if mktables is
14058 # buggy, this won't necessarily catch those bugs. Tests are generated for all
14059 # feasible properties; a few aren't currently feasible; see
14060 # is_code_point_usable() in mktables for details.
14062 # Standard test packages are not used because this manipulates SIG_WARN. It
14063 # exits 0 if every non-skipped test succeeded; -1 if any failed.
14068 my $non_ASCII = (ord('A') != 65);
14070 # The 256 8-bit characters in ASCII ordinal order, with the ones that don't
14071 # have Perl names replaced by -1
14072 my @ascii_ordered_chars = (
14075 "\a", "\b", "\t", "\n",
14079 " ", "!", "\"", "#", '$', "%", "&", "'",
14080 "(", ")", "*", "+", ",", "-", ".", "/",
14081 "0", "1", "2", "3", "4", "5", "6", "7", "8", "9",
14082 ":", ";", "<", "=", ">", "?", "@",
14083 "A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M",
14084 "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z",
14085 "[", "\\", "]", "^", "_", "`",
14086 "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m",
14087 "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z",
14088 "{", "|", "}", "~",
14092 sub ASCII_ord_to_native ($) {
14093 # Converts input ordinal number to the native one, if can be done easily.
14094 # Returns -1 otherwise.
14098 return $ord if $ord > 255 || ! $non_ASCII;
14099 my $result = $ascii_ordered_chars[$ord];
14100 return $result if $result eq '-1';
14101 return ord($result);
14105 my $expected = shift;
14108 my $warning_type = shift; # Type of warning message, like 'deprecated'
14110 my $line = (caller)[2];
14112 # Convert the non-ASCII code points expressible as characters to their
14113 # ASCII equivalents, and skip the others.
14114 $ord = ASCII_ord_to_native($ord);
14117 print "ok $Tests - "
14118 . sprintf("\"\\x{%04X}\"", $ord)
14119 . " =~ $regex # Skipped: non-ASCII\n";
14123 # Convert the code point to hex form
14124 my $string = sprintf "\"\\x{%04X}\"", $ord;
14128 # The first time through, use all warnings. If the input should generate
14129 # a warning, add another time through with them turned off
14130 push @tests, "no warnings '$warning_type';" if $warning_type;
14132 foreach my $no_warnings (@tests) {
14134 # Store any warning messages instead of outputting them
14135 local $SIG{__WARN__} = $SIG{__WARN__};
14136 my $warning_message;
14137 $SIG{__WARN__} = sub { $warning_message = $_[0] };
14141 # A string eval is needed because of the 'no warnings'.
14142 # Assumes no parens in the regular expression
14143 my $result = eval "$no_warnings
14144 my \$RegObj = qr($regex);
14145 $string =~ \$RegObj ? 1 : 0";
14146 if (not defined $result) {
14147 print "not ok $Tests - couldn't compile /$regex/; line $line: $@\n";
14150 elsif ($result ^ $expected) {
14151 print "not ok $Tests - expected $expected but got $result for $string =~ qr/$regex/; line $line\n";
14154 elsif ($warning_message) {
14155 if (! $warning_type || ($warning_type && $no_warnings)) {
14156 print "not ok $Tests - for qr/$regex/ did not expect warning message '$warning_message'; line $line\n";
14160 print "ok $Tests - expected and got a warning message for qr/$regex/; line $line\n";
14163 elsif ($warning_type && ! $no_warnings) {
14164 print "not ok $Tests - for qr/$regex/ expected a $warning_type warning message, but got none; line $line\n";
14168 print "ok $Tests - got $result for $string =~ qr/$regex/; line $line\n";
14177 if (eval { 'x' =~ qr/$regex/; 1 }) {
14179 my $line = (caller)[2];
14180 print "not ok $Tests - re compiled ok, but expected error for qr/$regex/; line $line: $@\n";
14183 my $line = (caller)[2];
14184 print "ok $Tests - got and expected error for qr/$regex/; line $line\n";
14189 # GCBTest.txt character that separates grapheme clusters
14190 my $breakable_utf8 = my $breakable = chr(0xF7);
14191 utf8::upgrade($breakable_utf8);
14193 # GCBTest.txt character that indicates that the adjoining code points are part
14194 # of the same grapheme cluster
14195 my $nobreak_utf8 = my $nobreak = chr(0xD7);
14196 utf8::upgrade($nobreak_utf8);
14199 # Test qr/\X/ matches. The input is a line from auxiliary/GCBTest.txt
14200 # Each such line is a sequence of code points given by their hex numbers,
14201 # separated by the two characters defined just before this subroutine that
14202 # indicate that either there can or cannot be a break between the adjacent
14203 # code points. If there isn't a break, that means the sequence forms an
14204 # extended grapheme cluster, which means that \X should match the whole
14205 # thing. If there is a break, \X should stop there. This is all
14206 # converted by this routine into a match:
14207 # $string =~ /(\X)/,
14208 # Each \X should match the next cluster; and that is what is checked.
14210 my $template = shift;
14212 my $line = (caller)[2];
14214 # The line contains characters above the ASCII range, but in Latin1. It
14215 # may or may not be in utf8, and if it is, it may or may not know it. So,
14216 # convert these characters to 8 bits. If knows is in utf8, simply
14218 if (utf8::is_utf8($template)) {
14219 utf8::downgrade($template);
14222 # Otherwise, if it is in utf8, but doesn't know it, the next lines
14223 # convert the two problematic characters to their 8-bit equivalents.
14224 # If it isn't in utf8, they don't harm anything.
14226 $template =~ s/$nobreak_utf8/$nobreak/g;
14227 $template =~ s/$breakable_utf8/$breakable/g;
14230 # Get rid of the leading and trailing breakables
14231 $template =~ s/^ \s* $breakable \s* //x;
14232 $template =~ s/ \s* $breakable \s* $ //x;
14234 # And no-breaks become just a space.
14235 $template =~ s/ \s* $nobreak \s* / /xg;
14237 # Split the input into segments that are breakable between them.
14238 my @segments = split /\s*$breakable\s*/, $template;
14241 my $display_string = "";
14243 my @should_display;
14245 # Convert the code point sequence in each segment into a Perl string of
14247 foreach my $segment (@segments) {
14248 my @code_points = split /\s+/, $segment;
14249 my $this_string = "";
14250 my $this_display = "";
14251 foreach my $code_point (@code_points) {
14252 my $ord = ASCII_ord_to_native(hex $code_point);
14255 print "ok $Tests - String containing $code_point =~ /(\\X)/g # Skipped: non-ASCII\n";
14258 $this_string .= chr $ord;
14259 $this_display .= "\\x{$code_point}";
14262 # The next cluster should match the string in this segment.
14263 push @should_match, $this_string;
14264 push @should_display, $this_display;
14265 $string .= $this_string;
14266 $display_string .= $this_display;
14269 # If a string can be represented in both non-ut8 and utf8, test both cases
14271 for my $to_upgrade (0 .. 1) {
14275 # If already in utf8, would just be a repeat
14276 next UPGRADE if utf8::is_utf8($string);
14278 utf8::upgrade($string);
14281 # Finally, do the \X match.
14282 my @matches = $string =~ /(\X)/g;
14284 # Look through each matched cluster to verify that it matches what we
14286 my $min = (@matches < @should_match) ? @matches : @should_match;
14287 for my $i (0 .. $min - 1) {
14289 if ($matches[$i] eq $should_match[$i]) {
14290 print "ok $Tests - ";
14292 print "In \"$display_string\" =~ /(\\X)/g, \\X #1";
14294 print "And \\X #", $i + 1,
14296 print " correctly matched $should_display[$i]; line $line\n";
14298 $matches[$i] = join("", map { sprintf "\\x{%04X}", $_ }
14299 unpack("U*", $matches[$i]));
14300 print "not ok $Tests - In \"$display_string\" =~ /(\\X)/g, \\X #",
14302 " should have matched $should_display[$i]",
14303 " but instead matched $matches[$i]",
14304 ". Abandoning rest of line $line\n";
14309 # And the number of matches should equal the number of expected matches.
14311 if (@matches == @should_match) {
14312 print "ok $Tests - Nothing was left over; line $line\n";
14314 print "not ok $Tests - There were ", scalar @should_match, " \\X matches expected, but got ", scalar @matches, " instead; line $line\n";
14322 print "1..$Tests\n";
14323 exit($Fails ? -1 : 0);
14326 Error('\p{Script=InGreek}'); # Bug #69018
14327 Test_X("1100 $nobreak 1161"); # Bug #70940
14328 Expect(0, 0x2028, '\p{Print}', ""); # Bug # 71722
14329 Expect(0, 0x2029, '\p{Print}', ""); # Bug # 71722
14330 Expect(1, 0xFF10, '\p{XDigit}', ""); # Bug # 71726