3 # !!!!!!!!!!!!!! IF YOU MODIFY THIS FILE !!!!!!!!!!!!!!!!!!!!!!!!!
4 # Any files created or read by this program should be listed in 'mktables.lst'
5 # Use -makelist to regenerate it.
7 # Needs 'no overloading' to run faster on miniperl. Code commented out at the
8 # subroutine objaddr can be used instead to work as far back (untested) as
9 # 5.8: needs pack "U". But almost all occurrences of objaddr have been
10 # removed in favor of using 'no overloading'. You also would have to go
11 # through and replace occurrences like:
12 # my $addr = do { no overloading; pack 'J', $self; }
14 # my $addr = main::objaddr $self;
15 # (or reverse commit 9b01bafde4b022706c3d6f947a0963f821b2e50b
16 # that instituted the change to main::objaddr, and subsequent commits that
17 # changed 0+$self to pack 'J', $self.)
20 BEGIN { # Get the time the script started running; do it at compilation to
21 # get it as close as possible
36 sub DEBUG () { 0 } # Set to 0 for production; 1 for development
37 my $debugging_build = $Config{"ccflags"} =~ /-DDEBUGGING/;
39 ##########################################################################
41 # mktables -- create the runtime Perl Unicode files (lib/unicore/.../*.pl),
42 # from the Unicode database files (lib/unicore/.../*.txt), It also generates
43 # a pod file and a .t file
45 # The structure of this file is:
46 # First these introductory comments; then
47 # code needed for everywhere, such as debugging stuff; then
48 # code to handle input parameters; then
49 # data structures likely to be of external interest (some of which depend on
50 # the input parameters, so follows them; then
51 # more data structures and subroutine and package (class) definitions; then
52 # the small actual loop to process the input files and finish up; then
53 # a __DATA__ section, for the .t tests
55 # This program works on all releases of Unicode through at least 6.0. The
56 # outputs have been scrutinized most intently for release 5.1. The others
57 # have been checked for somewhat more than just sanity. It can handle all
58 # existing Unicode character properties in those releases.
60 # This program is mostly about Unicode character (or code point) properties.
61 # A property describes some attribute or quality of a code point, like if it
62 # is lowercase or not, its name, what version of Unicode it was first defined
63 # in, or what its uppercase equivalent is. Unicode deals with these disparate
64 # possibilities by making all properties into mappings from each code point
65 # into some corresponding value. In the case of it being lowercase or not,
66 # the mapping is either to 'Y' or 'N' (or various synonyms thereof). Each
67 # property maps each Unicode code point to a single value, called a "property
68 # value". (Hence each Unicode property is a true mathematical function with
69 # exactly one value per code point.)
71 # When using a property in a regular expression, what is desired isn't the
72 # mapping of the code point to its property's value, but the reverse (or the
73 # mathematical "inverse relation"): starting with the property value, "Does a
74 # code point map to it?" These are written in a "compound" form:
75 # \p{property=value}, e.g., \p{category=punctuation}. This program generates
76 # files containing the lists of code points that map to each such regular
77 # expression property value, one file per list
79 # There is also a single form shortcut that Perl adds for many of the commonly
80 # used properties. This happens for all binary properties, plus script,
81 # general_category, and block properties.
83 # Thus the outputs of this program are files. There are map files, mostly in
84 # the 'To' directory; and there are list files for use in regular expression
85 # matching, all in subdirectories of the 'lib' directory, with each
86 # subdirectory being named for the property that the lists in it are for.
87 # Bookkeeping, test, and documentation files are also generated.
89 my $matches_directory = 'lib'; # Where match (\p{}) files go.
90 my $map_directory = 'To'; # Where map files go.
94 # The major data structures of this program are Property, of course, but also
95 # Table. There are two kinds of tables, very similar to each other.
96 # "Match_Table" is the data structure giving the list of code points that have
97 # a particular property value, mentioned above. There is also a "Map_Table"
98 # data structure which gives the property's mapping from code point to value.
99 # There are two structures because the match tables need to be combined in
100 # various ways, such as constructing unions, intersections, complements, etc.,
101 # and the map ones don't. And there would be problems, perhaps subtle, if
102 # a map table were inadvertently operated on in some of those ways.
103 # The use of separate classes with operations defined on one but not the other
104 # prevents accidentally confusing the two.
106 # At the heart of each table's data structure is a "Range_List", which is just
107 # an ordered list of "Ranges", plus ancillary information, and methods to
108 # operate on them. A Range is a compact way to store property information.
109 # Each range has a starting code point, an ending code point, and a value that
110 # is meant to apply to all the code points between the two end points,
111 # inclusive. For a map table, this value is the property value for those
112 # code points. Two such ranges could be written like this:
113 # 0x41 .. 0x5A, 'Upper',
114 # 0x61 .. 0x7A, 'Lower'
116 # Each range also has a type used as a convenience to classify the values.
117 # Most ranges in this program will be Type 0, or normal, but there are some
118 # ranges that have a non-zero type. These are used only in map tables, and
119 # are for mappings that don't fit into the normal scheme of things. Mappings
120 # that require a hash entry to communicate with utf8.c are one example;
121 # another example is mappings for charnames.pm to use which indicate a name
122 # that is algorithmically determinable from its code point (and vice-versa).
123 # These are used to significantly compact these tables, instead of listing
124 # each one of the tens of thousands individually.
126 # In a match table, the value of a range is irrelevant (and hence the type as
127 # well, which will always be 0), and arbitrarily set to the null string.
128 # Using the example above, there would be two match tables for those two
129 # entries, one named Upper would contain the 0x41..0x5A range, and the other
130 # named Lower would contain 0x61..0x7A.
132 # Actually, there are two types of range lists, "Range_Map" is the one
133 # associated with map tables, and "Range_List" with match tables.
134 # Again, this is so that methods can be defined on one and not the other so as
135 # to prevent operating on them in incorrect ways.
137 # Eventually, most tables are written out to files to be read by utf8_heavy.pl
138 # in the perl core. All tables could in theory be written, but some are
139 # suppressed because there is no current practical use for them. It is easy
140 # to change which get written by changing various lists that are near the top
141 # of the actual code in this file. The table data structures contain enough
142 # ancillary information to allow them to be treated as separate entities for
143 # writing, such as the path to each one's file. There is a heading in each
144 # map table that gives the format of its entries, and what the map is for all
145 # the code points missing from it. (This allows tables to be more compact.)
147 # The Property data structure contains one or more tables. All properties
148 # contain a map table (except the $perl property which is a
149 # pseudo-property containing only match tables), and any properties that
150 # are usable in regular expression matches also contain various matching
151 # tables, one for each value the property can have. A binary property can
152 # have two values, True and False (or Y and N, which are preferred by Unicode
153 # terminology). Thus each of these properties will have a map table that
154 # takes every code point and maps it to Y or N (but having ranges cuts the
155 # number of entries in that table way down), and two match tables, one
156 # which has a list of all the code points that map to Y, and one for all the
157 # code points that map to N. (For each of these, a third table is also
158 # generated for the pseudo Perl property. It contains the identical code
159 # points as the Y table, but can be written, not in the compound form, but in
160 # a "single" form like \p{IsUppercase}.) Many properties are binary, but some
161 # properties have several possible values, some have many, and properties like
162 # Name have a different value for every named code point. Those will not,
163 # unless the controlling lists are changed, have their match tables written
164 # out. But all the ones which can be used in regular expression \p{} and \P{}
165 # constructs will. Prior to 5.14, generally a property would have either its
166 # map table or its match tables written but not both. Again, what gets
167 # written is controlled by lists which can easily be changed. Starting in
168 # 5.14, advantage was taken of this, and all the map tables needed to
169 # reconstruct the Unicode db are now written out, while suppressing the
170 # Unicode .txt files that contain the data. Our tables are much more compact
171 # than the .txt files, so a significant space savings was achieved.
173 # Properties have a 'Type', like binary, or string, or enum depending on how
174 # many match tables there are and the content of the maps. This 'Type' is
175 # different than a range 'Type', so don't get confused by the two concepts
176 # having the same name.
178 # For information about the Unicode properties, see Unicode's UAX44 document:
180 my $unicode_reference_url = 'http://www.unicode.org/reports/tr44/';
182 # As stated earlier, this program will work on any release of Unicode so far.
183 # Most obvious problems in earlier data have NOT been corrected except when
184 # necessary to make Perl or this program work reasonably. For example, no
185 # folding information was given in early releases, so this program substitutes
186 # lower case instead, just so that a regular expression with the /i option
187 # will do something that actually gives the right results in many cases.
188 # There are also a couple other corrections for version 1.1.5, commented at
189 # the point they are made. As an example of corrections that weren't made
190 # (but could be) is this statement from DerivedAge.txt: "The supplementary
191 # private use code points and the non-character code points were assigned in
192 # version 2.0, but not specifically listed in the UCD until versions 3.0 and
193 # 3.1 respectively." (To be precise it was 3.0.1 not 3.0.0) More information
194 # on Unicode version glitches is further down in these introductory comments.
196 # This program works on all non-provisional properties as of 6.0, though the
197 # files for some are suppressed from apparent lack of demand for them. You
198 # can change which are output by changing lists in this program.
200 # The old version of mktables emphasized the term "Fuzzy" to mean Unicode's
201 # loose matchings rules (from Unicode TR18):
203 # The recommended names for UCD properties and property values are in
204 # PropertyAliases.txt [Prop] and PropertyValueAliases.txt
205 # [PropValue]. There are both abbreviated names and longer, more
206 # descriptive names. It is strongly recommended that both names be
207 # recognized, and that loose matching of property names be used,
208 # whereby the case distinctions, whitespace, hyphens, and underbar
210 # The program still allows Fuzzy to override its determination of if loose
211 # matching should be used, but it isn't currently used, as it is no longer
212 # needed; the calculations it makes are good enough.
214 # SUMMARY OF HOW IT WORKS:
218 # A list is constructed containing each input file that is to be processed
220 # Each file on the list is processed in a loop, using the associated handler
222 # The PropertyAliases.txt and PropValueAliases.txt files are processed
223 # first. These files name the properties and property values.
224 # Objects are created of all the property and property value names
225 # that the rest of the input should expect, including all synonyms.
226 # The other input files give mappings from properties to property
227 # values. That is, they list code points and say what the mapping
228 # is under the given property. Some files give the mappings for
229 # just one property; and some for many. This program goes through
230 # each file and populates the properties from them. Some properties
231 # are listed in more than one file, and Unicode has set up a
232 # precedence as to which has priority if there is a conflict. Thus
233 # the order of processing matters, and this program handles the
234 # conflict possibility by processing the overriding input files
235 # last, so that if necessary they replace earlier values.
236 # After this is all done, the program creates the property mappings not
237 # furnished by Unicode, but derivable from what it does give.
238 # The tables of code points that match each property value in each
239 # property that is accessible by regular expressions are created.
240 # The Perl-defined properties are created and populated. Many of these
241 # require data determined from the earlier steps
242 # Any Perl-defined synonyms are created, and name clashes between Perl
243 # and Unicode are reconciled and warned about.
244 # All the properties are written to files
245 # Any other files are written, and final warnings issued.
247 # For clarity, a number of operators have been overloaded to work on tables:
248 # ~ means invert (take all characters not in the set). The more
249 # conventional '!' is not used because of the possibility of confusing
250 # it with the actual boolean operation.
252 # - means subtraction
253 # & means intersection
254 # The precedence of these is the order listed. Parentheses should be
255 # copiously used. These are not a general scheme. The operations aren't
256 # defined for a number of things, deliberately, to avoid getting into trouble.
257 # Operations are done on references and affect the underlying structures, so
258 # that the copy constructors for them have been overloaded to not return a new
259 # clone, but the input object itself.
261 # The bool operator is deliberately not overloaded to avoid confusion with
262 # "should it mean if the object merely exists, or also is non-empty?".
264 # WHY CERTAIN DESIGN DECISIONS WERE MADE
266 # This program needs to be able to run under miniperl. Therefore, it uses a
267 # minimum of other modules, and hence implements some things itself that could
268 # be gotten from CPAN
270 # This program uses inputs published by the Unicode Consortium. These can
271 # change incompatibly between releases without the Perl maintainers realizing
272 # it. Therefore this program is now designed to try to flag these. It looks
273 # at the directories where the inputs are, and flags any unrecognized files.
274 # It keeps track of all the properties in the files it handles, and flags any
275 # that it doesn't know how to handle. It also flags any input lines that
276 # don't match the expected syntax, among other checks.
278 # It is also designed so if a new input file matches one of the known
279 # templates, one hopefully just needs to add it to a list to have it
282 # As mentioned earlier, some properties are given in more than one file. In
283 # particular, the files in the extracted directory are supposedly just
284 # reformattings of the others. But they contain information not easily
285 # derivable from the other files, including results for Unihan, which this
286 # program doesn't ordinarily look at, and for unassigned code points. They
287 # also have historically had errors or been incomplete. In an attempt to
288 # create the best possible data, this program thus processes them first to
289 # glean information missing from the other files; then processes those other
290 # files to override any errors in the extracted ones. Much of the design was
291 # driven by this need to store things and then possibly override them.
293 # It tries to keep fatal errors to a minimum, to generate something usable for
294 # testing purposes. It always looks for files that could be inputs, and will
295 # warn about any that it doesn't know how to handle (the -q option suppresses
298 # Why is there more than one type of range?
299 # This simplified things. There are some very specialized code points that
300 # have to be handled specially for output, such as Hangul syllable names.
301 # By creating a range type (done late in the development process), it
302 # allowed this to be stored with the range, and overridden by other input.
303 # Originally these were stored in another data structure, and it became a
304 # mess trying to decide if a second file that was for the same property was
305 # overriding the earlier one or not.
307 # Why are there two kinds of tables, match and map?
308 # (And there is a base class shared by the two as well.) As stated above,
309 # they actually are for different things. Development proceeded much more
310 # smoothly when I (khw) realized the distinction. Map tables are used to
311 # give the property value for every code point (actually every code point
312 # that doesn't map to a default value). Match tables are used for regular
313 # expression matches, and are essentially the inverse mapping. Separating
314 # the two allows more specialized methods, and error checks so that one
315 # can't just take the intersection of two map tables, for example, as that
320 # This program is written so it will run under miniperl. Occasionally changes
321 # will cause an error where the backtrace doesn't work well under miniperl.
322 # To diagnose the problem, you can instead run it under regular perl, if you
325 # There is a good trace facility. To enable it, first sub DEBUG must be set
326 # to return true. Then a line like
328 # local $to_trace = 1 if main::DEBUG;
330 # can be added to enable tracing in its lexical scope or until you insert
333 # local $to_trace = 0 if main::DEBUG;
335 # then use a line like "trace $a, @b, %c, ...;
337 # Some of the more complex subroutines already have trace statements in them.
338 # Permanent trace statements should be like:
340 # trace ... if main::DEBUG && $to_trace;
342 # If there is just one or a few files that you're debugging, you can easily
343 # cause most everything else to be skipped. Change the line
345 # my $debug_skip = 0;
347 # to 1, and every file whose object is in @input_file_objects and doesn't have
348 # a, 'non_skip => 1,' in its constructor will be skipped.
350 # To compare the output tables, it may be useful to specify the -annotate
351 # flag. This causes the tables to expand so there is one entry for each
352 # non-algorithmically named code point giving, currently its name, and its
353 # graphic representation if printable (and you have a font that knows about
354 # it). This makes it easier to see what the particular code points are in
355 # each output table. The tables are usable, but because they don't have
356 # ranges (for the most part), a Perl using them will run slower. Non-named
357 # code points are annotated with a description of their status, and contiguous
358 # ones with the same description will be output as a range rather than
359 # individually. Algorithmically named characters are also output as ranges,
360 # except when there are just a few contiguous ones.
364 # The program would break if Unicode were to change its names so that
365 # interior white space, underscores, or dashes differences were significant
366 # within property and property value names.
368 # It might be easier to use the xml versions of the UCD if this program ever
369 # would need heavy revision, and the ability to handle old versions was not
372 # There is the potential for name collisions, in that Perl has chosen names
373 # that Unicode could decide it also likes. There have been such collisions in
374 # the past, with mostly Perl deciding to adopt the Unicode definition of the
375 # name. However in the 5.2 Unicode beta testing, there were a number of such
376 # collisions, which were withdrawn before the final release, because of Perl's
377 # and other's protests. These all involved new properties which began with
378 # 'Is'. Based on the protests, Unicode is unlikely to try that again. Also,
379 # many of the Perl-defined synonyms, like Any, Word, etc, are listed in a
380 # Unicode document, so they are unlikely to be used by Unicode for another
381 # purpose. However, they might try something beginning with 'In', or use any
382 # of the other Perl-defined properties. This program will warn you of name
383 # collisions, and refuse to generate tables with them, but manual intervention
384 # will be required in this event. One scheme that could be implemented, if
385 # necessary, would be to have this program generate another file, or add a
386 # field to mktables.lst that gives the date of first definition of a property.
387 # Each new release of Unicode would use that file as a basis for the next
388 # iteration. And the Perl synonym addition code could sort based on the age
389 # of the property, so older properties get priority, and newer ones that clash
390 # would be refused; hence existing code would not be impacted, and some other
391 # synonym would have to be used for the new property. This is ugly, and
392 # manual intervention would certainly be easier to do in the short run; lets
393 # hope it never comes to this.
397 # This program can generate tables from the Unihan database. But it doesn't
398 # by default, letting the CPAN module Unicode::Unihan handle them. Prior to
399 # version 5.2, this database was in a single file, Unihan.txt. In 5.2 the
400 # database was split into 8 different files, all beginning with the letters
401 # 'Unihan'. This program will read those file(s) if present, but it needs to
402 # know which of the many properties in the file(s) should have tables created
403 # for them. It will create tables for any properties listed in
404 # PropertyAliases.txt and PropValueAliases.txt, plus any listed in the
405 # @cjk_properties array and the @cjk_property_values array. Thus, if a
406 # property you want is not in those files of the release you are building
407 # against, you must add it to those two arrays. Starting in 4.0, the
408 # Unicode_Radical_Stroke was listed in those files, so if the Unihan database
409 # is present in the directory, a table will be generated for that property.
410 # In 5.2, several more properties were added. For your convenience, the two
411 # arrays are initialized with all the 6.0 listed properties that are also in
412 # earlier releases. But these are commented out. You can just uncomment the
413 # ones you want, or use them as a template for adding entries for other
416 # You may need to adjust the entries to suit your purposes. setup_unihan(),
417 # and filter_unihan_line() are the functions where this is done. This program
418 # already does some adjusting to make the lines look more like the rest of the
419 # Unicode DB; You can see what that is in filter_unihan_line()
421 # There is a bug in the 3.2 data file in which some values for the
422 # kPrimaryNumeric property have commas and an unexpected comment. A filter
423 # could be added for these; or for a particular installation, the Unihan.txt
424 # file could be edited to fix them.
426 # HOW TO ADD A FILE TO BE PROCESSED
428 # A new file from Unicode needs to have an object constructed for it in
429 # @input_file_objects, probably at the end or at the end of the extracted
430 # ones. The program should warn you if its name will clash with others on
431 # restrictive file systems, like DOS. If so, figure out a better name, and
432 # add lines to the README.perl file giving that. If the file is a character
433 # property, it should be in the format that Unicode has by default
434 # standardized for such files for the more recently introduced ones.
435 # If so, the Input_file constructor for @input_file_objects can just be the
436 # file name and release it first appeared in. If not, then it should be
437 # possible to construct an each_line_handler() to massage the line into the
440 # For non-character properties, more code will be needed. You can look at
441 # the existing entries for clues.
443 # UNICODE VERSIONS NOTES
445 # The Unicode UCD has had a number of errors in it over the versions. And
446 # these remain, by policy, in the standard for that version. Therefore it is
447 # risky to correct them, because code may be expecting the error. So this
448 # program doesn't generally make changes, unless the error breaks the Perl
449 # core. As an example, some versions of 2.1.x Jamo.txt have the wrong value
450 # for U+1105, which causes real problems for the algorithms for Jamo
451 # calculations, so it is changed here.
453 # But it isn't so clear cut as to what to do about concepts that are
454 # introduced in a later release; should they extend back to earlier releases
455 # where the concept just didn't exist? It was easier to do this than to not,
456 # so that's what was done. For example, the default value for code points not
457 # in the files for various properties was probably undefined until changed by
458 # some version. No_Block for blocks is such an example. This program will
459 # assign No_Block even in Unicode versions that didn't have it. This has the
460 # benefit that code being written doesn't have to special case earlier
461 # versions; and the detriment that it doesn't match the Standard precisely for
462 # the affected versions.
464 # Here are some observations about some of the issues in early versions:
466 # The number of code points in \p{alpha} halved in 2.1.9. It turns out that
467 # the reason is that the CJK block starting at 4E00 was removed from PropList,
468 # and was not put back in until 3.1.0
470 # Unicode introduced the synonym Space for White_Space in 4.1. Perl has
471 # always had a \p{Space}. In release 3.2 only, they are not synonymous. The
472 # reason is that 3.2 introduced U+205F=medium math space, which was not
473 # classed as white space, but Perl figured out that it should have been. 4.0
474 # reclassified it correctly.
476 # Another change between 3.2 and 4.0 is the CCC property value ATBL. In 3.2
477 # this was erroneously a synonym for 202. In 4.0, ATB became 202, and ATBL
478 # was left with no code points, as all the ones that mapped to 202 stayed
479 # mapped to 202. Thus if your program used the numeric name for the class,
480 # it would not have been affected, but if it used the mnemonic, it would have
483 # \p{Script=Hrkt} (Katakana_Or_Hiragana) came in 4.0.1. Before that code
484 # points which eventually came to have this script property value, instead
485 # mapped to "Unknown". But in the next release all these code points were
486 # moved to \p{sc=common} instead.
488 # The default for missing code points for BidiClass is complicated. Starting
489 # in 3.1.1, the derived file DBidiClass.txt handles this, but this program
490 # tries to do the best it can for earlier releases. It is done in
491 # process_PropertyAliases()
493 ##############################################################################
495 my $UNDEF = ':UNDEF:'; # String to print out for undefined values in tracing
497 my $MAX_LINE_WIDTH = 78;
499 # Debugging aid to skip most files so as to not be distracted by them when
500 # concentrating on the ones being debugged. Add
502 # to the constructor for those files you want processed when you set this.
503 # Files with a first version number of 0 are special: they are always
504 # processed regardless of the state of this flag. Generally, Jamo.txt and
505 # UnicodeData.txt must not be skipped if you want this program to not die
506 # before normal completion.
509 # Set to 1 to enable tracing.
512 { # Closure for trace: debugging aid
513 my $print_caller = 1; # ? Include calling subroutine name
514 my $main_with_colon = 'main::';
515 my $main_colon_length = length($main_with_colon);
518 return unless $to_trace; # Do nothing if global flag not set
522 local $DB::trace = 0;
523 $DB::trace = 0; # Quiet 'used only once' message
527 # Loop looking up the stack to get the first non-trace caller
532 $line_number = $caller_line;
533 (my $pkg, my $file, $caller_line, my $caller) = caller $i++;
534 $caller = $main_with_colon unless defined $caller;
536 $caller_name = $caller;
539 $caller_name =~ s/.*:://;
540 if (substr($caller_name, 0, $main_colon_length)
543 $caller_name = substr($caller_name, $main_colon_length);
546 } until ($caller_name ne 'trace');
548 # If the stack was empty, we were called from the top level
549 $caller_name = 'main' if ($caller_name eq ""
550 || $caller_name eq 'trace');
553 foreach my $string (@input) {
554 #print STDERR __LINE__, ": ", join ", ", @input, "\n";
555 if (ref $string eq 'ARRAY' || ref $string eq 'HASH') {
556 $output .= simple_dumper($string);
559 $string = "$string" if ref $string;
560 $string = $UNDEF unless defined $string;
562 $string = '""' if $string eq "";
563 $output .= " " if $output ne ""
565 && substr($output, -1, 1) ne " "
566 && substr($string, 0, 1) ne " ";
571 print STDERR sprintf "%4d: ", $line_number if defined $line_number;
572 print STDERR "$caller_name: " if $print_caller;
573 print STDERR $output, "\n";
578 # This is for a rarely used development feature that allows you to compare two
579 # versions of the Unicode standard without having to deal with changes caused
580 # by the code points introduced in the later version. Change the 0 to a
581 # string containing a SINGLE dotted Unicode release number (e.g. "2.1"). Only
582 # code points introduced in that release and earlier will be used; later ones
583 # are thrown away. You use the version number of the earliest one you want to
584 # compare; then run this program on directory structures containing each
585 # release, and compare the outputs. These outputs will therefore include only
586 # the code points common to both releases, and you can see the changes caused
587 # just by the underlying release semantic changes. For versions earlier than
588 # 3.2, you must copy a version of DAge.txt into the directory.
589 my $string_compare_versions = DEBUG && 0; # e.g., "2.1";
590 my $compare_versions = DEBUG
591 && $string_compare_versions
592 && pack "C*", split /\./, $string_compare_versions;
595 # Returns non-duplicated input values. From "Perl Best Practices:
596 # Encapsulated Cleverness". p. 455 in first edition.
599 # Arguably this breaks encapsulation, if the goal is to permit multiple
600 # distinct objects to stringify to the same value, and be interchangeable.
601 # However, for this program, no two objects stringify identically, and all
602 # lists passed to this function are either objects or strings. So this
603 # doesn't affect correctness, but it does give a couple of percent speedup.
605 return grep { ! $seen{$_}++ } @_;
608 $0 = File::Spec->canonpath($0);
610 my $make_test_script = 0; # ? Should we output a test script
611 my $make_norm_test_script = 0; # ? Should we output a normalization test script
612 my $write_unchanged_files = 0; # ? Should we update the output files even if
613 # we don't think they have changed
614 my $use_directory = ""; # ? Should we chdir somewhere.
615 my $pod_directory; # input directory to store the pod file.
616 my $pod_file = 'perluniprops';
617 my $t_path; # Path to the .t test file
618 my $file_list = 'mktables.lst'; # File to store input and output file names.
619 # This is used to speed up the build, by not
620 # executing the main body of the program if
621 # nothing on the list has changed since the
623 my $make_list = 1; # ? Should we write $file_list. Set to always
624 # make a list so that when the pumpking is
625 # preparing a release, s/he won't have to do
627 my $glob_list = 0; # ? Should we try to include unknown .txt files
629 my $output_range_counts = $debugging_build; # ? Should we include the number
630 # of code points in ranges in
632 my $annotate = 0; # ? Should character names be in the output
634 # Verbosity levels; 0 is quiet
635 my $NORMAL_VERBOSITY = 1;
639 my $verbosity = $NORMAL_VERBOSITY;
643 my $arg = shift @ARGV;
645 $verbosity = $VERBOSE;
647 elsif ($arg eq '-p') {
648 $verbosity = $PROGRESS;
649 $| = 1; # Flush buffers as we go.
651 elsif ($arg eq '-q') {
654 elsif ($arg eq '-w') {
655 $write_unchanged_files = 1; # update the files even if havent changed
657 elsif ($arg eq '-check') {
658 my $this = shift @ARGV;
659 my $ok = shift @ARGV;
661 print "Skipping as check params are not the same.\n";
665 elsif ($arg eq '-P' && defined ($pod_directory = shift)) {
666 -d $pod_directory or croak "Directory '$pod_directory' doesn't exist";
668 elsif ($arg eq '-maketest' || ($arg eq '-T' && defined ($t_path = shift)))
670 $make_test_script = 1;
672 elsif ($arg eq '-makenormtest')
674 $make_norm_test_script = 1;
676 elsif ($arg eq '-makelist') {
679 elsif ($arg eq '-C' && defined ($use_directory = shift)) {
680 -d $use_directory or croak "Unknown directory '$use_directory'";
682 elsif ($arg eq '-L') {
684 # Existence not tested until have chdir'd
687 elsif ($arg eq '-globlist') {
690 elsif ($arg eq '-c') {
691 $output_range_counts = ! $output_range_counts
693 elsif ($arg eq '-annotate') {
695 $debugging_build = 1;
696 $output_range_counts = 1;
700 $with_c .= 'out' if $output_range_counts; # Complements the state
702 usage: $0 [-c|-p|-q|-v|-w] [-C dir] [-L filelist] [ -P pod_dir ]
703 [ -T test_file_path ] [-globlist] [-makelist] [-maketest]
705 -c : Output comments $with_c number of code points in ranges
706 -q : Quiet Mode: Only output serious warnings.
707 -p : Set verbosity level to normal plus show progress.
708 -v : Set Verbosity level high: Show progress and non-serious
710 -w : Write files regardless
711 -C dir : Change to this directory before proceeding. All relative paths
712 except those specified by the -P and -T options will be done
713 with respect to this directory.
714 -P dir : Output $pod_file file to directory 'dir'.
715 -T path : Create a test script as 'path'; overrides -maketest
716 -L filelist : Use alternate 'filelist' instead of standard one
717 -globlist : Take as input all non-Test *.txt files in current and sub
719 -maketest : Make test script 'TestProp.pl' in current (or -C directory),
721 -makelist : Rewrite the file list $file_list based on current setup
722 -annotate : Output an annotation for each character in the table files;
723 useful for debugging mktables, looking at diffs; but is slow,
724 memory intensive; resulting tables are usable but are slow and
725 very large (and currently fail the Unicode::UCD.t tests).
726 -check A B : Executes $0 only if A and B are the same
731 # Stores the most-recently changed file. If none have changed, can skip the
733 my $most_recent = (stat $0)[9]; # Do this before the chdir!
735 # Change directories now, because need to read 'version' early.
736 if ($use_directory) {
737 if ($pod_directory && ! File::Spec->file_name_is_absolute($pod_directory)) {
738 $pod_directory = File::Spec->rel2abs($pod_directory);
740 if ($t_path && ! File::Spec->file_name_is_absolute($t_path)) {
741 $t_path = File::Spec->rel2abs($t_path);
743 chdir $use_directory or croak "Failed to chdir to '$use_directory':$!";
744 if ($pod_directory && File::Spec->file_name_is_absolute($pod_directory)) {
745 $pod_directory = File::Spec->abs2rel($pod_directory);
747 if ($t_path && File::Spec->file_name_is_absolute($t_path)) {
748 $t_path = File::Spec->abs2rel($t_path);
752 # Get Unicode version into regular and v-string. This is done now because
753 # various tables below get populated based on it. These tables are populated
754 # here to be near the top of the file, and so easily seeable by those needing
756 open my $VERSION, "<", "version"
757 or croak "$0: can't open required file 'version': $!\n";
758 my $string_version = <$VERSION>;
760 chomp $string_version;
761 my $v_version = pack "C*", split /\./, $string_version; # v string
763 # The following are the complete names of properties with property values that
764 # are known to not match any code points in some versions of Unicode, but that
765 # may change in the future so they should be matchable, hence an empty file is
766 # generated for them.
767 my @tables_that_may_be_empty = (
768 'Joining_Type=Left_Joining',
770 push @tables_that_may_be_empty, 'Script=Common' if $v_version le v4.0.1;
771 push @tables_that_may_be_empty, 'Title' if $v_version lt v2.0.0;
772 push @tables_that_may_be_empty, 'Script=Katakana_Or_Hiragana'
773 if $v_version ge v4.1.0;
774 push @tables_that_may_be_empty, 'Script_Extensions=Katakana_Or_Hiragana'
775 if $v_version ge v6.0.0;
776 push @tables_that_may_be_empty, 'Grapheme_Cluster_Break=Prepend'
777 if $v_version ge v6.1.0;
779 # The lists below are hashes, so the key is the item in the list, and the
780 # value is the reason why it is in the list. This makes generation of
781 # documentation easier.
783 my %why_suppressed; # No file generated for these.
785 # Files aren't generated for empty extraneous properties. This is arguable.
786 # Extraneous properties generally come about because a property is no longer
787 # used in a newer version of Unicode. If we generated a file without code
788 # points, programs that used to work on that property will still execute
789 # without errors. It just won't ever match (or will always match, with \P{}).
790 # This means that the logic is now likely wrong. I (khw) think its better to
791 # find this out by getting an error message. Just move them to the table
792 # above to change this behavior
793 my %why_suppress_if_empty_warn_if_not = (
795 # It is the only property that has ever officially been removed from the
796 # Standard. The database never contained any code points for it.
797 'Special_Case_Condition' => 'Obsolete',
799 # Apparently never official, but there were code points in some versions of
800 # old-style PropList.txt
801 'Non_Break' => 'Obsolete',
804 # These would normally go in the warn table just above, but they were changed
805 # a long time before this program was written, so warnings about them are
807 if ($v_version gt v3.2.0) {
808 push @tables_that_may_be_empty,
809 'Canonical_Combining_Class=Attached_Below_Left'
812 # These are listed in the Property aliases file in 6.0, but Unihan is ignored
813 # unless explicitly added.
814 if ($v_version ge v5.2.0) {
815 my $unihan = 'Unihan; remove from list if using Unihan';
816 foreach my $table (qw (
820 kCompatibilityVariant
834 $why_suppress_if_empty_warn_if_not{$table} = $unihan;
838 # Enum values for to_output_map() method in the Map_Table package.
839 my $EXTERNAL_MAP = 1;
840 my $INTERNAL_MAP = 2;
841 my $OUTPUT_ADJUSTED = 3;
843 # To override computed values for writing the map tables for these properties.
844 # The default for enum map tables is to write them out, so that the Unicode
845 # .txt files can be removed, but all the data to compute any property value
846 # for any code point is available in a more compact form.
847 my %global_to_output_map = (
848 # Needed by UCD.pm, but don't want to publicize that it exists, so won't
849 # get stuck supporting it if things change. Since it is a STRING
850 # property, it normally would be listed in the pod, but INTERNAL_MAP
852 Unicode_1_Name => $INTERNAL_MAP,
854 Present_In => 0, # Suppress, as easily computed from Age
855 Block => 0, # Suppress, as Blocks.txt is retained.
857 # Suppress, as mapping can be found instead from the
858 # Perl_Decomposition_Mapping file
859 Decomposition_Type => 0,
862 # Properties that this program ignores.
863 my @unimplemented_properties;
865 # With this release, it is automatically handled if the Unihan db is
867 push @unimplemented_properties, 'Unicode_Radical_Stroke' if $v_version le v5.2.0;
869 # There are several types of obsolete properties defined by Unicode. These
870 # must be hand-edited for every new Unicode release.
871 my %why_deprecated; # Generates a deprecated warning message if used.
872 my %why_stabilized; # Documentation only
873 my %why_obsolete; # Documentation only
876 my $simple = 'Perl uses the more complete version of this property';
877 my $unihan = 'Unihan properties are by default not enabled in the Perl core. Instead use CPAN: Unicode::Unihan';
879 my $other_properties = 'other properties';
880 my $contributory = "Used by Unicode internally for generating $other_properties and not intended to be used stand-alone";
881 my $why_no_expand = "Deprecated by Unicode. These are characters that expand to more than one character in the specified normalization form, but whether they actually take up more bytes or not depends on the encoding being used. For example, a UTF-8 encoded character may expand to a different number of bytes than a UTF-32 encoded character.";
884 'Grapheme_Link' => 'Deprecated by Unicode: Duplicates ccc=vr (Canonical_Combining_Class=Virama)',
885 'Jamo_Short_Name' => $contributory,
886 'Line_Break=Surrogate' => 'Deprecated by Unicode because surrogates should never appear in well-formed text, and therefore shouldn\'t be the basis for line breaking',
887 'Other_Alphabetic' => $contributory,
888 'Other_Default_Ignorable_Code_Point' => $contributory,
889 'Other_Grapheme_Extend' => $contributory,
890 'Other_ID_Continue' => $contributory,
891 'Other_ID_Start' => $contributory,
892 'Other_Lowercase' => $contributory,
893 'Other_Math' => $contributory,
894 'Other_Uppercase' => $contributory,
895 'Expands_On_NFC' => $why_no_expand,
896 'Expands_On_NFD' => $why_no_expand,
897 'Expands_On_NFKC' => $why_no_expand,
898 'Expands_On_NFKD' => $why_no_expand,
902 # There is a lib/unicore/Decomposition.pl (used by Normalize.pm) which
903 # contains the same information, but without the algorithmically
904 # determinable Hangul syllables'. This file is not published, so it's
905 # existence is not noted in the comment.
906 'Decomposition_Mapping' => 'Accessible via Unicode::Normalize or Unicode::UCD::prop_invmap()',
908 'Indic_Matra_Category' => "Provisional",
909 'Indic_Syllabic_Category' => "Provisional",
911 # Don't suppress ISO_Comment, as otherwise special handling is needed
912 # to differentiate between it and gc=c, which can be written as 'isc',
913 # which is the same characters as ISO_Comment's short name.
915 'Name' => "Accessible via \\N{...} or 'use charnames;' or Unicode::UCD::prop_invmap()",
917 'Simple_Case_Folding' => "$simple. Can access this through Unicode::UCD::casefold or Unicode::UCD::prop_invmap()",
918 'Simple_Lowercase_Mapping' => "$simple. Can access this through Unicode::UCD::charinfo or Unicode::UCD::prop_invmap()",
919 'Simple_Titlecase_Mapping' => "$simple. Can access this through Unicode::UCD::charinfo or Unicode::UCD::prop_invmap()",
920 'Simple_Uppercase_Mapping' => "$simple. Can access this through Unicode::UCD::charinfo or Unicode::UCD::prop_invmap()",
922 FC_NFKC_Closure => 'Supplanted in usage by NFKC_Casefold; otherwise not useful',
925 foreach my $property (
927 # The following are suppressed because they were made contributory
928 # or deprecated by Unicode before Perl ever thought about
937 # The following are suppressed because they have been marked
938 # as deprecated for a sufficient amount of time
940 'Other_Default_Ignorable_Code_Point',
941 'Other_Grapheme_Extend',
948 $why_suppressed{$property} = $why_deprecated{$property};
951 # Customize the message for all the 'Other_' properties
952 foreach my $property (keys %why_deprecated) {
953 next if (my $main_property = $property) !~ s/^Other_//;
954 $why_deprecated{$property} =~ s/$other_properties/the $main_property property (which should be used instead)/;
958 if ($v_version ge 4.0.0) {
959 $why_stabilized{'Hyphen'} = 'Use the Line_Break property instead; see www.unicode.org/reports/tr14';
960 if ($v_version ge 6.0.0) {
961 $why_deprecated{'Hyphen'} = 'Supplanted by Line_Break property values; see www.unicode.org/reports/tr14';
964 if ($v_version ge 5.2.0 && $v_version lt 6.0.0) {
965 $why_obsolete{'ISO_Comment'} = 'Code points for it have been removed';
966 if ($v_version ge 6.0.0) {
967 $why_deprecated{'ISO_Comment'} = 'No longer needed for Unicode\'s internal chart generation; otherwise not useful, and code points for it have been removed';
971 # Probably obsolete forever
972 if ($v_version ge v4.1.0) {
973 $why_suppressed{'Script=Katakana_Or_Hiragana'} = 'Obsolete. All code points previously matched by this have been moved to "Script=Common".';
975 if ($v_version ge v6.0.0) {
976 $why_suppressed{'Script=Katakana_Or_Hiragana'} .= ' Consider instead using "Script_Extensions=Katakana" or "Script_Extensions=Hiragana (or both)"';
977 $why_suppressed{'Script_Extensions=Katakana_Or_Hiragana'} = 'All code points that would be matched by this are matched by either "Script_Extensions=Katakana" or "Script_Extensions=Hiragana"';
980 # This program can create files for enumerated-like properties, such as
981 # 'Numeric_Type'. This file would be the same format as for a string
982 # property, with a mapping from code point to its value, so you could look up,
983 # for example, the script a code point is in. But no one so far wants this
984 # mapping, or they have found another way to get it since this is a new
985 # feature. So no file is generated except if it is in this list.
986 my @output_mapped_properties = split "\n", <<END;
989 # If you are using the Unihan database in a Unicode version before 5.2, you
990 # need to add the properties that you want to extract from it to this table.
991 # For your convenience, the properties in the 6.0 PropertyAliases.txt file are
992 # listed, commented out
993 my @cjk_properties = split "\n", <<'END';
994 #cjkAccountingNumeric; kAccountingNumeric
995 #cjkOtherNumeric; kOtherNumeric
996 #cjkPrimaryNumeric; kPrimaryNumeric
997 #cjkCompatibilityVariant; kCompatibilityVariant
999 #cjkIRG_GSource; kIRG_GSource
1000 #cjkIRG_HSource; kIRG_HSource
1001 #cjkIRG_JSource; kIRG_JSource
1002 #cjkIRG_KPSource; kIRG_KPSource
1003 #cjkIRG_KSource; kIRG_KSource
1004 #cjkIRG_TSource; kIRG_TSource
1005 #cjkIRG_USource; kIRG_USource
1006 #cjkIRG_VSource; kIRG_VSource
1007 #cjkRSUnicode; kRSUnicode ; Unicode_Radical_Stroke; URS
1010 # Similarly for the property values. For your convenience, the lines in the
1011 # 6.0 PropertyAliases.txt file are listed. Just remove the first BUT NOT both
1012 # '#' marks (for Unicode versions before 5.2)
1013 my @cjk_property_values = split "\n", <<'END';
1014 ## @missing: 0000..10FFFF; cjkAccountingNumeric; NaN
1015 ## @missing: 0000..10FFFF; cjkCompatibilityVariant; <code point>
1016 ## @missing: 0000..10FFFF; cjkIICore; <none>
1017 ## @missing: 0000..10FFFF; cjkIRG_GSource; <none>
1018 ## @missing: 0000..10FFFF; cjkIRG_HSource; <none>
1019 ## @missing: 0000..10FFFF; cjkIRG_JSource; <none>
1020 ## @missing: 0000..10FFFF; cjkIRG_KPSource; <none>
1021 ## @missing: 0000..10FFFF; cjkIRG_KSource; <none>
1022 ## @missing: 0000..10FFFF; cjkIRG_TSource; <none>
1023 ## @missing: 0000..10FFFF; cjkIRG_USource; <none>
1024 ## @missing: 0000..10FFFF; cjkIRG_VSource; <none>
1025 ## @missing: 0000..10FFFF; cjkOtherNumeric; NaN
1026 ## @missing: 0000..10FFFF; cjkPrimaryNumeric; NaN
1027 ## @missing: 0000..10FFFF; cjkRSUnicode; <none>
1030 # The input files don't list every code point. Those not listed are to be
1031 # defaulted to some value. Below are hard-coded what those values are for
1032 # non-binary properties as of 5.1. Starting in 5.0, there are
1033 # machine-parsable comment lines in the files the give the defaults; so this
1034 # list shouldn't have to be extended. The claim is that all missing entries
1035 # for binary properties will default to 'N'. Unicode tried to change that in
1036 # 5.2, but the beta period produced enough protest that they backed off.
1038 # The defaults for the fields that appear in UnicodeData.txt in this hash must
1039 # be in the form that it expects. The others may be synonyms.
1040 my $CODE_POINT = '<code point>';
1041 my %default_mapping = (
1042 Age => "Unassigned",
1043 # Bidi_Class => Complicated; set in code
1044 Bidi_Mirroring_Glyph => "",
1045 Block => 'No_Block',
1046 Canonical_Combining_Class => 0,
1047 Case_Folding => $CODE_POINT,
1048 Decomposition_Mapping => $CODE_POINT,
1049 Decomposition_Type => 'None',
1050 East_Asian_Width => "Neutral",
1051 FC_NFKC_Closure => $CODE_POINT,
1052 General_Category => 'Cn',
1053 Grapheme_Cluster_Break => 'Other',
1054 Hangul_Syllable_Type => 'NA',
1056 Jamo_Short_Name => "",
1057 Joining_Group => "No_Joining_Group",
1058 # Joining_Type => Complicated; set in code
1059 kIICore => 'N', # Is converted to binary
1060 #Line_Break => Complicated; set in code
1061 Lowercase_Mapping => $CODE_POINT,
1068 Numeric_Type => 'None',
1069 Numeric_Value => 'NaN',
1070 Script => ($v_version le 4.1.0) ? 'Common' : 'Unknown',
1071 Sentence_Break => 'Other',
1072 Simple_Case_Folding => $CODE_POINT,
1073 Simple_Lowercase_Mapping => $CODE_POINT,
1074 Simple_Titlecase_Mapping => $CODE_POINT,
1075 Simple_Uppercase_Mapping => $CODE_POINT,
1076 Titlecase_Mapping => $CODE_POINT,
1077 Unicode_1_Name => "",
1078 Unicode_Radical_Stroke => "",
1079 Uppercase_Mapping => $CODE_POINT,
1080 Word_Break => 'Other',
1083 # Below are files that Unicode furnishes, but this program ignores, and why
1084 my %ignored_files = (
1085 'CJKRadicals.txt' => 'Maps the kRSUnicode property values to corresponding code points',
1086 'Index.txt' => 'Alphabetical index of Unicode characters',
1087 'NamedSqProv.txt' => 'Named sequences proposed for inclusion in a later version of the Unicode Standard; if you need them now, you can append this file to F<NamedSequences.txt> and recompile perl',
1088 'NamesList.txt' => 'Annotated list of characters',
1089 'NormalizationCorrections.txt' => 'Documentation of corrections already incorporated into the Unicode data base',
1090 'Props.txt' => 'Only in very early releases; is a subset of F<PropList.txt> (which is used instead)',
1091 'ReadMe.txt' => 'Documentation',
1092 'StandardizedVariants.txt' => 'Certain glyph variations for character display are standardized. This lists the non-Unihan ones; the Unihan ones are also not used by Perl, and are in a separate Unicode data base L<http://www.unicode.org/ivd>',
1093 'EmojiSources.txt' => 'Maps certain Unicode code points to their legacy Japanese cell-phone values',
1094 'auxiliary/WordBreakTest.html' => 'Documentation of validation tests',
1095 'auxiliary/SentenceBreakTest.html' => 'Documentation of validation tests',
1096 'auxiliary/GraphemeBreakTest.html' => 'Documentation of validation tests',
1097 'auxiliary/LineBreakTest.html' => 'Documentation of validation tests',
1100 my %skipped_files; # List of files that we skip
1102 ### End of externally interesting definitions, except for @input_file_objects
1105 # !!!!!!! DO NOT EDIT THIS FILE !!!!!!!
1106 # This file is machine-generated by $0 from the Unicode
1107 # database, Version $string_version. Any changes made here will be lost!
1110 my $INTERNAL_ONLY_HEADER = <<"EOF";
1112 # !!!!!!! INTERNAL PERL USE ONLY !!!!!!!
1113 # This file is for internal use by core Perl only. The format and even the
1114 # name or existence of this file are subject to change without notice. Don't
1118 my $DEVELOPMENT_ONLY=<<"EOF";
1119 # !!!!!!! DEVELOPMENT USE ONLY !!!!!!!
1120 # This file contains information artificially constrained to code points
1121 # present in Unicode release $string_compare_versions.
1122 # IT CANNOT BE RELIED ON. It is for use during development only and should
1123 # not be used for production.
1127 my $MAX_UNICODE_CODEPOINT_STRING = "10FFFF";
1128 my $MAX_UNICODE_CODEPOINT = hex $MAX_UNICODE_CODEPOINT_STRING;
1129 my $MAX_UNICODE_CODEPOINTS = $MAX_UNICODE_CODEPOINT + 1;
1131 # Matches legal code point. 4-6 hex numbers, If there are 6, the first
1132 # two must be 10; if there are 5, the first must not be a 0. Written this way
1133 # to decrease backtracking. The first regex allows the code point to be at
1134 # the end of a word, but to work properly, the word shouldn't end with a valid
1135 # hex character. The second one won't match a code point at the end of a
1136 # word, and doesn't have the run-on issue
1137 my $run_on_code_point_re =
1138 qr/ (?: 10[0-9A-F]{4} | [1-9A-F][0-9A-F]{4} | [0-9A-F]{4} ) \b/x;
1139 my $code_point_re = qr/\b$run_on_code_point_re/;
1141 # This matches the beginning of the line in the Unicode db files that give the
1142 # defaults for code points not listed (i.e., missing) in the file. The code
1143 # depends on this ending with a semi-colon, so it can assume it is a valid
1144 # field when the line is split() by semi-colons
1145 my $missing_defaults_prefix =
1146 qr/^#\s+\@missing:\s+0000\.\.$MAX_UNICODE_CODEPOINT_STRING\s*;/;
1148 # Property types. Unicode has more types, but these are sufficient for our
1150 my $UNKNOWN = -1; # initialized to illegal value
1151 my $NON_STRING = 1; # Either binary or enum
1153 my $FORCED_BINARY = 3; # Not a binary property, but, besides its normal
1154 # tables, additional true and false tables are
1155 # generated so that false is anything matching the
1156 # default value, and true is everything else.
1157 my $ENUM = 4; # Include catalog
1158 my $STRING = 5; # Anything else: string or misc
1160 # Some input files have lines that give default values for code points not
1161 # contained in the file. Sometimes these should be ignored.
1162 my $NO_DEFAULTS = 0; # Must evaluate to false
1163 my $NOT_IGNORED = 1;
1166 # Range types. Each range has a type. Most ranges are type 0, for normal,
1167 # and will appear in the main body of the tables in the output files, but
1168 # there are other types of ranges as well, listed below, that are specially
1169 # handled. There are pseudo-types as well that will never be stored as a
1170 # type, but will affect the calculation of the type.
1172 # 0 is for normal, non-specials
1173 my $MULTI_CP = 1; # Sequence of more than code point
1174 my $HANGUL_SYLLABLE = 2;
1175 my $CP_IN_NAME = 3; # The NAME contains the code point appended to it.
1176 my $NULL = 4; # The map is to the null string; utf8.c can't
1177 # handle these, nor is there an accepted syntax
1178 # for them in \p{} constructs
1179 my $COMPUTE_NO_MULTI_CP = 5; # Pseudo-type; means that ranges that would
1180 # otherwise be $MULTI_CP type are instead type 0
1182 # process_generic_property_file() can accept certain overrides in its input.
1183 # Each of these must begin AND end with $CMD_DELIM.
1184 my $CMD_DELIM = "\a";
1185 my $REPLACE_CMD = 'replace'; # Override the Replace
1186 my $MAP_TYPE_CMD = 'map_type'; # Override the Type
1191 # Values for the Replace argument to add_range.
1192 # $NO # Don't replace; add only the code points not
1194 my $IF_NOT_EQUIVALENT = 1; # Replace only under certain conditions; details in
1195 # the comments at the subroutine definition.
1196 my $UNCONDITIONALLY = 2; # Replace without conditions.
1197 my $MULTIPLE_BEFORE = 4; # Don't replace, but add a duplicate record if
1199 my $MULTIPLE_AFTER = 5; # Don't replace, but add a duplicate record if
1201 my $CROAK = 6; # Die with an error if is already there
1203 # Flags to give property statuses. The phrases are to remind maintainers that
1204 # if the flag is changed, the indefinite article referring to it in the
1205 # documentation may need to be as well.
1207 my $DEPRECATED = 'D';
1208 my $a_bold_deprecated = "a 'B<$DEPRECATED>'";
1209 my $A_bold_deprecated = "A 'B<$DEPRECATED>'";
1210 my $DISCOURAGED = 'X';
1211 my $a_bold_discouraged = "an 'B<$DISCOURAGED>'";
1212 my $A_bold_discouraged = "An 'B<$DISCOURAGED>'";
1214 my $a_bold_stricter = "a 'B<$STRICTER>'";
1215 my $A_bold_stricter = "A 'B<$STRICTER>'";
1216 my $STABILIZED = 'S';
1217 my $a_bold_stabilized = "an 'B<$STABILIZED>'";
1218 my $A_bold_stabilized = "An 'B<$STABILIZED>'";
1220 my $a_bold_obsolete = "an 'B<$OBSOLETE>'";
1221 my $A_bold_obsolete = "An 'B<$OBSOLETE>'";
1223 my %status_past_participles = (
1224 $DISCOURAGED => 'discouraged',
1225 $STABILIZED => 'stabilized',
1226 $OBSOLETE => 'obsolete',
1227 $DEPRECATED => 'deprecated',
1230 # Table fates. These are somewhat ordered, so that fates < $MAP_PROXIED should be
1231 # externally documented.
1232 my $ORDINARY = 0; # The normal fate.
1233 my $MAP_PROXIED = 1; # The map table for the property isn't written out,
1234 # but there is a file written that can be used to
1235 # reconstruct this table
1236 my $SUPPRESSED = 3; # The file for this table is not written out.
1237 my $INTERNAL_ONLY = 4; # The file for this table is written out, but it is
1238 # for Perl's internal use only
1239 my $PLACEHOLDER = 5; # A property that is defined as a placeholder in a
1240 # Unicode version that doesn't have it, but we need it
1241 # to be defined, if empty, to have things work.
1242 # Implies no pod entry generated
1244 # The format of the values of the tables:
1245 my $EMPTY_FORMAT = "";
1246 my $BINARY_FORMAT = 'b';
1247 my $DECIMAL_FORMAT = 'd';
1248 my $FLOAT_FORMAT = 'f';
1249 my $INTEGER_FORMAT = 'i';
1250 my $HEX_FORMAT = 'x';
1251 my $RATIONAL_FORMAT = 'r';
1252 my $STRING_FORMAT = 's';
1253 my $ADJUST_FORMAT = 'a';
1254 my $DECOMP_STRING_FORMAT = 'c';
1255 my $STRING_WHITE_SPACE_LIST = 'sw';
1257 my %map_table_formats = (
1258 $BINARY_FORMAT => 'binary',
1259 $DECIMAL_FORMAT => 'single decimal digit',
1260 $FLOAT_FORMAT => 'floating point number',
1261 $INTEGER_FORMAT => 'integer',
1262 $HEX_FORMAT => 'non-negative hex whole number; a code point',
1263 $RATIONAL_FORMAT => 'rational: an integer or a fraction',
1264 $STRING_FORMAT => 'string',
1265 $ADJUST_FORMAT => 'some entries need adjustment',
1266 $DECOMP_STRING_FORMAT => 'Perl\'s internal (Normalize.pm) decomposition mapping',
1267 $STRING_WHITE_SPACE_LIST => 'string, but some elements are interpreted as a list; white space occurs only as list item separators'
1270 # Unicode didn't put such derived files in a separate directory at first.
1271 my $EXTRACTED_DIR = (-d 'extracted') ? 'extracted' : "";
1272 my $EXTRACTED = ($EXTRACTED_DIR) ? "$EXTRACTED_DIR/" : "";
1273 my $AUXILIARY = 'auxiliary';
1275 # Hashes that will eventually go into Heavy.pl for the use of utf8_heavy.pl
1276 # and into UCD.pl for the use of UCD.pm
1277 my %loose_to_file_of; # loosely maps table names to their respective
1279 my %stricter_to_file_of; # same; but for stricter mapping.
1280 my %loose_property_to_file_of; # Maps a loose property name to its map file
1281 my %file_to_swash_name; # Maps the file name to its corresponding key name
1282 # in the hash %utf8::SwashInfo
1283 my %nv_floating_to_rational; # maps numeric values floating point numbers to
1284 # their rational equivalent
1285 my %loose_property_name_of; # Loosely maps (non_string) property names to
1287 my %string_property_loose_to_name; # Same, for string properties.
1288 my %loose_defaults; # keys are of form "prop=value", where 'prop' is
1289 # the property name in standard loose form, and
1290 # 'value' is the default value for that property,
1291 # also in standard loose form.
1292 my %loose_to_standard_value; # loosely maps table names to the canonical
1294 my %ambiguous_names; # keys are alias names (in standard form) that
1295 # have more than one possible meaning.
1296 my %prop_aliases; # Keys are standard property name; values are each
1298 my %prop_value_aliases; # Keys of top level are standard property name;
1299 # values are keys to another hash, Each one is
1300 # one of the property's values, in standard form.
1301 # The values are that prop-val's aliases.
1302 my %ucd_pod; # Holds entries that will go into the UCD section of the pod
1304 # Most properties are immune to caseless matching, otherwise you would get
1305 # nonsensical results, as properties are a function of a code point, not
1306 # everything that is caselessly equivalent to that code point. For example,
1307 # Changes_When_Case_Folded('s') should be false, whereas caselessly it would
1308 # be true because 's' and 'S' are equivalent caselessly. However,
1309 # traditionally, [:upper:] and [:lower:] are equivalent caselessly, so we
1310 # extend that concept to those very few properties that are like this. Each
1311 # such property will match the full range caselessly. They are hard-coded in
1312 # the program; it's not worth trying to make it general as it's extremely
1313 # unlikely that they will ever change.
1314 my %caseless_equivalent_to;
1316 # These constants names and values were taken from the Unicode standard,
1317 # version 5.1, section 3.12. They are used in conjunction with Hangul
1318 # syllables. The '_string' versions are so generated tables can retain the
1319 # hex format, which is the more familiar value
1320 my $SBase_string = "0xAC00";
1321 my $SBase = CORE::hex $SBase_string;
1322 my $LBase_string = "0x1100";
1323 my $LBase = CORE::hex $LBase_string;
1324 my $VBase_string = "0x1161";
1325 my $VBase = CORE::hex $VBase_string;
1326 my $TBase_string = "0x11A7";
1327 my $TBase = CORE::hex $TBase_string;
1332 my $NCount = $VCount * $TCount;
1334 # For Hangul syllables; These store the numbers from Jamo.txt in conjunction
1335 # with the above published constants.
1337 my %Jamo_L; # Leading consonants
1338 my %Jamo_V; # Vowels
1339 my %Jamo_T; # Trailing consonants
1341 # For code points whose name contains its ordinal as a '-ABCD' suffix.
1342 # The key is the base name of the code point, and the value is an
1343 # array giving all the ranges that use this base name. Each range
1344 # is actually a hash giving the 'low' and 'high' values of it.
1345 my %names_ending_in_code_point;
1346 my %loose_names_ending_in_code_point; # Same as above, but has blanks, dashes
1347 # removed from the names
1348 # Inverse mapping. The list of ranges that have these kinds of
1349 # names. Each element contains the low, high, and base names in an
1351 my @code_points_ending_in_code_point;
1353 # To hold Unicode's normalization test suite
1354 my @normalization_tests;
1356 # Boolean: does this Unicode version have the hangul syllables, and are we
1357 # writing out a table for them?
1358 my $has_hangul_syllables = 0;
1360 # Does this Unicode version have code points whose names end in their
1361 # respective code points, and are we writing out a table for them? 0 for no;
1362 # otherwise points to first property that a table is needed for them, so that
1363 # if multiple tables are needed, we don't create duplicates
1364 my $needing_code_points_ending_in_code_point = 0;
1366 my @backslash_X_tests; # List of tests read in for testing \X
1367 my @unhandled_properties; # Will contain a list of properties found in
1368 # the input that we didn't process.
1369 my @match_properties; # Properties that have match tables, to be
1371 my @map_properties; # Properties that get map files written
1372 my @named_sequences; # NamedSequences.txt contents.
1373 my %potential_files; # Generated list of all .txt files in the directory
1374 # structure so we can warn if something is being
1376 my @files_actually_output; # List of files we generated.
1377 my @more_Names; # Some code point names are compound; this is used
1378 # to store the extra components of them.
1379 my $MIN_FRACTION_LENGTH = 3; # How many digits of a floating point number at
1380 # the minimum before we consider it equivalent to a
1381 # candidate rational
1382 my $MAX_FLOATING_SLOP = 10 ** - $MIN_FRACTION_LENGTH; # And in floating terms
1384 # These store references to certain commonly used property objects
1393 # Are there conflicting names because of beginning with 'In_', or 'Is_'
1394 my $has_In_conflicts = 0;
1395 my $has_Is_conflicts = 0;
1397 sub internal_file_to_platform ($) {
1398 # Convert our file paths which have '/' separators to those of the
1402 return undef unless defined $file;
1404 return File::Spec->join(split '/', $file);
1407 sub file_exists ($) { # platform independent '-e'. This program internally
1408 # uses slash as a path separator.
1410 return 0 if ! defined $file;
1411 return -e internal_file_to_platform($file);
1415 # Returns the address of the blessed input object.
1416 # It doesn't check for blessedness because that would do a string eval
1417 # every call, and the program is structured so that this is never called
1418 # for a non-blessed object.
1420 no overloading; # If overloaded, numifying below won't work.
1422 # Numifying a ref gives its address.
1423 return pack 'J', $_[0];
1426 # These are used only if $annotate is true.
1427 # The entire range of Unicode characters is examined to populate these
1428 # after all the input has been processed. But most can be skipped, as they
1429 # have the same descriptive phrases, such as being unassigned
1430 my @viacode; # Contains the 1 million character names
1431 my @printable; # boolean: And are those characters printable?
1432 my @annotate_char_type; # Contains a type of those characters, specifically
1433 # for the purposes of annotation.
1434 my $annotate_ranges; # A map of ranges of code points that have the same
1435 # name for the purposes of annotation. They map to the
1436 # upper edge of the range, so that the end point can
1437 # be immediately found. This is used to skip ahead to
1438 # the end of a range, and avoid processing each
1439 # individual code point in it.
1440 my $unassigned_sans_noncharacters; # A Range_List of the unassigned
1441 # characters, but excluding those which are
1442 # also noncharacter code points
1444 # The annotation types are an extension of the regular range types, though
1445 # some of the latter are folded into one. Make the new types negative to
1446 # avoid conflicting with the regular types
1447 my $SURROGATE_TYPE = -1;
1448 my $UNASSIGNED_TYPE = -2;
1449 my $PRIVATE_USE_TYPE = -3;
1450 my $NONCHARACTER_TYPE = -4;
1451 my $CONTROL_TYPE = -5;
1452 my $UNKNOWN_TYPE = -6; # Used only if there is a bug in this program
1454 sub populate_char_info ($) {
1455 # Used only with the $annotate option. Populates the arrays with the
1456 # input code point's info that are needed for outputting more detailed
1457 # comments. If calling context wants a return, it is the end point of
1458 # any contiguous range of characters that share essentially the same info
1461 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
1463 $viacode[$i] = $perl_charname->value_of($i) || "";
1465 # A character is generally printable if Unicode says it is,
1466 # but below we make sure that most Unicode general category 'C' types
1468 $printable[$i] = $print->contains($i);
1470 $annotate_char_type[$i] = $perl_charname->type_of($i) || 0;
1472 # Only these two regular types are treated specially for annotations
1474 $annotate_char_type[$i] = 0 if $annotate_char_type[$i] != $CP_IN_NAME
1475 && $annotate_char_type[$i] != $HANGUL_SYLLABLE;
1477 # Give a generic name to all code points that don't have a real name.
1478 # We output ranges, if applicable, for these. Also calculate the end
1479 # point of the range.
1481 if (! $viacode[$i]) {
1482 if ($gc-> table('Surrogate')->contains($i)) {
1483 $viacode[$i] = 'Surrogate';
1484 $annotate_char_type[$i] = $SURROGATE_TYPE;
1486 $end = $gc->table('Surrogate')->containing_range($i)->end;
1488 elsif ($gc-> table('Private_use')->contains($i)) {
1489 $viacode[$i] = 'Private Use';
1490 $annotate_char_type[$i] = $PRIVATE_USE_TYPE;
1492 $end = $gc->table('Private_Use')->containing_range($i)->end;
1494 elsif (Property::property_ref('Noncharacter_Code_Point')-> table('Y')->
1497 $viacode[$i] = 'Noncharacter';
1498 $annotate_char_type[$i] = $NONCHARACTER_TYPE;
1500 $end = property_ref('Noncharacter_Code_Point')->table('Y')->
1501 containing_range($i)->end;
1503 elsif ($gc-> table('Control')->contains($i)) {
1504 $viacode[$i] = 'Control';
1505 $annotate_char_type[$i] = $CONTROL_TYPE;
1507 $end = 0x81 if $i == 0x80; # Hard-code this one known case
1509 elsif ($gc-> table('Unassigned')->contains($i)) {
1510 $viacode[$i] = 'Unassigned, block=' . $block-> value_of($i);
1511 $annotate_char_type[$i] = $UNASSIGNED_TYPE;
1514 # Because we name the unassigned by the blocks they are in, it
1515 # can't go past the end of that block, and it also can't go past
1516 # the unassigned range it is in. The special table makes sure
1517 # that the non-characters, which are unassigned, are separated
1519 $end = min($block->containing_range($i)->end,
1520 $unassigned_sans_noncharacters-> containing_range($i)->
1524 Carp::my_carp_bug("Can't figure out how to annotate "
1525 . sprintf("U+%04X", $i)
1526 . ". Proceeding anyway.");
1527 $viacode[$i] = 'UNKNOWN';
1528 $annotate_char_type[$i] = $UNKNOWN_TYPE;
1533 # Here, has a name, but if it's one in which the code point number is
1534 # appended to the name, do that.
1535 elsif ($annotate_char_type[$i] == $CP_IN_NAME) {
1536 $viacode[$i] .= sprintf("-%04X", $i);
1537 $end = $perl_charname->containing_range($i)->end;
1540 # And here, has a name, but if it's a hangul syllable one, replace it with
1541 # the correct name from the Unicode algorithm
1542 elsif ($annotate_char_type[$i] == $HANGUL_SYLLABLE) {
1544 my $SIndex = $i - $SBase;
1545 my $L = $LBase + $SIndex / $NCount;
1546 my $V = $VBase + ($SIndex % $NCount) / $TCount;
1547 my $T = $TBase + $SIndex % $TCount;
1548 $viacode[$i] = "HANGUL SYLLABLE $Jamo{$L}$Jamo{$V}";
1549 $viacode[$i] .= $Jamo{$T} if $T != $TBase;
1550 $end = $perl_charname->containing_range($i)->end;
1553 return if ! defined wantarray;
1554 return $i if ! defined $end; # If not a range, return the input
1556 # Save this whole range so can find the end point quickly
1557 $annotate_ranges->add_map($i, $end, $end);
1562 # Commented code below should work on Perl 5.8.
1563 ## This 'require' doesn't necessarily work in miniperl, and even if it does,
1564 ## the native perl version of it (which is what would operate under miniperl)
1565 ## is extremely slow, as it does a string eval every call.
1566 #my $has_fast_scalar_util = $
\18 !~ /miniperl/
1567 # && defined eval "require Scalar::Util";
1570 # # Returns the address of the blessed input object. Uses the XS version if
1571 # # available. It doesn't check for blessedness because that would do a
1572 # # string eval every call, and the program is structured so that this is
1573 # # never called for a non-blessed object.
1575 # return Scalar::Util::refaddr($_[0]) if $has_fast_scalar_util;
1577 # # Check at least that is a ref.
1578 # my $pkg = ref($_[0]) or return undef;
1580 # # Change to a fake package to defeat any overloaded stringify
1581 # bless $_[0], 'main::Fake';
1583 # # Numifying a ref gives its address.
1584 # my $addr = pack 'J', $_[0];
1586 # # Return to original class
1587 # bless $_[0], $pkg;
1594 return $a if $a >= $b;
1601 return $a if $a <= $b;
1605 sub clarify_number ($) {
1606 # This returns the input number with underscores inserted every 3 digits
1607 # in large (5 digits or more) numbers. Input must be entirely digits, not
1611 my $pos = length($number) - 3;
1612 return $number if $pos <= 1;
1614 substr($number, $pos, 0) = '_';
1623 # These routines give a uniform treatment of messages in this program. They
1624 # are placed in the Carp package to cause the stack trace to not include them,
1625 # although an alternative would be to use another package and set @CARP_NOT
1628 our $Verbose = 1 if main::DEBUG; # Useful info when debugging
1630 # This is a work-around suggested by Nicholas Clark to fix a problem with Carp
1631 # and overload trying to load Scalar:Util under miniperl. See
1632 # http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/2009-11/msg01057.html
1633 undef $overload::VERSION;
1636 my $message = shift || "";
1637 my $nofold = shift || 0;
1640 $message = main::join_lines($message);
1641 $message =~ s/^$0: *//; # Remove initial program name
1642 $message =~ s/[.;,]+$//; # Remove certain ending punctuation
1643 $message = "\n$0: $message;";
1645 # Fold the message with program name, semi-colon end punctuation
1646 # (which looks good with the message that carp appends to it), and a
1647 # hanging indent for continuation lines.
1648 $message = main::simple_fold($message, "", 4) unless $nofold;
1649 $message =~ s/\n$//; # Remove the trailing nl so what carp
1650 # appends is to the same line
1653 return $message if defined wantarray; # If a caller just wants the msg
1660 # This is called when it is clear that the problem is caused by a bug in
1663 my $message = shift;
1664 $message =~ s/^$0: *//;
1665 $message = my_carp("Bug in $0. Please report it by running perlbug or if that is unavailable, by sending email to perbug\@perl.org:\n$message");
1670 sub carp_too_few_args {
1672 my_carp_bug("Wrong number of arguments: to 'carp_too_few_arguments'. No action taken.");
1676 my $args_ref = shift;
1679 my_carp_bug("Need at least $count arguments to "
1681 . ". Instead got: '"
1682 . join ', ', @$args_ref
1683 . "'. No action taken.");
1687 sub carp_extra_args {
1688 my $args_ref = shift;
1689 my_carp_bug("Too many arguments to 'carp_extra_args': (" . join(', ', @_) . "); Extras ignored.") if @_;
1691 unless (ref $args_ref) {
1692 my_carp_bug("Argument to 'carp_extra_args' ($args_ref) must be a ref. Not checking arguments.");
1695 my ($package, $file, $line) = caller;
1696 my $subroutine = (caller 1)[3];
1699 if (ref $args_ref eq 'HASH') {
1700 foreach my $key (keys %$args_ref) {
1701 $args_ref->{$key} = $UNDEF unless defined $args_ref->{$key};
1703 $list = join ', ', each %{$args_ref};
1705 elsif (ref $args_ref eq 'ARRAY') {
1706 foreach my $arg (@$args_ref) {
1707 $arg = $UNDEF unless defined $arg;
1709 $list = join ', ', @$args_ref;
1712 my_carp_bug("Can't cope with ref "
1714 . " . argument to 'carp_extra_args'. Not checking arguments.");
1718 my_carp_bug("Unrecognized parameters in options: '$list' to $subroutine. Skipped.");
1726 # This program uses the inside-out method for objects, as recommended in
1727 # "Perl Best Practices". This closure aids in generating those. There
1728 # are two routines. setup_package() is called once per package to set
1729 # things up, and then set_access() is called for each hash representing a
1730 # field in the object. These routines arrange for the object to be
1731 # properly destroyed when no longer used, and for standard accessor
1732 # functions to be generated. If you need more complex accessors, just
1733 # write your own and leave those accesses out of the call to set_access().
1734 # More details below.
1736 my %constructor_fields; # fields that are to be used in constructors; see
1739 # The values of this hash will be the package names as keys to other
1740 # hashes containing the name of each field in the package as keys, and
1741 # references to their respective hashes as values.
1745 # Sets up the package, creating standard DESTROY and dump methods
1746 # (unless already defined). The dump method is used in debugging by
1748 # The optional parameters are:
1749 # a) a reference to a hash, that gets populated by later
1750 # set_access() calls with one of the accesses being
1751 # 'constructor'. The caller can then refer to this, but it is
1752 # not otherwise used by these two routines.
1753 # b) a reference to a callback routine to call during destruction
1754 # of the object, before any fields are actually destroyed
1757 my $constructor_ref = delete $args{'Constructor_Fields'};
1758 my $destroy_callback = delete $args{'Destroy_Callback'};
1759 Carp::carp_extra_args(\@_) if main::DEBUG && %args;
1762 my $package = (caller)[0];
1764 $package_fields{$package} = \%fields;
1765 $constructor_fields{$package} = $constructor_ref;
1767 unless ($package->can('DESTROY')) {
1768 my $destroy_name = "${package}::DESTROY";
1771 # Use typeglob to give the anonymous subroutine the name we want
1772 *$destroy_name = sub {
1774 my $addr = do { no overloading; pack 'J', $self; };
1776 $self->$destroy_callback if $destroy_callback;
1777 foreach my $field (keys %{$package_fields{$package}}) {
1778 #print STDERR __LINE__, ": Destroying ", ref $self, " ", sprintf("%04X", $addr), ": ", $field, "\n";
1779 delete $package_fields{$package}{$field}{$addr};
1785 unless ($package->can('dump')) {
1786 my $dump_name = "${package}::dump";
1790 return dump_inside_out($self, $package_fields{$package}, @_);
1797 # Arrange for the input field to be garbage collected when no longer
1798 # needed. Also, creates standard accessor functions for the field
1799 # based on the optional parameters-- none if none of these parameters:
1800 # 'addable' creates an 'add_NAME()' accessor function.
1801 # 'readable' or 'readable_array' creates a 'NAME()' accessor
1803 # 'settable' creates a 'set_NAME()' accessor function.
1804 # 'constructor' doesn't create an accessor function, but adds the
1805 # field to the hash that was previously passed to
1807 # Any of the accesses can be abbreviated down, so that 'a', 'ad',
1808 # 'add' etc. all mean 'addable'.
1809 # The read accessor function will work on both array and scalar
1810 # values. If another accessor in the parameter list is 'a', the read
1811 # access assumes an array. You can also force it to be array access
1812 # by specifying 'readable_array' instead of 'readable'
1814 # A sort-of 'protected' access can be set-up by preceding the addable,
1815 # readable or settable with some initial portion of 'protected_' (but,
1816 # the underscore is required), like 'p_a', 'pro_set', etc. The
1817 # "protection" is only by convention. All that happens is that the
1818 # accessor functions' names begin with an underscore. So instead of
1819 # calling set_foo, the call is _set_foo. (Real protection could be
1820 # accomplished by having a new subroutine, end_package, called at the
1821 # end of each package, and then storing the __LINE__ ranges and
1822 # checking them on every accessor. But that is way overkill.)
1824 # We create anonymous subroutines as the accessors and then use
1825 # typeglobs to assign them to the proper package and name
1827 my $name = shift; # Name of the field
1828 my $field = shift; # Reference to the inside-out hash containing the
1831 my $package = (caller)[0];
1833 if (! exists $package_fields{$package}) {
1834 croak "$0: Must call 'setup_package' before 'set_access'";
1837 # Stash the field so DESTROY can get it.
1838 $package_fields{$package}{$name} = $field;
1840 # Remaining arguments are the accessors. For each...
1841 foreach my $access (@_) {
1842 my $access = lc $access;
1846 # Match the input as far as it goes.
1847 if ($access =~ /^(p[^_]*)_/) {
1849 if (substr('protected_', 0, length $protected)
1853 # Add 1 for the underscore not included in $protected
1854 $access = substr($access, length($protected) + 1);
1862 if (substr('addable', 0, length $access) eq $access) {
1863 my $subname = "${package}::${protected}add_$name";
1866 # add_ accessor. Don't add if already there, which we
1867 # determine using 'eq' for scalars and '==' otherwise.
1870 return Carp::carp_too_few_args(\@_, 2) if main::DEBUG && @_ < 2;
1873 my $addr = do { no overloading; pack 'J', $self; };
1874 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
1876 return if grep { $value == $_ } @{$field->{$addr}};
1879 return if grep { $value eq $_ } @{$field->{$addr}};
1881 push @{$field->{$addr}}, $value;
1885 elsif (substr('constructor', 0, length $access) eq $access) {
1887 Carp::my_carp_bug("Can't set-up 'protected' constructors")
1890 $constructor_fields{$package}{$name} = $field;
1893 elsif (substr('readable_array', 0, length $access) eq $access) {
1895 # Here has read access. If one of the other parameters for
1896 # access is array, or this one specifies array (by being more
1897 # than just 'readable_'), then create a subroutine that
1898 # assumes the data is an array. Otherwise just a scalar
1899 my $subname = "${package}::${protected}$name";
1900 if (grep { /^a/i } @_
1901 or length($access) > length('readable_'))
1906 Carp::carp_extra_args(\@_) if main::DEBUG && @_ > 1;
1907 my $addr = do { no overloading; pack 'J', $_[0]; };
1908 if (ref $field->{$addr} ne 'ARRAY') {
1909 my $type = ref $field->{$addr};
1910 $type = 'scalar' unless $type;
1911 Carp::my_carp_bug("Trying to read $name as an array when it is a $type. Big problems.");
1914 return scalar @{$field->{$addr}} unless wantarray;
1916 # Make a copy; had problems with caller modifying the
1917 # original otherwise
1918 my @return = @{$field->{$addr}};
1924 # Here not an array value, a simpler function.
1928 Carp::carp_extra_args(\@_) if main::DEBUG && @_ > 1;
1930 return $field->{pack 'J', $_[0]};
1934 elsif (substr('settable', 0, length $access) eq $access) {
1935 my $subname = "${package}::${protected}set_$name";
1940 return Carp::carp_too_few_args(\@_, 2) if @_ < 2;
1941 Carp::carp_extra_args(\@_) if @_ > 2;
1943 # $self is $_[0]; $value is $_[1]
1945 $field->{pack 'J', $_[0]} = $_[1];
1950 Carp::my_carp_bug("Unknown accessor type $access. No accessor set.");
1959 # All input files use this object, which stores various attributes about them,
1960 # and provides for convenient, uniform handling. The run method wraps the
1961 # processing. It handles all the bookkeeping of opening, reading, and closing
1962 # the file, returning only significant input lines.
1964 # Each object gets a handler which processes the body of the file, and is
1965 # called by run(). Most should use the generic, default handler, which has
1966 # code scrubbed to handle things you might not expect. A handler should
1967 # basically be a while(next_line()) {...} loop.
1969 # You can also set up handlers to
1970 # 1) call before the first line is read for pre processing
1971 # 2) call to adjust each line of the input before the main handler gets them
1972 # 3) call upon EOF before the main handler exits its loop
1973 # 4) call at the end for post processing
1975 # $_ is used to store the input line, and is to be filtered by the
1976 # each_line_handler()s. So, if the format of the line is not in the desired
1977 # format for the main handler, these are used to do that adjusting. They can
1978 # be stacked (by enclosing them in an [ anonymous array ] in the constructor,
1979 # so the $_ output of one is used as the input to the next. None of the other
1980 # handlers are stackable, but could easily be changed to be so.
1982 # Most of the handlers can call insert_lines() or insert_adjusted_lines()
1983 # which insert the parameters as lines to be processed before the next input
1984 # file line is read. This allows the EOF handler to flush buffers, for
1985 # example. The difference between the two routines is that the lines inserted
1986 # by insert_lines() are subjected to the each_line_handler()s. (So if you
1987 # called it from such a handler, you would get infinite recursion.) Lines
1988 # inserted by insert_adjusted_lines() go directly to the main handler without
1989 # any adjustments. If the post-processing handler calls any of these, there
1990 # will be no effect. Some error checking for these conditions could be added,
1991 # but it hasn't been done.
1993 # carp_bad_line() should be called to warn of bad input lines, which clears $_
1994 # to prevent further processing of the line. This routine will output the
1995 # message as a warning once, and then keep a count of the lines that have the
1996 # same message, and output that count at the end of the file's processing.
1997 # This keeps the number of messages down to a manageable amount.
1999 # get_missings() should be called to retrieve any @missing input lines.
2000 # Messages will be raised if this isn't done if the options aren't to ignore
2003 sub trace { return main::trace(@_); }
2006 # Keep track of fields that are to be put into the constructor.
2007 my %constructor_fields;
2009 main::setup_package(Constructor_Fields => \%constructor_fields);
2011 my %file; # Input file name, required
2012 main::set_access('file', \%file, qw{ c r });
2014 my %first_released; # Unicode version file was first released in, required
2015 main::set_access('first_released', \%first_released, qw{ c r });
2017 my %handler; # Subroutine to process the input file, defaults to
2018 # 'process_generic_property_file'
2019 main::set_access('handler', \%handler, qw{ c });
2022 # name of property this file is for. defaults to none, meaning not
2023 # applicable, or is otherwise determinable, for example, from each line.
2024 main::set_access('property', \%property, qw{ c });
2027 # If this is true, the file is optional. If not present, no warning is
2028 # output. If it is present, the string given by this parameter is
2029 # evaluated, and if false the file is not processed.
2030 main::set_access('optional', \%optional, 'c', 'r');
2033 # This is used for debugging, to skip processing of all but a few input
2034 # files. Add 'non_skip => 1' to the constructor for those files you want
2035 # processed when you set the $debug_skip global.
2036 main::set_access('non_skip', \%non_skip, 'c');
2039 # This is used to skip processing of this input file semi-permanently,
2040 # when it evaluates to true. The value should be the reason the file is
2041 # being skipped. It is used for files that we aren't planning to process
2042 # anytime soon, but want to allow to be in the directory and not raise a
2043 # message that we are not handling. Mostly for test files. This is in
2044 # contrast to the non_skip element, which is supposed to be used very
2045 # temporarily for debugging. Sets 'optional' to 1. Also, files that we
2046 # pretty much will never look at can be placed in the global
2047 # %ignored_files instead. Ones used here will be added to %skipped files
2048 main::set_access('skip', \%skip, 'c');
2050 my %each_line_handler;
2051 # list of subroutines to look at and filter each non-comment line in the
2052 # file. defaults to none. The subroutines are called in order, each is
2053 # to adjust $_ for the next one, and the final one adjusts it for
2055 main::set_access('each_line_handler', \%each_line_handler, 'c');
2057 my %has_missings_defaults;
2058 # ? Are there lines in the file giving default values for code points
2059 # missing from it?. Defaults to NO_DEFAULTS. Otherwise NOT_IGNORED is
2060 # the norm, but IGNORED means it has such lines, but the handler doesn't
2061 # use them. Having these three states allows us to catch changes to the
2062 # UCD that this program should track
2063 main::set_access('has_missings_defaults',
2064 \%has_missings_defaults, qw{ c r });
2067 # Subroutine to call before doing anything else in the file. If undef, no
2068 # such handler is called.
2069 main::set_access('pre_handler', \%pre_handler, qw{ c });
2072 # Subroutine to call upon getting an EOF on the input file, but before
2073 # that is returned to the main handler. This is to allow buffers to be
2074 # flushed. The handler is expected to call insert_lines() or
2075 # insert_adjusted() with the buffered material
2076 main::set_access('eof_handler', \%eof_handler, qw{ c r });
2079 # Subroutine to call after all the lines of the file are read in and
2080 # processed. If undef, no such handler is called.
2081 main::set_access('post_handler', \%post_handler, qw{ c });
2083 my %progress_message;
2084 # Message to print to display progress in lieu of the standard one
2085 main::set_access('progress_message', \%progress_message, qw{ c });
2088 # cache open file handle, internal. Is undef if file hasn't been
2089 # processed at all, empty if has;
2090 main::set_access('handle', \%handle);
2093 # cache of lines added virtually to the file, internal
2094 main::set_access('added_lines', \%added_lines);
2097 # cache of errors found, internal
2098 main::set_access('errors', \%errors);
2101 # storage of '@missing' defaults lines
2102 main::set_access('missings', \%missings);
2107 my $self = bless \do{ my $anonymous_scalar }, $class;
2108 my $addr = do { no overloading; pack 'J', $self; };
2111 $handler{$addr} = \&main::process_generic_property_file;
2112 $non_skip{$addr} = 0;
2114 $has_missings_defaults{$addr} = $NO_DEFAULTS;
2115 $handle{$addr} = undef;
2116 $added_lines{$addr} = [ ];
2117 $each_line_handler{$addr} = [ ];
2118 $errors{$addr} = { };
2119 $missings{$addr} = [ ];
2121 # Two positional parameters.
2122 return Carp::carp_too_few_args(\@_, 2) if main::DEBUG && @_ < 2;
2123 $file{$addr} = main::internal_file_to_platform(shift);
2124 $first_released{$addr} = shift;
2126 # The rest of the arguments are key => value pairs
2127 # %constructor_fields has been set up earlier to list all possible
2128 # ones. Either set or push, depending on how the default has been set
2131 foreach my $key (keys %args) {
2132 my $argument = $args{$key};
2134 # Note that the fields are the lower case of the constructor keys
2135 my $hash = $constructor_fields{lc $key};
2136 if (! defined $hash) {
2137 Carp::my_carp_bug("Unrecognized parameters '$key => $argument' to new() for $self. Skipped");
2140 if (ref $hash->{$addr} eq 'ARRAY') {
2141 if (ref $argument eq 'ARRAY') {
2142 foreach my $argument (@{$argument}) {
2143 next if ! defined $argument;
2144 push @{$hash->{$addr}}, $argument;
2148 push @{$hash->{$addr}}, $argument if defined $argument;
2152 $hash->{$addr} = $argument;
2157 # If the file has a property for it, it means that the property is not
2158 # listed in the file's entries. So add a handler to the list of line
2159 # handlers to insert the property name into the lines, to provide a
2160 # uniform interface to the final processing subroutine.
2161 # the final code doesn't have to worry about that.
2162 if ($property{$addr}) {
2163 push @{$each_line_handler{$addr}}, \&_insert_property_into_line;
2166 if ($non_skip{$addr} && ! $debug_skip && $verbosity) {
2167 print "Warning: " . __PACKAGE__ . " constructor for $file{$addr} has useless 'non_skip' in it\n";
2170 # If skipping, set to optional, and add to list of ignored files,
2171 # including its reason
2173 $optional{$addr} = 1;
2174 $skipped_files{$file{$addr}} = $skip{$addr}
2183 qw("") => "_operator_stringify",
2184 "." => \&main::_operator_dot,
2185 ".=" => \&main::_operator_dot_equal,
2188 sub _operator_stringify {
2191 return __PACKAGE__ . " object for " . $self->file;
2194 # flag to make sure extracted files are processed early
2195 my $seen_non_extracted_non_age = 0;
2198 # Process the input object $self. This opens and closes the file and
2199 # calls all the handlers for it. Currently, this can only be called
2200 # once per file, as it destroy's the EOF handler
2203 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2205 my $addr = do { no overloading; pack 'J', $self; };
2207 my $file = $file{$addr};
2209 # Don't process if not expecting this file (because released later
2210 # than this Unicode version), and isn't there. This means if someone
2211 # copies it into an earlier version's directory, we will go ahead and
2213 return if $first_released{$addr} gt $v_version && ! -e $file;
2215 # If in debugging mode and this file doesn't have the non-skip
2216 # flag set, and isn't one of the critical files, skip it.
2218 && $first_released{$addr} ne v0
2219 && ! $non_skip{$addr})
2221 print "Skipping $file in debugging\n" if $verbosity;
2225 # File could be optional
2226 if ($optional{$addr}) {
2227 return unless -e $file;
2228 my $result = eval $optional{$addr};
2229 if (! defined $result) {
2230 Carp::my_carp_bug("Got '$@' when tried to eval $optional{$addr}. $file Skipped.");
2235 print STDERR "Skipping processing input file '$file' because '$optional{$addr}' is not true\n";
2241 if (! defined $file || ! -e $file) {
2243 # If the file doesn't exist, see if have internal data for it
2244 # (based on first_released being 0).
2245 if ($first_released{$addr} eq v0) {
2246 $handle{$addr} = 'pretend_is_open';
2249 if (! $optional{$addr} # File could be optional
2250 && $v_version ge $first_released{$addr})
2252 print STDERR "Skipping processing input file '$file' because not found\n" if $v_version ge $first_released{$addr};
2259 # Here, the file exists. Some platforms may change the case of
2261 if ($seen_non_extracted_non_age) {
2262 if ($file =~ /$EXTRACTED/i) {
2263 Carp::my_carp_bug(main::join_lines(<<END
2264 $file should be processed just after the 'Prop...Alias' files, and before
2265 anything not in the $EXTRACTED_DIR directory. Proceeding, but the results may
2266 have subtle problems
2271 elsif ($EXTRACTED_DIR
2272 && $first_released{$addr} ne v0
2273 && $file !~ /$EXTRACTED/i
2274 && lc($file) ne 'dage.txt')
2276 # We don't set this (by the 'if' above) if we have no
2277 # extracted directory, so if running on an early version,
2278 # this test won't work. Not worth worrying about.
2279 $seen_non_extracted_non_age = 1;
2282 # And mark the file as having being processed, and warn if it
2283 # isn't a file we are expecting. As we process the files,
2284 # they are deleted from the hash, so any that remain at the
2285 # end of the program are files that we didn't process.
2286 my $fkey = File::Spec->rel2abs($file);
2287 my $expecting = delete $potential_files{lc($fkey)};
2289 Carp::my_carp("Was not expecting '$file'.") if
2291 && ! defined $handle{$addr};
2293 # Having deleted from expected files, we can quit if not to do
2294 # anything. Don't print progress unless really want verbosity
2296 print "Skipping $file.\n" if $verbosity >= $VERBOSE;
2300 # Open the file, converting the slashes used in this program
2301 # into the proper form for the OS
2303 if (not open $file_handle, "<", $file) {
2304 Carp::my_carp("Can't open $file. Skipping: $!");
2307 $handle{$addr} = $file_handle; # Cache the open file handle
2310 if ($verbosity >= $PROGRESS) {
2311 if ($progress_message{$addr}) {
2312 print "$progress_message{$addr}\n";
2315 # If using a virtual file, say so.
2316 print "Processing ", (-e $file)
2318 : "substitute $file",
2324 # Call any special handler for before the file.
2325 &{$pre_handler{$addr}}($self) if $pre_handler{$addr};
2327 # Then the main handler
2328 &{$handler{$addr}}($self);
2330 # Then any special post-file handler.
2331 &{$post_handler{$addr}}($self) if $post_handler{$addr};
2333 # If any errors have been accumulated, output the counts (as the first
2334 # error message in each class was output when it was encountered).
2335 if ($errors{$addr}) {
2338 foreach my $error (keys %{$errors{$addr}}) {
2339 $total += $errors{$addr}->{$error};
2340 delete $errors{$addr}->{$error};
2345 = "A total of $total lines had errors in $file. ";
2347 $message .= ($types == 1)
2348 ? '(Only the first one was displayed.)'
2349 : '(Only the first of each type was displayed.)';
2350 Carp::my_carp($message);
2354 if (@{$missings{$addr}}) {
2355 Carp::my_carp_bug("Handler for $file didn't look at all the \@missing lines. Generated tables likely are wrong");
2358 # If a real file handle, close it.
2359 close $handle{$addr} or Carp::my_carp("Can't close $file: $!") if
2361 $handle{$addr} = ""; # Uses empty to indicate that has already seen
2362 # the file, as opposed to undef
2367 # Sets $_ to be the next logical input line, if any. Returns non-zero
2368 # if such a line exists. 'logical' means that any lines that have
2369 # been added via insert_lines() will be returned in $_ before the file
2373 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2375 my $addr = do { no overloading; pack 'J', $self; };
2377 # Here the file is open (or if the handle is not a ref, is an open
2378 # 'virtual' file). Get the next line; any inserted lines get priority
2379 # over the file itself.
2383 while (1) { # Loop until find non-comment, non-empty line
2384 #local $to_trace = 1 if main::DEBUG;
2385 my $inserted_ref = shift @{$added_lines{$addr}};
2386 if (defined $inserted_ref) {
2387 ($adjusted, $_) = @{$inserted_ref};
2388 trace $adjusted, $_ if main::DEBUG && $to_trace;
2389 return 1 if $adjusted;
2392 last if ! ref $handle{$addr}; # Don't read unless is real file
2393 last if ! defined ($_ = readline $handle{$addr});
2396 trace $_ if main::DEBUG && $to_trace;
2398 # See if this line is the comment line that defines what property
2399 # value that code points that are not listed in the file should
2400 # have. The format or existence of these lines is not guaranteed
2401 # by Unicode since they are comments, but the documentation says
2402 # that this was added for machine-readability, so probably won't
2403 # change. This works starting in Unicode Version 5.0. They look
2406 # @missing: 0000..10FFFF; Not_Reordered
2407 # @missing: 0000..10FFFF; Decomposition_Mapping; <code point>
2408 # @missing: 0000..10FFFF; ; NaN
2410 # Save the line for a later get_missings() call.
2411 if (/$missing_defaults_prefix/) {
2412 if ($has_missings_defaults{$addr} == $NO_DEFAULTS) {
2413 $self->carp_bad_line("Unexpected \@missing line. Assuming no missing entries");
2415 elsif ($has_missings_defaults{$addr} == $NOT_IGNORED) {
2416 my @defaults = split /\s* ; \s*/x, $_;
2418 # The first field is the @missing, which ends in a
2419 # semi-colon, so can safely shift.
2422 # Some of these lines may have empty field placeholders
2423 # which get in the way. An example is:
2424 # @missing: 0000..10FFFF; ; NaN
2425 # Remove them. Process starting from the top so the
2426 # splice doesn't affect things still to be looked at.
2427 for (my $i = @defaults - 1; $i >= 0; $i--) {
2428 next if $defaults[$i] ne "";
2429 splice @defaults, $i, 1;
2432 # What's left should be just the property (maybe) and the
2433 # default. Having only one element means it doesn't have
2437 if (@defaults >= 1) {
2438 if (@defaults == 1) {
2439 $default = $defaults[0];
2442 $property = $defaults[0];
2443 $default = $defaults[1];
2449 || ($default =~ /^</
2450 && $default !~ /^<code *point>$/i
2451 && $default !~ /^<none>$/i
2452 && $default !~ /^<script>$/i))
2454 $self->carp_bad_line("Unrecognized \@missing line: $_. Assuming no missing entries");
2458 # If the property is missing from the line, it should
2459 # be the one for the whole file
2460 $property = $property{$addr} if ! defined $property;
2462 # Change <none> to the null string, which is what it
2463 # really means. If the default is the code point
2464 # itself, set it to <code point>, which is what
2465 # Unicode uses (but sometimes they've forgotten the
2467 if ($default =~ /^<none>$/i) {
2470 elsif ($default =~ /^<code *point>$/i) {
2471 $default = $CODE_POINT;
2473 elsif ($default =~ /^<script>$/i) {
2475 # Special case this one. Currently is from
2476 # ScriptExtensions.txt, and means for all unlisted
2477 # code points, use their Script property values.
2478 # For the code points not listed in that file, the
2479 # default value is 'Unknown'.
2480 $default = "Unknown";
2483 # Store them as a sub-arrays with both components.
2484 push @{$missings{$addr}}, [ $default, $property ];
2488 # There is nothing for the caller to process on this comment
2493 # Remove comments and trailing space, and skip this line if the
2499 # Call any handlers for this line, and skip further processing of
2500 # the line if the handler sets the line to null.
2501 foreach my $sub_ref (@{$each_line_handler{$addr}}) {
2506 # Here the line is ok. return success.
2508 } # End of looping through lines.
2510 # If there is an EOF handler, call it (only once) and if it generates
2511 # more lines to process go back in the loop to handle them.
2512 if ($eof_handler{$addr}) {
2513 &{$eof_handler{$addr}}($self);
2514 $eof_handler{$addr} = ""; # Currently only get one shot at it.
2515 goto LINE if $added_lines{$addr};
2518 # Return failure -- no more lines.
2523 # Not currently used, not fully tested.
2525 # # Non-destructive look-ahead one non-adjusted, non-comment, non-blank
2526 # # record. Not callable from an each_line_handler(), nor does it call
2527 # # an each_line_handler() on the line.
2530 # my $addr = do { no overloading; pack 'J', $self; };
2532 # foreach my $inserted_ref (@{$added_lines{$addr}}) {
2533 # my ($adjusted, $line) = @{$inserted_ref};
2534 # next if $adjusted;
2536 # # Remove comments and trailing space, and return a non-empty
2539 # $line =~ s/\s+$//;
2540 # return $line if $line ne "";
2543 # return if ! ref $handle{$addr}; # Don't read unless is real file
2544 # while (1) { # Loop until find non-comment, non-empty line
2545 # local $to_trace = 1 if main::DEBUG;
2546 # trace $_ if main::DEBUG && $to_trace;
2547 # return if ! defined (my $line = readline $handle{$addr});
2549 # push @{$added_lines{$addr}}, [ 0, $line ];
2552 # $line =~ s/\s+$//;
2553 # return $line if $line ne "";
2561 # Lines can be inserted so that it looks like they were in the input
2562 # file at the place it was when this routine is called. See also
2563 # insert_adjusted_lines(). Lines inserted via this routine go through
2564 # any each_line_handler()
2568 # Each inserted line is an array, with the first element being 0 to
2569 # indicate that this line hasn't been adjusted, and needs to be
2572 push @{$added_lines{pack 'J', $self}}, map { [ 0, $_ ] } @_;
2576 sub insert_adjusted_lines {
2577 # Lines can be inserted so that it looks like they were in the input
2578 # file at the place it was when this routine is called. See also
2579 # insert_lines(). Lines inserted via this routine are already fully
2580 # adjusted, ready to be processed; each_line_handler()s handlers will
2581 # not be called. This means this is not a completely general
2582 # facility, as only the last each_line_handler on the stack should
2583 # call this. It could be made more general, by passing to each of the
2584 # line_handlers their position on the stack, which they would pass on
2585 # to this routine, and that would replace the boolean first element in
2586 # the anonymous array pushed here, so that the next_line routine could
2587 # use that to call only those handlers whose index is after it on the
2588 # stack. But this is overkill for what is needed now.
2591 trace $_[0] if main::DEBUG && $to_trace;
2593 # Each inserted line is an array, with the first element being 1 to
2594 # indicate that this line has been adjusted
2596 push @{$added_lines{pack 'J', $self}}, map { [ 1, $_ ] } @_;
2601 # Returns the stored up @missings lines' values, and clears the list.
2602 # The values are in an array, consisting of the default in the first
2603 # element, and the property in the 2nd. However, since these lines
2604 # can be stacked up, the return is an array of all these arrays.
2607 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2609 my $addr = do { no overloading; pack 'J', $self; };
2611 # If not accepting a list return, just return the first one.
2612 return shift @{$missings{$addr}} unless wantarray;
2614 my @return = @{$missings{$addr}};
2615 undef @{$missings{$addr}};
2619 sub _insert_property_into_line {
2620 # Add a property field to $_, if this file requires it.
2623 my $addr = do { no overloading; pack 'J', $self; };
2624 my $property = $property{$addr};
2625 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2627 $_ =~ s/(;|$)/; $property$1/;
2632 # Output consistent error messages, using either a generic one, or the
2633 # one given by the optional parameter. To avoid gazillions of the
2634 # same message in case the syntax of a file is way off, this routine
2635 # only outputs the first instance of each message, incrementing a
2636 # count so the totals can be output at the end of the file.
2639 my $message = shift;
2640 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2642 my $addr = do { no overloading; pack 'J', $self; };
2644 $message = 'Unexpected line' unless $message;
2646 # No trailing punctuation so as to fit with our addenda.
2647 $message =~ s/[.:;,]$//;
2649 # If haven't seen this exact message before, output it now. Otherwise
2650 # increment the count of how many times it has occurred
2651 unless ($errors{$addr}->{$message}) {
2652 Carp::my_carp("$message in '$_' in "
2654 . " at line $.. Skipping this line;");
2655 $errors{$addr}->{$message} = 1;
2658 $errors{$addr}->{$message}++;
2661 # Clear the line to prevent any further (meaningful) processing of it.
2668 package Multi_Default;
2670 # Certain properties in early versions of Unicode had more than one possible
2671 # default for code points missing from the files. In these cases, one
2672 # default applies to everything left over after all the others are applied,
2673 # and for each of the others, there is a description of which class of code
2674 # points applies to it. This object helps implement this by storing the
2675 # defaults, and for all but that final default, an eval string that generates
2676 # the class that it applies to.
2681 main::setup_package();
2684 # The defaults structure for the classes
2685 main::set_access('class_defaults', \%class_defaults);
2688 # The default that applies to everything left over.
2689 main::set_access('other_default', \%other_default, 'r');
2693 # The constructor is called with default => eval pairs, terminated by
2694 # the left-over default. e.g.
2695 # Multi_Default->new(
2696 # 'T' => '$gc->table("Mn") + $gc->table("Cf") - 0x200C
2698 # 'R' => 'some other expression that evaluates to code points',
2706 my $self = bless \do{my $anonymous_scalar}, $class;
2707 my $addr = do { no overloading; pack 'J', $self; };
2710 my $default = shift;
2712 $class_defaults{$addr}->{$default} = $eval;
2715 $other_default{$addr} = shift;
2720 sub get_next_defaults {
2721 # Iterates and returns the next class of defaults.
2723 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2725 my $addr = do { no overloading; pack 'J', $self; };
2727 return each %{$class_defaults{$addr}};
2733 # An alias is one of the names that a table goes by. This class defines them
2734 # including some attributes. Everything is currently setup in the
2740 main::setup_package();
2743 main::set_access('name', \%name, 'r');
2746 # Should this name match loosely or not.
2747 main::set_access('loose_match', \%loose_match, 'r');
2749 my %make_re_pod_entry;
2750 # Some aliases should not get their own entries in the re section of the
2751 # pod, because they are covered by a wild-card, and some we want to
2752 # discourage use of. Binary
2753 main::set_access('make_re_pod_entry', \%make_re_pod_entry, 'r', 's');
2756 # Is this documented to be accessible via Unicode::UCD
2757 main::set_access('ucd', \%ucd, 'r', 's');
2760 # Aliases have a status, like deprecated, or even suppressed (which means
2761 # they don't appear in documentation). Enum
2762 main::set_access('status', \%status, 'r');
2765 # Similarly, some aliases should not be considered as usable ones for
2766 # external use, such as file names, or we don't want documentation to
2767 # recommend them. Boolean
2768 main::set_access('ok_as_filename', \%ok_as_filename, 'r');
2773 my $self = bless \do { my $anonymous_scalar }, $class;
2774 my $addr = do { no overloading; pack 'J', $self; };
2776 $name{$addr} = shift;
2777 $loose_match{$addr} = shift;
2778 $make_re_pod_entry{$addr} = shift;
2779 $ok_as_filename{$addr} = shift;
2780 $status{$addr} = shift;
2781 $ucd{$addr} = shift;
2783 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2785 # Null names are never ok externally
2786 $ok_as_filename{$addr} = 0 if $name{$addr} eq "";
2794 # A range is the basic unit for storing code points, and is described in the
2795 # comments at the beginning of the program. Each range has a starting code
2796 # point; an ending code point (not less than the starting one); a value
2797 # that applies to every code point in between the two end-points, inclusive;
2798 # and an enum type that applies to the value. The type is for the user's
2799 # convenience, and has no meaning here, except that a non-zero type is
2800 # considered to not obey the normal Unicode rules for having standard forms.
2802 # The same structure is used for both map and match tables, even though in the
2803 # latter, the value (and hence type) is irrelevant and could be used as a
2804 # comment. In map tables, the value is what all the code points in the range
2805 # map to. Type 0 values have the standardized version of the value stored as
2806 # well, so as to not have to recalculate it a lot.
2808 sub trace { return main::trace(@_); }
2812 main::setup_package();
2815 main::set_access('start', \%start, 'r', 's');
2818 main::set_access('end', \%end, 'r', 's');
2821 main::set_access('value', \%value, 'r');
2824 main::set_access('type', \%type, 'r');
2827 # The value in internal standard form. Defined only if the type is 0.
2828 main::set_access('standard_form', \%standard_form);
2830 # Note that if these fields change, the dump() method should as well
2833 return Carp::carp_too_few_args(\@_, 3) if main::DEBUG && @_ < 3;
2836 my $self = bless \do { my $anonymous_scalar }, $class;
2837 my $addr = do { no overloading; pack 'J', $self; };
2839 $start{$addr} = shift;
2840 $end{$addr} = shift;
2844 my $value = delete $args{'Value'}; # Can be 0
2845 $value = "" unless defined $value;
2846 $value{$addr} = $value;
2848 $type{$addr} = delete $args{'Type'} || 0;
2850 Carp::carp_extra_args(\%args) if main::DEBUG && %args;
2852 if (! $type{$addr}) {
2853 $standard_form{$addr} = main::standardize($value);
2861 qw("") => "_operator_stringify",
2862 "." => \&main::_operator_dot,
2863 ".=" => \&main::_operator_dot_equal,
2866 sub _operator_stringify {
2868 my $addr = do { no overloading; pack 'J', $self; };
2870 # Output it like '0041..0065 (value)'
2871 my $return = sprintf("%04X", $start{$addr})
2873 . sprintf("%04X", $end{$addr});
2874 my $value = $value{$addr};
2875 my $type = $type{$addr};
2877 $return .= "$value";
2878 $return .= ", Type=$type" if $type != 0;
2885 # The standard form is the value itself if the standard form is
2886 # undefined (that is if the value is special)
2889 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2891 my $addr = do { no overloading; pack 'J', $self; };
2893 return $standard_form{$addr} if defined $standard_form{$addr};
2894 return $value{$addr};
2898 # Human, not machine readable. For machine readable, comment out this
2899 # entire routine and let the standard one take effect.
2902 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2904 my $addr = do { no overloading; pack 'J', $self; };
2906 my $return = $indent
2907 . sprintf("%04X", $start{$addr})
2909 . sprintf("%04X", $end{$addr})
2910 . " '$value{$addr}';";
2911 if (! defined $standard_form{$addr}) {
2912 $return .= "(type=$type{$addr})";
2914 elsif ($standard_form{$addr} ne $value{$addr}) {
2915 $return .= "(standard '$standard_form{$addr}')";
2921 package _Range_List_Base;
2923 # Base class for range lists. A range list is simply an ordered list of
2924 # ranges, so that the ranges with the lowest starting numbers are first in it.
2926 # When a new range is added that is adjacent to an existing range that has the
2927 # same value and type, it merges with it to form a larger range.
2929 # Ranges generally do not overlap, except that there can be multiple entries
2930 # of single code point ranges. This is because of NameAliases.txt.
2932 # In this program, there is a standard value such that if two different
2933 # values, have the same standard value, they are considered equivalent. This
2934 # value was chosen so that it gives correct results on Unicode data
2936 # There are a number of methods to manipulate range lists, and some operators
2937 # are overloaded to handle them.
2939 sub trace { return main::trace(@_); }
2945 main::setup_package();
2948 # The list of ranges
2949 main::set_access('ranges', \%ranges, 'readable_array');
2952 # The highest code point in the list. This was originally a method, but
2953 # actual measurements said it was used a lot.
2954 main::set_access('max', \%max, 'r');
2956 my %each_range_iterator;
2957 # Iterator position for each_range()
2958 main::set_access('each_range_iterator', \%each_range_iterator);
2961 # Name of parent this is attached to, if any. Solely for better error
2963 main::set_access('owner_name_of', \%owner_name_of, 'p_r');
2965 my %_search_ranges_cache;
2966 # A cache of the previous result from _search_ranges(), for better
2968 main::set_access('_search_ranges_cache', \%_search_ranges_cache);
2974 # Optional initialization data for the range list.
2975 my $initialize = delete $args{'Initialize'};
2979 # Use _union() to initialize. _union() returns an object of this
2980 # class, which means that it will call this constructor recursively.
2981 # But it won't have this $initialize parameter so that it won't
2982 # infinitely loop on this.
2983 return _union($class, $initialize, %args) if defined $initialize;
2985 $self = bless \do { my $anonymous_scalar }, $class;
2986 my $addr = do { no overloading; pack 'J', $self; };
2988 # Optional parent object, only for debug info.
2989 $owner_name_of{$addr} = delete $args{'Owner'};
2990 $owner_name_of{$addr} = "" if ! defined $owner_name_of{$addr};
2992 # Stringify, in case it is an object.
2993 $owner_name_of{$addr} = "$owner_name_of{$addr}";
2995 # This is used only for error messages, and so a colon is added
2996 $owner_name_of{$addr} .= ": " if $owner_name_of{$addr} ne "";
2998 Carp::carp_extra_args(\%args) if main::DEBUG && %args;
3000 # Max is initialized to a negative value that isn't adjacent to 0,
3004 $_search_ranges_cache{$addr} = 0;
3005 $ranges{$addr} = [];
3012 qw("") => "_operator_stringify",
3013 "." => \&main::_operator_dot,
3014 ".=" => \&main::_operator_dot_equal,
3017 sub _operator_stringify {
3019 my $addr = do { no overloading; pack 'J', $self; };
3021 return "Range_List attached to '$owner_name_of{$addr}'"
3022 if $owner_name_of{$addr};
3023 return "anonymous Range_List " . \$self;
3027 # Returns the union of the input code points. It can be called as
3028 # either a constructor or a method. If called as a method, the result
3029 # will be a new() instance of the calling object, containing the union
3030 # of that object with the other parameter's code points; if called as
3031 # a constructor, the first parameter gives the class that the new object
3032 # should be, and the second parameter gives the code points to go into
3034 # In either case, there are two parameters looked at by this routine;
3035 # any additional parameters are passed to the new() constructor.
3037 # The code points can come in the form of some object that contains
3038 # ranges, and has a conventionally named method to access them; or
3039 # they can be an array of individual code points (as integers); or
3040 # just a single code point.
3042 # If they are ranges, this routine doesn't make any effort to preserve
3043 # the range values and types of one input over the other. Therefore
3044 # this base class should not allow _union to be called from other than
3045 # initialization code, so as to prevent two tables from being added
3046 # together where the range values matter. The general form of this
3047 # routine therefore belongs in a derived class, but it was moved here
3048 # to avoid duplication of code. The failure to overload this in this
3049 # class keeps it safe.
3051 # It does make the effort during initialization to accept tables with
3052 # multiple values for the same code point, and to preserve the order
3053 # of these. If there is only one input range or range set, it doesn't
3054 # sort (as it should already be sorted to the desired order), and will
3055 # accept multiple values per code point. Otherwise it will merge
3056 # multiple values into a single one.
3059 my @args; # Arguments to pass to the constructor
3063 # If a method call, will start the union with the object itself, and
3064 # the class of the new object will be the same as self.
3071 # Add the other required parameter.
3073 # Rest of parameters are passed on to the constructor
3075 # Accumulate all records from both lists.
3077 my $input_count = 0;
3078 for my $arg (@args) {
3079 #local $to_trace = 0 if main::DEBUG;
3080 trace "argument = $arg" if main::DEBUG && $to_trace;
3081 if (! defined $arg) {
3083 if (defined $self) {
3085 $message .= $owner_name_of{pack 'J', $self};
3087 Carp::my_carp_bug($message .= "Undefined argument to _union. No union done.");
3091 $arg = [ $arg ] if ! ref $arg;
3092 my $type = ref $arg;
3093 if ($type eq 'ARRAY') {
3094 foreach my $element (@$arg) {
3095 push @records, Range->new($element, $element);
3099 elsif ($arg->isa('Range')) {
3100 push @records, $arg;
3103 elsif ($arg->can('ranges')) {
3104 push @records, $arg->ranges;
3109 if (defined $self) {
3111 $message .= $owner_name_of{pack 'J', $self};
3113 Carp::my_carp_bug($message . "Cannot take the union of a $type. No union done.");
3118 # Sort with the range containing the lowest ordinal first, but if
3119 # two ranges start at the same code point, sort with the bigger range
3120 # of the two first, because it takes fewer cycles.
3121 if ($input_count > 1) {
3122 @records = sort { ($a->start <=> $b->start)
3124 # if b is shorter than a, b->end will be
3125 # less than a->end, and we want to select
3126 # a, so want to return -1
3127 ($b->end <=> $a->end)
3131 my $new = $class->new(@_);
3133 # Fold in records so long as they add new information.
3134 for my $set (@records) {
3135 my $start = $set->start;
3136 my $end = $set->end;
3137 my $value = $set->value;
3138 my $type = $set->type;
3139 if ($start > $new->max) {
3140 $new->_add_delete('+', $start, $end, $value, Type => $type);
3142 elsif ($end > $new->max) {
3143 $new->_add_delete('+', $new->max +1, $end, $value,
3146 elsif ($input_count == 1) {
3147 # Here, overlaps existing range, but is from a single input,
3148 # so preserve the multiple values from that input.
3149 $new->_add_delete('+', $start, $end, $value, Type => $type,
3150 Replace => $MULTIPLE_AFTER);
3157 sub range_count { # Return the number of ranges in the range list
3159 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3162 return scalar @{$ranges{pack 'J', $self}};
3166 # Returns the minimum code point currently in the range list, or if
3167 # the range list is empty, 2 beyond the max possible. This is a
3168 # method because used so rarely, that not worth saving between calls,
3169 # and having to worry about changing it as ranges are added and
3173 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3175 my $addr = do { no overloading; pack 'J', $self; };
3177 # If the range list is empty, return a large value that isn't adjacent
3178 # to any that could be in the range list, for simpler tests
3179 return $MAX_UNICODE_CODEPOINT + 2 unless scalar @{$ranges{$addr}};
3180 return $ranges{$addr}->[0]->start;
3184 # Boolean: Is argument in the range list? If so returns $i such that:
3185 # range[$i]->end < $codepoint <= range[$i+1]->end
3186 # which is one beyond what you want; this is so that the 0th range
3187 # doesn't return false
3189 my $codepoint = shift;
3190 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3192 my $i = $self->_search_ranges($codepoint);
3193 return 0 unless defined $i;
3195 # The search returns $i, such that
3196 # range[$i-1]->end < $codepoint <= range[$i]->end
3197 # So is in the table if and only iff it is at least the start position
3200 return 0 if $ranges{pack 'J', $self}->[$i]->start > $codepoint;
3204 sub containing_range {
3205 # Returns the range object that contains the code point, undef if none
3208 my $codepoint = shift;
3209 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3211 my $i = $self->contains($codepoint);
3214 # contains() returns 1 beyond where we should look
3216 return $ranges{pack 'J', $self}->[$i-1];
3220 # Returns the value associated with the code point, undef if none
3223 my $codepoint = shift;
3224 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3226 my $range = $self->containing_range($codepoint);
3227 return unless defined $range;
3229 return $range->value;
3233 # Returns the type of the range containing the code point, undef if
3234 # the code point is not in the table
3237 my $codepoint = shift;
3238 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3240 my $range = $self->containing_range($codepoint);
3241 return unless defined $range;
3243 return $range->type;
3246 sub _search_ranges {
3247 # Find the range in the list which contains a code point, or where it
3248 # should go if were to add it. That is, it returns $i, such that:
3249 # range[$i-1]->end < $codepoint <= range[$i]->end
3250 # Returns undef if no such $i is possible (e.g. at end of table), or
3251 # if there is an error.
3254 my $code_point = shift;
3255 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3257 my $addr = do { no overloading; pack 'J', $self; };
3259 return if $code_point > $max{$addr};
3260 my $r = $ranges{$addr}; # The current list of ranges
3261 my $range_list_size = scalar @$r;
3264 use integer; # want integer division
3266 # Use the cached result as the starting guess for this one, because,
3267 # an experiment on 5.1 showed that 90% of the time the cache was the
3268 # same as the result on the next call (and 7% it was one less).
3269 $i = $_search_ranges_cache{$addr};
3270 $i = 0 if $i >= $range_list_size; # Reset if no longer valid (prob.
3271 # from an intervening deletion
3272 #local $to_trace = 1 if main::DEBUG;
3273 trace "previous \$i is still valid: $i" if main::DEBUG && $to_trace && $code_point <= $r->[$i]->end && ($i == 0 || $r->[$i-1]->end < $code_point);
3274 return $i if $code_point <= $r->[$i]->end
3275 && ($i == 0 || $r->[$i-1]->end < $code_point);
3277 # Here the cache doesn't yield the correct $i. Try adding 1.
3278 if ($i < $range_list_size - 1
3279 && $r->[$i]->end < $code_point &&
3280 $code_point <= $r->[$i+1]->end)
3283 trace "next \$i is correct: $i" if main::DEBUG && $to_trace;
3284 $_search_ranges_cache{$addr} = $i;
3288 # Here, adding 1 also didn't work. We do a binary search to
3289 # find the correct position, starting with current $i
3291 my $upper = $range_list_size - 1;
3293 trace "top of loop i=$i:", sprintf("%04X", $r->[$lower]->start), "[$lower] .. ", sprintf("%04X", $r->[$i]->start), "[$i] .. ", sprintf("%04X", $r->[$upper]->start), "[$upper]" if main::DEBUG && $to_trace;
3295 if ($code_point <= $r->[$i]->end) {
3297 # Here we have met the upper constraint. We can quit if we
3298 # also meet the lower one.
3299 last if $i == 0 || $r->[$i-1]->end < $code_point;
3301 $upper = $i; # Still too high.
3306 # Here, $r[$i]->end < $code_point, so look higher up.
3310 # Split search domain in half to try again.
3311 my $temp = ($upper + $lower) / 2;
3313 # No point in continuing unless $i changes for next time
3317 # We can't reach the highest element because of the averaging.
3318 # So if one below the upper edge, force it there and try one
3320 if ($i == $range_list_size - 2) {
3322 trace "Forcing to upper edge" if main::DEBUG && $to_trace;
3323 $i = $range_list_size - 1;
3325 # Change $lower as well so if fails next time through,
3326 # taking the average will yield the same $i, and we will
3327 # quit with the error message just below.
3331 Carp::my_carp_bug("$owner_name_of{$addr}Can't find where the range ought to go. No action taken.");
3335 } # End of while loop
3337 if (main::DEBUG && $to_trace) {
3338 trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i;
3339 trace "i= [ $i ]", $r->[$i];
3340 trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < $range_list_size - 1;
3343 # Here we have found the offset. Cache it as a starting point for the
3345 $_search_ranges_cache{$addr} = $i;
3350 # Add, replace or delete ranges to or from a list. The $type
3351 # parameter gives which:
3352 # '+' => insert or replace a range, returning a list of any changed
3354 # '-' => delete a range, returning a list of any deleted ranges.
3356 # The next three parameters give respectively the start, end, and
3357 # value associated with the range. 'value' should be null unless the
3360 # The range list is kept sorted so that the range with the lowest
3361 # starting position is first in the list, and generally, adjacent
3362 # ranges with the same values are merged into a single larger one (see
3363 # exceptions below).
3365 # There are more parameters; all are key => value pairs:
3366 # Type gives the type of the value. It is only valid for '+'.
3367 # All ranges have types; if this parameter is omitted, 0 is
3368 # assumed. Ranges with type 0 are assumed to obey the
3369 # Unicode rules for casing, etc; ranges with other types are
3370 # not. Otherwise, the type is arbitrary, for the caller's
3371 # convenience, and looked at only by this routine to keep
3372 # adjacent ranges of different types from being merged into
3373 # a single larger range, and when Replace =>
3374 # $IF_NOT_EQUIVALENT is specified (see just below).
3375 # Replace determines what to do if the range list already contains
3376 # ranges which coincide with all or portions of the input
3377 # range. It is only valid for '+':
3378 # => $NO means that the new value is not to replace
3379 # any existing ones, but any empty gaps of the
3380 # range list coinciding with the input range
3381 # will be filled in with the new value.
3382 # => $UNCONDITIONALLY means to replace the existing values with
3383 # this one unconditionally. However, if the
3384 # new and old values are identical, the
3385 # replacement is skipped to save cycles
3386 # => $IF_NOT_EQUIVALENT means to replace the existing values
3387 # (the default) with this one if they are not equivalent.
3388 # Ranges are equivalent if their types are the
3389 # same, and they are the same string; or if
3390 # both are type 0 ranges, if their Unicode
3391 # standard forms are identical. In this last
3392 # case, the routine chooses the more "modern"
3393 # one to use. This is because some of the
3394 # older files are formatted with values that
3395 # are, for example, ALL CAPs, whereas the
3396 # derived files have a more modern style,
3397 # which looks better. By looking for this
3398 # style when the pre-existing and replacement
3399 # standard forms are the same, we can move to
3401 # => $MULTIPLE_BEFORE means that if this range duplicates an
3402 # existing one, but has a different value,
3403 # don't replace the existing one, but insert
3404 # this, one so that the same range can occur
3405 # multiple times. They are stored LIFO, so
3406 # that the final one inserted is the first one
3407 # returned in an ordered search of the table.
3408 # If this is an exact duplicate, including the
3409 # value, the original will be moved to be
3410 # first, before any other duplicate ranges
3411 # with different values.
3412 # => $MULTIPLE_AFTER is like $MULTIPLE_BEFORE, but is stored
3413 # FIFO, so that this one is inserted after all
3414 # others that currently exist. If this is an
3415 # exact duplicate, including value, of an
3416 # existing range, this one is discarded
3417 # (leaving the existing one in its original,
3418 # higher priority position
3419 # => anything else is the same as => $IF_NOT_EQUIVALENT
3421 # "same value" means identical for non-type-0 ranges, and it means
3422 # having the same standard forms for type-0 ranges.
3424 return Carp::carp_too_few_args(\@_, 5) if main::DEBUG && @_ < 5;
3427 my $operation = shift; # '+' for add/replace; '-' for delete;
3434 $value = "" if not defined $value; # warning: $value can be "0"
3436 my $replace = delete $args{'Replace'};
3437 $replace = $IF_NOT_EQUIVALENT unless defined $replace;
3439 my $type = delete $args{'Type'};
3440 $type = 0 unless defined $type;
3442 Carp::carp_extra_args(\%args) if main::DEBUG && %args;
3444 my $addr = do { no overloading; pack 'J', $self; };
3446 if ($operation ne '+' && $operation ne '-') {
3447 Carp::my_carp_bug("$owner_name_of{$addr}First parameter to _add_delete must be '+' or '-'. No action taken.");
3450 unless (defined $start && defined $end) {
3451 Carp::my_carp_bug("$owner_name_of{$addr}Undefined start and/or end to _add_delete. No action taken.");
3454 unless ($end >= $start) {
3455 Carp::my_carp_bug("$owner_name_of{$addr}End of range (" . sprintf("%04X", $end) . ") must not be before start (" . sprintf("%04X", $start) . "). No action taken.");
3458 if ($end > $MAX_UNICODE_CODEPOINT && $operation eq '+') {
3459 Carp::my_carp("$owner_name_of{$addr}Warning: Range '" . sprintf("%04X..%04X", $start, $end) . ") is above the Unicode maximum of " . sprintf("%04X", $MAX_UNICODE_CODEPOINT) . ". Adding it anyway");
3461 #local $to_trace = 1 if main::DEBUG;
3463 if ($operation eq '-') {
3464 if ($replace != $IF_NOT_EQUIVALENT) {
3465 Carp::my_carp_bug("$owner_name_of{$addr}Replace => \$IF_NOT_EQUIVALENT is required when deleting a range from a range list. Assuming Replace => \$IF_NOT_EQUIVALENT.");
3466 $replace = $IF_NOT_EQUIVALENT;
3469 Carp::my_carp_bug("$owner_name_of{$addr}Type => 0 is required when deleting a range from a range list. Assuming Type => 0.");
3473 Carp::my_carp_bug("$owner_name_of{$addr}Value => \"\" is required when deleting a range from a range list. Assuming Value => \"\".");
3478 my $r = $ranges{$addr}; # The current list of ranges
3479 my $range_list_size = scalar @$r; # And its size
3480 my $max = $max{$addr}; # The current high code point in
3481 # the list of ranges
3483 # Do a special case requiring fewer machine cycles when the new range
3484 # starts after the current highest point. The Unicode input data is
3485 # structured so this is common.
3486 if ($start > $max) {
3488 trace "$owner_name_of{$addr} $operation", sprintf("%04X", $start) . '..' . sprintf("%04X", $end) . " ($value) type=$type" if main::DEBUG && $to_trace;
3489 return if $operation eq '-'; # Deleting a non-existing range is a
3492 # If the new range doesn't logically extend the current final one
3493 # in the range list, create a new range at the end of the range
3494 # list. (max cleverly is initialized to a negative number not
3495 # adjacent to 0 if the range list is empty, so even adding a range
3496 # to an empty range list starting at 0 will have this 'if'
3498 if ($start > $max + 1 # non-adjacent means can't extend.
3499 || @{$r}[-1]->value ne $value # values differ, can't extend.
3500 || @{$r}[-1]->type != $type # types differ, can't extend.
3502 push @$r, Range->new($start, $end,
3508 # Here, the new range starts just after the current highest in
3509 # the range list, and they have the same type and value.
3510 # Extend the current range to incorporate the new one.
3511 @{$r}[-1]->set_end($end);
3514 # This becomes the new maximum.
3519 #local $to_trace = 0 if main::DEBUG;
3521 trace "$owner_name_of{$addr} $operation", sprintf("%04X", $start) . '..' . sprintf("%04X", $end) . " ($value) replace=$replace" if main::DEBUG && $to_trace;
3523 # Here, the input range isn't after the whole rest of the range list.
3524 # Most likely 'splice' will be needed. The rest of the routine finds
3525 # the needed splice parameters, and if necessary, does the splice.
3526 # First, find the offset parameter needed by the splice function for
3527 # the input range. Note that the input range may span multiple
3528 # existing ones, but we'll worry about that later. For now, just find
3529 # the beginning. If the input range is to be inserted starting in a
3530 # position not currently in the range list, it must (obviously) come
3531 # just after the range below it, and just before the range above it.
3532 # Slightly less obviously, it will occupy the position currently
3533 # occupied by the range that is to come after it. More formally, we
3534 # are looking for the position, $i, in the array of ranges, such that:
3536 # r[$i-1]->start <= r[$i-1]->end < $start < r[$i]->start <= r[$i]->end
3538 # (The ordered relationships within existing ranges are also shown in
3539 # the equation above). However, if the start of the input range is
3540 # within an existing range, the splice offset should point to that
3541 # existing range's position in the list; that is $i satisfies a
3542 # somewhat different equation, namely:
3544 #r[$i-1]->start <= r[$i-1]->end < r[$i]->start <= $start <= r[$i]->end
3546 # More briefly, $start can come before or after r[$i]->start, and at
3547 # this point, we don't know which it will be. However, these
3548 # two equations share these constraints:
3550 # r[$i-1]->end < $start <= r[$i]->end
3552 # And that is good enough to find $i.
3554 my $i = $self->_search_ranges($start);
3556 Carp::my_carp_bug("Searching $self for range beginning with $start unexpectedly returned undefined. Operation '$operation' not performed");
3560 # The search function returns $i such that:
3562 # r[$i-1]->end < $start <= r[$i]->end
3564 # That means that $i points to the first range in the range list
3565 # that could possibly be affected by this operation. We still don't
3566 # know if the start of the input range is within r[$i], or if it
3567 # points to empty space between r[$i-1] and r[$i].
3568 trace "[$i] is the beginning splice point. Existing range there is ", $r->[$i] if main::DEBUG && $to_trace;
3570 # Special case the insertion of data that is not to replace any
3572 if ($replace == $NO) { # If $NO, has to be operation '+'
3573 #local $to_trace = 1 if main::DEBUG;
3574 trace "Doesn't replace" if main::DEBUG && $to_trace;
3576 # Here, the new range is to take effect only on those code points
3577 # that aren't already in an existing range. This can be done by
3578 # looking through the existing range list and finding the gaps in
3579 # the ranges that this new range affects, and then calling this
3580 # function recursively on each of those gaps, leaving untouched
3581 # anything already in the list. Gather up a list of the changed
3582 # gaps first so that changes to the internal state as new ranges
3583 # are added won't be a problem.
3586 # First, if the starting point of the input range is outside an
3587 # existing one, there is a gap from there to the beginning of the
3588 # existing range -- add a span to fill the part that this new
3590 if ($start < $r->[$i]->start) {
3591 push @gap_list, Range->new($start,
3593 $r->[$i]->start - 1),
3595 trace "gap before $r->[$i] [$i], will add", $gap_list[-1] if main::DEBUG && $to_trace;
3598 # Then look through the range list for other gaps until we reach
3599 # the highest range affected by the input one.
3601 for ($j = $i+1; $j < $range_list_size; $j++) {
3602 trace "j=[$j]", $r->[$j] if main::DEBUG && $to_trace;
3603 last if $end < $r->[$j]->start;
3605 # If there is a gap between when this range starts and the
3606 # previous one ends, add a span to fill it. Note that just
3607 # because there are two ranges doesn't mean there is a
3608 # non-zero gap between them. It could be that they have
3609 # different values or types
3610 if ($r->[$j-1]->end + 1 != $r->[$j]->start) {
3612 Range->new($r->[$j-1]->end + 1,
3613 $r->[$j]->start - 1,
3615 trace "gap between $r->[$j-1] and $r->[$j] [$j], will add: $gap_list[-1]" if main::DEBUG && $to_trace;
3619 # Here, we have either found an existing range in the range list,
3620 # beyond the area affected by the input one, or we fell off the
3621 # end of the loop because the input range affects the whole rest
3622 # of the range list. In either case, $j is 1 higher than the
3623 # highest affected range. If $j == $i, it means that there are no
3624 # affected ranges, that the entire insertion is in the gap between
3625 # r[$i-1], and r[$i], which we already have taken care of before
3627 # On the other hand, if there are affected ranges, it might be
3628 # that there is a gap that needs filling after the final such
3629 # range to the end of the input range
3630 if ($r->[$j-1]->end < $end) {
3631 push @gap_list, Range->new(main::max($start,
3632 $r->[$j-1]->end + 1),
3635 trace "gap after $r->[$j-1], will add $gap_list[-1]" if main::DEBUG && $to_trace;
3638 # Call recursively to fill in all the gaps.
3639 foreach my $gap (@gap_list) {
3640 $self->_add_delete($operation,
3650 # Here, we have taken care of the case where $replace is $NO.
3651 # Remember that here, r[$i-1]->end < $start <= r[$i]->end
3652 # If inserting a multiple record, this is where it goes, before the
3653 # first (if any) existing one if inserting LIFO. (If this is to go
3654 # afterwards, FIFO, we below move the pointer to there.) These imply
3655 # an insertion, and no change to any existing ranges. Note that $i
3656 # can be -1 if this new range doesn't actually duplicate any existing,
3657 # and comes at the beginning of the list.
3658 if ($replace == $MULTIPLE_BEFORE || $replace == $MULTIPLE_AFTER) {
3660 if ($start != $end) {
3661 Carp::my_carp_bug("$owner_name_of{$addr}Can't cope with adding a multiple record when the range ($start..$end) contains more than one code point. No action taken.");
3665 # If the new code point is within a current range ...
3666 if ($end >= $r->[$i]->start) {
3668 # Don't add an exact duplicate, as it isn't really a multiple
3669 my $existing_value = $r->[$i]->value;
3670 my $existing_type = $r->[$i]->type;
3671 return if $value eq $existing_value && $type eq $existing_type;
3673 # If the multiple value is part of an existing range, we want
3674 # to split up that range, so that only the single code point
3675 # is affected. To do this, we first call ourselves
3676 # recursively to delete that code point from the table, having
3677 # preserved its current data above. Then we call ourselves
3678 # recursively again to add the new multiple, which we know by
3679 # the test just above is different than the current code
3680 # point's value, so it will become a range containing a single
3681 # code point: just itself. Finally, we add back in the
3682 # pre-existing code point, which will again be a single code
3683 # point range. Because 'i' likely will have changed as a
3684 # result of these operations, we can't just continue on, but
3685 # do this operation recursively as well. If we are inserting
3686 # LIFO, the pre-existing code point needs to go after the new
3687 # one, so use MULTIPLE_AFTER; and vice versa.
3688 if ($r->[$i]->start != $r->[$i]->end) {
3689 $self->_add_delete('-', $start, $end, "");
3690 $self->_add_delete('+', $start, $end, $value, Type => $type);
3691 return $self->_add_delete('+',
3694 Type => $existing_type,
3695 Replace => ($replace == $MULTIPLE_BEFORE)
3697 : $MULTIPLE_BEFORE);
3701 # If to place this new record after, move to beyond all existing
3702 # ones; but don't add this one if identical to any of them, as it
3703 # isn't really a multiple. This leaves the original order, so
3704 # that the current request is ignored. The reasoning is that the
3705 # previous request that wanted this record to have high priority
3706 # should have precedence.
3707 if ($replace == $MULTIPLE_AFTER) {
3708 while ($i < @$r && $r->[$i]->start == $start) {
3709 return if $value eq $r->[$i]->value
3710 && $type eq $r->[$i]->type;
3715 # If instead we are to place this new record before any
3716 # existing ones, remove any identical ones that come after it.
3717 # This changes the existing order so that the new one is
3718 # first, as is being requested.
3719 for (my $j = $i + 1;
3720 $j < @$r && $r->[$j]->start == $start;
3723 if ($value eq $r->[$j]->value && $type eq $r->[$j]->type) {
3725 last; # There should only be one instance, so no
3726 # need to keep looking
3731 trace "Adding multiple record at $i with $start..$end, $value" if main::DEBUG && $to_trace;
3732 my @return = splice @$r,
3739 if (main::DEBUG && $to_trace) {
3740 trace "After splice:";
3741 trace 'i-2=[', $i-2, ']', $r->[$i-2] if $i >= 2;
3742 trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i >= 1;
3743 trace "i =[", $i, "]", $r->[$i] if $i >= 0;
3744 trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < @$r - 1;
3745 trace 'i+2=[', $i+2, ']', $r->[$i+2] if $i < @$r - 2;
3746 trace 'i+3=[', $i+3, ']', $r->[$i+3] if $i < @$r - 3;
3751 # Here, we have taken care of $NO and $MULTIPLE_foo replaces. This
3752 # leaves delete, insert, and replace either unconditionally or if not
3753 # equivalent. $i still points to the first potential affected range.
3754 # Now find the highest range affected, which will determine the length
3755 # parameter to splice. (The input range can span multiple existing
3756 # ones.) If this isn't a deletion, while we are looking through the
3757 # range list, see also if this is a replacement rather than a clean
3758 # insertion; that is if it will change the values of at least one
3759 # existing range. Start off assuming it is an insert, until find it
3761 my $clean_insert = $operation eq '+';
3762 my $j; # This will point to the highest affected range
3764 # For non-zero types, the standard form is the value itself;
3765 my $standard_form = ($type) ? $value : main::standardize($value);
3767 for ($j = $i; $j < $range_list_size; $j++) {
3768 trace "Looking for highest affected range; the one at $j is ", $r->[$j] if main::DEBUG && $to_trace;
3770 # If find a range that it doesn't overlap into, we can stop
3772 last if $end < $r->[$j]->start;
3774 # Here, overlaps the range at $j. If the values don't match,
3775 # and so far we think this is a clean insertion, it becomes a
3776 # non-clean insertion, i.e., a 'change' or 'replace' instead.
3777 if ($clean_insert) {
3778 if ($r->[$j]->standard_form ne $standard_form) {
3780 if ($replace == $CROAK) {
3781 main::croak("The range to add "
3782 . sprintf("%04X", $start)
3784 . sprintf("%04X", $end)
3785 . " with value '$value' overlaps an existing range $r->[$j]");
3790 # Here, the two values are essentially the same. If the
3791 # two are actually identical, replacing wouldn't change
3792 # anything so skip it.
3793 my $pre_existing = $r->[$j]->value;
3794 if ($pre_existing ne $value) {
3796 # Here the new and old standardized values are the
3797 # same, but the non-standardized values aren't. If
3798 # replacing unconditionally, then replace
3799 if( $replace == $UNCONDITIONALLY) {
3804 # Here, are replacing conditionally. Decide to
3805 # replace or not based on which appears to look
3806 # the "nicest". If one is mixed case and the
3807 # other isn't, choose the mixed case one.
3808 my $new_mixed = $value =~ /[A-Z]/
3809 && $value =~ /[a-z]/;
3810 my $old_mixed = $pre_existing =~ /[A-Z]/
3811 && $pre_existing =~ /[a-z]/;
3813 if ($old_mixed != $new_mixed) {
3814 $clean_insert = 0 if $new_mixed;
3815 if (main::DEBUG && $to_trace) {
3816 if ($clean_insert) {
3817 trace "Retaining $pre_existing over $value";
3820 trace "Replacing $pre_existing with $value";
3826 # Here casing wasn't different between the two.
3827 # If one has hyphens or underscores and the
3828 # other doesn't, choose the one with the
3830 my $new_punct = $value =~ /[-_]/;
3831 my $old_punct = $pre_existing =~ /[-_]/;
3833 if ($old_punct != $new_punct) {
3834 $clean_insert = 0 if $new_punct;
3835 if (main::DEBUG && $to_trace) {
3836 if ($clean_insert) {
3837 trace "Retaining $pre_existing over $value";
3840 trace "Replacing $pre_existing with $value";
3843 } # else existing one is just as "good";
3844 # retain it to save cycles.
3850 } # End of loop looking for highest affected range.
3852 # Here, $j points to one beyond the highest range that this insertion
3853 # affects (hence to beyond the range list if that range is the final
3854 # one in the range list).
3856 # The splice length is all the affected ranges. Get it before
3857 # subtracting, for efficiency, so we don't have to later add 1.
3858 my $length = $j - $i;
3860 $j--; # $j now points to the highest affected range.
3861 trace "Final affected range is $j: $r->[$j]" if main::DEBUG && $to_trace;
3863 # Here, have taken care of $NO and $MULTIPLE_foo replaces.
3864 # $j points to the highest affected range. But it can be < $i or even
3865 # -1. These happen only if the insertion is entirely in the gap
3866 # between r[$i-1] and r[$i]. Here's why: j < i means that the j loop
3867 # above exited first time through with $end < $r->[$i]->start. (And
3868 # then we subtracted one from j) This implies also that $start <
3869 # $r->[$i]->start, but we know from above that $r->[$i-1]->end <
3870 # $start, so the entire input range is in the gap.
3873 # Here the entire input range is in the gap before $i.
3875 if (main::DEBUG && $to_trace) {
3877 trace "Entire range is between $r->[$i-1] and $r->[$i]";
3880 trace "Entire range is before $r->[$i]";
3883 return if $operation ne '+'; # Deletion of a non-existent range is
3888 # Here part of the input range is not in the gap before $i. Thus,
3889 # there is at least one affected one, and $j points to the highest
3892 # At this point, here is the situation:
3893 # This is not an insertion of a multiple, nor of tentative ($NO)
3895 # $i points to the first element in the current range list that
3896 # may be affected by this operation. In fact, we know
3897 # that the range at $i is affected because we are in
3898 # the else branch of this 'if'
3899 # $j points to the highest affected range.
3901 # r[$i-1]->end < $start <= r[$i]->end
3903 # r[$i-1]->end < $start <= $end <= r[$j]->end
3906 # $clean_insert is a boolean which is set true if and only if
3907 # this is a "clean insertion", i.e., not a change nor a
3908 # deletion (multiple was handled above).
3910 # We now have enough information to decide if this call is a no-op
3911 # or not. It is a no-op if this is an insertion of already
3914 if (main::DEBUG && $to_trace && $clean_insert
3916 && $start >= $r->[$i]->start)
3920 return if $clean_insert
3921 && $i == $j # more than one affected range => not no-op
3923 # Here, r[$i-1]->end < $start <= $end <= r[$i]->end
3924 # Further, $start and/or $end is >= r[$i]->start
3925 # The test below hence guarantees that
3926 # r[$i]->start < $start <= $end <= r[$i]->end
3927 # This means the input range is contained entirely in
3928 # the one at $i, so is a no-op
3929 && $start >= $r->[$i]->start;
3932 # Here, we know that some action will have to be taken. We have
3933 # calculated the offset and length (though adjustments may be needed)
3934 # for the splice. Now start constructing the replacement list.
3936 my $splice_start = $i;
3941 # See if should extend any adjacent ranges.
3942 if ($operation eq '-') { # Don't extend deletions
3943 $extends_below = $extends_above = 0;
3945 else { # Here, should extend any adjacent ranges. See if there are
3947 $extends_below = ($i > 0
3948 # can't extend unless adjacent
3949 && $r->[$i-1]->end == $start -1
3950 # can't extend unless are same standard value
3951 && $r->[$i-1]->standard_form eq $standard_form
3952 # can't extend unless share type
3953 && $r->[$i-1]->type == $type);
3954 $extends_above = ($j+1 < $range_list_size
3955 && $r->[$j+1]->start == $end +1
3956 && $r->[$j+1]->standard_form eq $standard_form
3957 && $r->[$j+1]->type == $type);
3959 if ($extends_below && $extends_above) { # Adds to both
3960 $splice_start--; # start replace at element below
3961 $length += 2; # will replace on both sides
3962 trace "Extends both below and above ranges" if main::DEBUG && $to_trace;
3964 # The result will fill in any gap, replacing both sides, and
3965 # create one large range.
3966 @replacement = Range->new($r->[$i-1]->start,
3973 # Here we know that the result won't just be the conglomeration of
3974 # a new range with both its adjacent neighbors. But it could
3975 # extend one of them.
3977 if ($extends_below) {
3979 # Here the new element adds to the one below, but not to the
3980 # one above. If inserting, and only to that one range, can
3981 # just change its ending to include the new one.
3982 if ($length == 0 && $clean_insert) {
3983 $r->[$i-1]->set_end($end);
3984 trace "inserted range extends range to below so it is now $r->[$i-1]" if main::DEBUG && $to_trace;
3988 trace "Changing inserted range to start at ", sprintf("%04X", $r->[$i-1]->start), " instead of ", sprintf("%04X", $start) if main::DEBUG && $to_trace;
3989 $splice_start--; # start replace at element below
3990 $length++; # will replace the element below
3991 $start = $r->[$i-1]->start;
3994 elsif ($extends_above) {
3996 # Here the new element adds to the one above, but not below.
3997 # Mirror the code above
3998 if ($length == 0 && $clean_insert) {
3999 $r->[$j+1]->set_start($start);
4000 trace "inserted range extends range to above so it is now $r->[$j+1]" if main::DEBUG && $to_trace;
4004 trace "Changing inserted range to end at ", sprintf("%04X", $r->[$j+1]->end), " instead of ", sprintf("%04X", $end) if main::DEBUG && $to_trace;
4005 $length++; # will replace the element above
4006 $end = $r->[$j+1]->end;
4010 trace "Range at $i is $r->[$i]" if main::DEBUG && $to_trace;
4012 # Finally, here we know there will have to be a splice.
4013 # If the change or delete affects only the highest portion of the
4014 # first affected range, the range will have to be split. The
4015 # splice will remove the whole range, but will replace it by a new
4016 # range containing just the unaffected part. So, in this case,
4017 # add to the replacement list just this unaffected portion.
4018 if (! $extends_below
4019 && $start > $r->[$i]->start && $start <= $r->[$i]->end)
4022 Range->new($r->[$i]->start,
4024 Value => $r->[$i]->value,
4025 Type => $r->[$i]->type);
4028 # In the case of an insert or change, but not a delete, we have to
4029 # put in the new stuff; this comes next.
4030 if ($operation eq '+') {
4031 push @replacement, Range->new($start,
4037 trace "Range at $j is $r->[$j]" if main::DEBUG && $to_trace && $j != $i;
4038 #trace "$end >=", $r->[$j]->start, " && $end <", $r->[$j]->end if main::DEBUG && $to_trace;
4040 # And finally, if we're changing or deleting only a portion of the
4041 # highest affected range, it must be split, as the lowest one was.
4042 if (! $extends_above
4043 && $j >= 0 # Remember that j can be -1 if before first
4045 && $end >= $r->[$j]->start
4046 && $end < $r->[$j]->end)
4049 Range->new($end + 1,
4051 Value => $r->[$j]->value,
4052 Type => $r->[$j]->type);
4056 # And do the splice, as calculated above
4057 if (main::DEBUG && $to_trace) {
4058 trace "replacing $length element(s) at $i with ";
4059 foreach my $replacement (@replacement) {
4060 trace " $replacement";
4062 trace "Before splice:";
4063 trace 'i-2=[', $i-2, ']', $r->[$i-2] if $i >= 2;
4064 trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i >= 1;
4065 trace "i =[", $i, "]", $r->[$i];
4066 trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < @$r - 1;
4067 trace 'i+2=[', $i+2, ']', $r->[$i+2] if $i < @$r - 2;
4070 my @return = splice @$r, $splice_start, $length, @replacement;
4072 if (main::DEBUG && $to_trace) {
4073 trace "After splice:";
4074 trace 'i-2=[', $i-2, ']', $r->[$i-2] if $i >= 2;
4075 trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i >= 1;
4076 trace "i =[", $i, "]", $r->[$i];
4077 trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < @$r - 1;
4078 trace 'i+2=[', $i+2, ']', $r->[$i+2] if $i < @$r - 2;
4079 trace "removed ", @return if @return;
4082 # An actual deletion could have changed the maximum in the list.
4083 # There was no deletion if the splice didn't return something, but
4084 # otherwise recalculate it. This is done too rarely to worry about
4086 if ($operation eq '-' && @return) {
4087 $max{$addr} = $r->[-1]->end;
4092 sub reset_each_range { # reset the iterator for each_range();
4094 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4097 undef $each_range_iterator{pack 'J', $self};
4102 # Iterate over each range in a range list. Results are undefined if
4103 # the range list is changed during the iteration.
4106 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4108 my $addr = do { no overloading; pack 'J', $self; };
4110 return if $self->is_empty;
4112 $each_range_iterator{$addr} = -1
4113 if ! defined $each_range_iterator{$addr};
4114 $each_range_iterator{$addr}++;
4115 return $ranges{$addr}->[$each_range_iterator{$addr}]
4116 if $each_range_iterator{$addr} < @{$ranges{$addr}};
4117 undef $each_range_iterator{$addr};
4121 sub count { # Returns count of code points in range list
4123 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4125 my $addr = do { no overloading; pack 'J', $self; };
4128 foreach my $range (@{$ranges{$addr}}) {
4129 $count += $range->end - $range->start + 1;
4134 sub delete_range { # Delete a range
4139 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4141 return $self->_add_delete('-', $start, $end, "");
4144 sub is_empty { # Returns boolean as to if a range list is empty
4146 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4149 return scalar @{$ranges{pack 'J', $self}} == 0;
4153 # Quickly returns a scalar suitable for separating tables into
4154 # buckets, i.e. it is a hash function of the contents of a table, so
4155 # there are relatively few conflicts.
4158 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4160 my $addr = do { no overloading; pack 'J', $self; };
4162 # These are quickly computable. Return looks like 'min..max;count'
4163 return $self->min . "..$max{$addr};" . scalar @{$ranges{$addr}};
4165 } # End closure for _Range_List_Base
4168 use base '_Range_List_Base';
4170 # A Range_List is a range list for match tables; i.e. the range values are
4171 # not significant. Thus a number of operations can be safely added to it,
4172 # such as inversion, intersection. Note that union is also an unsafe
4173 # operation when range values are cared about, and that method is in the base
4174 # class, not here. But things are set up so that that method is callable only
4175 # during initialization. Only in this derived class, is there an operation
4176 # that combines two tables. A Range_Map can thus be used to initialize a
4177 # Range_List, and its mappings will be in the list, but are not significant to
4180 sub trace { return main::trace(@_); }
4186 '+' => sub { my $self = shift;
4189 return $self->_union($other)
4191 '+=' => sub { my $self = shift;
4193 my $reversed = shift;
4196 Carp::my_carp_bug("Bad news. Can't cope with '"
4200 . "'. undef returned.");
4204 return $self->_union($other)
4206 '&' => sub { my $self = shift;
4209 return $self->_intersect($other, 0);
4216 # Returns a new Range_List that gives all code points not in $self.
4220 my $new = Range_List->new;
4222 # Go through each range in the table, finding the gaps between them
4223 my $max = -1; # Set so no gap before range beginning at 0
4224 for my $range ($self->ranges) {
4225 my $start = $range->start;
4226 my $end = $range->end;
4228 # If there is a gap before this range, the inverse will contain
4230 if ($start > $max + 1) {
4231 $new->add_range($max + 1, $start - 1);
4236 # And finally, add the gap from the end of the table to the max
4237 # possible code point
4238 if ($max < $MAX_UNICODE_CODEPOINT) {
4239 $new->add_range($max + 1, $MAX_UNICODE_CODEPOINT);
4245 # Returns a new Range_List with the argument deleted from it. The
4246 # argument can be a single code point, a range, or something that has
4247 # a range, with the _range_list() method on it returning them
4251 my $reversed = shift;
4252 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4255 Carp::my_carp_bug("Can't cope with a "
4257 . " being the second parameter in a '-'. Subtraction ignored.");
4261 my $new = Range_List->new(Initialize => $self);
4263 if (! ref $other) { # Single code point
4264 $new->delete_range($other, $other);
4266 elsif ($other->isa('Range')) {
4267 $new->delete_range($other->start, $other->end);
4269 elsif ($other->can('_range_list')) {
4270 foreach my $range ($other->_range_list->ranges) {
4271 $new->delete_range($range->start, $range->end);
4275 Carp::my_carp_bug("Can't cope with a "
4277 . " argument to '-'. Subtraction ignored."
4286 # Returns either a boolean giving whether the two inputs' range lists
4287 # intersect (overlap), or a new Range_List containing the intersection
4288 # of the two lists. The optional final parameter being true indicates
4289 # to do the check instead of the intersection.
4291 my $a_object = shift;
4292 my $b_object = shift;
4293 my $check_if_overlapping = shift;
4294 $check_if_overlapping = 0 unless defined $check_if_overlapping;
4295 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4297 if (! defined $b_object) {
4299 $message .= $a_object->_owner_name_of if defined $a_object;
4300 Carp::my_carp_bug($message .= "Called with undefined value. Intersection not done.");
4304 # a & b = !(!a | !b), or in our terminology = ~ ( ~a + -b )
4305 # Thus the intersection could be much more simply be written:
4306 # return ~(~$a_object + ~$b_object);
4307 # But, this is slower, and when taking the inverse of a large
4308 # range_size_1 table, back when such tables were always stored that
4309 # way, it became prohibitively slow, hence the code was changed to the
4312 if ($b_object->isa('Range')) {
4313 $b_object = Range_List->new(Initialize => $b_object,
4314 Owner => $a_object->_owner_name_of);
4316 $b_object = $b_object->_range_list if $b_object->can('_range_list');
4318 my @a_ranges = $a_object->ranges;
4319 my @b_ranges = $b_object->ranges;
4321 #local $to_trace = 1 if main::DEBUG;
4322 trace "intersecting $a_object with ", scalar @a_ranges, "ranges and $b_object with", scalar @b_ranges, " ranges" if main::DEBUG && $to_trace;
4324 # Start with the first range in each list
4326 my $range_a = $a_ranges[$a_i];
4328 my $range_b = $b_ranges[$b_i];
4330 my $new = __PACKAGE__->new(Owner => $a_object->_owner_name_of)
4331 if ! $check_if_overlapping;
4333 # If either list is empty, there is no intersection and no overlap
4334 if (! defined $range_a || ! defined $range_b) {
4335 return $check_if_overlapping ? 0 : $new;
4337 trace "range_a[$a_i]=$range_a; range_b[$b_i]=$range_b" if main::DEBUG && $to_trace;
4339 # Otherwise, must calculate the intersection/overlap. Start with the
4340 # very first code point in each list
4341 my $a = $range_a->start;
4342 my $b = $range_b->start;
4344 # Loop through all the ranges of each list; in each iteration, $a and
4345 # $b are the current code points in their respective lists
4348 # If $a and $b are the same code point, ...
4351 # it means the lists overlap. If just checking for overlap
4352 # know the answer now,
4353 return 1 if $check_if_overlapping;
4355 # The intersection includes this code point plus anything else
4356 # common to both current ranges.
4358 my $end = main::min($range_a->end, $range_b->end);
4359 if (! $check_if_overlapping) {
4360 trace "adding intersection range ", sprintf("%04X", $start) . ".." . sprintf("%04X", $end) if main::DEBUG && $to_trace;
4361 $new->add_range($start, $end);
4364 # Skip ahead to the end of the current intersect
4367 # If the current intersect ends at the end of either range (as
4368 # it must for at least one of them), the next possible one
4369 # will be the beginning code point in it's list's next range.
4370 if ($a == $range_a->end) {
4371 $range_a = $a_ranges[++$a_i];
4372 last unless defined $range_a;
4373 $a = $range_a->start;
4375 if ($b == $range_b->end) {
4376 $range_b = $b_ranges[++$b_i];
4377 last unless defined $range_b;
4378 $b = $range_b->start;
4381 trace "range_a[$a_i]=$range_a; range_b[$b_i]=$range_b" if main::DEBUG && $to_trace;
4385 # Not equal, but if the range containing $a encompasses $b,
4386 # change $a to be the middle of the range where it does equal
4387 # $b, so the next iteration will get the intersection
4388 if ($range_a->end >= $b) {
4393 # Here, the current range containing $a is entirely below
4394 # $b. Go try to find a range that could contain $b.
4395 $a_i = $a_object->_search_ranges($b);
4397 # If no range found, quit.
4398 last unless defined $a_i;
4400 # The search returns $a_i, such that
4401 # range_a[$a_i-1]->end < $b <= range_a[$a_i]->end
4402 # Set $a to the beginning of this new range, and repeat.
4403 $range_a = $a_ranges[$a_i];
4404 $a = $range_a->start;
4407 else { # Here, $b < $a.
4409 # Mirror image code to the leg just above
4410 if ($range_b->end >= $a) {
4414 $b_i = $b_object->_search_ranges($a);
4415 last unless defined $b_i;
4416 $range_b = $b_ranges[$b_i];
4417 $b = $range_b->start;
4420 } # End of looping through ranges.
4422 # Intersection fully computed, or now know that there is no overlap
4423 return $check_if_overlapping ? 0 : $new;
4427 # Returns boolean giving whether the two arguments overlap somewhere
4431 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4433 return $self->_intersect($other, 1);
4437 # Add a range to the list.
4442 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4444 return $self->_add_delete('+', $start, $end, "");
4447 sub matches_identically_to {
4448 # Return a boolean as to whether or not two Range_Lists match identical
4449 # sets of code points.
4453 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4455 # These are ordered in increasing real time to figure out (at least
4456 # until a patch changes that and doesn't change this)
4457 return 0 if $self->max != $other->max;
4458 return 0 if $self->min != $other->min;
4459 return 0 if $self->range_count != $other->range_count;
4460 return 0 if $self->count != $other->count;
4462 # Here they could be identical because all the tests above passed.
4463 # The loop below is somewhat simpler since we know they have the same
4464 # number of elements. Compare range by range, until reach the end or
4465 # find something that differs.
4466 my @a_ranges = $self->ranges;
4467 my @b_ranges = $other->ranges;
4468 for my $i (0 .. @a_ranges - 1) {
4469 my $a = $a_ranges[$i];
4470 my $b = $b_ranges[$i];
4471 trace "self $a; other $b" if main::DEBUG && $to_trace;
4472 return 0 if ! defined $b
4473 || $a->start != $b->start
4474 || $a->end != $b->end;
4479 sub is_code_point_usable {
4480 # This used only for making the test script. See if the input
4481 # proposed trial code point is one that Perl will handle. If second
4482 # parameter is 0, it won't select some code points for various
4483 # reasons, noted below.
4486 my $try_hard = shift;
4487 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4489 return 0 if $code < 0; # Never use a negative
4491 # shun null. I'm (khw) not sure why this was done, but NULL would be
4492 # the character very frequently used.
4493 return $try_hard if $code == 0x0000;
4495 # shun non-character code points.
4496 return $try_hard if $code >= 0xFDD0 && $code <= 0xFDEF;
4497 return $try_hard if ($code & 0xFFFE) == 0xFFFE; # includes FFFF
4499 return $try_hard if $code > $MAX_UNICODE_CODEPOINT; # keep in range
4500 return $try_hard if $code >= 0xD800 && $code <= 0xDFFF; # no surrogate
4505 sub get_valid_code_point {
4506 # Return a code point that's part of the range list. Returns nothing
4507 # if the table is empty or we can't find a suitable code point. This
4508 # used only for making the test script.
4511 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4513 my $addr = do { no overloading; pack 'J', $self; };
4515 # On first pass, don't choose less desirable code points; if no good
4516 # one is found, repeat, allowing a less desirable one to be selected.
4517 for my $try_hard (0, 1) {
4519 # Look through all the ranges for a usable code point.
4520 for my $set (reverse $self->ranges) {
4522 # Try the edge cases first, starting with the end point of the
4524 my $end = $set->end;
4525 return $end if is_code_point_usable($end, $try_hard);
4527 # End point didn't, work. Start at the beginning and try
4528 # every one until find one that does work.
4529 for my $trial ($set->start .. $end - 1) {
4530 return $trial if is_code_point_usable($trial, $try_hard);
4534 return (); # If none found, give up.
4537 sub get_invalid_code_point {
4538 # Return a code point that's not part of the table. Returns nothing
4539 # if the table covers all code points or a suitable code point can't
4540 # be found. This used only for making the test script.
4543 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4545 # Just find a valid code point of the inverse, if any.
4546 return Range_List->new(Initialize => ~ $self)->get_valid_code_point;
4548 } # end closure for Range_List
4551 use base '_Range_List_Base';
4553 # A Range_Map is a range list in which the range values (called maps) are
4554 # significant, and hence shouldn't be manipulated by our other code, which
4555 # could be ambiguous or lose things. For example, in taking the union of two
4556 # lists, which share code points, but which have differing values, which one
4557 # has precedence in the union?
4558 # It turns out that these operations aren't really necessary for map tables,
4559 # and so this class was created to make sure they aren't accidentally
4565 # Add a range containing a mapping value to the list
4568 # Rest of parameters passed on
4570 return $self->_add_delete('+', @_);
4574 # Adds entry to a range list which can duplicate an existing entry
4577 my $code_point = shift;
4580 my $replace = delete $args{'Replace'} // $MULTIPLE_BEFORE;
4581 Carp::carp_extra_args(\%args) if main::DEBUG && %args;
4583 return $self->add_map($code_point, $code_point,
4584 $value, Replace => $replace);
4586 } # End of closure for package Range_Map
4588 package _Base_Table;
4590 # A table is the basic data structure that gets written out into a file for
4591 # use by the Perl core. This is the abstract base class implementing the
4592 # common elements from the derived ones. A list of the methods to be
4593 # furnished by an implementing class is just after the constructor.
4595 sub standardize { return main::standardize($_[0]); }
4596 sub trace { return main::trace(@_); }
4600 main::setup_package();
4603 # Object containing the ranges of the table.
4604 main::set_access('range_list', \%range_list, 'p_r', 'p_s');
4607 # The full table name.
4608 main::set_access('full_name', \%full_name, 'r');
4611 # The table name, almost always shorter
4612 main::set_access('name', \%name, 'r');
4615 # The shortest of all the aliases for this table, with underscores removed
4616 main::set_access('short_name', \%short_name);
4618 my %nominal_short_name_length;
4619 # The length of short_name before removing underscores
4620 main::set_access('nominal_short_name_length',
4621 \%nominal_short_name_length);
4624 # The complete name, including property.
4625 main::set_access('complete_name', \%complete_name, 'r');
4628 # Parent property this table is attached to.
4629 main::set_access('property', \%property, 'r');
4632 # Ordered list of alias objects of the table's name. The first ones in
4633 # the list are output first in comments
4634 main::set_access('aliases', \%aliases, 'readable_array');
4637 # A comment associated with the table for human readers of the files
4638 main::set_access('comment', \%comment, 's');
4641 # A comment giving a short description of the table's meaning for human
4642 # readers of the files.
4643 main::set_access('description', \%description, 'readable_array');
4646 # A comment giving a short note about the table for human readers of the
4648 main::set_access('note', \%note, 'readable_array');
4651 # Enum; there are a number of possibilities for what happens to this
4652 # table: it could be normal, or suppressed, or not for external use. See
4653 # values at definition for $SUPPRESSED.
4654 main::set_access('fate', \%fate, 'r');
4656 my %find_table_from_alias;
4657 # The parent property passes this pointer to a hash which this class adds
4658 # all its aliases to, so that the parent can quickly take an alias and
4660 main::set_access('find_table_from_alias', \%find_table_from_alias, 'p_r');
4663 # After this table is made equivalent to another one; we shouldn't go
4664 # changing the contents because that could mean it's no longer equivalent
4665 main::set_access('locked', \%locked, 'r');
4668 # This gives the final path to the file containing the table. Each
4669 # directory in the path is an element in the array
4670 main::set_access('file_path', \%file_path, 'readable_array');
4673 # What is the table's status, normal, $OBSOLETE, etc. Enum
4674 main::set_access('status', \%status, 'r');
4677 # A comment about its being obsolete, or whatever non normal status it has
4678 main::set_access('status_info', \%status_info, 'r');
4680 my %caseless_equivalent;
4681 # The table this is equivalent to under /i matching, if any.
4682 main::set_access('caseless_equivalent', \%caseless_equivalent, 'r', 's');
4685 # Is the table to be output with each range only a single code point?
4686 # This is done to avoid breaking existing code that may have come to rely
4687 # on this behavior in previous versions of this program.)
4688 main::set_access('range_size_1', \%range_size_1, 'r', 's');
4691 # A boolean set iff this table is a Perl extension to the Unicode
4693 main::set_access('perl_extension', \%perl_extension, 'r');
4695 my %output_range_counts;
4696 # A boolean set iff this table is to have comments written in the
4697 # output file that contain the number of code points in the range.
4698 # The constructor can override the global flag of the same name.
4699 main::set_access('output_range_counts', \%output_range_counts, 'r');
4702 # The format of the entries of the table. This is calculated from the
4703 # data in the table (or passed in the constructor). This is an enum e.g.,
4704 # $STRING_FORMAT. It is marked protected as it should not be generally
4705 # used to override calculations.
4706 main::set_access('format', \%format, 'r', 'p_s');
4709 # All arguments are key => value pairs, which you can see below, most
4710 # of which match fields documented above. Otherwise: Re_Pod_Entry,
4711 # OK_as_Filename, and Fuzzy apply to the names of the table, and are
4712 # documented in the Alias package
4714 return Carp::carp_too_few_args(\@_, 2) if main::DEBUG && @_ < 2;
4718 my $self = bless \do { my $anonymous_scalar }, $class;
4719 my $addr = do { no overloading; pack 'J', $self; };
4723 $name{$addr} = delete $args{'Name'};
4724 $find_table_from_alias{$addr} = delete $args{'_Alias_Hash'};
4725 $full_name{$addr} = delete $args{'Full_Name'};
4726 my $complete_name = $complete_name{$addr}
4727 = delete $args{'Complete_Name'};
4728 $format{$addr} = delete $args{'Format'};
4729 $output_range_counts{$addr} = delete $args{'Output_Range_Counts'};
4730 $property{$addr} = delete $args{'_Property'};
4731 $range_list{$addr} = delete $args{'_Range_List'};
4732 $status{$addr} = delete $args{'Status'} || $NORMAL;
4733 $status_info{$addr} = delete $args{'_Status_Info'} || "";
4734 $range_size_1{$addr} = delete $args{'Range_Size_1'} || 0;
4735 $caseless_equivalent{$addr} = delete $args{'Caseless_Equivalent'} || 0;
4736 $fate{$addr} = delete $args{'Fate'} || $ORDINARY;
4737 my $ucd = delete $args{'UCD'};
4739 my $description = delete $args{'Description'};
4740 my $ok_as_filename = delete $args{'OK_as_Filename'};
4741 my $loose_match = delete $args{'Fuzzy'};
4742 my $note = delete $args{'Note'};
4743 my $make_re_pod_entry = delete $args{'Re_Pod_Entry'};
4744 my $perl_extension = delete $args{'Perl_Extension'};
4746 # Shouldn't have any left over
4747 Carp::carp_extra_args(\%args) if main::DEBUG && %args;
4749 # Can't use || above because conceivably the name could be 0, and
4750 # can't use // operator in case this program gets used in Perl 5.8
4751 $full_name{$addr} = $name{$addr} if ! defined $full_name{$addr};
4752 $output_range_counts{$addr} = $output_range_counts if
4753 ! defined $output_range_counts{$addr};
4755 $aliases{$addr} = [ ];
4756 $comment{$addr} = [ ];
4757 $description{$addr} = [ ];
4759 $file_path{$addr} = [ ];
4760 $locked{$addr} = "";
4762 push @{$description{$addr}}, $description if $description;
4763 push @{$note{$addr}}, $note if $note;
4765 if ($fate{$addr} == $PLACEHOLDER) {
4767 # A placeholder table doesn't get documented, is a perl extension,
4768 # and quite likely will be empty
4769 $make_re_pod_entry = 0 if ! defined $make_re_pod_entry;
4770 $perl_extension = 1 if ! defined $perl_extension;
4771 $ucd = 0 if ! defined $ucd;
4772 push @tables_that_may_be_empty, $complete_name{$addr};
4773 $self->add_comment(<<END);
4774 This is a placeholder because it is not in Version $string_version of Unicode,
4775 but is needed by the Perl core to work gracefully. Because it is not in this
4776 version of Unicode, it will not be listed in $pod_file.pod
4779 elsif (exists $why_suppressed{$complete_name}
4780 # Don't suppress if overridden
4781 && ! grep { $_ eq $complete_name{$addr} }
4782 @output_mapped_properties)
4784 $fate{$addr} = $SUPPRESSED;
4786 elsif ($fate{$addr} == $SUPPRESSED
4787 && ! exists $why_suppressed{$property{$addr}->complete_name})
4789 Carp::my_carp_bug("There is no current capability to set the reason for suppressing.");
4790 # perhaps Fate => [ $SUPPRESSED, "reason" ]
4793 # If hasn't set its status already, see if it is on one of the
4794 # lists of properties or tables that have particular statuses; if
4795 # not, is normal. The lists are prioritized so the most serious
4796 # ones are checked first
4797 if (! $status{$addr}) {
4798 if (exists $why_deprecated{$complete_name}) {
4799 $status{$addr} = $DEPRECATED;
4801 elsif (exists $why_stabilized{$complete_name}) {
4802 $status{$addr} = $STABILIZED;
4804 elsif (exists $why_obsolete{$complete_name}) {
4805 $status{$addr} = $OBSOLETE;
4808 # Existence above doesn't necessarily mean there is a message
4809 # associated with it. Use the most serious message.
4810 if ($status{$addr}) {
4811 if ($why_deprecated{$complete_name}) {
4813 = $why_deprecated{$complete_name};
4815 elsif ($why_stabilized{$complete_name}) {
4817 = $why_stabilized{$complete_name};
4819 elsif ($why_obsolete{$complete_name}) {
4821 = $why_obsolete{$complete_name};
4826 $perl_extension{$addr} = $perl_extension || 0;
4828 # Don't list a property by default that is internal only
4829 if ($fate{$addr} > $MAP_PROXIED) {
4830 $make_re_pod_entry = 0 if ! defined $make_re_pod_entry;
4831 $ucd = 0 if ! defined $ucd;
4834 $ucd = 1 if ! defined $ucd;
4837 # By convention what typically gets printed only or first is what's
4838 # first in the list, so put the full name there for good output
4839 # clarity. Other routines rely on the full name being first on the
4841 $self->add_alias($full_name{$addr},
4842 OK_as_Filename => $ok_as_filename,
4843 Fuzzy => $loose_match,
4844 Re_Pod_Entry => $make_re_pod_entry,
4845 Status => $status{$addr},
4849 # Then comes the other name, if meaningfully different.
4850 if (standardize($full_name{$addr}) ne standardize($name{$addr})) {
4851 $self->add_alias($name{$addr},
4852 OK_as_Filename => $ok_as_filename,
4853 Fuzzy => $loose_match,
4854 Re_Pod_Entry => $make_re_pod_entry,
4855 Status => $status{$addr},
4863 # Here are the methods that are required to be defined by any derived
4866 handle_special_range
4870 # write() knows how to write out normal ranges, but it calls
4871 # handle_special_range() when it encounters a non-normal one.
4872 # append_to_body() is called by it after it has handled all
4873 # ranges to add anything after the main portion of the table.
4874 # And finally, pre_body() is called after all this to build up
4875 # anything that should appear before the main portion of the
4876 # table. Doing it this way allows things in the middle to
4877 # affect what should appear before the main portion of the
4882 Carp::my_carp_bug( __LINE__
4883 . ": Must create method '$sub()' for "
4891 "." => \&main::_operator_dot,
4892 ".=" => \&main::_operator_dot_equal,
4893 '!=' => \&main::_operator_not_equal,
4894 '==' => \&main::_operator_equal,
4898 # Returns the array of ranges associated with this table.
4901 return $range_list{pack 'J', shift}->ranges;
4905 # Add a synonym for this table.
4907 return Carp::carp_too_few_args(\@_, 3) if main::DEBUG && @_ < 3;
4910 my $name = shift; # The name to add.
4911 my $pointer = shift; # What the alias hash should point to. For
4912 # map tables, this is the parent property;
4913 # for match tables, it is the table itself.
4916 my $loose_match = delete $args{'Fuzzy'};
4918 my $make_re_pod_entry = delete $args{'Re_Pod_Entry'};
4919 $make_re_pod_entry = $YES unless defined $make_re_pod_entry;
4921 my $ok_as_filename = delete $args{'OK_as_Filename'};
4922 $ok_as_filename = 1 unless defined $ok_as_filename;
4924 my $status = delete $args{'Status'};
4925 $status = $NORMAL unless defined $status;
4927 # An internal name does not get documented, unless overridden by the
4929 my $ucd = delete $args{'UCD'} // (($name =~ /^_/) ? 0 : 1);
4931 Carp::carp_extra_args(\%args) if main::DEBUG && %args;
4933 # Capitalize the first letter of the alias unless it is one of the CJK
4934 # ones which specifically begins with a lower 'k'. Do this because
4935 # Unicode has varied whether they capitalize first letters or not, and
4936 # have later changed their minds and capitalized them, but not the
4937 # other way around. So do it always and avoid changes from release to
4939 $name = ucfirst($name) unless $name =~ /^k[A-Z]/;
4941 my $addr = do { no overloading; pack 'J', $self; };
4943 # Figure out if should be loosely matched if not already specified.
4944 if (! defined $loose_match) {
4946 # Is a loose_match if isn't null, and doesn't begin with an
4947 # underscore and isn't just a number
4949 && substr($name, 0, 1) ne '_'
4950 && $name !~ qr{^[0-9_.+-/]+$})
4959 # If this alias has already been defined, do nothing.
4960 return if defined $find_table_from_alias{$addr}->{$name};
4962 # That includes if it is standardly equivalent to an existing alias,
4963 # in which case, add this name to the list, so won't have to search
4965 my $standard_name = main::standardize($name);
4966 if (defined $find_table_from_alias{$addr}->{$standard_name}) {
4967 $find_table_from_alias{$addr}->{$name}
4968 = $find_table_from_alias{$addr}->{$standard_name};
4972 # Set the index hash for this alias for future quick reference.
4973 $find_table_from_alias{$addr}->{$name} = $pointer;
4974 $find_table_from_alias{$addr}->{$standard_name} = $pointer;
4975 local $to_trace = 0 if main::DEBUG;
4976 trace "adding alias $name to $pointer" if main::DEBUG && $to_trace;
4977 trace "adding alias $standard_name to $pointer" if main::DEBUG && $to_trace;
4980 # Put the new alias at the end of the list of aliases unless the final
4981 # element begins with an underscore (meaning it is for internal perl
4982 # use) or is all numeric, in which case, put the new one before that
4983 # one. This floats any all-numeric or underscore-beginning aliases to
4984 # the end. This is done so that they are listed last in output lists,
4985 # to encourage the user to use a better name (either more descriptive
4986 # or not an internal-only one) instead. This ordering is relied on
4987 # implicitly elsewhere in this program, like in short_name()
4988 my $list = $aliases{$addr};
4989 my $insert_position = (@$list == 0
4990 || (substr($list->[-1]->name, 0, 1) ne '_'
4991 && $list->[-1]->name =~ /\D/))
4997 Alias->new($name, $loose_match, $make_re_pod_entry,
4998 $ok_as_filename, $status, $ucd);
5000 # This name may be shorter than any existing ones, so clear the cache
5001 # of the shortest, so will have to be recalculated.
5003 undef $short_name{pack 'J', $self};
5008 # Returns a name suitable for use as the base part of a file name.
5009 # That is, shorter wins. It can return undef if there is no suitable
5010 # name. The name has all non-essential underscores removed.
5012 # The optional second parameter is a reference to a scalar in which
5013 # this routine will store the length the returned name had before the
5014 # underscores were removed, or undef if the return is undef.
5016 # The shortest name can change if new aliases are added. So using
5017 # this should be deferred until after all these are added. The code
5018 # that does that should clear this one's cache.
5019 # Any name with alphabetics is preferred over an all numeric one, even
5023 my $nominal_length_ptr = shift;
5024 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5026 my $addr = do { no overloading; pack 'J', $self; };
5028 # For efficiency, don't recalculate, but this means that adding new
5029 # aliases could change what the shortest is, so the code that does
5030 # that needs to undef this.
5031 if (defined $short_name{$addr}) {
5032 if ($nominal_length_ptr) {
5033 $$nominal_length_ptr = $nominal_short_name_length{$addr};
5035 return $short_name{$addr};
5038 # Look at each alias
5039 foreach my $alias ($self->aliases()) {
5041 # Don't use an alias that isn't ok to use for an external name.
5042 next if ! $alias->ok_as_filename;
5044 my $name = main::Standardize($alias->name);
5045 trace $self, $name if main::DEBUG && $to_trace;
5047 # Take the first one, or a shorter one that isn't numeric. This
5048 # relies on numeric aliases always being last in the array
5049 # returned by aliases(). Any alpha one will have precedence.
5050 if (! defined $short_name{$addr}
5052 && length($name) < length($short_name{$addr})))
5054 # Remove interior underscores.
5055 ($short_name{$addr} = $name) =~ s/ (?<= . ) _ (?= . ) //xg;
5057 $nominal_short_name_length{$addr} = length $name;
5061 # If the short name isn't a nice one, perhaps an equivalent table has
5063 if (! defined $short_name{$addr}
5064 || $short_name{$addr} eq ""
5065 || $short_name{$addr} eq "_")
5068 foreach my $follower ($self->children) { # All equivalents
5069 my $follower_name = $follower->short_name;
5070 next unless defined $follower_name;
5072 # Anything (except undefined) is better than underscore or
5074 if (! defined $return || $return eq "_") {
5075 $return = $follower_name;
5079 # If the new follower name isn't "_" and is shorter than the
5080 # current best one, prefer the new one.
5081 next if $follower_name eq "_";
5082 next if length $follower_name > length $return;
5083 $return = $follower_name;
5085 $short_name{$addr} = $return if defined $return;
5088 # If no suitable external name return undef
5089 if (! defined $short_name{$addr}) {
5090 $$nominal_length_ptr = undef if $nominal_length_ptr;
5094 # Don't allow a null short name.
5095 if ($short_name{$addr} eq "") {
5096 $short_name{$addr} = '_';
5097 $nominal_short_name_length{$addr} = 1;
5100 trace $self, $short_name{$addr} if main::DEBUG && $to_trace;
5102 if ($nominal_length_ptr) {
5103 $$nominal_length_ptr = $nominal_short_name_length{$addr};
5105 return $short_name{$addr};
5109 # Returns the external name that this table should be known by. This
5110 # is usually the short_name, but not if the short_name is undefined,
5111 # in which case the external_name is arbitrarily set to the
5115 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5117 my $short = $self->short_name;
5118 return $short if defined $short;
5123 sub add_description { # Adds the parameter as a short description.
5126 my $description = shift;
5128 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5131 push @{$description{pack 'J', $self}}, $description;
5136 sub add_note { # Adds the parameter as a short note.
5141 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5144 push @{$note{pack 'J', $self}}, $note;
5149 sub add_comment { # Adds the parameter as a comment.
5151 return unless $debugging_build;
5154 my $comment = shift;
5155 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5160 push @{$comment{pack 'J', $self}}, $comment;
5166 # Return the current comment for this table. If called in list
5167 # context, returns the array of comments. In scalar, returns a string
5168 # of each element joined together with a period ending each.
5171 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5173 my $addr = do { no overloading; pack 'J', $self; };
5174 my @list = @{$comment{$addr}};
5175 return @list if wantarray;
5177 foreach my $sentence (@list) {
5178 $return .= '. ' if $return;
5179 $return .= $sentence;
5182 $return .= '.' if $return;
5187 # Initialize the table with the argument which is any valid
5188 # initialization for range lists.
5191 my $addr = do { no overloading; pack 'J', $self; };
5192 my $initialization = shift;
5193 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5195 # Replace the current range list with a new one of the same exact
5197 my $class = ref $range_list{$addr};
5198 $range_list{$addr} = $class->new(Owner => $self,
5199 Initialize => $initialization);
5205 # The header that is output for the table in the file it is written
5209 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5212 $return .= $DEVELOPMENT_ONLY if $compare_versions;
5218 # Write a representation of the table to its file. It calls several
5219 # functions furnished by sub-classes of this abstract base class to
5220 # handle non-normal ranges, to add stuff before the table, and at its
5221 # end. If the table is to be written so that adjustments are
5222 # required, this does that conversion.
5225 my $use_adjustments = shift; # ? output in adjusted format or not
5226 my $tab_stops = shift; # The number of tab stops over to put any
5228 my $suppress_value = shift; # Optional, if the value associated with
5229 # a range equals this one, don't write
5231 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5233 my $addr = do { no overloading; pack 'J', $self; };
5235 # Start with the header
5236 my @HEADER = $self->header;
5239 push @HEADER, "\n", main::simple_fold($comment{$addr}, '# '), "\n"
5242 # Things discovered processing the main body of the document may
5243 # affect what gets output before it, therefore pre_body() isn't called
5244 # until after all other processing of the table is done.
5246 # The main body looks like a 'here' document. If annotating, get rid
5247 # of the comments before passing to the caller, as some callers, such
5248 # as charnames.pm, can't cope with them. (Outputting range counts
5249 # also introduces comments, but these don't show up in the tables that
5250 # can't cope with comments, and there aren't that many of them that
5251 # it's worth the extra real time to get rid of them).
5254 # Use the line below in Perls that don't have /r
5255 #push @OUT, 'return join "\n", map { s/\s*#.*//mg; $_ } split "\n", <<\'END\';' . "\n";
5256 push @OUT, "return <<'END' =~ s/\\s*#.*//mgr;\n";
5258 push @OUT, "return <<'END';\n";
5261 if ($range_list{$addr}->is_empty) {
5263 # This is a kludge for empty tables to silence a warning in
5264 # utf8.c, which can't really deal with empty tables, but it can
5265 # deal with a table that matches nothing, as the inverse of 'Any'
5267 push @OUT, "!utf8::Any\n";
5269 elsif ($self->name eq 'N'
5271 # To save disk space and table cache space, avoid putting out
5272 # binary N tables, but instead create a file which just inverts
5273 # the Y table. Since the file will still exist and occupy a
5274 # certain number of blocks, might as well output the whole
5275 # thing if it all will fit in one block. The number of
5276 # ranges below is an approximate number for that.
5277 && ($self->property->type == $BINARY
5278 || $self->property->type == $FORCED_BINARY)
5279 # && $self->property->tables == 2 Can't do this because the
5280 # non-binary properties, like NFDQC aren't specifiable
5282 && $range_list{$addr}->ranges > 15
5283 && ! $annotate) # Under --annotate, want to see everything
5285 push @OUT, "!utf8::" . $self->property->name . "\n";
5288 my $range_size_1 = $range_size_1{$addr};
5289 my $format; # Used only in $annotate option
5290 my $include_name; # Used only in $annotate option
5294 # If annotating each code point, must print 1 per line.
5295 # The variable could point to a subroutine, and we don't want
5296 # to lose that fact, so only set if not set already
5297 $range_size_1 = 1 if ! $range_size_1;
5299 $format = $self->format;
5301 # The name of the character is output only for tables that
5302 # don't already include the name in the output.
5303 my $property = $self->property;
5305 ! ($property == $perl_charname
5306 || $property == main::property_ref('Unicode_1_Name')
5307 || $property == main::property_ref('Name')
5308 || $property == main::property_ref('Name_Alias')
5312 # Values for previous time through the loop. Initialize to
5313 # something that won't be adjacent to the first iteration;
5314 # only $previous_end matters for that.
5316 my $previous_end = -2;
5319 # Values for next time through the portion of the loop that splits
5320 # the range. 0 in $next_start means there is no remaining portion
5327 # Output each range as part of the here document.
5329 for my $set ($range_list{$addr}->ranges) {
5330 if ($set->type != 0) {
5331 $self->handle_special_range($set);
5334 my $start = $set->start;
5335 my $end = $set->end;
5336 my $value = $set->value;
5338 # Don't output ranges whose value is the one to suppress
5339 next RANGE if defined $suppress_value
5340 && $value eq $suppress_value;
5342 { # This bare block encloses the scope where we may need to
5343 # split a range (when outputting adjusteds), and each time
5344 # through we handle the next portion of the original by
5345 # ending the block with a 'redo'. The values to use for
5346 # that next time through are set up just below in the
5347 # scalars whose names begin with '$next_'.
5349 if ($use_adjustments) {
5351 # When converting to use adjustments, we can handle
5352 # only single element ranges. Set up so that this
5353 # time through the loop, we look at the first element,
5354 # and the next time through, we start off with the
5355 # remainder. Thus each time through we look at the
5356 # first element of the range
5357 if ($end != $start) {
5358 $next_start = $start + 1;
5360 $next_value = $value;
5364 # The values for some of these tables are stored as
5365 # hex strings. Convert those to decimal
5366 $value = hex($value)
5367 if $self->default_map eq $CODE_POINT
5368 && $value =~ / ^ [A-Fa-f0-9]+ $ /x;
5370 # If this range is adjacent to the previous one, and
5371 # the values in each are integers that are also
5372 # adjacent (differ by 1), then this range really
5373 # extends the previous one that is already in element
5374 # $OUT[-1]. So we pop that element, and pretend that
5375 # the range starts with whatever it started with.
5376 # $offset is incremented by 1 each time so that it
5377 # gives the current offset from the first element in
5378 # the accumulating range, and we keep in $value the
5379 # value of that first element.
5380 if ($start == $previous_end + 1
5381 && $value =~ /^ -? \d+ $/xa
5382 && $previous_value =~ /^ -? \d+ $/xa
5383 && ($value == ($previous_value + ++$offset)))
5386 $start = $previous_start;
5387 $value = $previous_value;
5393 # Save the current values for the next time through
5395 $previous_start = $start;
5396 $previous_end = $end;
5397 $previous_value = $value;
5400 # If there is a range and doesn't need a single point range
5402 if ($start != $end && ! $range_size_1) {
5403 push @OUT, sprintf "%04X\t%04X", $start, $end;
5404 $OUT[-1] .= "\t$value" if $value ne "";
5406 # Add a comment with the size of the range, if
5407 # requested. Expand Tabs to make sure they all start
5408 # in the same column, and then unexpand to use mostly
5410 if (! $output_range_counts{$addr}) {
5414 $OUT[-1] = Text::Tabs::expand($OUT[-1]);
5415 my $count = main::clarify_number($end - $start + 1);
5418 my $width = $tab_stops * 8 - 1;
5419 $OUT[-1] = sprintf("%-*s # [%s]\n",
5423 $OUT[-1] = Text::Tabs::unexpand($OUT[-1]);
5427 # Here to output a single code point per line.
5428 # If not to annotate, use the simple formats
5429 elsif (! $annotate) {
5431 # Use any passed in subroutine to output.
5432 if (ref $range_size_1 eq 'CODE') {
5433 for my $i ($start .. $end) {
5434 push @OUT, &{$range_size_1}($i, $value);
5439 # Here, caller is ok with default output.
5440 for (my $i = $start; $i <= $end; $i++) {
5441 push @OUT, sprintf "%04X\t\t%s\n", $i, $value;
5447 # Here, wants annotation.
5448 for (my $i = $start; $i <= $end; $i++) {
5450 # Get character information if don't have it already
5451 main::populate_char_info($i)
5452 if ! defined $viacode[$i];
5453 my $type = $annotate_char_type[$i];
5455 # Figure out if should output the next code points
5456 # as part of a range or not. If this is not in an
5457 # annotation range, then won't output as a range,
5458 # so returns $i. Otherwise use the end of the
5459 # annotation range, but no further than the
5460 # maximum possible end point of the loop.
5461 my $range_end = main::min(
5462 $annotate_ranges->value_of($i) || $i,
5465 # Use a range if it is a range, and either is one
5466 # of the special annotation ranges, or the range
5467 # is at most 3 long. This last case causes the
5468 # algorithmically named code points to be output
5469 # individually in spans of at most 3, as they are
5470 # the ones whose $type is > 0.
5471 if ($range_end != $i
5472 && ( $type < 0 || $range_end - $i > 2))
5474 # Here is to output a range. We don't allow a
5475 # caller-specified output format--just use the
5477 push @OUT, sprintf "%04X\t%04X\t%s\t#", $i,
5480 my $range_name = $viacode[$i];
5482 # For the code points which end in their hex
5483 # value, we eliminate that from the output
5484 # annotation, and capitalize only the first
5485 # letter of each word.
5486 if ($type == $CP_IN_NAME) {
5487 my $hex = sprintf "%04X", $i;
5488 $range_name =~ s/-$hex$//;
5489 my @words = split " ", $range_name;
5490 for my $word (@words) {
5492 ucfirst(lc($word)) if $word ne 'CJK';
5494 $range_name = join " ", @words;
5496 elsif ($type == $HANGUL_SYLLABLE) {
5497 $range_name = "Hangul Syllable";
5500 $OUT[-1] .= " $range_name" if $range_name;
5502 # Include the number of code points in the
5505 main::clarify_number($range_end - $i + 1);
5506 $OUT[-1] .= " [$count]\n";
5508 # Skip to the end of the range
5511 else { # Not in a range.
5514 # When outputting the names of each character,
5515 # use the character itself if printable
5516 $comment .= "'" . chr($i) . "' "
5519 # To make it more readable, use a minimum
5523 # Determine the annotation
5524 if ($format eq $DECOMP_STRING_FORMAT) {
5526 # This is very specialized, with the type
5527 # of decomposition beginning the line
5528 # enclosed in <...>, and the code points
5529 # that the code point decomposes to
5530 # separated by blanks. Create two
5531 # strings, one of the printable
5532 # characters, and one of their official
5534 (my $map = $value) =~ s/ \ * < .*? > \ +//x;
5538 foreach my $to (split " ", $map) {
5539 $to = CORE::hex $to;
5540 $to_name .= " + " if $to_name;
5541 $to_chr .= chr($to);
5542 main::populate_char_info($to)
5543 if ! defined $viacode[$to];
5544 $to_name .= $viacode[$to];
5548 "=> '$to_chr'; $viacode[$i] => $to_name";
5549 $comment_indent = 25; # Determined by
5554 # Assume that any table that has hex
5555 # format is a mapping of one code point to
5557 if ($format eq $HEX_FORMAT) {
5558 my $decimal_value = CORE::hex $value;
5559 main::populate_char_info($decimal_value)
5560 if ! defined $viacode[$decimal_value];
5562 . chr($decimal_value)
5563 . "'; " if $printable[$decimal_value];
5565 $comment .= $viacode[$i] if $include_name
5567 if ($format eq $HEX_FORMAT) {
5568 my $decimal_value = CORE::hex $value;
5570 " => $viacode[$decimal_value]"
5571 if $viacode[$decimal_value];
5574 # If including the name, no need to
5575 # indent, as the name will already be way
5577 $comment_indent = ($include_name) ? 0 : 60;
5580 # Use any passed in routine to output the base
5582 if (ref $range_size_1 eq 'CODE') {
5583 my $base_part=&{$range_size_1}($i, $value);
5585 push @OUT, $base_part;
5588 push @OUT, sprintf "%04X\t\t%s", $i, $value;
5591 # And add the annotation.
5592 $OUT[-1] = sprintf "%-*s\t# %s",
5602 # If we split the range, set up so the next time through
5603 # we get the remainder, and redo.
5605 $start = $next_start;
5607 $value = $next_value;
5612 } # End of loop through all the table's ranges
5615 # Add anything that goes after the main body, but within the here
5617 my $append_to_body = $self->append_to_body;
5618 push @OUT, $append_to_body if $append_to_body;
5620 # And finish the here document.
5623 # Done with the main portion of the body. Can now figure out what
5624 # should appear before it in the file.
5625 my $pre_body = $self->pre_body;
5626 push @HEADER, $pre_body, "\n" if $pre_body;
5628 # All these files should have a .pl suffix added to them.
5629 my @file_with_pl = @{$file_path{$addr}};
5630 $file_with_pl[-1] .= '.pl';
5632 main::write(\@file_with_pl,
5633 $annotate, # utf8 iff annotating
5639 sub set_status { # Set the table's status
5641 my $status = shift; # The status enum value
5642 my $info = shift; # Any message associated with it.
5643 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5645 my $addr = do { no overloading; pack 'J', $self; };
5647 $status{$addr} = $status;
5648 $status_info{$addr} = $info;
5652 sub set_fate { # Set the fate of a table
5656 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5658 my $addr = do { no overloading; pack 'J', $self; };
5660 return if $fate{$addr} == $fate; # If no-op
5662 # Can only change the ordinary fate, except if going to $MAP_PROXIED
5663 return if $fate{$addr} != $ORDINARY && $fate != $MAP_PROXIED;
5665 $fate{$addr} = $fate;
5667 # Don't document anything to do with a non-normal fated table
5668 if ($fate != $ORDINARY) {
5669 my $put_in_pod = ($fate == $MAP_PROXIED) ? 1 : 0;
5670 foreach my $alias ($self->aliases) {
5671 $alias->set_ucd($put_in_pod);
5673 # MAP_PROXIED doesn't affect the match tables
5674 next if $fate == $MAP_PROXIED;
5675 $alias->set_make_re_pod_entry($put_in_pod);
5679 # Save the reason for suppression for output
5680 if ($fate == $SUPPRESSED && defined $reason) {
5681 $why_suppressed{$complete_name{$addr}} = $reason;
5688 # Don't allow changes to the table from now on. This stores a stack
5689 # trace of where it was called, so that later attempts to modify it
5690 # can immediately show where it got locked.
5693 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5695 my $addr = do { no overloading; pack 'J', $self; };
5697 $locked{$addr} = "";
5699 my $line = (caller(0))[2];
5702 # Accumulate the stack trace
5704 my ($pkg, $file, $caller_line, $caller) = caller $i++;
5706 last unless defined $caller;
5708 $locked{$addr} .= " called from $caller() at line $line\n";
5709 $line = $caller_line;
5711 $locked{$addr} .= " called from main at line $line\n";
5716 sub carp_if_locked {
5717 # Return whether a table is locked or not, and, by the way, complain
5721 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5723 my $addr = do { no overloading; pack 'J', $self; };
5725 return 0 if ! $locked{$addr};
5726 Carp::my_carp_bug("Can't modify a locked table. Stack trace of locking:\n$locked{$addr}\n\n");
5730 sub set_file_path { # Set the final directory path for this table
5732 # Rest of parameters passed on
5735 @{$file_path{pack 'J', $self}} = @_;
5739 # Accessors for the range list stored in this table. First for
5748 matches_identically_to
5761 return $self->_range_list->$sub(@_);
5765 # Then for ones that should fail if locked
5775 return if $self->carp_if_locked;
5777 return $self->_range_list->$sub(@_);
5784 use base '_Base_Table';
5786 # A Map Table is a table that contains the mappings from code points to
5787 # values. There are two weird cases:
5788 # 1) Anomalous entries are ones that aren't maps of ranges of code points, but
5789 # are written in the table's file at the end of the table nonetheless. It
5790 # requires specially constructed code to handle these; utf8.c can not read
5791 # these in, so they should not go in $map_directory. As of this writing,
5792 # the only case that these happen is for named sequences used in
5793 # charnames.pm. But this code doesn't enforce any syntax on these, so
5794 # something else could come along that uses it.
5795 # 2) Specials are anything that doesn't fit syntactically into the body of the
5796 # table. The ranges for these have a map type of non-zero. The code below
5797 # knows about and handles each possible type. In most cases, these are
5798 # written as part of the header.
5800 # A map table deliberately can't be manipulated at will unlike match tables.
5801 # This is because of the ambiguities having to do with what to do with
5802 # overlapping code points. And there just isn't a need for those things;
5803 # what one wants to do is just query, add, replace, or delete mappings, plus
5804 # write the final result.
5805 # However, there is a method to get the list of possible ranges that aren't in
5806 # this table to use for defaulting missing code point mappings. And,
5807 # map_add_or_replace_non_nulls() does allow one to add another table to this
5808 # one, but it is clearly very specialized, and defined that the other's
5809 # non-null values replace this one's if there is any overlap.
5811 sub trace { return main::trace(@_); }
5815 main::setup_package();
5818 # Many input files omit some entries; this gives what the mapping for the
5819 # missing entries should be
5820 main::set_access('default_map', \%default_map, 'r');
5822 my %anomalous_entries;
5823 # Things that go in the body of the table which don't fit the normal
5824 # scheme of things, like having a range. Not much can be done with these
5825 # once there except to output them. This was created to handle named
5827 main::set_access('anomalous_entry', \%anomalous_entries, 'a');
5828 main::set_access('anomalous_entries', # Append singular, read plural
5829 \%anomalous_entries,
5833 # Enum as to whether or not to write out this map table, and how:
5835 # $EXTERNAL_MAP means its existence is noted in the documentation, and
5836 # it should not be removed nor its format changed. This
5837 # is done for those files that have traditionally been
5839 # $INTERNAL_MAP means Perl reserves the right to do anything it wants
5841 # $OUTPUT_ADJUSTED means that it is an $INTERNAL_MAP, and instead of
5842 # outputting the actual mappings as-is, we adjust things
5843 # to create a much more compact table. Only those few
5844 # tables where the mapping is convertible at least to an
5845 # integer and compacting makes a big difference should
5846 # have this. Hence, the default is to not do this
5847 # unless the table's default mapping is to $CODE_POINT,
5848 # and the range size is not 1.
5849 main::set_access('to_output_map', \%to_output_map, 's');
5857 # Optional initialization data for the table.
5858 my $initialize = delete $args{'Initialize'};
5860 my $default_map = delete $args{'Default_Map'};
5861 my $property = delete $args{'_Property'};
5862 my $full_name = delete $args{'Full_Name'};
5863 my $to_output_map = delete $args{'To_Output_Map'};
5865 # Rest of parameters passed on
5867 my $range_list = Range_Map->new(Owner => $property);
5869 my $self = $class->SUPER::new(
5871 Complete_Name => $full_name,
5872 Full_Name => $full_name,
5873 _Property => $property,
5874 _Range_List => $range_list,
5877 my $addr = do { no overloading; pack 'J', $self; };
5879 $anomalous_entries{$addr} = [];
5880 $default_map{$addr} = $default_map;
5881 $to_output_map{$addr} = $to_output_map;
5883 $self->initialize($initialize) if defined $initialize;
5890 qw("") => "_operator_stringify",
5893 sub _operator_stringify {
5896 my $name = $self->property->full_name;
5897 $name = '""' if $name eq "";
5898 return "Map table for Property '$name'";
5902 # Add a synonym for this table (which means the property itself)
5905 # Rest of parameters passed on.
5907 $self->SUPER::add_alias($name, $self->property, @_);
5912 # Add a range of code points to the list of specially-handled code
5913 # points. $MULTI_CP is assumed if the type of special is not passed
5922 my $type = delete $args{'Type'} || 0;
5923 # Rest of parameters passed on
5925 # Can't change the table if locked.
5926 return if $self->carp_if_locked;
5928 my $addr = do { no overloading; pack 'J', $self; };
5930 $self->_range_list->add_map($lower, $upper,
5937 sub append_to_body {
5938 # Adds to the written HERE document of the table's body any anomalous
5939 # entries in the table..
5942 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5944 my $addr = do { no overloading; pack 'J', $self; };
5946 return "" unless @{$anomalous_entries{$addr}};
5947 return join("\n", @{$anomalous_entries{$addr}}) . "\n";
5950 sub map_add_or_replace_non_nulls {
5951 # This adds the mappings in the table $other to $self. Non-null
5952 # mappings from $other override those in $self. It essentially merges
5953 # the two tables, with the second having priority except for null
5958 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5960 return if $self->carp_if_locked;
5962 if (! $other->isa(__PACKAGE__)) {
5963 Carp::my_carp_bug("$other should be a "
5971 my $addr = do { no overloading; pack 'J', $self; };
5972 my $other_addr = do { no overloading; pack 'J', $other; };
5974 local $to_trace = 0 if main::DEBUG;
5976 my $self_range_list = $self->_range_list;
5977 my $other_range_list = $other->_range_list;
5978 foreach my $range ($other_range_list->ranges) {
5979 my $value = $range->value;
5980 next if $value eq "";
5981 $self_range_list->_add_delete('+',
5985 Type => $range->type,
5986 Replace => $UNCONDITIONALLY);
5992 sub set_default_map {
5993 # Define what code points that are missing from the input files should
5998 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6000 my $addr = do { no overloading; pack 'J', $self; };
6002 # Convert the input to the standard equivalent, if any (won't have any
6003 # for $STRING properties)
6004 my $standard = $self->_find_table_from_alias->{$map};
6005 $map = $standard->name if defined $standard;
6007 # Warn if there already is a non-equivalent default map for this
6008 # property. Note that a default map can be a ref, which means that
6009 # what it actually means is delayed until later in the program, and it
6010 # IS permissible to override it here without a message.
6011 my $default_map = $default_map{$addr};
6012 if (defined $default_map
6013 && ! ref($default_map)
6014 && $default_map ne $map
6015 && main::Standardize($map) ne $default_map)
6017 my $property = $self->property;
6018 my $map_table = $property->table($map);
6019 my $default_table = $property->table($default_map);
6020 if (defined $map_table
6021 && defined $default_table
6022 && $map_table != $default_table)
6024 Carp::my_carp("Changing the default mapping for "
6026 . " from $default_map to $map'");
6030 $default_map{$addr} = $map;
6032 # Don't also create any missing table for this map at this point,
6033 # because if we did, it could get done before the main table add is
6034 # done for PropValueAliases.txt; instead the caller will have to make
6035 # sure it exists, if desired.
6040 # Returns boolean: should we write this map table?
6043 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6045 my $addr = do { no overloading; pack 'J', $self; };
6047 # If overridden, use that
6048 return $to_output_map{$addr} if defined $to_output_map{$addr};
6050 my $full_name = $self->full_name;
6051 return $global_to_output_map{$full_name}
6052 if defined $global_to_output_map{$full_name};
6054 # If table says to output, do so; if says to suppress it, do so.
6055 my $fate = $self->fate;
6056 return $INTERNAL_MAP if $fate == $INTERNAL_ONLY;
6057 return $EXTERNAL_MAP if grep { $_ eq $full_name } @output_mapped_properties;
6058 return 0 if $fate == $SUPPRESSED || $fate == $MAP_PROXIED;
6060 my $type = $self->property->type;
6062 # Don't want to output binary map tables even for debugging.
6063 return 0 if $type == $BINARY;
6065 # But do want to output string ones. All the ones that remain to
6066 # be dealt with (i.e. which haven't explicitly been set to external)
6067 # are for internal Perl use only. The default for those that map to
6068 # $CODE_POINT and haven't been restricted to a single element range
6069 # is to use the adjusted form.
6070 if ($type == $STRING) {
6071 return $INTERNAL_MAP if $self->range_size_1
6072 || $default_map{$addr} ne $CODE_POINT;
6073 return $OUTPUT_ADJUSTED;
6076 # Otherwise is an $ENUM, do output it, for Perl's purposes
6077 return $INTERNAL_MAP;
6081 # Returns a Range_List that is gaps of the current table. That is,
6085 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6087 my $current = Range_List->new(Initialize => $self->_range_list,
6088 Owner => $self->property);
6094 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6096 my $return = $self->SUPER::header();
6098 if ($self->to_output_map >= $INTERNAL_MAP) {
6099 $return .= $INTERNAL_ONLY_HEADER;
6102 my $property_name = $self->property->full_name =~ s/Legacy_//r;
6105 # !!!!!!! IT IS DEPRECATED TO USE THIS FILE !!!!!!!
6107 # This file is for internal use by core Perl only. It is retained for
6108 # backwards compatibility with applications that may have come to rely on it,
6109 # but its format and even its name or existence are subject to change without
6110 # notice in a future Perl version. Don't use it directly. Instead, its
6111 # contents are now retrievable through a stable API in the Unicode::UCD
6112 # module: Unicode::UCD::prop_invmap('$property_name').
6118 sub set_final_comment {
6119 # Just before output, create the comment that heads the file
6120 # containing this table.
6122 return unless $debugging_build;
6125 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6127 # No sense generating a comment if aren't going to write it out.
6128 return if ! $self->to_output_map;
6130 my $addr = do { no overloading; pack 'J', $self; };
6132 my $property = $self->property;
6134 # Get all the possible names for this property. Don't use any that
6135 # aren't ok for use in a file name, etc. This is perhaps causing that
6136 # flag to do double duty, and may have to be changed in the future to
6137 # have our own flag for just this purpose; but it works now to exclude
6138 # Perl generated synonyms from the lists for properties, where the
6139 # name is always the proper Unicode one.
6140 my @property_aliases = grep { $_->ok_as_filename } $self->aliases;
6142 my $count = $self->count;
6143 my $default_map = $default_map{$addr};
6145 # The ranges that map to the default aren't output, so subtract that
6146 # to get those actually output. A property with matching tables
6147 # already has the information calculated.
6148 if ($property->type != $STRING) {
6149 $count -= $property->table($default_map)->count;
6151 elsif (defined $default_map) {
6153 # But for $STRING properties, must calculate now. Subtract the
6154 # count from each range that maps to the default.
6155 foreach my $range ($self->_range_list->ranges) {
6156 if ($range->value eq $default_map) {
6157 $count -= $range->end +1 - $range->start;
6163 # Get a string version of $count with underscores in large numbers,
6165 my $string_count = main::clarify_number($count);
6167 my $code_points = ($count == 1)
6168 ? 'single code point'
6169 : "$string_count code points";
6174 if (@property_aliases <= 1) {
6175 $mapping = 'mapping';
6176 $these_mappings = 'this mapping';
6180 $mapping = 'synonymous mappings';
6181 $these_mappings = 'these mappings';
6185 if ($count >= $MAX_UNICODE_CODEPOINTS) {
6186 $cp = "any code point in Unicode Version $string_version";
6190 if ($default_map eq "") {
6191 $map_to = 'the null string';
6193 elsif ($default_map eq $CODE_POINT) {
6197 $map_to = "'$default_map'";
6200 $cp = "the single code point";
6203 $cp = "one of the $code_points";
6205 $cp .= " in Unicode Version $string_version for which the mapping is not to $map_to";
6210 my $status = $self->status;
6211 if ($status && $status ne $PLACEHOLDER) {
6212 my $warn = uc $status_past_participles{$status};
6215 !!!!!!! $warn !!!!!!!!!!!!!!!!!!!
6216 All property or property=value combinations contained in this file are $warn.
6217 See $unicode_reference_url for what this means.
6221 $comment .= "This file returns the $mapping:\n";
6223 my $ucd_accessible_name = "";
6224 my $full_name = $self->property->full_name;
6225 for my $i (0 .. @property_aliases - 1) {
6226 my $name = $property_aliases[$i]->name;
6227 $comment .= sprintf("%-8s%s\n", " ", $name . '(cp)');
6228 if ($property_aliases[$i]->ucd) {
6229 if ($name eq $full_name) {
6230 $ucd_accessible_name = $full_name;
6232 elsif (! $ucd_accessible_name) {
6233 $ucd_accessible_name = $name;
6237 $comment .= "\nwhere 'cp' is $cp.";
6238 if ($ucd_accessible_name) {
6239 $comment .= " Note that $these_mappings $are accessible via the function prop_invmap('$full_name') in Unicode::UCD";
6242 # And append any commentary already set from the actual property.
6243 $comment .= "\n\n" . $self->comment if $self->comment;
6244 if ($self->description) {
6245 $comment .= "\n\n" . join " ", $self->description;
6248 $comment .= "\n\n" . join " ", $self->note;
6252 if (! $self->perl_extension) {
6255 For information about what this property really means, see:
6256 $unicode_reference_url
6260 if ($count) { # Format differs for empty table
6261 $comment.= "\nThe format of the ";
6262 if ($self->range_size_1) {
6264 main body of lines of this file is: CODE_POINT\\t\\tMAPPING where CODE_POINT
6265 is in hex; MAPPING is what CODE_POINT maps to.
6270 # There are tables which end up only having one element per
6271 # range, but it is not worth keeping track of for making just
6272 # this comment a little better.
6274 non-comment portions of the main body of lines of this file is:
6275 START\\tSTOP\\tMAPPING where START is the starting code point of the
6276 range, in hex; STOP is the ending point, or if omitted, the range has just one
6277 code point; MAPPING is what each code point between START and STOP maps to.
6279 if ($self->output_range_counts) {
6281 Numbers in comments in [brackets] indicate how many code points are in the
6282 range (omitted when the range is a single code point or if the mapping is to
6288 $self->set_comment(main::join_lines($comment));
6292 my %swash_keys; # Makes sure don't duplicate swash names.
6294 # The remaining variables are temporaries used while writing each table,
6295 # to output special ranges.
6296 my @multi_code_point_maps; # Map is to more than one code point.
6298 sub handle_special_range {
6299 # Called in the middle of write when it finds a range it doesn't know
6304 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6306 my $addr = do { no overloading; pack 'J', $self; };
6308 my $type = $range->type;
6310 my $low = $range->start;
6311 my $high = $range->end;
6312 my $map = $range->value;
6314 # No need to output the range if it maps to the default.
6315 return if $map eq $default_map{$addr};
6317 my $property = $self->property;
6319 # Switch based on the map type...
6320 if ($type == $HANGUL_SYLLABLE) {
6322 # These are entirely algorithmically determinable based on
6323 # some constants furnished by Unicode; for now, just set a
6324 # flag to indicate that have them. After everything is figured
6325 # out, we will output the code that does the algorithm. (Don't
6326 # output them if not needed because we are suppressing this
6328 $has_hangul_syllables = 1 if $property->to_output_map;
6330 elsif ($type == $CP_IN_NAME) {
6332 # Code points whose name ends in their code point are also
6333 # algorithmically determinable, but need information about the map
6334 # to do so. Both the map and its inverse are stored in data
6335 # structures output in the file. They are stored in the mean time
6336 # in global lists The lists will be written out later into Name.pm,
6337 # which is created only if needed. In order to prevent duplicates
6338 # in the list, only add to them for one property, should multiple
6340 if ($needing_code_points_ending_in_code_point == 0) {
6341 $needing_code_points_ending_in_code_point = $property;
6343 if ($property == $needing_code_points_ending_in_code_point) {
6344 push @{$names_ending_in_code_point{$map}->{'low'}}, $low;
6345 push @{$names_ending_in_code_point{$map}->{'high'}}, $high;
6347 my $squeezed = $map =~ s/[-\s]+//gr;
6348 push @{$loose_names_ending_in_code_point{$squeezed}->{'low'}},
6350 push @{$loose_names_ending_in_code_point{$squeezed}->{'high'}},
6353 push @code_points_ending_in_code_point, { low => $low,
6359 elsif ($range->type == $MULTI_CP || $range->type == $NULL) {
6361 # Multi-code point maps and null string maps have an entry
6362 # for each code point in the range. They use the same
6364 for my $code_point ($low .. $high) {
6366 # The pack() below can't cope with surrogates. XXX This may
6368 if ($code_point >= 0xD800 && $code_point <= 0xDFFF) {
6369 Carp::my_carp("Surrogate code point '$code_point' in mapping to '$map' in $self. No map created");
6373 # Generate the hash entries for these in the form that
6374 # utf8.c understands.
6378 foreach my $to (split " ", $map) {
6379 if ($to !~ /^$code_point_re$/) {
6380 Carp::my_carp("Illegal code point '$to' in mapping '$map' from $code_point in $self. No map created");
6383 $tostr .= sprintf "\\x{%s}", $to;
6384 $to = CORE::hex $to;
6386 $to_name .= " + " if $to_name;
6387 $to_chr .= chr($to);
6388 main::populate_char_info($to)
6389 if ! defined $viacode[$to];
6390 $to_name .= $viacode[$to];
6394 # I (khw) have never waded through this line to
6395 # understand it well enough to comment it.
6396 my $utf8 = sprintf(qq["%s" => "$tostr",],
6397 join("", map { sprintf "\\x%02X", $_ }
6398 unpack("U0C*", pack("U", $code_point))));
6400 # Add a comment so that a human reader can more easily
6401 # see what's going on.
6402 push @multi_code_point_maps,
6403 sprintf("%-45s # U+%04X", $utf8, $code_point);
6405 $multi_code_point_maps[-1] .= " => $map";
6408 main::populate_char_info($code_point)
6409 if ! defined $viacode[$code_point];
6410 $multi_code_point_maps[-1] .= " '"
6412 . "' => '$to_chr'; $viacode[$code_point] => $to_name";
6417 Carp::my_carp("Unrecognized map type '$range->type' in '$range' in $self. Not written");
6424 # Returns the string that should be output in the file before the main
6425 # body of this table. It isn't called until the main body is
6426 # calculated, saving a pass. The string includes some hash entries
6427 # identifying the format of the body, and what the single value should
6428 # be for all ranges missing from it. It also includes any code points
6429 # which have map_types that don't go in the main table.
6432 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6434 my $addr = do { no overloading; pack 'J', $self; };
6436 my $name = $self->property->swash_name;
6438 # Currently there is nothing in the pre_body unless a swash is being
6440 return unless defined $name;
6442 if (defined $swash_keys{$name}) {
6443 Carp::my_carp(main::join_lines(<<END
6444 Already created a swash name '$name' for $swash_keys{$name}. This means that
6445 the same name desired for $self shouldn't be used. Bad News. This must be
6446 fixed before production use, but proceeding anyway
6450 $swash_keys{$name} = "$self";
6454 # Here we assume we were called after have gone through the whole
6455 # file. If we actually generated anything for each map type, add its
6456 # respective header and trailer
6457 my $specials_name = "";
6458 if (@multi_code_point_maps) {
6459 $specials_name = "utf8::ToSpec$name";
6462 # Some code points require special handling because their mappings are each to
6463 # multiple code points. These do not appear in the main body, but are defined
6464 # in the hash below.
6466 # Each key is the string of N bytes that together make up the UTF-8 encoding
6467 # for the code point. (i.e. the same as looking at the code point's UTF-8
6468 # under "use bytes"). Each value is the UTF-8 of the translation, for speed.
6469 \%$specials_name = (
6471 $pre_body .= join("\n", @multi_code_point_maps) . "\n);\n";
6474 my $format = $self->format;
6478 my $output_adjusted = ($self->to_output_map == $OUTPUT_ADJUSTED);
6479 if ($output_adjusted) {
6480 if ($specials_name) {
6482 # The mappings in the non-hash portion of this file must be modified to get the
6483 # correct values by adding the code point ordinal number to each one that is
6489 # The mappings must be modified to get the correct values by adding the code
6490 # point ordinal number to each one that is numeric.
6497 # The name this swash is to be known by, with the format of the mappings in
6498 # the main body of the table, and what all code points missing from this file
6500 \$utf8::SwashInfo{'To$name'}{'format'} = '$format'; # $map_table_formats{$format}
6502 if ($specials_name) {
6504 \$utf8::SwashInfo{'To$name'}{'specials_name'} = '$specials_name'; # Name of hash of special mappings
6507 my $default_map = $default_map{$addr};
6509 # For $CODE_POINT default maps and using adjustments, instead the default
6511 $return .= "\$utf8::SwashInfo{'To$name'}{'missing'} = '"
6512 . (($output_adjusted && $default_map eq $CODE_POINT)
6517 if ($default_map eq $CODE_POINT) {
6518 $return .= ' # code point maps to itself';
6520 elsif ($default_map eq "") {
6521 $return .= ' # code point maps to the null string';
6525 $return .= $pre_body;
6531 # Write the table to the file.
6534 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6536 my $addr = do { no overloading; pack 'J', $self; };
6538 # Clear the temporaries
6539 undef @multi_code_point_maps;
6541 # Calculate the format of the table if not already done.
6542 my $format = $self->format;
6543 my $type = $self->property->type;
6544 my $default_map = $self->default_map;
6545 if (! defined $format) {
6546 if ($type == $BINARY) {
6548 # Don't bother checking the values, because we elsewhere
6549 # verify that a binary table has only 2 values.
6550 $format = $BINARY_FORMAT;
6553 my @ranges = $self->_range_list->ranges;
6555 # default an empty table based on its type and default map
6558 # But it turns out that the only one we can say is a
6559 # non-string (besides binary, handled above) is when the
6560 # table is a string and the default map is to a code point
6561 if ($type == $STRING && $default_map eq $CODE_POINT) {
6562 $format = $HEX_FORMAT;
6565 $format = $STRING_FORMAT;
6570 # Start with the most restrictive format, and as we find
6571 # something that doesn't fit with that, change to the next
6572 # most restrictive, and so on.
6573 $format = $DECIMAL_FORMAT;
6574 foreach my $range (@ranges) {
6575 next if $range->type != 0; # Non-normal ranges don't
6576 # affect the main body
6577 my $map = $range->value;
6578 if ($map ne $default_map) {
6579 last if $format eq $STRING_FORMAT; # already at
6582 $format = $INTEGER_FORMAT
6583 if $format eq $DECIMAL_FORMAT
6584 && $map !~ / ^ [0-9] $ /x;
6585 $format = $FLOAT_FORMAT
6586 if $format eq $INTEGER_FORMAT
6587 && $map !~ / ^ -? [0-9]+ $ /x;
6588 $format = $RATIONAL_FORMAT
6589 if $format eq $FLOAT_FORMAT
6590 && $map !~ / ^ -? [0-9]+ \. [0-9]* $ /x;
6591 $format = $HEX_FORMAT
6592 if ($format eq $RATIONAL_FORMAT
6594 m/ ^ -? [0-9]+ ( \/ [0-9]+ )? $ /x)
6595 # Assume a leading zero means hex,
6596 # even if all digits are 0-9
6597 || ($format eq $INTEGER_FORMAT
6598 && $map =~ /^0[0-9A-F]/);
6599 $format = $STRING_FORMAT if $format eq $HEX_FORMAT
6600 && $map =~ /[^0-9A-F]/;
6605 } # end of calculating format
6607 if ($default_map eq $CODE_POINT
6608 && $format ne $HEX_FORMAT
6609 && ! defined $self->format) # manual settings are always
6612 Carp::my_carp_bug("Expecting hex format for mapping table for $self, instead got '$format'")
6615 # If the output is to be adjusted, the format of the table that gets
6616 # output is actually 'a' instead of whatever it is stored internally
6618 my $output_adjusted = ($self->to_output_map == $OUTPUT_ADJUSTED);
6619 if ($output_adjusted) {
6620 $format = $ADJUST_FORMAT;
6623 $self->_set_format($format);
6625 return $self->SUPER::write(
6627 ($self->property == $block)
6628 ? 7 # block file needs more tab stops
6630 $default_map); # don't write defaulteds
6633 # Accessors for the underlying list that should fail if locked.
6643 return if $self->carp_if_locked;
6644 return $self->_range_list->$sub(@_);
6647 } # End closure for Map_Table
6649 package Match_Table;
6650 use base '_Base_Table';
6652 # A Match table is one which is a list of all the code points that have
6653 # the same property and property value, for use in \p{property=value}
6654 # constructs in regular expressions. It adds very little data to the base
6655 # structure, but many methods, as these lists can be combined in many ways to
6657 # There are only a few concepts added:
6658 # 1) Equivalents and Relatedness.
6659 # Two tables can match the identical code points, but have different names.
6660 # This always happens when there is a perl single form extension
6661 # \p{IsProperty} for the Unicode compound form \P{Property=True}. The two
6662 # tables are set to be related, with the Perl extension being a child, and
6663 # the Unicode property being the parent.
6665 # It may be that two tables match the identical code points and we don't
6666 # know if they are related or not. This happens most frequently when the
6667 # Block and Script properties have the exact range. But note that a
6668 # revision to Unicode could add new code points to the script, which would
6669 # now have to be in a different block (as the block was filled, or there
6670 # would have been 'Unknown' script code points in it and they wouldn't have
6671 # been identical). So we can't rely on any two properties from Unicode
6672 # always matching the same code points from release to release, and thus
6673 # these tables are considered coincidentally equivalent--not related. When
6674 # two tables are unrelated but equivalent, one is arbitrarily chosen as the
6675 # 'leader', and the others are 'equivalents'. This concept is useful
6676 # to minimize the number of tables written out. Only one file is used for
6677 # any identical set of code points, with entries in Heavy.pl mapping all
6678 # the involved tables to it.
6680 # Related tables will always be identical; we set them up to be so. Thus
6681 # if the Unicode one is deprecated, the Perl one will be too. Not so for
6682 # unrelated tables. Relatedness makes generating the documentation easier.
6685 # Like equivalents, two tables may be the inverses of each other, the
6686 # intersection between them is null, and the union is every Unicode code
6687 # point. The two tables that occupy a binary property are necessarily like
6688 # this. By specifying one table as the complement of another, we can avoid
6689 # storing it on disk (using the other table and performing a fast
6690 # transform), and some memory and calculations.
6692 # 3) Conflicting. It may be that there will eventually be name clashes, with
6693 # the same name meaning different things. For a while, there actually were
6694 # conflicts, but they have so far been resolved by changing Perl's or
6695 # Unicode's definitions to match the other, but when this code was written,
6696 # it wasn't clear that that was what was going to happen. (Unicode changed
6697 # because of protests during their beta period.) Name clashes are warned
6698 # about during compilation, and the documentation. The generated tables
6699 # are sane, free of name clashes, because the code suppresses the Perl
6700 # version. But manual intervention to decide what the actual behavior
6701 # should be may be required should this happen. The introductory comments
6702 # have more to say about this.
6704 sub standardize { return main::standardize($_[0]); }
6705 sub trace { return main::trace(@_); }
6710 main::setup_package();
6713 # The leader table of this one; initially $self.
6714 main::set_access('leader', \%leader, 'r');
6717 # An array of any tables that have this one as their leader
6718 main::set_access('equivalents', \%equivalents, 'readable_array');
6721 # The parent table to this one, initially $self. This allows us to
6722 # distinguish between equivalent tables that are related (for which this
6723 # is set to), and those which may not be, but share the same output file
6724 # because they match the exact same set of code points in the current
6726 main::set_access('parent', \%parent, 'r');
6729 # An array of any tables that have this one as their parent
6730 main::set_access('children', \%children, 'readable_array');
6733 # Array of any tables that would have the same name as this one with
6734 # a different meaning. This is used for the generated documentation.
6735 main::set_access('conflicting', \%conflicting, 'readable_array');
6738 # Set in the constructor for tables that are expected to match all code
6740 main::set_access('matches_all', \%matches_all, 'r');
6743 # Points to the complement that this table is expressed in terms of; 0 if
6745 main::set_access('complement', \%complement, 'r');
6752 # The property for which this table is a listing of property values.
6753 my $property = delete $args{'_Property'};
6755 my $name = delete $args{'Name'};
6756 my $full_name = delete $args{'Full_Name'};
6757 $full_name = $name if ! defined $full_name;
6760 my $initialize = delete $args{'Initialize'};
6761 my $matches_all = delete $args{'Matches_All'} || 0;
6762 my $format = delete $args{'Format'};
6763 # Rest of parameters passed on.
6765 my $range_list = Range_List->new(Initialize => $initialize,
6766 Owner => $property);
6768 my $complete = $full_name;
6769 $complete = '""' if $complete eq ""; # A null name shouldn't happen,
6770 # but this helps debug if it
6772 # The complete name for a match table includes it's property in a
6773 # compound form 'property=table', except if the property is the
6774 # pseudo-property, perl, in which case it is just the single form,
6775 # 'table' (If you change the '=' must also change the ':' in lots of
6776 # places in this program that assume an equal sign)
6777 $complete = $property->full_name . "=$complete" if $property != $perl;
6779 my $self = $class->SUPER::new(%args,
6781 Complete_Name => $complete,
6782 Full_Name => $full_name,
6783 _Property => $property,
6784 _Range_List => $range_list,
6785 Format => $EMPTY_FORMAT,
6787 my $addr = do { no overloading; pack 'J', $self; };
6789 $conflicting{$addr} = [ ];
6790 $equivalents{$addr} = [ ];
6791 $children{$addr} = [ ];
6792 $matches_all{$addr} = $matches_all;
6793 $leader{$addr} = $self;
6794 $parent{$addr} = $self;
6795 $complement{$addr} = 0;
6797 if (defined $format && $format ne $EMPTY_FORMAT) {
6798 Carp::my_carp_bug("'Format' must be '$EMPTY_FORMAT' in a match table instead of '$format'. Using '$EMPTY_FORMAT'");
6804 # See this program's beginning comment block about overloading these.
6807 qw("") => "_operator_stringify",
6811 return if $self->carp_if_locked;
6819 return $self->_range_list + $other;
6825 return $self->_range_list & $other;
6830 my $reversed = shift;
6833 Carp::my_carp_bug("Bad news. Can't cope with '"
6837 . "'. undef returned.");
6841 return if $self->carp_if_locked;
6843 my $addr = do { no overloading; pack 'J', $self; };
6847 # Change the range list of this table to be the
6849 $self->_set_range_list($self->_range_list
6852 else { # $other is just a simple value
6853 $self->add_range($other, $other);
6861 return if $self->carp_if_locked;
6862 $self->_set_range_list($self->_range_list & $other);
6865 '-' => sub { my $self = shift;
6867 my $reversed = shift;
6870 Carp::my_carp_bug("Can't cope with a "
6872 . " being the first parameter in a '-'. Subtraction ignored.");
6876 return $self->_range_list - $other;
6878 '~' => sub { my $self = shift;
6879 return ~ $self->_range_list;
6883 sub _operator_stringify {
6886 my $name = $self->complete_name;
6887 return "Table '$name'";
6891 # Returns the range list associated with this table, which will be the
6892 # complement's if it has one.
6896 if (($complement = $self->complement) != 0) {
6897 return ~ $complement->_range_list;
6900 return $self->SUPER::_range_list;
6905 # Add a synonym for this table. See the comments in the base class
6909 # Rest of parameters passed on.
6911 $self->SUPER::add_alias($name, $self, @_);
6915 sub add_conflicting {
6916 # Add the name of some other object to the list of ones that name
6917 # clash with this match table.
6920 my $conflicting_name = shift; # The name of the conflicting object
6921 my $p = shift || 'p'; # Optional, is this a \p{} or \P{} ?
6922 my $conflicting_object = shift; # Optional, the conflicting object
6923 # itself. This is used to
6924 # disambiguate the text if the input
6925 # name is identical to any of the
6926 # aliases $self is known by.
6927 # Sometimes the conflicting object is
6928 # merely hypothetical, so this has to
6929 # be an optional parameter.
6930 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6932 my $addr = do { no overloading; pack 'J', $self; };
6934 # Check if the conflicting name is exactly the same as any existing
6935 # alias in this table (as long as there is a real object there to
6936 # disambiguate with).
6937 if (defined $conflicting_object) {
6938 foreach my $alias ($self->aliases) {
6939 if ($alias->name eq $conflicting_name) {
6941 # Here, there is an exact match. This results in
6942 # ambiguous comments, so disambiguate by changing the
6943 # conflicting name to its object's complete equivalent.
6944 $conflicting_name = $conflicting_object->complete_name;
6950 # Convert to the \p{...} final name
6951 $conflicting_name = "\\$p" . "{$conflicting_name}";
6954 return if grep { $conflicting_name eq $_ } @{$conflicting{$addr}};
6956 push @{$conflicting{$addr}}, $conflicting_name;
6961 sub is_set_equivalent_to {
6962 # Return boolean of whether or not the other object is a table of this
6963 # type and has been marked equivalent to this one.
6967 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6969 return 0 if ! defined $other; # Can happen for incomplete early
6971 unless ($other->isa(__PACKAGE__)) {
6972 my $ref_other = ref $other;
6973 my $ref_self = ref $self;
6974 Carp::my_carp_bug("Argument to 'is_set_equivalent_to' must be another $ref_self, not a '$ref_other'. $other not set equivalent to $self.");
6978 # Two tables are equivalent if they have the same leader.
6980 return $leader{pack 'J', $self} == $leader{pack 'J', $other};
6984 sub set_equivalent_to {
6985 # Set $self equivalent to the parameter table.
6986 # The required Related => 'x' parameter is a boolean indicating
6987 # whether these tables are related or not. If related, $other becomes
6988 # the 'parent' of $self; if unrelated it becomes the 'leader'
6990 # Related tables share all characteristics except names; equivalents
6991 # not quite so many.
6992 # If they are related, one must be a perl extension. This is because
6993 # we can't guarantee that Unicode won't change one or the other in a
6994 # later release even if they are identical now.
7000 my $related = delete $args{'Related'};
7002 Carp::carp_extra_args(\%args) if main::DEBUG && %args;
7004 return if ! defined $other; # Keep on going; happens in some early
7007 if (! defined $related) {
7008 Carp::my_carp_bug("set_equivalent_to must have 'Related => [01] parameter. Assuming $self is not related to $other");
7012 # If already are equivalent, no need to re-do it; if subroutine
7013 # returns null, it found an error, also do nothing
7014 my $are_equivalent = $self->is_set_equivalent_to($other);
7015 return if ! defined $are_equivalent || $are_equivalent;
7017 my $addr = do { no overloading; pack 'J', $self; };
7018 my $current_leader = ($related) ? $parent{$addr} : $leader{$addr};
7021 if ($current_leader->perl_extension) {
7022 if ($other->perl_extension) {
7023 Carp::my_carp_bug("Use add_alias() to set two Perl tables '$self' and '$other', equivalent.");
7026 } elsif ($self->property != $other->property # Depending on
7032 && ! $other->perl_extension)
7034 Carp::my_carp_bug("set_equivalent_to should have 'Related => 0 for equivalencing two Unicode properties. Assuming $self is not related to $other");
7039 if (! $self->is_empty && ! $self->matches_identically_to($other)) {
7040 Carp::my_carp_bug("$self should be empty or match identically to $other. Not setting equivalent");
7044 my $leader = do { no overloading; pack 'J', $current_leader; };
7045 my $other_addr = do { no overloading; pack 'J', $other; };
7047 # Any tables that are equivalent to or children of this table must now
7048 # instead be equivalent to or (children) to the new leader (parent),
7049 # still equivalent. The equivalency includes their matches_all info,
7050 # and for related tables, their fate and status.
7051 # All related tables are of necessity equivalent, but the converse
7052 # isn't necessarily true
7053 my $status = $other->status;
7054 my $status_info = $other->status_info;
7055 my $fate = $other->fate;
7056 my $matches_all = $matches_all{other_addr};
7057 my $caseless_equivalent = $other->caseless_equivalent;
7058 foreach my $table ($current_leader, @{$equivalents{$leader}}) {
7059 next if $table == $other;
7060 trace "setting $other to be the leader of $table, status=$status" if main::DEBUG && $to_trace;
7062 my $table_addr = do { no overloading; pack 'J', $table; };
7063 $leader{$table_addr} = $other;
7064 $matches_all{$table_addr} = $matches_all;
7065 $self->_set_range_list($other->_range_list);
7066 push @{$equivalents{$other_addr}}, $table;
7068 $parent{$table_addr} = $other;
7069 push @{$children{$other_addr}}, $table;
7070 $table->set_status($status, $status_info);
7072 # This reason currently doesn't get exposed outside; otherwise
7073 # would have to look up the parent's reason and use it instead.
7074 $table->set_fate($fate, "Parent's fate");
7076 $self->set_caseless_equivalent($caseless_equivalent);
7080 # Now that we've declared these to be equivalent, any changes to one
7081 # of the tables would invalidate that equivalency.
7087 sub set_complement {
7088 # Set $self to be the complement of the parameter table. $self is
7089 # locked, as what it contains should all come from the other table.
7095 Carp::carp_extra_args(\%args) if main::DEBUG && %args;
7097 if ($other->complement != 0) {
7098 Carp::my_carp_bug("Can't set $self to be the complement of $other, which itself is the complement of " . $other->complement);
7101 my $addr = do { no overloading; pack 'J', $self; };
7102 $complement{$addr} = $other;
7107 sub add_range { # Add a range to the list for this table.
7109 # Rest of parameters passed on
7111 return if $self->carp_if_locked;
7112 return $self->_range_list->add_range(@_);
7117 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7119 # All match tables are to be used only by the Perl core.
7120 return $self->SUPER::header() . $INTERNAL_ONLY_HEADER;
7123 sub pre_body { # Does nothing for match tables.
7127 sub append_to_body { # Does nothing for match tables.
7135 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7137 $self->SUPER::set_fate($fate, $reason);
7139 # All children share this fate
7140 foreach my $child ($self->children) {
7141 $child->set_fate($fate, $reason);
7148 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7150 return $self->SUPER::write(0, 2); # No adjustments; 2 tab stops
7153 sub set_final_comment {
7154 # This creates a comment for the file that is to hold the match table
7155 # $self. It is somewhat convoluted to make the English read nicely,
7156 # but, heh, it's just a comment.
7157 # This should be called only with the leader match table of all the
7158 # ones that share the same file. It lists all such tables, ordered so
7159 # that related ones are together.
7161 return unless $debugging_build;
7163 my $leader = shift; # Should only be called on the leader table of
7164 # an equivalent group
7165 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7167 my $addr = do { no overloading; pack 'J', $leader; };
7169 if ($leader{$addr} != $leader) {
7170 Carp::my_carp_bug(<<END
7171 set_final_comment() must be called on a leader table, which $leader is not.
7172 It is equivalent to $leader{$addr}. No comment created
7178 # Get the number of code points matched by each of the tables in this
7179 # file, and add underscores for clarity.
7180 my $count = $leader->count;
7181 my $string_count = main::clarify_number($count);
7183 my $loose_count = 0; # how many aliases loosely matched
7184 my $compound_name = ""; # ? Are any names compound?, and if so, an
7186 my $properties_with_compound_names = 0; # count of these
7189 my %flags; # The status flags used in the file
7190 my $total_entries = 0; # number of entries written in the comment
7191 my $matches_comment = ""; # The portion of the comment about the
7193 my @global_comments; # List of all the tables' comments that are
7194 # there before this routine was called.
7195 my $has_ucd_alias = 0; # If there is an alias that is accessible via
7196 # Unicode::UCD. If not, then don't say it is
7199 # Get list of all the parent tables that are equivalent to this one
7200 # (including itself).
7201 my @parents = grep { $parent{main::objaddr $_} == $_ }
7202 main::uniques($leader, @{$equivalents{$addr}});
7203 my $has_unrelated = (@parents >= 2); # boolean, ? are there unrelated
7206 for my $parent (@parents) {
7208 my $property = $parent->property;
7210 # Special case 'N' tables in properties with two match tables when
7211 # the other is a 'Y' one. These are likely to be binary tables,
7212 # but not necessarily. In either case, \P{} will match the
7213 # complement of \p{}, and so if something is a synonym of \p, the
7214 # complement of that something will be the synonym of \P. This
7215 # would be true of any property with just two match tables, not
7216 # just those whose values are Y and N; but that would require a
7217 # little extra work, and there are none such so far in Unicode.
7218 my $perl_p = 'p'; # which is it? \p{} or \P{}
7219 my @yes_perl_synonyms; # list of any synonyms for the 'Y' table
7221 if (scalar $property->tables == 2
7222 && $parent == $property->table('N')
7223 && defined (my $yes = $property->table('Y')))
7225 my $yes_addr = do { no overloading; pack 'J', $yes; };
7227 = grep { $_->property == $perl }
7230 $parent{$yes_addr}->children);
7232 # But these synonyms are \P{} ,not \p{}
7236 my @description; # Will hold the table description
7237 my @note; # Will hold the table notes.
7238 my @conflicting; # Will hold the table conflicts.
7240 # Look at the parent, any yes synonyms, and all the children
7241 my $parent_addr = do { no overloading; pack 'J', $parent; };
7242 for my $table ($parent,
7244 @{$children{$parent_addr}})
7246 my $table_addr = do { no overloading; pack 'J', $table; };
7247 my $table_property = $table->property;
7249 # Tables are separated by a blank line to create a grouping.
7250 $matches_comment .= "\n" if $matches_comment;
7252 # The table is named based on the property and value
7253 # combination it is for, like script=greek. But there may be
7254 # a number of synonyms for each side, like 'sc' for 'script',
7255 # and 'grek' for 'greek'. Any combination of these is a valid
7256 # name for this table. In this case, there are three more,
7257 # 'sc=grek', 'sc=greek', and 'script='grek'. Rather than
7258 # listing all possible combinations in the comment, we make
7259 # sure that each synonym occurs at least once, and add
7260 # commentary that the other combinations are possible.
7261 # Because regular expressions don't recognize things like
7262 # \p{jsn=}, only look at non-null right-hand-sides
7263 my @property_aliases = $table_property->aliases;
7264 my @table_aliases = grep { $_->name ne "" } $table->aliases;
7266 # The alias lists above are already ordered in the order we
7267 # want to output them. To ensure that each synonym is listed,
7268 # we must use the max of the two numbers. But if there are no
7269 # legal synonyms (nothing in @table_aliases), then we don't
7271 my $listed_combos = (@table_aliases)
7272 ? main::max(scalar @table_aliases,
7273 scalar @property_aliases)
7275 trace "$listed_combos, tables=", scalar @table_aliases, "; names=", scalar @property_aliases if main::DEBUG;
7278 my $property_had_compound_name = 0;
7280 for my $i (0 .. $listed_combos - 1) {
7283 # The current alias for the property is the next one on
7284 # the list, or if beyond the end, start over. Similarly
7285 # for the table (\p{prop=table})
7286 my $property_alias = $property_aliases
7287 [$i % @property_aliases]->name;
7288 my $table_alias_object = $table_aliases
7289 [$i % @table_aliases];
7290 my $table_alias = $table_alias_object->name;
7291 my $loose_match = $table_alias_object->loose_match;
7292 $has_ucd_alias |= $table_alias_object->ucd;
7294 if ($table_alias !~ /\D/) { # Clarify large numbers.
7295 $table_alias = main::clarify_number($table_alias)
7298 # Add a comment for this alias combination
7299 my $current_match_comment;
7300 if ($table_property == $perl) {
7301 $current_match_comment = "\\$perl_p"
7305 $current_match_comment
7306 = "\\p{$property_alias=$table_alias}";
7307 $property_had_compound_name = 1;
7310 # Flag any abnormal status for this table.
7311 my $flag = $property->status
7313 || $table_alias_object->status;
7314 if ($flag && $flag ne $PLACEHOLDER) {
7315 $flags{$flag} = $status_past_participles{$flag};
7320 # Pretty up the comment. Note the \b; it says don't make
7321 # this line a continuation.
7322 $matches_comment .= sprintf("\b%-1s%-s%s\n",
7325 $current_match_comment);
7326 } # End of generating the entries for this table.
7328 # Save these for output after this group of related tables.
7329 push @description, $table->description;
7330 push @note, $table->note;
7331 push @conflicting, $table->conflicting;
7333 # And this for output after all the tables.
7334 push @global_comments, $table->comment;
7336 # Compute an alternate compound name using the final property
7337 # synonym and the first table synonym with a colon instead of
7338 # the equal sign used elsewhere.
7339 if ($property_had_compound_name) {
7340 $properties_with_compound_names ++;
7341 if (! $compound_name || @property_aliases > 1) {
7342 $compound_name = $property_aliases[-1]->name
7344 . $table_aliases[0]->name;
7347 } # End of looping through all children of this table
7349 # Here have assembled in $matches_comment all the related tables
7350 # to the current parent (preceded by the same info for all the
7351 # previous parents). Put out information that applies to all of
7352 # the current family.
7355 # But output the conflicting information now, as it applies to
7357 my $conflicting = join ", ", @conflicting;
7359 $matches_comment .= <<END;
7361 Note that contrary to what you might expect, the above is NOT the same as
7363 $matches_comment .= "any of: " if @conflicting > 1;
7364 $matches_comment .= "$conflicting\n";
7368 $matches_comment .= "\n Meaning: "
7369 . join('; ', @description)
7373 $matches_comment .= "\n Note: "
7374 . join("\n ", @note)
7377 } # End of looping through all tables
7385 $code_points = 'single code point';
7389 $code_points = "$string_count code points";
7394 if ($total_entries == 1) {
7397 $any_of_these = 'this'
7400 $synonyms = " any of the following regular expression constructs";
7401 $entries = 'entries';
7402 $any_of_these = 'any of these'
7406 if ($has_ucd_alias) {
7407 $comment .= "Use Unicode::UCD::prop_invlist() to access the contents of this file.\n\n";
7409 if ($has_unrelated) {
7411 This file is for tables that are not necessarily related: To conserve
7412 resources, every table that matches the identical set of code points in this
7413 version of Unicode uses this file. Each one is listed in a separate group
7414 below. It could be that the tables will match the same set of code points in
7415 other Unicode releases, or it could be purely coincidence that they happen to
7416 be the same in Unicode $string_version, and hence may not in other versions.
7422 foreach my $flag (sort keys %flags) {
7424 '$flag' below means that this form is $flags{$flag}.
7425 Consult $pod_file.pod
7431 if ($total_entries == 0) {
7432 Carp::my_carp("No regular expression construct can match $leader, as all names for it are the null string. Creating file anyway.");
7434 This file returns the $code_points in Unicode Version $string_version for
7435 $leader, but it is inaccessible through Perl regular expressions, as
7436 "\\p{prop=}" is not recognized.
7441 This file returns the $code_points in Unicode Version $string_version that
7445 $pod_file.pod should be consulted for the syntax rules for $any_of_these,
7446 including if adding or subtracting white space, underscore, and hyphen
7447 characters matters or doesn't matter, and other permissible syntactic
7448 variants. Upper/lower case distinctions never matter.
7452 if ($compound_name) {
7455 A colon can be substituted for the equals sign, and
7457 if ($properties_with_compound_names > 1) {
7459 within each group above,
7462 $compound_name = sprintf("%-8s\\p{%s}", " ", $compound_name);
7464 # Note the \b below, it says don't make that line a continuation.
7466 anything to the left of the equals (or colon) can be combined with anything to
7467 the right. Thus, for example,
7473 # And append any comment(s) from the actual tables. They are all
7474 # gathered here, so may not read all that well.
7475 if (@global_comments) {
7476 $comment .= "\n" . join("\n\n", @global_comments) . "\n";
7479 if ($count) { # The format differs if no code points, and needs no
7480 # explanation in that case
7483 The format of the lines of this file is:
7486 START\\tSTOP\\twhere START is the starting code point of the range, in hex;
7487 STOP is the ending point, or if omitted, the range has just one code point.
7489 if ($leader->output_range_counts) {
7491 Numbers in comments in [brackets] indicate how many code points are in the
7497 $leader->set_comment(main::join_lines($comment));
7501 # Accessors for the underlying list
7503 get_valid_code_point
7504 get_invalid_code_point
7512 return $self->_range_list->$sub(@_);
7515 } # End closure for Match_Table
7519 # The Property class represents a Unicode property, or the $perl
7520 # pseudo-property. It contains a map table initialized empty at construction
7521 # time, and for properties accessible through regular expressions, various
7522 # match tables, created through the add_match_table() method, and referenced
7523 # by the table('NAME') or tables() methods, the latter returning a list of all
7524 # of the match tables. Otherwise table operations implicitly are for the map
7527 # Most of the data in the property is actually about its map table, so it
7528 # mostly just uses that table's accessors for most methods. The two could
7529 # have been combined into one object, but for clarity because of their
7530 # differing semantics, they have been kept separate. It could be argued that
7531 # the 'file' and 'directory' fields should be kept with the map table.
7533 # Each property has a type. This can be set in the constructor, or in the
7534 # set_type accessor, but mostly it is figured out by the data. Every property
7535 # starts with unknown type, overridden by a parameter to the constructor, or
7536 # as match tables are added, or ranges added to the map table, the data is
7537 # inspected, and the type changed. After the table is mostly or entirely
7538 # filled, compute_type() should be called to finalize they analysis.
7540 # There are very few operations defined. One can safely remove a range from
7541 # the map table, and property_add_or_replace_non_nulls() adds the maps from another
7542 # table to this one, replacing any in the intersection of the two.
7544 sub standardize { return main::standardize($_[0]); }
7545 sub trace { return main::trace(@_) if main::DEBUG && $to_trace }
7549 # This hash will contain as keys, all the aliases of all properties, and
7550 # as values, pointers to their respective property objects. This allows
7551 # quick look-up of a property from any of its names.
7552 my %alias_to_property_of;
7554 sub dump_alias_to_property_of {
7557 print "\n", main::simple_dumper (\%alias_to_property_of), "\n";
7562 # This is a package subroutine, not called as a method.
7563 # If the single parameter is a literal '*' it returns a list of all
7564 # defined properties.
7565 # Otherwise, the single parameter is a name, and it returns a pointer
7566 # to the corresponding property object, or undef if none.
7568 # Properties can have several different names. The 'standard' form of
7569 # each of them is stored in %alias_to_property_of as they are defined.
7570 # But it's possible that this subroutine will be called with some
7571 # variant, so if the initial lookup fails, it is repeated with the
7572 # standardized form of the input name. If found, besides returning the
7573 # result, the input name is added to the list so future calls won't
7574 # have to do the conversion again.
7578 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7580 if (! defined $name) {
7581 Carp::my_carp_bug("Undefined input property. No action taken.");
7585 return main::uniques(values %alias_to_property_of) if $name eq '*';
7587 # Return cached result if have it.
7588 my $result = $alias_to_property_of{$name};
7589 return $result if defined $result;
7591 # Convert the input to standard form.
7592 my $standard_name = standardize($name);
7594 $result = $alias_to_property_of{$standard_name};
7595 return unless defined $result; # Don't cache undefs
7597 # Cache the result before returning it.
7598 $alias_to_property_of{$name} = $result;
7603 main::setup_package();
7606 # A pointer to the map table object for this property
7607 main::set_access('map', \%map);
7610 # The property's full name. This is a duplicate of the copy kept in the
7611 # map table, but is needed because stringify needs it during
7612 # construction of the map table, and then would have a chicken before egg
7614 main::set_access('full_name', \%full_name, 'r');
7617 # This hash will contain as keys, all the aliases of any match tables
7618 # attached to this property, and as values, the pointers to their
7619 # respective tables. This allows quick look-up of a table from any of its
7621 main::set_access('table_ref', \%table_ref);
7624 # The type of the property, $ENUM, $BINARY, etc
7625 main::set_access('type', \%type, 'r');
7628 # The filename where the map table will go (if actually written).
7629 # Normally defaulted, but can be overridden.
7630 main::set_access('file', \%file, 'r', 's');
7633 # The directory where the map table will go (if actually written).
7634 # Normally defaulted, but can be overridden.
7635 main::set_access('directory', \%directory, 's');
7637 my %pseudo_map_type;
7638 # This is used to affect the calculation of the map types for all the
7639 # ranges in the table. It should be set to one of the values that signify
7640 # to alter the calculation.
7641 main::set_access('pseudo_map_type', \%pseudo_map_type, 'r');
7643 my %has_only_code_point_maps;
7644 # A boolean used to help in computing the type of data in the map table.
7645 main::set_access('has_only_code_point_maps', \%has_only_code_point_maps);
7648 # A list of the first few distinct mappings this property has. This is
7649 # used to disambiguate between binary and enum property types, so don't
7650 # have to keep more than three.
7651 main::set_access('unique_maps', \%unique_maps);
7653 my %pre_declared_maps;
7654 # A boolean that gives whether the input data should declare all the
7655 # tables used, or not. If the former, unknown ones raise a warning.
7656 main::set_access('pre_declared_maps',
7657 \%pre_declared_maps, 'r', 's');
7660 # The only required parameter is the positionally first, name. All
7661 # other parameters are key => value pairs. See the documentation just
7662 # above for the meanings of the ones not passed directly on to the map
7663 # table constructor.
7666 my $name = shift || "";
7668 my $self = property_ref($name);
7669 if (defined $self) {
7670 my $options_string = join ", ", @_;
7671 $options_string = ". Ignoring options $options_string" if $options_string;
7672 Carp::my_carp("$self is already in use. Using existing one$options_string;");
7678 $self = bless \do { my $anonymous_scalar }, $class;
7679 my $addr = do { no overloading; pack 'J', $self; };
7681 $directory{$addr} = delete $args{'Directory'};
7682 $file{$addr} = delete $args{'File'};
7683 $full_name{$addr} = delete $args{'Full_Name'} || $name;
7684 $type{$addr} = delete $args{'Type'} || $UNKNOWN;
7685 $pseudo_map_type{$addr} = delete $args{'Map_Type'};
7686 $pre_declared_maps{$addr} = delete $args{'Pre_Declared_Maps'}
7687 # Starting in this release, property
7688 # values should be defined for all
7689 # properties, except those overriding this
7690 // $v_version ge v5.1.0;
7692 # Rest of parameters passed on.
7694 $has_only_code_point_maps{$addr} = 1;
7695 $table_ref{$addr} = { };
7696 $unique_maps{$addr} = { };
7698 $map{$addr} = Map_Table->new($name,
7699 Full_Name => $full_name{$addr},
7700 _Alias_Hash => \%alias_to_property_of,
7706 # See this program's beginning comment block about overloading the copy
7707 # constructor. Few operations are defined on properties, but a couple are
7708 # useful. It is safe to take the inverse of a property, and to remove a
7709 # single code point from it.
7712 qw("") => "_operator_stringify",
7713 "." => \&main::_operator_dot,
7714 ".=" => \&main::_operator_dot_equal,
7715 '==' => \&main::_operator_equal,
7716 '!=' => \&main::_operator_not_equal,
7717 '=' => sub { return shift },
7718 '-=' => "_minus_and_equal",
7721 sub _operator_stringify {
7722 return "Property '" . shift->full_name . "'";
7725 sub _minus_and_equal {
7726 # Remove a single code point from the map table of a property.
7730 my $reversed = shift;
7731 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7734 Carp::my_carp_bug("Can't cope with a "
7736 . " argument to '-='. Subtraction ignored.");
7739 elsif ($reversed) { # Shouldn't happen in a -=, but just in case
7740 Carp::my_carp_bug("Can't cope with a "
7742 . " being the first parameter in a '-='. Subtraction ignored.");
7747 $map{pack 'J', $self}->delete_range($other, $other);
7752 sub add_match_table {
7753 # Add a new match table for this property, with name given by the
7754 # parameter. It returns a pointer to the table.
7760 my $addr = do { no overloading; pack 'J', $self; };
7762 my $table = $table_ref{$addr}{$name};
7763 my $standard_name = main::standardize($name);
7765 || (defined ($table = $table_ref{$addr}{$standard_name})))
7767 Carp::my_carp("Table '$name' in $self is already in use. Using existing one");
7768 $table_ref{$addr}{$name} = $table;
7773 # See if this is a perl extension, if not passed in.
7774 my $perl_extension = delete $args{'Perl_Extension'};
7776 = $self->perl_extension if ! defined $perl_extension;
7778 $table = Match_Table->new(
7780 Perl_Extension => $perl_extension,
7781 _Alias_Hash => $table_ref{$addr},
7784 # gets property's fate and status by default
7785 Fate => $self->fate,
7786 Status => $self->status,
7787 _Status_Info => $self->status_info,
7789 return unless defined $table;
7792 # Save the names for quick look up
7793 $table_ref{$addr}{$standard_name} = $table;
7794 $table_ref{$addr}{$name} = $table;
7796 # Perhaps we can figure out the type of this property based on the
7797 # fact of adding this match table. First, string properties don't
7798 # have match tables; second, a binary property can't have 3 match
7800 if ($type{$addr} == $UNKNOWN) {
7801 $type{$addr} = $NON_STRING;
7803 elsif ($type{$addr} == $STRING) {
7804 Carp::my_carp("$self Added a match table '$name' to a string property '$self'. Changed it to a non-string property. Bad News.");
7805 $type{$addr} = $NON_STRING;
7807 elsif ($type{$addr} != $ENUM && $type{$addr} != $FORCED_BINARY) {
7808 if (scalar main::uniques(values %{$table_ref{$addr}}) > 2
7809 && $type{$addr} == $BINARY)
7811 Carp::my_carp("$self now has more than 2 tables (with the addition of '$name'), and so is no longer binary. Changing its type to 'enum'. Bad News.");
7812 $type{$addr} = $ENUM;
7819 sub delete_match_table {
7820 # Delete the table referred to by $2 from the property $1.
7823 my $table_to_remove = shift;
7824 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7826 my $addr = do { no overloading; pack 'J', $self; };
7828 # Remove all names that refer to it.
7829 foreach my $key (keys %{$table_ref{$addr}}) {
7830 delete $table_ref{$addr}{$key}
7831 if $table_ref{$addr}{$key} == $table_to_remove;
7834 $table_to_remove->DESTROY;
7839 # Return a pointer to the match table (with name given by the
7840 # parameter) associated with this property; undef if none.
7844 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7846 my $addr = do { no overloading; pack 'J', $self; };
7848 return $table_ref{$addr}{$name} if defined $table_ref{$addr}{$name};
7850 # If quick look-up failed, try again using the standard form of the
7851 # input name. If that succeeds, cache the result before returning so
7852 # won't have to standardize this input name again.
7853 my $standard_name = main::standardize($name);
7854 return unless defined $table_ref{$addr}{$standard_name};
7856 $table_ref{$addr}{$name} = $table_ref{$addr}{$standard_name};
7857 return $table_ref{$addr}{$name};
7861 # Return a list of pointers to all the match tables attached to this
7865 return main::uniques(values %{$table_ref{pack 'J', shift}});
7869 # Returns the directory the map table for this property should be
7870 # output in. If a specific directory has been specified, that has
7871 # priority; 'undef' is returned if the type isn't defined;
7872 # or $map_directory for everything else.
7874 my $addr = do { no overloading; pack 'J', shift; };
7876 return $directory{$addr} if defined $directory{$addr};
7877 return undef if $type{$addr} == $UNKNOWN;
7878 return $map_directory;
7882 # Return the name that is used to both:
7883 # 1) Name the file that the map table is written to.
7884 # 2) The name of swash related stuff inside that file.
7885 # The reason for this is that the Perl core historically has used
7886 # certain names that aren't the same as the Unicode property names.
7887 # To continue using these, $file is hard-coded in this file for those,
7888 # but otherwise the standard name is used. This is different from the
7889 # external_name, so that the rest of the files, like in lib can use
7890 # the standard name always, without regard to historical precedent.
7893 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7895 my $addr = do { no overloading; pack 'J', $self; };
7897 # Swash names are used only on regular map tables; otherwise there
7898 # should be no access to the property map table from other parts of
7900 return if $map{$addr}->fate != $ORDINARY;
7902 return $file{$addr} if defined $file{$addr};
7903 return $map{$addr}->external_name;
7906 sub to_create_match_tables {
7907 # Returns a boolean as to whether or not match tables should be
7908 # created for this property.
7911 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7913 # The whole point of this pseudo property is match tables.
7914 return 1 if $self == $perl;
7916 my $addr = do { no overloading; pack 'J', $self; };
7918 # Don't generate tables of code points that match the property values
7919 # of a string property. Such a list would most likely have many
7920 # property values, each with just one or very few code points mapping
7922 return 0 if $type{$addr} == $STRING;
7924 # Don't generate anything for unimplemented properties.
7925 return 0 if grep { $self->complete_name eq $_ }
7926 @unimplemented_properties;
7931 sub property_add_or_replace_non_nulls {
7932 # This adds the mappings in the property $other to $self. Non-null
7933 # mappings from $other override those in $self. It essentially merges
7934 # the two properties, with the second having priority except for null
7939 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7941 if (! $other->isa(__PACKAGE__)) {
7942 Carp::my_carp_bug("$other should be a "
7951 return $map{pack 'J', $self}->map_add_or_replace_non_nulls($map{pack 'J', $other});
7955 # Certain tables are not generally written out to files, but
7956 # Unicode::UCD has the intelligence to know that the file for $self
7957 # can be used to reconstruct those tables. This routine just changes
7958 # things so that UCD pod entries for those suppressed tables are
7959 # generated, so the fact that a proxy is used is invisible to the
7964 foreach my $property_name (@_) {
7965 my $ref = property_ref($property_name);
7966 next if $ref->to_output_map;
7967 $ref->set_fate($MAP_PROXIED);
7972 # Set the type of the property. Mostly this is figured out by the
7973 # data in the table. But this is used to set it explicitly. The
7974 # reason it is not a standard accessor is that when setting a binary
7975 # property, we need to make sure that all the true/false aliases are
7976 # present, as they were omitted in early Unicode releases.
7980 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7984 && $type != $FORCED_BINARY
7985 && $type != $STRING)
7987 Carp::my_carp("Unrecognized type '$type'. Type not set");
7991 { no overloading; $type{pack 'J', $self} = $type; }
7992 return if $type != $BINARY && $type != $FORCED_BINARY;
7994 my $yes = $self->table('Y');
7995 $yes = $self->table('Yes') if ! defined $yes;
7996 $yes = $self->add_match_table('Y', Full_Name => 'Yes')
7999 # Add aliases in order wanted, duplicates will be ignored. We use a
8000 # binary property present in all releases for its ordered lists of
8001 # true/false aliases. Note, that could run into problems in
8002 # outputting things in that we don't distinguish between the name and
8003 # full name of these. Hopefully, if the table was already created
8004 # before this code is executed, it was done with these set properly.
8005 my $bm = property_ref("Bidi_Mirrored");
8006 foreach my $alias ($bm->table("Y")->aliases) {
8007 $yes->add_alias($alias->name);
8009 my $no = $self->table('N');
8010 $no = $self->table('No') if ! defined $no;
8011 $no = $self->add_match_table('N', Full_Name => 'No') if ! defined $no;
8012 foreach my $alias ($bm->table("N")->aliases) {
8013 $no->add_alias($alias->name);
8020 # Add a map to the property's map table. This also keeps
8021 # track of the maps so that the property type can be determined from
8025 my $start = shift; # First code point in range
8026 my $end = shift; # Final code point in range
8027 my $map = shift; # What the range maps to.
8028 # Rest of parameters passed on.
8030 my $addr = do { no overloading; pack 'J', $self; };
8032 # If haven't the type of the property, gather information to figure it
8034 if ($type{$addr} == $UNKNOWN) {
8036 # If the map contains an interior blank or dash, or most other
8037 # nonword characters, it will be a string property. This
8038 # heuristic may actually miss some string properties. If so, they
8039 # may need to have explicit set_types called for them. This
8040 # happens in the Unihan properties.
8041 if ($map =~ / (?<= . ) [ -] (?= . ) /x
8042 || $map =~ / [^\w.\/\ -] /x)
8044 $self->set_type($STRING);
8046 # $unique_maps is used for disambiguating between ENUM and
8047 # BINARY later; since we know the property is not going to be
8048 # one of those, no point in keeping the data around
8049 undef $unique_maps{$addr};
8053 # Not necessarily a string. The final decision has to be
8054 # deferred until all the data are in. We keep track of if all
8055 # the values are code points for that eventual decision.
8056 $has_only_code_point_maps{$addr} &=
8057 $map =~ / ^ $code_point_re $/x;
8059 # For the purposes of disambiguating between binary and other
8060 # enumerations at the end, we keep track of the first three
8061 # distinct property values. Once we get to three, we know
8062 # it's not going to be binary, so no need to track more.
8063 if (scalar keys %{$unique_maps{$addr}} < 3) {
8064 $unique_maps{$addr}{main::standardize($map)} = 1;
8069 # Add the mapping by calling our map table's method
8070 return $map{$addr}->add_map($start, $end, $map, @_);
8074 # Compute the type of the property: $ENUM, $STRING, or $BINARY. This
8075 # should be called after the property is mostly filled with its maps.
8076 # We have been keeping track of what the property values have been,
8077 # and now have the necessary information to figure out the type.
8080 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8082 my $addr = do { no overloading; pack 'J', $self; };
8084 my $type = $type{$addr};
8086 # If already have figured these out, no need to do so again, but we do
8087 # a double check on ENUMS to make sure that a string property hasn't
8088 # improperly been classified as an ENUM, so continue on with those.
8089 return if $type == $STRING
8091 || $type == $FORCED_BINARY;
8093 # If every map is to a code point, is a string property.
8094 if ($type == $UNKNOWN
8095 && ($has_only_code_point_maps{$addr}
8096 || (defined $map{$addr}->default_map
8097 && $map{$addr}->default_map eq "")))
8099 $self->set_type($STRING);
8103 # Otherwise, it is to some sort of enumeration. (The case where
8104 # it is a Unicode miscellaneous property, and treated like a
8105 # string in this program is handled in add_map()). Distinguish
8106 # between binary and some other enumeration type. Of course, if
8107 # there are more than two values, it's not binary. But more
8108 # subtle is the test that the default mapping is defined means it
8109 # isn't binary. This in fact may change in the future if Unicode
8110 # changes the way its data is structured. But so far, no binary
8111 # properties ever have @missing lines for them, so the default map
8112 # isn't defined for them. The few properties that are two-valued
8113 # and aren't considered binary have the default map defined
8114 # starting in Unicode 5.0, when the @missing lines appeared; and
8115 # this program has special code to put in a default map for them
8116 # for earlier than 5.0 releases.
8118 || scalar keys %{$unique_maps{$addr}} > 2
8119 || defined $self->default_map)
8121 my $tables = $self->tables;
8122 my $count = $self->count;
8123 if ($verbosity && $count > 500 && $tables/$count > .1) {
8124 Carp::my_carp_bug("It appears that $self should be a \$STRING property, not an \$ENUM because it has too many match tables: $count\n");
8126 $self->set_type($ENUM);
8129 $self->set_type($BINARY);
8132 undef $unique_maps{$addr}; # Garbage collect
8139 my $reason = shift; # Ignored unless suppressing
8140 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8142 my $addr = do { no overloading; pack 'J', $self; };
8143 if ($fate == $SUPPRESSED) {
8144 $why_suppressed{$self->complete_name} = $reason;
8147 # Each table shares the property's fate, except that MAP_PROXIED
8148 # doesn't affect match tables
8149 $map{$addr}->set_fate($fate, $reason);
8150 if ($fate != $MAP_PROXIED) {
8151 foreach my $table ($map{$addr}, $self->tables) {
8152 $table->set_fate($fate, $reason);
8159 # Most of the accessors for a property actually apply to its map table.
8160 # Setup up accessor functions for those, referring to %map
8209 # 'property' above is for symmetry, so that one can take
8210 # the property of a property and get itself, and so don't
8211 # have to distinguish between properties and tables in
8219 return $map{pack 'J', $self}->$sub(@_);
8229 # Returns lines of the input joined together, so that they can be folded
8231 # This causes continuation lines to be joined together into one long line
8232 # for folding. A continuation line is any line that doesn't begin with a
8233 # space or "\b" (the latter is stripped from the output). This is so
8234 # lines can be be in a HERE document so as to fit nicely in the terminal
8235 # width, but be joined together in one long line, and then folded with
8236 # indents, '#' prefixes, etc, properly handled.
8237 # A blank separates the joined lines except if there is a break; an extra
8238 # blank is inserted after a period ending a line.
8240 # Initialize the return with the first line.
8241 my ($return, @lines) = split "\n", shift;
8243 # If the first line is null, it was an empty line, add the \n back in
8244 $return = "\n" if $return eq "";
8246 # Now join the remainder of the physical lines.
8247 for my $line (@lines) {
8249 # An empty line means wanted a blank line, so add two \n's to get that
8250 # effect, and go to the next line.
8251 if (length $line == 0) {
8256 # Look at the last character of what we have so far.
8257 my $previous_char = substr($return, -1, 1);
8259 # And at the next char to be output.
8260 my $next_char = substr($line, 0, 1);
8262 if ($previous_char ne "\n") {
8264 # Here didn't end wth a nl. If the next char a blank or \b, it
8265 # means that here there is a break anyway. So add a nl to the
8267 if ($next_char eq " " || $next_char eq "\b") {
8268 $previous_char = "\n";
8269 $return .= $previous_char;
8272 # Add an extra space after periods.
8273 $return .= " " if $previous_char eq '.';
8276 # Here $previous_char is still the latest character to be output. If
8277 # it isn't a nl, it means that the next line is to be a continuation
8278 # line, with a blank inserted between them.
8279 $return .= " " if $previous_char ne "\n";
8282 substr($line, 0, 1) = "" if $next_char eq "\b";
8284 # And append this next line.
8291 sub simple_fold($;$$$) {
8292 # Returns a string of the input (string or an array of strings) folded
8293 # into multiple-lines each of no more than $MAX_LINE_WIDTH characters plus
8295 # This is tailored for the kind of text written by this program,
8296 # especially the pod file, which can have very long names with
8297 # underscores in the middle, or words like AbcDefgHij.... We allow
8298 # breaking in the middle of such constructs if the line won't fit
8299 # otherwise. The break in such cases will come either just after an
8300 # underscore, or just before one of the Capital letters.
8302 local $to_trace = 0 if main::DEBUG;
8305 my $prefix = shift; # Optional string to prepend to each output
8307 $prefix = "" unless defined $prefix;
8309 my $hanging_indent = shift; # Optional number of spaces to indent
8310 # continuation lines
8311 $hanging_indent = 0 unless $hanging_indent;
8313 my $right_margin = shift; # Optional number of spaces to narrow the
8315 $right_margin = 0 unless defined $right_margin;
8317 # Call carp with the 'nofold' option to avoid it from trying to call us
8319 Carp::carp_extra_args(\@_, 'nofold') if main::DEBUG && @_;
8321 # The space available doesn't include what's automatically prepended
8322 # to each line, or what's reserved on the right.
8323 my $max = $MAX_LINE_WIDTH - length($prefix) - $right_margin;
8324 # XXX Instead of using the 'nofold' perhaps better to look up the stack
8326 if (DEBUG && $hanging_indent >= $max) {
8327 Carp::my_carp("Too large a hanging indent ($hanging_indent); must be < $max. Using 0", 'nofold');
8328 $hanging_indent = 0;
8331 # First, split into the current physical lines.
8333 if (ref $line) { # Better be an array, because not bothering to
8335 foreach my $line (@{$line}) {
8336 push @line, split /\n/, $line;
8340 @line = split /\n/, $line;
8343 #local $to_trace = 1 if main::DEBUG;
8344 trace "", join(" ", @line), "\n" if main::DEBUG && $to_trace;
8346 # Look at each current physical line.
8347 for (my $i = 0; $i < @line; $i++) {
8348 Carp::my_carp("Tabs don't work well.", 'nofold') if $line[$i] =~ /\t/;
8349 #local $to_trace = 1 if main::DEBUG;
8350 trace "i=$i: $line[$i]\n" if main::DEBUG && $to_trace;
8352 # Remove prefix, because will be added back anyway, don't want
8354 $line[$i] =~ s/^$prefix//;
8356 # Remove trailing space
8357 $line[$i] =~ s/\s+\Z//;
8359 # If the line is too long, fold it.
8360 if (length $line[$i] > $max) {
8363 # Here needs to fold. Save the leading space in the line for
8365 $line[$i] =~ /^ ( \s* )/x;
8366 my $leading_space = $1;
8367 trace "line length", length $line[$i], "; lead length", length($leading_space) if main::DEBUG && $to_trace;
8369 # If character at final permissible position is white space,
8370 # fold there, which will delete that white space
8371 if (substr($line[$i], $max - 1, 1) =~ /\s/) {
8372 $remainder = substr($line[$i], $max);
8373 $line[$i] = substr($line[$i], 0, $max - 1);
8377 # Otherwise fold at an acceptable break char closest to
8378 # the max length. Look at just the maximal initial
8379 # segment of the line
8380 my $segment = substr($line[$i], 0, $max - 1);
8382 /^ ( .{$hanging_indent} # Don't look before the
8384 \ * # Don't look in leading
8385 # blanks past the indent
8386 [^ ] .* # Find the right-most
8387 (?: # acceptable break:
8388 [ \s = ] # space or equal
8389 | - (?! [.0-9] ) # or non-unary minus.
8390 ) # $1 includes the character
8393 # Split into the initial part that fits, and remaining
8395 $remainder = substr($line[$i], length $1);
8397 trace $line[$i] if DEBUG && $to_trace;
8398 trace $remainder if DEBUG && $to_trace;
8401 # If didn't find a good breaking spot, see if there is a
8402 # not-so-good breaking spot. These are just after
8403 # underscores or where the case changes from lower to
8404 # upper. Use \a as a soft hyphen, but give up
8405 # and don't break the line if there is actually a \a
8406 # already in the input. We use an ascii character for the
8407 # soft-hyphen to avoid any attempt by miniperl to try to
8408 # access the files that this program is creating.
8409 elsif ($segment !~ /\a/
8410 && ($segment =~ s/_/_\a/g
8411 || $segment =~ s/ ( [a-z] ) (?= [A-Z] )/$1\a/xg))
8413 # Here were able to find at least one place to insert
8414 # our substitute soft hyphen. Find the right-most one
8415 # and replace it by a real hyphen.
8416 trace $segment if DEBUG && $to_trace;
8418 rindex($segment, "\a"),
8421 # Then remove the soft hyphen substitutes.
8422 $segment =~ s/\a//g;
8423 trace $segment if DEBUG && $to_trace;
8425 # And split into the initial part that fits, and
8426 # remainder of the line
8427 my $pos = rindex($segment, '-');
8428 $remainder = substr($line[$i], $pos);
8429 trace $remainder if DEBUG && $to_trace;
8430 $line[$i] = substr($segment, 0, $pos + 1);
8434 # Here we know if we can fold or not. If we can, $remainder
8435 # is what remains to be processed in the next iteration.
8436 if (defined $remainder) {
8437 trace "folded='$line[$i]'" if main::DEBUG && $to_trace;
8439 # Insert the folded remainder of the line as a new element
8440 # of the array. (It may still be too long, but we will
8441 # deal with that next time through the loop.) Omit any
8442 # leading space in the remainder.
8443 $remainder =~ s/^\s+//;
8444 trace "remainder='$remainder'" if main::DEBUG && $to_trace;
8446 # But then indent by whichever is larger of:
8447 # 1) the leading space on the input line;
8448 # 2) the hanging indent.
8449 # This preserves indentation in the original line.
8450 my $lead = ($leading_space)
8451 ? length $leading_space
8453 $lead = max($lead, $hanging_indent);
8454 splice @line, $i+1, 0, (" " x $lead) . $remainder;
8458 # Ready to output the line. Get rid of any trailing space
8459 # And prefix by the required $prefix passed in.
8460 $line[$i] =~ s/\s+$//;
8461 $line[$i] = "$prefix$line[$i]\n";
8462 } # End of looping through all the lines.
8464 return join "", @line;
8467 sub property_ref { # Returns a reference to a property object.
8468 return Property::property_ref(@_);
8471 sub force_unlink ($) {
8472 my $filename = shift;
8473 return unless file_exists($filename);
8474 return if CORE::unlink($filename);
8476 # We might need write permission
8477 chmod 0777, $filename;
8478 CORE::unlink($filename) or Carp::my_carp("Couldn't unlink $filename. Proceeding anyway: $!");
8483 # Given a filename and references to arrays of lines, write the lines of
8484 # each array to the file
8485 # Filename can be given as an arrayref of directory names
8487 return Carp::carp_too_few_args(\@_, 3) if main::DEBUG && @_ < 3;
8490 my $use_utf8 = shift;
8492 # Get into a single string if an array, and get rid of, in Unix terms, any
8494 $file= File::Spec->join(@$file) if ref $file eq 'ARRAY';
8495 $file = File::Spec->canonpath($file);
8497 # If has directories, make sure that they all exist
8498 (undef, my $directories, undef) = File::Spec->splitpath($file);
8499 File::Path::mkpath($directories) if $directories && ! -d $directories;
8501 push @files_actually_output, $file;
8503 force_unlink ($file);
8506 if (not open $OUT, ">", $file) {
8507 Carp::my_carp("can't open $file for output. Skipping this file: $!");
8511 binmode $OUT, ":utf8" if $use_utf8;
8513 while (defined (my $lines_ref = shift)) {
8514 unless (@$lines_ref) {
8515 Carp::my_carp("An array of lines for writing to file '$file' is empty; writing it anyway;");
8518 print $OUT @$lines_ref or die Carp::my_carp("write to '$file' failed: $!");
8520 close $OUT or die Carp::my_carp("close '$file' failed: $!");
8522 print "$file written.\n" if $verbosity >= $VERBOSE;
8528 sub Standardize($) {
8529 # This converts the input name string into a standardized equivalent to
8533 unless (defined $name) {
8534 Carp::my_carp_bug("Standardize() called with undef. Returning undef.");
8538 # Remove any leading or trailing white space
8542 # Convert interior white space and hyphens into underscores.
8543 $name =~ s/ (?<= .) [ -]+ (.) /_$1/xg;
8545 # Capitalize the letter following an underscore, and convert a sequence of
8546 # multiple underscores to a single one
8547 $name =~ s/ (?<= .) _+ (.) /_\u$1/xg;
8549 # And capitalize the first letter, but not for the special cjk ones.
8550 $name = ucfirst($name) unless $name =~ /^k[A-Z]/;
8554 sub standardize ($) {
8555 # Returns a lower-cased standardized name, without underscores. This form
8556 # is chosen so that it can distinguish between any real versus superficial
8557 # Unicode name differences. It relies on the fact that Unicode doesn't
8558 # have interior underscores, white space, nor dashes in any
8559 # stricter-matched name. It should not be used on Unicode code point
8560 # names (the Name property), as they mostly, but not always follow these
8563 my $name = Standardize(shift);
8564 return if !defined $name;
8566 $name =~ s/ (?<= .) _ (?= . ) //xg;
8570 sub utf8_heavy_name ($$) {
8571 # Returns the name that utf8_heavy.pl will use to find a table. XXX
8572 # perhaps this function should be placed somewhere, like Heavy.pl so that
8573 # utf8_heavy can use it directly without duplicating code that can get
8578 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8580 my $property = $table->property;
8581 $property = ($property == $perl)
8582 ? "" # 'perl' is never explicitly stated
8583 : standardize($property->name) . '=';
8584 if ($alias->loose_match) {
8585 return $property . standardize($alias->name);
8588 return lc ($property . $alias->name);
8596 my $indent_increment = " " x (($debugging_build) ? 2 : 0);
8599 $main::simple_dumper_nesting = 0;
8602 # Like Simple Data::Dumper. Good enough for our needs. We can't use
8603 # the real thing as we have to run under miniperl.
8605 # It is designed so that on input it is at the beginning of a line,
8606 # and the final thing output in any call is a trailing ",\n".
8610 $indent = "" if ! $debugging_build || ! defined $indent;
8612 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8614 # nesting level is localized, so that as the call stack pops, it goes
8615 # back to the prior value.
8616 local $main::simple_dumper_nesting = $main::simple_dumper_nesting;
8617 undef %already_output if $main::simple_dumper_nesting == 0;
8618 $main::simple_dumper_nesting++;
8619 #print STDERR __LINE__, ": $main::simple_dumper_nesting: $indent$item\n";
8621 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8623 # Determine the indent for recursive calls.
8624 my $next_indent = $indent . $indent_increment;
8629 # Dump of scalar: just output it in quotes if not a number. To do
8630 # so we must escape certain characters, and therefore need to
8631 # operate on a copy to avoid changing the original
8633 $copy = $UNDEF unless defined $copy;
8635 # Quote non-integers (integers also have optional leading '-')
8636 if ($copy eq "" || $copy !~ /^ -? \d+ $/x) {
8638 # Escape apostrophe and backslash
8639 $copy =~ s/ ( ['\\] ) /\\$1/xg;
8642 $output = "$indent$copy,\n";
8646 # Keep track of cycles in the input, and refuse to infinitely loop
8647 my $addr = do { no overloading; pack 'J', $item; };
8648 if (defined $already_output{$addr}) {
8649 return "${indent}ALREADY OUTPUT: $item\n";
8651 $already_output{$addr} = $item;
8653 if (ref $item eq 'ARRAY') {
8656 if ($main::simple_dumper_nesting > 1) {
8658 $using_brackets = 1;
8661 $using_brackets = 0;
8664 # If the array is empty, put the closing bracket on the same
8665 # line. Otherwise, recursively add each array element
8671 for (my $i = 0; $i < @$item; $i++) {
8673 # Indent array elements one level
8674 $output .= &simple_dumper($item->[$i], $next_indent);
8675 next if ! $debugging_build;
8676 $output =~ s/\n$//; # Remove any trailing nl so
8677 $output .= " # [$i]\n"; # as to add a comment giving
8680 $output .= $indent; # Indent closing ']' to orig level
8682 $output .= ']' if $using_brackets;
8685 elsif (ref $item eq 'HASH') {
8690 # No surrounding braces at top level
8692 if ($main::simple_dumper_nesting > 1) {
8695 $body_indent = $next_indent;
8696 $next_indent .= $indent_increment;
8701 $body_indent = $indent;
8705 # Output hashes sorted alphabetically instead of apparently
8706 # random. Use caseless alphabetic sort
8707 foreach my $key (sort { lc $a cmp lc $b } keys %$item)
8709 if ($is_first_line) {
8713 $output .= "$body_indent";
8716 # The key must be a scalar, but this recursive call quotes
8718 $output .= &simple_dumper($key);
8720 # And change the trailing comma and nl to the hash fat
8721 # comma for clarity, and so the value can be on the same
8723 $output =~ s/,\n$/ => /;
8725 # Recursively call to get the value's dump.
8726 my $next = &simple_dumper($item->{$key}, $next_indent);
8728 # If the value is all on one line, remove its indent, so
8729 # will follow the => immediately. If it takes more than
8730 # one line, start it on a new line.
8731 if ($next !~ /\n.*\n/) {
8740 $output .= "$indent},\n" if $using_braces;
8742 elsif (ref $item eq 'CODE' || ref $item eq 'GLOB') {
8743 $output = $indent . ref($item) . "\n";
8744 # XXX see if blessed
8746 elsif ($item->can('dump')) {
8748 # By convention in this program, objects furnish a 'dump'
8749 # method. Since not doing any output at this level, just pass
8750 # on the input indent
8751 $output = $item->dump($indent);
8754 Carp::my_carp("Can't cope with dumping a " . ref($item) . ". Skipping.");
8761 sub dump_inside_out {
8762 # Dump inside-out hashes in an object's state by converting them to a
8763 # regular hash and then calling simple_dumper on that.
8766 my $fields_ref = shift;
8767 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8769 my $addr = do { no overloading; pack 'J', $object; };
8772 foreach my $key (keys %$fields_ref) {
8773 $hash{$key} = $fields_ref->{$key}{$addr};
8776 return simple_dumper(\%hash, @_);
8780 # Overloaded '.' method that is common to all packages. It uses the
8781 # package's stringify method.
8785 my $reversed = shift;
8786 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8788 $other = "" unless defined $other;
8790 foreach my $which (\$self, \$other) {
8791 next unless ref $$which;
8792 if ($$which->can('_operator_stringify')) {
8793 $$which = $$which->_operator_stringify;
8796 my $ref = ref $$which;
8797 my $addr = do { no overloading; pack 'J', $$which; };
8798 $$which = "$ref ($addr)";
8806 sub _operator_dot_equal {
8807 # Overloaded '.=' method that is common to all packages.
8811 my $reversed = shift;
8812 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8814 $other = "" unless defined $other;
8817 return $other .= "$self";
8820 return "$self" . "$other";
8824 sub _operator_equal {
8825 # Generic overloaded '==' routine. To be equal, they must be the exact
8831 return 0 unless defined $other;
8832 return 0 unless ref $other;
8834 return $self == $other;
8837 sub _operator_not_equal {
8841 return ! _operator_equal($self, $other);
8844 sub process_PropertyAliases($) {
8845 # This reads in the PropertyAliases.txt file, which contains almost all
8846 # the character properties in Unicode and their equivalent aliases:
8847 # scf ; Simple_Case_Folding ; sfc
8849 # Field 0 is the preferred short name for the property.
8850 # Field 1 is the full name.
8851 # Any succeeding ones are other accepted names.
8854 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8856 # This whole file was non-existent in early releases, so use our own
8858 $file->insert_lines(get_old_property_aliases())
8859 if ! -e 'PropertyAliases.txt';
8861 # Add any cjk properties that may have been defined.
8862 $file->insert_lines(@cjk_properties);
8864 while ($file->next_line) {
8866 my @data = split /\s*;\s*/;
8868 my $full = $data[1];
8870 my $this = Property->new($data[0], Full_Name => $full);
8872 # Start looking for more aliases after these two.
8873 for my $i (2 .. @data - 1) {
8874 $this->add_alias($data[$i]);
8881 sub finish_property_setup {
8882 # Finishes setting up after PropertyAliases.
8885 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8887 # This entry was missing from this file in earlier Unicode versions
8888 if (-e 'Jamo.txt') {
8889 my $jsn = property_ref('JSN');
8890 if (! defined $jsn) {
8891 $jsn = Property->new('JSN', Full_Name => 'Jamo_Short_Name');
8895 # These are used so much, that we set globals for them.
8896 $gc = property_ref('General_Category');
8897 $block = property_ref('Block');
8898 $script = property_ref('Script');
8900 # Perl adds this alias.
8901 $gc->add_alias('Category');
8903 # Unicode::Normalize expects this file with this name and directory.
8904 my $ccc = property_ref('Canonical_Combining_Class');
8906 $ccc->set_file('CombiningClass');
8907 $ccc->set_directory(File::Spec->curdir());
8910 # These two properties aren't actually used in the core, but unfortunately
8911 # the names just above that are in the core interfere with these, so
8912 # choose different names. These aren't a problem unless the map tables
8913 # for these files get written out.
8914 my $lowercase = property_ref('Lowercase');
8915 $lowercase->set_file('IsLower') if defined $lowercase;
8916 my $uppercase = property_ref('Uppercase');
8917 $uppercase->set_file('IsUpper') if defined $uppercase;
8919 # Set up the hard-coded default mappings, but only on properties defined
8921 foreach my $property (keys %default_mapping) {
8922 my $property_object = property_ref($property);
8923 next if ! defined $property_object;
8924 my $default_map = $default_mapping{$property};
8925 $property_object->set_default_map($default_map);
8927 # A map of <code point> implies the property is string.
8928 if ($property_object->type == $UNKNOWN
8929 && $default_map eq $CODE_POINT)
8931 $property_object->set_type($STRING);
8935 # The following use the Multi_Default class to create objects for
8938 # Bidi class has a complicated default, but the derived file takes care of
8939 # the complications, leaving just 'L'.
8940 if (file_exists("${EXTRACTED}DBidiClass.txt")) {
8941 property_ref('Bidi_Class')->set_default_map('L');
8946 # The derived file was introduced in 3.1.1. The values below are
8947 # taken from table 3-8, TUS 3.0
8949 'my $default = Range_List->new;
8950 $default->add_range(0x0590, 0x05FF);
8951 $default->add_range(0xFB1D, 0xFB4F);'
8954 # The defaults apply only to unassigned characters
8955 $default_R .= '$gc->table("Unassigned") & $default;';
8957 if ($v_version lt v3.0.0) {
8958 $default = Multi_Default->new(R => $default_R, 'L');
8962 # AL apparently not introduced until 3.0: TUS 2.x references are
8963 # not on-line to check it out
8965 'my $default = Range_List->new;
8966 $default->add_range(0x0600, 0x07BF);
8967 $default->add_range(0xFB50, 0xFDFF);
8968 $default->add_range(0xFE70, 0xFEFF);'
8971 # Non-character code points introduced in this release; aren't AL
8972 if ($v_version ge 3.1.0) {
8973 $default_AL .= '$default->delete_range(0xFDD0, 0xFDEF);';
8975 $default_AL .= '$gc->table("Unassigned") & $default';
8976 $default = Multi_Default->new(AL => $default_AL,
8980 property_ref('Bidi_Class')->set_default_map($default);
8983 # Joining type has a complicated default, but the derived file takes care
8984 # of the complications, leaving just 'U' (or Non_Joining), except the file
8986 if (file_exists("${EXTRACTED}DJoinType.txt") || -e 'ArabicShaping.txt') {
8987 if (file_exists("${EXTRACTED}DJoinType.txt") && $v_version ne 3.1.0) {
8988 property_ref('Joining_Type')->set_default_map('Non_Joining');
8992 # Otherwise, there are not one, but two possibilities for the
8993 # missing defaults: T and U.
8994 # The missing defaults that evaluate to T are given by:
8995 # T = Mn + Cf - ZWNJ - ZWJ
8996 # where Mn and Cf are the general category values. In other words,
8997 # any non-spacing mark or any format control character, except
8998 # U+200C ZERO WIDTH NON-JOINER (joining type U) and U+200D ZERO
8999 # WIDTH JOINER (joining type C).
9000 my $default = Multi_Default->new(
9001 'T' => '$gc->table("Mn") + $gc->table("Cf") - 0x200C - 0x200D',
9003 property_ref('Joining_Type')->set_default_map($default);
9007 # Line break has a complicated default in early releases. It is 'Unknown'
9008 # for non-assigned code points; 'AL' for assigned.
9009 if (file_exists("${EXTRACTED}DLineBreak.txt") || -e 'LineBreak.txt') {
9010 my $lb = property_ref('Line_Break');
9011 if ($v_version gt 3.2.0) {
9012 $lb->set_default_map('Unknown');
9015 my $default = Multi_Default->new( 'Unknown' => '$gc->table("Cn")',
9017 $lb->set_default_map($default);
9020 # If has the URS property, make sure that the standard aliases are in
9021 # it, since not in the input tables in some versions.
9022 my $urs = property_ref('Unicode_Radical_Stroke');
9024 $urs->add_alias('cjkRSUnicode');
9025 $urs->add_alias('kRSUnicode');
9029 # For backwards compatibility with applications that may read the mapping
9030 # file directly (it was documented in 5.12 and 5.14 as being thusly
9031 # usable), keep it from being adjusted. (range_size_1 is
9032 # used to force the traditional format.)
9033 if (defined (my $nfkc_cf = property_ref('NFKC_Casefold'))) {
9034 $nfkc_cf->set_to_output_map($EXTERNAL_MAP);
9035 $nfkc_cf->set_range_size_1(1);
9037 if (defined (my $bmg = property_ref('Bidi_Mirroring_Glyph'))) {
9038 $bmg->set_to_output_map($EXTERNAL_MAP);
9039 $bmg->set_range_size_1(1);
9042 property_ref('Numeric_Value')->set_to_output_map($OUTPUT_ADJUSTED);
9047 sub get_old_property_aliases() {
9048 # Returns what would be in PropertyAliases.txt if it existed in very old
9049 # versions of Unicode. It was derived from the one in 3.2, and pared
9050 # down based on the data that was actually in the older releases.
9051 # An attempt was made to use the existence of files to mean inclusion or
9052 # not of various aliases, but if this was not sufficient, using version
9053 # numbers was resorted to.
9057 # These are to be used in all versions (though some are constructed by
9058 # this program if missing)
9059 push @return, split /\n/, <<'END';
9061 Bidi_M ; Bidi_Mirrored
9063 ccc ; Canonical_Combining_Class
9064 dm ; Decomposition_Mapping
9065 dt ; Decomposition_Type
9066 gc ; General_Category
9068 lc ; Lowercase_Mapping
9070 na1 ; Unicode_1_Name
9073 sfc ; Simple_Case_Folding
9074 slc ; Simple_Lowercase_Mapping
9075 stc ; Simple_Titlecase_Mapping
9076 suc ; Simple_Uppercase_Mapping
9077 tc ; Titlecase_Mapping
9078 uc ; Uppercase_Mapping
9081 if (-e 'Blocks.txt') {
9082 push @return, "blk ; Block\n";
9084 if (-e 'ArabicShaping.txt') {
9085 push @return, split /\n/, <<'END';
9090 if (-e 'PropList.txt') {
9092 # This first set is in the original old-style proplist.
9093 push @return, split /\n/, <<'END';
9095 Bidi_C ; Bidi_Control
9103 Join_C ; Join_Control
9105 QMark ; Quotation_Mark
9106 Term ; Terminal_Punctuation
9107 WSpace ; White_Space
9109 # The next sets were added later
9110 if ($v_version ge v3.0.0) {
9111 push @return, split /\n/, <<'END';
9116 if ($v_version ge v3.0.1) {
9117 push @return, split /\n/, <<'END';
9118 NChar ; Noncharacter_Code_Point
9121 # The next sets were added in the new-style
9122 if ($v_version ge v3.1.0) {
9123 push @return, split /\n/, <<'END';
9124 OAlpha ; Other_Alphabetic
9125 OLower ; Other_Lowercase
9127 OUpper ; Other_Uppercase
9130 if ($v_version ge v3.1.1) {
9131 push @return, "AHex ; ASCII_Hex_Digit\n";
9134 if (-e 'EastAsianWidth.txt') {
9135 push @return, "ea ; East_Asian_Width\n";
9137 if (-e 'CompositionExclusions.txt') {
9138 push @return, "CE ; Composition_Exclusion\n";
9140 if (-e 'LineBreak.txt') {
9141 push @return, "lb ; Line_Break\n";
9143 if (-e 'BidiMirroring.txt') {
9144 push @return, "bmg ; Bidi_Mirroring_Glyph\n";
9146 if (-e 'Scripts.txt') {
9147 push @return, "sc ; Script\n";
9149 if (-e 'DNormalizationProps.txt') {
9150 push @return, split /\n/, <<'END';
9151 Comp_Ex ; Full_Composition_Exclusion
9152 FC_NFKC ; FC_NFKC_Closure
9153 NFC_QC ; NFC_Quick_Check
9154 NFD_QC ; NFD_Quick_Check
9155 NFKC_QC ; NFKC_Quick_Check
9156 NFKD_QC ; NFKD_Quick_Check
9157 XO_NFC ; Expands_On_NFC
9158 XO_NFD ; Expands_On_NFD
9159 XO_NFKC ; Expands_On_NFKC
9160 XO_NFKD ; Expands_On_NFKD
9163 if (-e 'DCoreProperties.txt') {
9164 push @return, split /\n/, <<'END';
9169 # These can also appear in some versions of PropList.txt
9170 push @return, "Lower ; Lowercase\n"
9171 unless grep { $_ =~ /^Lower\b/} @return;
9172 push @return, "Upper ; Uppercase\n"
9173 unless grep { $_ =~ /^Upper\b/} @return;
9176 # This flag requires the DAge.txt file to be copied into the directory.
9177 if (DEBUG && $compare_versions) {
9178 push @return, 'age ; Age';
9184 sub process_PropValueAliases {
9185 # This file contains values that properties look like:
9186 # bc ; AL ; Arabic_Letter
9187 # blk; n/a ; Greek_And_Coptic ; Greek
9189 # Field 0 is the property.
9190 # Field 1 is the short name of a property value or 'n/a' if no
9191 # short name exists;
9192 # Field 2 is the full property value name;
9193 # Any other fields are more synonyms for the property value.
9194 # Purely numeric property values are omitted from the file; as are some
9195 # others, fewer and fewer in later releases
9197 # Entries for the ccc property have an extra field before the
9199 # ccc; 0; NR ; Not_Reordered
9200 # It is the numeric value that the names are synonyms for.
9202 # There are comment entries for values missing from this file:
9203 # # @missing: 0000..10FFFF; ISO_Comment; <none>
9204 # # @missing: 0000..10FFFF; Lowercase_Mapping; <code point>
9207 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9209 # This whole file was non-existent in early releases, so use our own
9210 # internal one if necessary.
9211 if (! -e 'PropValueAliases.txt') {
9212 $file->insert_lines(get_old_property_value_aliases());
9215 # Add any explicit cjk values
9216 $file->insert_lines(@cjk_property_values);
9218 # This line is used only for testing the code that checks for name
9219 # conflicts. There is a script Inherited, and when this line is executed
9220 # it causes there to be a name conflict with the 'Inherited' that this
9221 # program generates for this block property value
9222 #$file->insert_lines('blk; n/a; Herited');
9225 # Process each line of the file ...
9226 while ($file->next_line) {
9228 # Fix typo in input file
9229 s/CCC133/CCC132/g if $v_version eq v6.1.0;
9231 my ($property, @data) = split /\s*;\s*/;
9233 # The ccc property has an extra field at the beginning, which is the
9234 # numeric value. Move it to be after the other two, mnemonic, fields,
9235 # so that those will be used as the property value's names, and the
9236 # number will be an extra alias. (Rightmost splice removes field 1-2,
9237 # returning them in a slice; left splice inserts that before anything,
9238 # thus shifting the former field 0 to after them.)
9239 splice (@data, 0, 0, splice(@data, 1, 2)) if $property eq 'ccc';
9241 # Field 0 is a short name unless "n/a"; field 1 is the full name. If
9242 # there is no short name, use the full one in element 1
9243 if ($data[0] eq "n/a") {
9244 $data[0] = $data[1];
9246 elsif ($data[0] ne $data[1]
9247 && standardize($data[0]) eq standardize($data[1])
9248 && $data[1] !~ /[[:upper:]]/)
9250 # Also, there is a bug in the file in which "n/a" is omitted, and
9251 # the two fields are identical except for case, and the full name
9252 # is all lower case. Copy the "short" name unto the full one to
9253 # give it some upper case.
9255 $data[1] = $data[0];
9258 # Earlier releases had the pseudo property 'qc' that should expand to
9259 # the ones that replace it below.
9260 if ($property eq 'qc') {
9261 if (lc $data[0] eq 'y') {
9262 $file->insert_lines('NFC_QC; Y ; Yes',
9268 elsif (lc $data[0] eq 'n') {
9269 $file->insert_lines('NFC_QC; N ; No',
9275 elsif (lc $data[0] eq 'm') {
9276 $file->insert_lines('NFC_QC; M ; Maybe',
9277 'NFKC_QC; M ; Maybe',
9281 $file->carp_bad_line("qc followed by unexpected '$data[0]");
9286 # The first field is the short name, 2nd is the full one.
9287 my $property_object = property_ref($property);
9288 my $table = $property_object->add_match_table($data[0],
9289 Full_Name => $data[1]);
9291 # Start looking for more aliases after these two.
9292 for my $i (2 .. @data - 1) {
9293 $table->add_alias($data[$i]);
9295 } # End of looping through the file
9297 # As noted in the comments early in the program, it generates tables for
9298 # the default values for all releases, even those for which the concept
9299 # didn't exist at the time. Here we add those if missing.
9300 my $age = property_ref('age');
9301 if (defined $age && ! defined $age->table('Unassigned')) {
9302 $age->add_match_table('Unassigned');
9304 $block->add_match_table('No_Block') if -e 'Blocks.txt'
9305 && ! defined $block->table('No_Block');
9308 # Now set the default mappings of the properties from the file. This is
9309 # done after the loop because a number of properties have only @missings
9310 # entries in the file, and may not show up until the end.
9311 my @defaults = $file->get_missings;
9312 foreach my $default_ref (@defaults) {
9313 my $default = $default_ref->[0];
9314 my $property = property_ref($default_ref->[1]);
9315 $property->set_default_map($default);
9320 sub get_old_property_value_aliases () {
9321 # Returns what would be in PropValueAliases.txt if it existed in very old
9322 # versions of Unicode. It was derived from the one in 3.2, and pared
9323 # down. An attempt was made to use the existence of files to mean
9324 # inclusion or not of various aliases, but if this was not sufficient,
9325 # using version numbers was resorted to.
9327 my @return = split /\n/, <<'END';
9328 bc ; AN ; Arabic_Number
9329 bc ; B ; Paragraph_Separator
9330 bc ; CS ; Common_Separator
9331 bc ; EN ; European_Number
9332 bc ; ES ; European_Separator
9333 bc ; ET ; European_Terminator
9334 bc ; L ; Left_To_Right
9335 bc ; ON ; Other_Neutral
9336 bc ; R ; Right_To_Left
9337 bc ; WS ; White_Space
9339 # The standard combining classes are very much different in v1, so only use
9340 # ones that look right (not checked thoroughly)
9341 ccc; 0; NR ; Not_Reordered
9342 ccc; 1; OV ; Overlay
9344 ccc; 8; KV ; Kana_Voicing
9346 ccc; 202; ATBL ; Attached_Below_Left
9347 ccc; 216; ATAR ; Attached_Above_Right
9348 ccc; 218; BL ; Below_Left
9350 ccc; 222; BR ; Below_Right
9352 ccc; 228; AL ; Above_Left
9354 ccc; 232; AR ; Above_Right
9355 ccc; 234; DA ; Double_Above
9357 dt ; can ; canonical
9371 gc ; C ; Other # Cc | Cf | Cn | Co | Cs
9373 gc ; Cn ; Unassigned
9374 gc ; Co ; Private_Use
9375 gc ; L ; Letter # Ll | Lm | Lo | Lt | Lu
9376 gc ; LC ; Cased_Letter # Ll | Lt | Lu
9377 gc ; Ll ; Lowercase_Letter
9378 gc ; Lm ; Modifier_Letter
9379 gc ; Lo ; Other_Letter
9380 gc ; Lu ; Uppercase_Letter
9381 gc ; M ; Mark # Mc | Me | Mn
9382 gc ; Mc ; Spacing_Mark
9383 gc ; Mn ; Nonspacing_Mark
9384 gc ; N ; Number # Nd | Nl | No
9385 gc ; Nd ; Decimal_Number
9386 gc ; No ; Other_Number
9387 gc ; P ; Punctuation # Pc | Pd | Pe | Pf | Pi | Po | Ps
9388 gc ; Pd ; Dash_Punctuation
9389 gc ; Pe ; Close_Punctuation
9390 gc ; Po ; Other_Punctuation
9391 gc ; Ps ; Open_Punctuation
9392 gc ; S ; Symbol # Sc | Sk | Sm | So
9393 gc ; Sc ; Currency_Symbol
9394 gc ; Sm ; Math_Symbol
9395 gc ; So ; Other_Symbol
9396 gc ; Z ; Separator # Zl | Zp | Zs
9397 gc ; Zl ; Line_Separator
9398 gc ; Zp ; Paragraph_Separator
9399 gc ; Zs ; Space_Separator
9407 if (-e 'ArabicShaping.txt') {
9408 push @return, split /\n/, <<'END';
9415 jg ; n/a ; NO_JOINING_GROUP
9423 jt ; C ; Join_Causing
9424 jt ; D ; Dual_Joining
9425 jt ; L ; Left_Joining
9426 jt ; R ; Right_Joining
9427 jt ; U ; Non_Joining
9428 jt ; T ; Transparent
9430 if ($v_version ge v3.0.0) {
9431 push @return, split /\n/, <<'END';
9435 jg ; n/a ; DALATH_RISH
9438 jg ; n/a ; FINAL_SEMKATH
9441 jg ; n/a ; HAMZA_ON_HEH_GOAL
9448 jg ; n/a ; KNOTTED_HEH
9455 jg ; n/a ; REVERSED_PE
9459 jg ; n/a ; SWASH_KAF
9461 jg ; n/a ; TEH_MARBUTA
9464 jg ; n/a ; YEH_BARREE
9465 jg ; n/a ; YEH_WITH_TAIL
9474 if (-e 'EastAsianWidth.txt') {
9475 push @return, split /\n/, <<'END';
9485 if (-e 'LineBreak.txt') {
9486 push @return, split /\n/, <<'END';
9488 lb ; AL ; Alphabetic
9489 lb ; B2 ; Break_Both
9490 lb ; BA ; Break_After
9491 lb ; BB ; Break_Before
9492 lb ; BK ; Mandatory_Break
9493 lb ; CB ; Contingent_Break
9494 lb ; CL ; Close_Punctuation
9495 lb ; CM ; Combining_Mark
9496 lb ; CR ; Carriage_Return
9497 lb ; EX ; Exclamation
9500 lb ; ID ; Ideographic
9501 lb ; IN ; Inseperable
9502 lb ; IS ; Infix_Numeric
9504 lb ; NS ; Nonstarter
9506 lb ; OP ; Open_Punctuation
9507 lb ; PO ; Postfix_Numeric
9508 lb ; PR ; Prefix_Numeric
9510 lb ; SA ; Complex_Context
9513 lb ; SY ; Break_Symbols
9519 if (-e 'DNormalizationProps.txt') {
9520 push @return, split /\n/, <<'END';
9527 if (-e 'Scripts.txt') {
9528 push @return, split /\n/, <<'END';
9530 sc ; Armn ; Armenian
9532 sc ; Bopo ; Bopomofo
9533 sc ; Cans ; Canadian_Aboriginal
9534 sc ; Cher ; Cherokee
9535 sc ; Cyrl ; Cyrillic
9536 sc ; Deva ; Devanagari
9538 sc ; Ethi ; Ethiopic
9539 sc ; Geor ; Georgian
9542 sc ; Gujr ; Gujarati
9543 sc ; Guru ; Gurmukhi
9547 sc ; Hira ; Hiragana
9548 sc ; Ital ; Old_Italic
9549 sc ; Kana ; Katakana
9554 sc ; Mlym ; Malayalam
9555 sc ; Mong ; Mongolian
9559 sc ; Qaai ; Inherited
9573 if ($v_version ge v2.0.0) {
9574 push @return, split /\n/, <<'END';
9578 dt ; vert ; vertical
9583 gc ; Lt ; Titlecase_Letter
9584 gc ; Me ; Enclosing_Mark
9585 gc ; Nl ; Letter_Number
9586 gc ; Pc ; Connector_Punctuation
9587 gc ; Sk ; Modifier_Symbol
9590 if ($v_version ge v2.1.2) {
9591 push @return, "bc ; S ; Segment_Separator\n";
9593 if ($v_version ge v2.1.5) {
9594 push @return, split /\n/, <<'END';
9595 gc ; Pf ; Final_Punctuation
9596 gc ; Pi ; Initial_Punctuation
9599 if ($v_version ge v2.1.8) {
9600 push @return, "ccc; 240; IS ; Iota_Subscript\n";
9603 if ($v_version ge v3.0.0) {
9604 push @return, split /\n/, <<'END';
9605 bc ; AL ; Arabic_Letter
9606 bc ; BN ; Boundary_Neutral
9607 bc ; LRE ; Left_To_Right_Embedding
9608 bc ; LRO ; Left_To_Right_Override
9609 bc ; NSM ; Nonspacing_Mark
9610 bc ; PDF ; Pop_Directional_Format
9611 bc ; RLE ; Right_To_Left_Embedding
9612 bc ; RLO ; Right_To_Left_Override
9614 ccc; 233; DB ; Double_Below
9618 if ($v_version ge v3.1.0) {
9619 push @return, "ccc; 226; R ; Right\n";
9625 sub process_NormalizationsTest {
9627 # Each line looks like:
9628 # source code point; NFC; NFD; NFKC; NFKD
9630 # 1E0A;1E0A;0044 0307;1E0A;0044 0307;
9633 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9635 # Process each line of the file ...
9636 while ($file->next_line) {
9640 my ($c1, $c2, $c3, $c4, $c5) = split /\s*;\s*/;
9642 foreach my $var (\$c1, \$c2, \$c3, \$c4, \$c5) {
9643 $$var = pack "U0U*", map { hex } split " ", $$var;
9644 $$var =~ s/(\\)/$1$1/g;
9647 push @normalization_tests,
9648 "Test_N(q
\a$c1
\a, q
\a$c2
\a, q
\a$c3
\a, q
\a$c4
\a, q
\a$c5
\a);\n";
9649 } # End of looping through the file
9652 sub output_perl_charnames_line ($$) {
9654 # Output the entries in Perl_charnames specially, using 5 digits instead
9655 # of four. This makes the entries a constant length, and simplifies
9656 # charnames.pm which this table is for. Unicode can have 6 digit
9657 # ordinals, but they are all private use or noncharacters which do not
9658 # have names, so won't be in this table.
9660 return sprintf "%05X\t%s\n", $_[0], $_[1];
9664 # This is used to store the range list of all the code points usable when
9665 # the little used $compare_versions feature is enabled.
9666 my $compare_versions_range_list;
9668 # These are constants to the $property_info hash in this subroutine, to
9669 # avoid using a quoted-string which might have a typo.
9671 my $DEFAULT_MAP = 'default_map';
9672 my $DEFAULT_TABLE = 'default_table';
9673 my $PSEUDO_MAP_TYPE = 'pseudo_map_type';
9674 my $MISSINGS = 'missings';
9676 sub process_generic_property_file {
9677 # This processes a file containing property mappings and puts them
9678 # into internal map tables. It should be used to handle any property
9679 # files that have mappings from a code point or range thereof to
9680 # something else. This means almost all the UCD .txt files.
9681 # each_line_handlers() should be set to adjust the lines of these
9682 # files, if necessary, to what this routine understands:
9687 # the fields are: "codepoint-range ; property; map"
9689 # meaning the codepoints in the range all have the value 'map' under
9691 # Beginning and trailing white space in each field are not significant.
9692 # Note there is not a trailing semi-colon in the above. A trailing
9693 # semi-colon means the map is a null-string. An omitted map, as
9694 # opposed to a null-string, is assumed to be 'Y', based on Unicode
9695 # table syntax. (This could have been hidden from this routine by
9696 # doing it in the $file object, but that would require parsing of the
9697 # line there, so would have to parse it twice, or change the interface
9698 # to pass this an array. So not done.)
9700 # The map field may begin with a sequence of commands that apply to
9701 # this range. Each such command begins and ends with $CMD_DELIM.
9702 # These are used to indicate, for example, that the mapping for a
9703 # range has a non-default type.
9705 # This loops through the file, calling it's next_line() method, and
9706 # then taking the map and adding it to the property's table.
9707 # Complications arise because any number of properties can be in the
9708 # file, in any order, interspersed in any way. The first time a
9709 # property is seen, it gets information about that property and
9710 # caches it for quick retrieval later. It also normalizes the maps
9711 # so that only one of many synonyms is stored. The Unicode input
9712 # files do use some multiple synonyms.
9715 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9717 my %property_info; # To keep track of what properties
9718 # have already had entries in the
9719 # current file, and info about each,
9720 # so don't have to recompute.
9721 my $property_name; # property currently being worked on
9722 my $property_type; # and its type
9723 my $previous_property_name = ""; # name from last time through loop
9724 my $property_object; # pointer to the current property's
9726 my $property_addr; # the address of that object
9727 my $default_map; # the string that code points missing
9728 # from the file map to
9729 my $default_table; # For non-string properties, a
9730 # reference to the match table that
9731 # will contain the list of code
9732 # points that map to $default_map.
9734 # Get the next real non-comment line
9736 while ($file->next_line) {
9738 # Default replacement type; means that if parts of the range have
9739 # already been stored in our tables, the new map overrides them if
9740 # they differ more than cosmetically
9741 my $replace = $IF_NOT_EQUIVALENT;
9742 my $map_type; # Default type for the map of this range
9744 #local $to_trace = 1 if main::DEBUG;
9745 trace $_ if main::DEBUG && $to_trace;
9747 # Split the line into components
9748 my ($range, $property_name, $map, @remainder)
9749 = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
9751 # If more or less on the line than we are expecting, warn and skip
9754 $file->carp_bad_line('Extra fields');
9757 elsif ( ! defined $property_name) {
9758 $file->carp_bad_line('Missing property');
9762 # Examine the range.
9763 if ($range !~ /^ ($code_point_re) (?:\.\. ($code_point_re) )? $/x)
9765 $file->carp_bad_line("Range '$range' not of the form 'CP1' or 'CP1..CP2' (where CP1,2 are code points in hex)");
9769 my $high = (defined $2) ? hex $2 : $low;
9771 # For the very specialized case of comparing two Unicode
9773 if (DEBUG && $compare_versions) {
9774 if ($property_name eq 'Age') {
9776 # Only allow code points at least as old as the version
9778 my $age = pack "C*", split(/\./, $map); # v string
9779 next LINE if $age gt $compare_versions;
9783 # Again, we throw out code points younger than those of
9784 # the specified version. By now, the Age property is
9785 # populated. We use the intersection of each input range
9786 # with this property to find what code points in it are
9787 # valid. To do the intersection, we have to convert the
9788 # Age property map to a Range_list. We only have to do
9790 if (! defined $compare_versions_range_list) {
9791 my $age = property_ref('Age');
9792 if (! -e 'DAge.txt') {
9793 croak "Need to have 'DAge.txt' file to do version comparison";
9795 elsif ($age->count == 0) {
9796 croak "The 'Age' table is empty, but its file exists";
9798 $compare_versions_range_list
9799 = Range_List->new(Initialize => $age);
9802 # An undefined map is always 'Y'
9803 $map = 'Y' if ! defined $map;
9805 # Calculate the intersection of the input range with the
9806 # code points that are known in the specified version
9807 my @ranges = ($compare_versions_range_list
9808 & Range->new($low, $high))->ranges;
9810 # If the intersection is empty, throw away this range
9811 next LINE unless @ranges;
9813 # Only examine the first range this time through the loop.
9814 my $this_range = shift @ranges;
9816 # Put any remaining ranges in the queue to be processed
9817 # later. Note that there is unnecessary work here, as we
9818 # will do the intersection again for each of these ranges
9819 # during some future iteration of the LINE loop, but this
9820 # code is not used in production. The later intersections
9821 # are guaranteed to not splinter, so this will not become
9823 my $line = join ';', $property_name, $map;
9824 foreach my $range (@ranges) {
9825 $file->insert_adjusted_lines(sprintf("%04X..%04X; %s",
9831 # And process the first range, like any other.
9832 $low = $this_range->start;
9833 $high = $this_range->end;
9835 } # End of $compare_versions
9837 # If changing to a new property, get the things constant per
9839 if ($previous_property_name ne $property_name) {
9841 $property_object = property_ref($property_name);
9842 if (! defined $property_object) {
9843 $file->carp_bad_line("Unexpected property '$property_name'. Skipped");
9846 { no overloading; $property_addr = pack 'J', $property_object; }
9848 # Defer changing names until have a line that is acceptable
9849 # (the 'next' statement above means is unacceptable)
9850 $previous_property_name = $property_name;
9852 # If not the first time for this property, retrieve info about
9854 if (defined ($property_info{$property_addr}{$TYPE})) {
9855 $property_type = $property_info{$property_addr}{$TYPE};
9856 $default_map = $property_info{$property_addr}{$DEFAULT_MAP};
9858 = $property_info{$property_addr}{$PSEUDO_MAP_TYPE};
9860 = $property_info{$property_addr}{$DEFAULT_TABLE};
9864 # Here, is the first time for this property. Set up the
9866 $property_type = $property_info{$property_addr}{$TYPE}
9867 = $property_object->type;
9869 = $property_info{$property_addr}{$PSEUDO_MAP_TYPE}
9870 = $property_object->pseudo_map_type;
9872 # The Unicode files are set up so that if the map is not
9873 # defined, it is a binary property
9874 if (! defined $map && $property_type != $BINARY) {
9875 if ($property_type != $UNKNOWN
9876 && $property_type != $NON_STRING)
9878 $file->carp_bad_line("No mapping defined on a non-binary property. Using 'Y' for the map");
9881 $property_object->set_type($BINARY);
9883 = $property_info{$property_addr}{$TYPE}
9888 # Get any @missings default for this property. This
9889 # should precede the first entry for the property in the
9890 # input file, and is located in a comment that has been
9891 # stored by the Input_file class until we access it here.
9892 # It's possible that there is more than one such line
9893 # waiting for us; collect them all, and parse
9894 my @missings_list = $file->get_missings
9895 if $file->has_missings_defaults;
9896 foreach my $default_ref (@missings_list) {
9897 my $default = $default_ref->[0];
9898 my $addr = do { no overloading; pack 'J', property_ref($default_ref->[1]); };
9900 # For string properties, the default is just what the
9901 # file says, but non-string properties should already
9902 # have set up a table for the default property value;
9903 # use the table for these, so can resolve synonyms
9904 # later to a single standard one.
9905 if ($property_type == $STRING
9906 || $property_type == $UNKNOWN)
9908 $property_info{$addr}{$MISSINGS} = $default;
9911 $property_info{$addr}{$MISSINGS}
9912 = $property_object->table($default);
9916 # Finished storing all the @missings defaults in the input
9917 # file so far. Get the one for the current property.
9918 my $missings = $property_info{$property_addr}{$MISSINGS};
9920 # But we likely have separately stored what the default
9921 # should be. (This is to accommodate versions of the
9922 # standard where the @missings lines are absent or
9923 # incomplete.) Hopefully the two will match. But check
9925 $default_map = $property_object->default_map;
9927 # If the map is a ref, it means that the default won't be
9928 # processed until later, so undef it, so next few lines
9929 # will redefine it to something that nothing will match
9930 undef $default_map if ref $default_map;
9932 # Create a $default_map if don't have one; maybe a dummy
9933 # that won't match anything.
9934 if (! defined $default_map) {
9936 # Use any @missings line in the file.
9937 if (defined $missings) {
9938 if (ref $missings) {
9939 $default_map = $missings->full_name;
9940 $default_table = $missings;
9943 $default_map = $missings;
9946 # And store it with the property for outside use.
9947 $property_object->set_default_map($default_map);
9951 # Neither an @missings nor a default map. Create
9952 # a dummy one, so won't have to test definedness
9954 $default_map = '_Perl This will never be in a file
9959 # Here, we have $default_map defined, possibly in terms of
9960 # $missings, but maybe not, and possibly is a dummy one.
9961 if (defined $missings) {
9963 # Make sure there is no conflict between the two.
9964 # $missings has priority.
9965 if (ref $missings) {
9967 = $property_object->table($default_map);
9968 if (! defined $default_table
9969 || $default_table != $missings)
9971 if (! defined $default_table) {
9972 $default_table = $UNDEF;
9974 $file->carp_bad_line(<<END
9975 The \@missings line for $property_name in $file says that missings default to
9976 $missings, but we expect it to be $default_table. $missings used.
9979 $default_table = $missings;
9980 $default_map = $missings->full_name;
9982 $property_info{$property_addr}{$DEFAULT_TABLE}
9985 elsif ($default_map ne $missings) {
9986 $file->carp_bad_line(<<END
9987 The \@missings line for $property_name in $file says that missings default to
9988 $missings, but we expect it to be $default_map. $missings used.
9991 $default_map = $missings;
9995 $property_info{$property_addr}{$DEFAULT_MAP}
9998 # If haven't done so already, find the table corresponding
9999 # to this map for non-string properties.
10000 if (! defined $default_table
10001 && $property_type != $STRING
10002 && $property_type != $UNKNOWN)
10004 $default_table = $property_info{$property_addr}
10006 = $property_object->table($default_map);
10008 } # End of is first time for this property
10009 } # End of switching properties.
10011 # Ready to process the line.
10012 # The Unicode files are set up so that if the map is not defined,
10013 # it is a binary property with value 'Y'
10014 if (! defined $map) {
10019 # If the map begins with a special command to us (enclosed in
10020 # delimiters), extract the command(s).
10021 while ($map =~ s/ ^ $CMD_DELIM (.*?) $CMD_DELIM //x) {
10023 if ($command =~ / ^ $REPLACE_CMD= (.*) /x) {
10026 elsif ($command =~ / ^ $MAP_TYPE_CMD= (.*) /x) {
10030 $file->carp_bad_line("Unknown command line: '$1'");
10036 if ($default_map eq $CODE_POINT && $map =~ / ^ $code_point_re $/x)
10039 # Here, we have a map to a particular code point, and the
10040 # default map is to a code point itself. If the range
10041 # includes the particular code point, change that portion of
10042 # the range to the default. This makes sure that in the final
10043 # table only the non-defaults are listed.
10044 my $decimal_map = hex $map;
10045 if ($low <= $decimal_map && $decimal_map <= $high) {
10047 # If the range includes stuff before or after the map
10048 # we're changing, split it and process the split-off parts
10050 if ($low < $decimal_map) {
10051 $file->insert_adjusted_lines(
10052 sprintf("%04X..%04X; %s; %s",
10058 if ($high > $decimal_map) {
10059 $file->insert_adjusted_lines(
10060 sprintf("%04X..%04X; %s; %s",
10066 $low = $high = $decimal_map;
10067 $map = $CODE_POINT;
10071 # If we can tell that this is a synonym for the default map, use
10072 # the default one instead.
10073 if ($property_type != $STRING
10074 && $property_type != $UNKNOWN)
10076 my $table = $property_object->table($map);
10077 if (defined $table && $table == $default_table) {
10078 $map = $default_map;
10082 # And figure out the map type if not known.
10083 if (! defined $map_type || $map_type == $COMPUTE_NO_MULTI_CP) {
10084 if ($map eq "") { # Nulls are always $NULL map type
10086 } # Otherwise, non-strings, and those that don't allow
10087 # $MULTI_CP, and those that aren't multiple code points are
10090 (($property_type != $STRING && $property_type != $UNKNOWN)
10091 || (defined $map_type && $map_type == $COMPUTE_NO_MULTI_CP)
10092 || $map !~ /^ $code_point_re ( \ $code_point_re )+ $ /x)
10097 $map_type = $MULTI_CP;
10101 $property_object->add_map($low, $high,
10104 Replace => $replace);
10105 } # End of loop through file's lines
10111 { # Closure for UnicodeData.txt handling
10113 # This file was the first one in the UCD; its design leads to some
10114 # awkwardness in processing. Here is a sample line:
10115 # 0041;LATIN CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0061;
10116 # The fields in order are:
10117 my $i = 0; # The code point is in field 0, and is shifted off.
10118 my $CHARNAME = $i++; # character name (e.g. "LATIN CAPITAL LETTER A")
10119 my $CATEGORY = $i++; # category (e.g. "Lu")
10120 my $CCC = $i++; # Canonical combining class (e.g. "230")
10121 my $BIDI = $i++; # directional class (e.g. "L")
10122 my $PERL_DECOMPOSITION = $i++; # decomposition mapping
10123 my $PERL_DECIMAL_DIGIT = $i++; # decimal digit value
10124 my $NUMERIC_TYPE_OTHER_DIGIT = $i++; # digit value, like a superscript
10125 # Dual-use in this program; see below
10126 my $NUMERIC = $i++; # numeric value
10127 my $MIRRORED = $i++; # ? mirrored
10128 my $UNICODE_1_NAME = $i++; # name in Unicode 1.0
10129 my $COMMENT = $i++; # iso comment
10130 my $UPPER = $i++; # simple uppercase mapping
10131 my $LOWER = $i++; # simple lowercase mapping
10132 my $TITLE = $i++; # simple titlecase mapping
10133 my $input_field_count = $i;
10135 # This routine in addition outputs these extra fields:
10137 my $DECOMP_TYPE = $i++; # Decomposition type
10139 # These fields are modifications of ones above, and are usually
10140 # suppressed; they must come last, as for speed, the loop upper bound is
10141 # normally set to ignore them
10142 my $NAME = $i++; # This is the strict name field, not the one that
10144 my $DECOMP_MAP = $i++; # Strict decomposition mapping; not the one used
10145 # by Unicode::Normalize
10146 my $last_field = $i - 1;
10148 # All these are read into an array for each line, with the indices defined
10149 # above. The empty fields in the example line above indicate that the
10150 # value is defaulted. The handler called for each line of the input
10151 # changes these to their defaults.
10153 # Here are the official names of the properties, in a parallel array:
10155 $field_names[$BIDI] = 'Bidi_Class';
10156 $field_names[$CATEGORY] = 'General_Category';
10157 $field_names[$CCC] = 'Canonical_Combining_Class';
10158 $field_names[$CHARNAME] = 'Perl_Charnames';
10159 $field_names[$COMMENT] = 'ISO_Comment';
10160 $field_names[$DECOMP_MAP] = 'Decomposition_Mapping';
10161 $field_names[$DECOMP_TYPE] = 'Decomposition_Type';
10162 $field_names[$LOWER] = 'Lowercase_Mapping';
10163 $field_names[$MIRRORED] = 'Bidi_Mirrored';
10164 $field_names[$NAME] = 'Name';
10165 $field_names[$NUMERIC] = 'Numeric_Value';
10166 $field_names[$NUMERIC_TYPE_OTHER_DIGIT] = 'Numeric_Type';
10167 $field_names[$PERL_DECIMAL_DIGIT] = 'Perl_Decimal_Digit';
10168 $field_names[$PERL_DECOMPOSITION] = 'Perl_Decomposition_Mapping';
10169 $field_names[$TITLE] = 'Titlecase_Mapping';
10170 $field_names[$UNICODE_1_NAME] = 'Unicode_1_Name';
10171 $field_names[$UPPER] = 'Uppercase_Mapping';
10173 # Some of these need a little more explanation:
10174 # The $PERL_DECIMAL_DIGIT field does not lead to an official Unicode
10175 # property, but is used in calculating the Numeric_Type. Perl however,
10176 # creates a file from this field, so a Perl property is created from it.
10177 # Similarly, the Other_Digit field is used only for calculating the
10178 # Numeric_Type, and so it can be safely re-used as the place to store
10179 # the value for Numeric_Type; hence it is referred to as
10180 # $NUMERIC_TYPE_OTHER_DIGIT.
10181 # The input field named $PERL_DECOMPOSITION is a combination of both the
10182 # decomposition mapping and its type. Perl creates a file containing
10183 # exactly this field, so it is used for that. The two properties are
10184 # separated into two extra output fields, $DECOMP_MAP and $DECOMP_TYPE.
10185 # $DECOMP_MAP is usually suppressed (unless the lists are changed to
10186 # output it), as Perl doesn't use it directly.
10187 # The input field named here $CHARNAME is used to construct the
10188 # Perl_Charnames property, which is a combination of the Name property
10189 # (which the input field contains), and the Unicode_1_Name property, and
10190 # others from other files. Since, the strict Name property is not used
10191 # by Perl, this field is used for the table that Perl does use. The
10192 # strict Name property table is usually suppressed (unless the lists are
10193 # changed to output it), so it is accumulated in a separate field,
10194 # $NAME, which to save time is discarded unless the table is actually to
10197 # This file is processed like most in this program. Control is passed to
10198 # process_generic_property_file() which calls filter_UnicodeData_line()
10199 # for each input line. This filter converts the input into line(s) that
10200 # process_generic_property_file() understands. There is also a setup
10201 # routine called before any of the file is processed, and a handler for
10202 # EOF processing, all in this closure.
10204 # A huge speed-up occurred at the cost of some added complexity when these
10205 # routines were altered to buffer the outputs into ranges. Almost all the
10206 # lines of the input file apply to just one code point, and for most
10207 # properties, the map for the next code point up is the same as the
10208 # current one. So instead of creating a line for each property for each
10209 # input line, filter_UnicodeData_line() remembers what the previous map
10210 # of a property was, and doesn't generate a line to pass on until it has
10211 # to, as when the map changes; and that passed-on line encompasses the
10212 # whole contiguous range of code points that have the same map for that
10213 # property. This means a slight amount of extra setup, and having to
10214 # flush these buffers on EOF, testing if the maps have changed, plus
10215 # remembering state information in the closure. But it means a lot less
10216 # real time in not having to change the data base for each property on
10219 # Another complication is that there are already a few ranges designated
10220 # in the input. There are two lines for each, with the same maps except
10221 # the code point and name on each line. This was actually the hardest
10222 # thing to design around. The code points in those ranges may actually
10223 # have real maps not given by these two lines. These maps will either
10224 # be algorithmically determinable, or be in the extracted files furnished
10225 # with the UCD. In the event of conflicts between these extracted files,
10226 # and this one, Unicode says that this one prevails. But it shouldn't
10227 # prevail for conflicts that occur in these ranges. The data from the
10228 # extracted files prevails in those cases. So, this program is structured
10229 # so that those files are processed first, storing maps. Then the other
10230 # files are processed, generally overwriting what the extracted files
10231 # stored. But just the range lines in this input file are processed
10232 # without overwriting. This is accomplished by adding a special string to
10233 # the lines output to tell process_generic_property_file() to turn off the
10234 # overwriting for just this one line.
10235 # A similar mechanism is used to tell it that the map is of a non-default
10238 sub setup_UnicodeData { # Called before any lines of the input are read
10240 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10242 # Create a new property specially located that is a combination of the
10243 # various Name properties: Name, Unicode_1_Name, Named Sequences, and
10244 # Name_Alias properties. (The final duplicates elements of the
10245 # first.) A comment for it will later be constructed based on the
10246 # actual properties present and used
10247 $perl_charname = Property->new('Perl_Charnames',
10249 Directory => File::Spec->curdir(),
10251 Fate => $INTERNAL_ONLY,
10252 Perl_Extension => 1,
10253 Range_Size_1 => \&output_perl_charnames_line,
10256 $perl_charname->set_proxy_for('Name');
10258 my $Perl_decomp = Property->new('Perl_Decomposition_Mapping',
10259 Directory => File::Spec->curdir(),
10260 File => 'Decomposition',
10261 Format => $DECOMP_STRING_FORMAT,
10262 Fate => $INTERNAL_ONLY,
10263 Perl_Extension => 1,
10264 Default_Map => $CODE_POINT,
10266 # normalize.pm can't cope with these
10267 Output_Range_Counts => 0,
10269 # This is a specially formatted table
10270 # explicitly for normalize.pm, which
10271 # is expecting a particular format,
10272 # which means that mappings containing
10273 # multiple code points are in the main
10274 # body of the table
10275 Map_Type => $COMPUTE_NO_MULTI_CP,
10277 To_Output_Map => $INTERNAL_MAP,
10279 $Perl_decomp->set_proxy_for('Decomposition_Mapping', 'Decomposition_Type');
10280 $Perl_decomp->add_comment(join_lines(<<END
10281 This mapping is a combination of the Unicode 'Decomposition_Type' and
10282 'Decomposition_Mapping' properties, formatted for use by normalize.pm. It is
10283 identical to the official Unicode 'Decomposition_Mapping' property except for
10285 1) It omits the algorithmically determinable Hangul syllable decompositions,
10286 which normalize.pm handles algorithmically.
10287 2) It contains the decomposition type as well. Non-canonical decompositions
10288 begin with a word in angle brackets, like <super>, which denotes the
10289 compatible decomposition type. If the map does not begin with the <angle
10290 brackets>, the decomposition is canonical.
10294 my $Decimal_Digit = Property->new("Perl_Decimal_Digit",
10296 Perl_Extension => 1,
10297 Directory => $map_directory,
10299 To_Output_Map => $OUTPUT_ADJUSTED,
10301 $Decimal_Digit->add_comment(join_lines(<<END
10302 This file gives the mapping of all code points which represent a single
10303 decimal digit [0-9] to their respective digits, but it has ranges of 10 code
10304 points, and the mapping of each non-initial element of each range is actually
10305 not to "0", but to the offset that element has from its corresponding DIGIT 0.
10306 These code points are those that have Numeric_Type=Decimal; not special
10307 things, like subscripts nor Roman numerals.
10311 # These properties are not used for generating anything else, and are
10312 # usually not output. By making them last in the list, we can just
10313 # change the high end of the loop downwards to avoid the work of
10314 # generating a table(s) that is/are just going to get thrown away.
10315 if (! property_ref('Decomposition_Mapping')->to_output_map
10316 && ! property_ref('Name')->to_output_map)
10318 $last_field = min($NAME, $DECOMP_MAP) - 1;
10319 } elsif (property_ref('Decomposition_Mapping')->to_output_map) {
10320 $last_field = $DECOMP_MAP;
10321 } elsif (property_ref('Name')->to_output_map) {
10322 $last_field = $NAME;
10327 my $first_time = 1; # ? Is this the first line of the file
10328 my $in_range = 0; # ? Are we in one of the file's ranges
10329 my $previous_cp; # hex code point of previous line
10330 my $decimal_previous_cp = -1; # And its decimal equivalent
10331 my @start; # For each field, the current starting
10332 # code point in hex for the range
10333 # being accumulated.
10334 my @fields; # The input fields;
10335 my @previous_fields; # And those from the previous call
10337 sub filter_UnicodeData_line {
10338 # Handle a single input line from UnicodeData.txt; see comments above
10339 # Conceptually this takes a single line from the file containing N
10340 # properties, and converts it into N lines with one property per line,
10341 # which is what the final handler expects. But there are
10342 # complications due to the quirkiness of the input file, and to save
10343 # time, it accumulates ranges where the property values don't change
10344 # and only emits lines when necessary. This is about an order of
10345 # magnitude fewer lines emitted.
10348 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10350 # $_ contains the input line.
10351 # -1 in split means retain trailing null fields
10352 (my $cp, @fields) = split /\s*;\s*/, $_, -1;
10354 #local $to_trace = 1 if main::DEBUG;
10355 trace $cp, @fields , $input_field_count if main::DEBUG && $to_trace;
10356 if (@fields > $input_field_count) {
10357 $file->carp_bad_line('Extra fields');
10362 my $decimal_cp = hex $cp;
10364 # We have to output all the buffered ranges when the next code point
10365 # is not exactly one after the previous one, which means there is a
10366 # gap in the ranges.
10367 my $force_output = ($decimal_cp != $decimal_previous_cp + 1);
10369 # The decomposition mapping field requires special handling. It looks
10372 # <compat> 0032 0020
10375 # The decomposition type is enclosed in <brackets>; if missing, it
10376 # means the type is canonical. There are two decomposition mapping
10377 # tables: the one for use by Perl's normalize.pm has a special format
10378 # which is this field intact; the other, for general use is of
10379 # standard format. In either case we have to find the decomposition
10380 # type. Empty fields have None as their type, and map to the code
10382 if ($fields[$PERL_DECOMPOSITION] eq "") {
10383 $fields[$DECOMP_TYPE] = 'None';
10384 $fields[$DECOMP_MAP] = $fields[$PERL_DECOMPOSITION] = $CODE_POINT;
10387 ($fields[$DECOMP_TYPE], my $map) = $fields[$PERL_DECOMPOSITION]
10388 =~ / < ( .+? ) > \s* ( .+ ) /x;
10389 if (! defined $fields[$DECOMP_TYPE]) {
10390 $fields[$DECOMP_TYPE] = 'Canonical';
10391 $fields[$DECOMP_MAP] = $fields[$PERL_DECOMPOSITION];
10394 $fields[$DECOMP_MAP] = $map;
10398 # The 3 numeric fields also require special handling. The 2 digit
10399 # fields must be either empty or match the number field. This means
10400 # that if it is empty, they must be as well, and the numeric type is
10401 # None, and the numeric value is 'Nan'.
10402 # The decimal digit field must be empty or match the other digit
10403 # field. If the decimal digit field is non-empty, the code point is
10404 # a decimal digit, and the other two fields will have the same value.
10405 # If it is empty, but the other digit field is non-empty, the code
10406 # point is an 'other digit', and the number field will have the same
10407 # value as the other digit field. If the other digit field is empty,
10408 # but the number field is non-empty, the code point is a generic
10410 if ($fields[$NUMERIC] eq "") {
10411 if ($fields[$PERL_DECIMAL_DIGIT] ne ""
10412 || $fields[$NUMERIC_TYPE_OTHER_DIGIT] ne ""
10414 $file->carp_bad_line("Numeric values inconsistent. Trying to process anyway");
10416 $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'None';
10417 $fields[$NUMERIC] = 'NaN';
10420 $file->carp_bad_line("'$fields[$NUMERIC]' should be a whole or rational number. Processing as if it were") if $fields[$NUMERIC] !~ qr{ ^ -? \d+ ( / \d+ )? $ }x;
10421 if ($fields[$PERL_DECIMAL_DIGIT] ne "") {
10422 $file->carp_bad_line("$fields[$PERL_DECIMAL_DIGIT] should equal $fields[$NUMERIC]. Processing anyway") if $fields[$PERL_DECIMAL_DIGIT] != $fields[$NUMERIC];
10423 $file->carp_bad_line("$fields[$PERL_DECIMAL_DIGIT] should be empty since the general category ($fields[$CATEGORY]) isn't 'Nd'. Processing as Decimal") if $fields[$CATEGORY] ne "Nd";
10424 $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'Decimal';
10426 elsif ($fields[$NUMERIC_TYPE_OTHER_DIGIT] ne "") {
10427 $file->carp_bad_line("$fields[$NUMERIC_TYPE_OTHER_DIGIT] should equal $fields[$NUMERIC]. Processing anyway") if $fields[$NUMERIC_TYPE_OTHER_DIGIT] != $fields[$NUMERIC];
10428 $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'Digit';
10431 $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'Numeric';
10433 # Rationals require extra effort.
10434 register_fraction($fields[$NUMERIC])
10435 if $fields[$NUMERIC] =~ qr{/};
10439 # For the properties that have empty fields in the file, and which
10440 # mean something different from empty, change them to that default.
10441 # Certain fields just haven't been empty so far in any Unicode
10442 # version, so don't look at those, namely $MIRRORED, $BIDI, $CCC,
10443 # $CATEGORY. This leaves just the two fields, and so we hard-code in
10444 # the defaults; which are very unlikely to ever change.
10445 $fields[$UPPER] = $CODE_POINT if $fields[$UPPER] eq "";
10446 $fields[$LOWER] = $CODE_POINT if $fields[$LOWER] eq "";
10448 # UAX44 says that if title is empty, it is the same as whatever upper
10450 $fields[$TITLE] = $fields[$UPPER] if $fields[$TITLE] eq "";
10452 # There are a few pairs of lines like:
10453 # AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
10454 # D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
10455 # that define ranges. These should be processed after the fields are
10456 # adjusted above, as they may override some of them; but mostly what
10457 # is left is to possibly adjust the $CHARNAME field. The names of all the
10458 # paired lines start with a '<', but this is also true of '<control>,
10459 # which isn't one of these special ones.
10460 if ($fields[$CHARNAME] eq '<control>') {
10462 # Some code points in this file have the pseudo-name
10463 # '<control>', but the official name for such ones is the null
10465 $fields[$NAME] = $fields[$CHARNAME] = "";
10467 # We had better not be in between range lines.
10469 $file->carp_bad_line("Expecting a closing range line, not a $fields[$CHARNAME]'. Trying anyway");
10473 elsif (substr($fields[$CHARNAME], 0, 1) ne '<') {
10475 # Here is a non-range line. We had better not be in between range
10478 $file->carp_bad_line("Expecting a closing range line, not a $fields[$CHARNAME]'. Trying anyway");
10481 if ($fields[$CHARNAME] =~ s/- $cp $//x) {
10483 # These are code points whose names end in their code points,
10484 # which means the names are algorithmically derivable from the
10485 # code points. To shorten the output Name file, the algorithm
10486 # for deriving these is placed in the file instead of each
10487 # code point, so they have map type $CP_IN_NAME
10488 $fields[$CHARNAME] = $CMD_DELIM
10493 . $fields[$CHARNAME];
10495 $fields[$NAME] = $fields[$CHARNAME];
10497 elsif ($fields[$CHARNAME] =~ /^<(.+), First>$/) {
10498 $fields[$CHARNAME] = $fields[$NAME] = $1;
10500 # Here we are at the beginning of a range pair.
10502 $file->carp_bad_line("Expecting a closing range line, not a beginning one, $fields[$CHARNAME]'. Trying anyway");
10506 # Because the properties in the range do not overwrite any already
10507 # in the db, we must flush the buffers of what's already there, so
10508 # they get handled in the normal scheme.
10512 elsif ($fields[$CHARNAME] !~ s/^<(.+), Last>$/$1/) {
10513 $file->carp_bad_line("Unexpected name starting with '<' $fields[$CHARNAME]. Ignoring this line.");
10517 else { # Here, we are at the last line of a range pair.
10520 $file->carp_bad_line("Unexpected end of range $fields[$CHARNAME] when not in one. Ignoring this line.");
10526 $fields[$NAME] = $fields[$CHARNAME];
10528 # Check that the input is valid: that the closing of the range is
10529 # the same as the beginning.
10530 foreach my $i (0 .. $last_field) {
10531 next if $fields[$i] eq $previous_fields[$i];
10532 $file->carp_bad_line("Expecting '$fields[$i]' to be the same as '$previous_fields[$i]'. Bad News. Trying anyway");
10535 # The processing differs depending on the type of range,
10536 # determined by its $CHARNAME
10537 if ($fields[$CHARNAME] =~ /^Hangul Syllable/) {
10539 # Check that the data looks right.
10540 if ($decimal_previous_cp != $SBase) {
10541 $file->carp_bad_line("Unexpected Hangul syllable start = $previous_cp. Bad News. Results will be wrong");
10543 if ($decimal_cp != $SBase + $SCount - 1) {
10544 $file->carp_bad_line("Unexpected Hangul syllable end = $cp. Bad News. Results will be wrong");
10547 # The Hangul syllable range has a somewhat complicated name
10548 # generation algorithm. Each code point in it has a canonical
10549 # decomposition also computable by an algorithm. The
10550 # perl decomposition map table built from these is used only
10551 # by normalize.pm, which has the algorithm built in it, so the
10552 # decomposition maps are not needed, and are large, so are
10553 # omitted from it. If the full decomposition map table is to
10554 # be output, the decompositions are generated for it, in the
10555 # EOF handling code for this input file.
10557 $previous_fields[$DECOMP_TYPE] = 'Canonical';
10559 # This range is stored in our internal structure with its
10560 # own map type, different from all others.
10561 $previous_fields[$CHARNAME] = $previous_fields[$NAME]
10567 . $fields[$CHARNAME];
10569 elsif ($fields[$CHARNAME] =~ /^CJK/) {
10571 # The name for these contains the code point itself, and all
10572 # are defined to have the same base name, regardless of what
10573 # is in the file. They are stored in our internal structure
10574 # with a map type of $CP_IN_NAME
10575 $previous_fields[$CHARNAME] = $previous_fields[$NAME]
10581 . 'CJK UNIFIED IDEOGRAPH';
10584 elsif ($fields[$CATEGORY] eq 'Co'
10585 || $fields[$CATEGORY] eq 'Cs')
10587 # The names of all the code points in these ranges are set to
10588 # null, as there are no names for the private use and
10589 # surrogate code points.
10591 $previous_fields[$CHARNAME] = $previous_fields[$NAME] = "";
10594 $file->carp_bad_line("Unexpected code point range $fields[$CHARNAME] because category is $fields[$CATEGORY]. Attempting to process it.");
10597 # The first line of the range caused everything else to be output,
10598 # and then its values were stored as the beginning values for the
10599 # next set of ranges, which this one ends. Now, for each value,
10600 # add a command to tell the handler that these values should not
10601 # replace any existing ones in our database.
10602 foreach my $i (0 .. $last_field) {
10603 $previous_fields[$i] = $CMD_DELIM
10608 . $previous_fields[$i];
10611 # And change things so it looks like the entire range has been
10612 # gone through with this being the final part of it. Adding the
10613 # command above to each field will cause this range to be flushed
10614 # during the next iteration, as it guaranteed that the stored
10615 # field won't match whatever value the next one has.
10616 $previous_cp = $cp;
10617 $decimal_previous_cp = $decimal_cp;
10619 # We are now set up for the next iteration; so skip the remaining
10620 # code in this subroutine that does the same thing, but doesn't
10621 # know about these ranges.
10627 # On the very first line, we fake it so the code below thinks there is
10628 # nothing to output, and initialize so that when it does get output it
10629 # uses the first line's values for the lowest part of the range.
10630 # (One could avoid this by using peek(), but then one would need to
10631 # know the adjustments done above and do the same ones in the setup
10632 # routine; not worth it)
10635 @previous_fields = @fields;
10636 @start = ($cp) x scalar @fields;
10637 $decimal_previous_cp = $decimal_cp - 1;
10640 # For each field, output the stored up ranges that this code point
10641 # doesn't fit in. Earlier we figured out if all ranges should be
10642 # terminated because of changing the replace or map type styles, or if
10643 # there is a gap between this new code point and the previous one, and
10644 # that is stored in $force_output. But even if those aren't true, we
10645 # need to output the range if this new code point's value for the
10646 # given property doesn't match the stored range's.
10647 #local $to_trace = 1 if main::DEBUG;
10648 foreach my $i (0 .. $last_field) {
10649 my $field = $fields[$i];
10650 if ($force_output || $field ne $previous_fields[$i]) {
10652 # Flush the buffer of stored values.
10653 $file->insert_adjusted_lines("$start[$i]..$previous_cp; $field_names[$i]; $previous_fields[$i]");
10655 # Start a new range with this code point and its value
10657 $previous_fields[$i] = $field;
10661 # Set the values for the next time.
10662 $previous_cp = $cp;
10663 $decimal_previous_cp = $decimal_cp;
10665 # The input line has generated whatever adjusted lines are needed, and
10666 # should not be looked at further.
10671 sub EOF_UnicodeData {
10672 # Called upon EOF to flush the buffers, and create the Hangul
10673 # decomposition mappings if needed.
10676 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10678 # Flush the buffers.
10679 foreach my $i (0 .. $last_field) {
10680 $file->insert_adjusted_lines("$start[$i]..$previous_cp; $field_names[$i]; $previous_fields[$i]");
10683 if (-e 'Jamo.txt') {
10685 # The algorithm is published by Unicode, based on values in
10686 # Jamo.txt, (which should have been processed before this
10687 # subroutine), and the results left in %Jamo
10689 Carp::my_carp_bug("Jamo.txt should be processed before Unicode.txt. Hangul syllables not generated.");
10693 # If the full decomposition map table is being output, insert
10694 # into it the Hangul syllable mappings. This is to avoid having
10695 # to publish a subroutine in it to compute them. (which would
10696 # essentially be this code.) This uses the algorithm published by
10698 if (property_ref('Decomposition_Mapping')->to_output_map) {
10699 for (my $S = $SBase; $S < $SBase + $SCount; $S++) {
10701 my $SIndex = $S - $SBase;
10702 my $L = $LBase + $SIndex / $NCount;
10703 my $V = $VBase + ($SIndex % $NCount) / $TCount;
10704 my $T = $TBase + $SIndex % $TCount;
10706 trace "L=$L, V=$V, T=$T" if main::DEBUG && $to_trace;
10707 my $decomposition = sprintf("%04X %04X", $L, $V);
10708 $decomposition .= sprintf(" %04X", $T) if $T != $TBase;
10709 $file->insert_adjusted_lines(
10710 sprintf("%04X; Decomposition_Mapping; %s",
10720 sub filter_v1_ucd {
10721 # Fix UCD lines in version 1. This is probably overkill, but this
10722 # fixes some glaring errors in Version 1 UnicodeData.txt. That file:
10723 # 1) had many Hangul (U+3400 - U+4DFF) code points that were later
10724 # removed. This program retains them
10725 # 2) didn't include ranges, which it should have, and which are now
10726 # added in @corrected_lines below. It was hand populated by
10727 # taking the data from Version 2, verified by analyzing
10729 # 3) There is a syntax error in the entry for U+09F8 which could
10730 # cause problems for utf8_heavy, and so is changed. It's
10731 # numeric value was simply a minus sign, without any number.
10732 # (Eventually Unicode changed the code point to non-numeric.)
10733 # 4) The decomposition types often don't match later versions
10734 # exactly, and the whole syntax of that field is different; so
10735 # the syntax is changed as well as the types to their later
10736 # terminology. Otherwise normalize.pm would be very unhappy
10737 # 5) Many ccc classes are different. These are left intact.
10738 # 6) U+FF10 - U+FF19 are missing their numeric values in all three
10739 # fields. These are unchanged because it doesn't really cause
10740 # problems for Perl.
10741 # 7) A number of code points, such as controls, don't have their
10742 # Unicode Version 1 Names in this file. These are unchanged.
10744 my @corrected_lines = split /\n/, <<'END';
10745 4E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;;
10746 9FA5;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;;
10747 E000;<Private Use, First>;Co;0;L;;;;;N;;;;;
10748 F8FF;<Private Use, Last>;Co;0;L;;;;;N;;;;;
10749 F900;<CJK Compatibility Ideograph, First>;Lo;0;L;;;;;N;;;;;
10750 FA2D;<CJK Compatibility Ideograph, Last>;Lo;0;L;;;;;N;;;;;
10754 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10756 #local $to_trace = 1 if main::DEBUG;
10757 trace $_ if main::DEBUG && $to_trace;
10759 # -1 => retain trailing null fields
10760 my ($code_point, @fields) = split /\s*;\s*/, $_, -1;
10762 # At the first place that is wrong in the input, insert all the
10763 # corrections, replacing the wrong line.
10764 if ($code_point eq '4E00') {
10765 my @copy = @corrected_lines;
10767 ($code_point, @fields) = split /\s*;\s*/, $_, -1;
10769 $file->insert_lines(@copy);
10773 if ($fields[$NUMERIC] eq '-') {
10774 $fields[$NUMERIC] = '-1'; # This is what 2.0 made it.
10777 if ($fields[$PERL_DECOMPOSITION] ne "") {
10779 # Several entries have this change to superscript 2 or 3 in the
10780 # middle. Convert these to the modern version, which is to use
10781 # the actual U+00B2 and U+00B3 (the superscript forms) instead.
10782 # So 'HHHH HHHH <+sup> 0033 <-sup> HHHH' becomes
10783 # 'HHHH HHHH 00B3 HHHH'.
10784 # It turns out that all of these that don't have another
10785 # decomposition defined at the beginning of the line have the
10786 # <square> decomposition in later releases.
10787 if ($code_point ne '00B2' && $code_point ne '00B3') {
10788 if ($fields[$PERL_DECOMPOSITION]
10789 =~ s/<\+sup> 003([23]) <-sup>/00B$1/)
10791 if (substr($fields[$PERL_DECOMPOSITION], 0, 1) ne '<') {
10792 $fields[$PERL_DECOMPOSITION] = '<square> '
10793 . $fields[$PERL_DECOMPOSITION];
10798 # If is like '<+circled> 0052 <-circled>', convert to
10800 $fields[$PERL_DECOMPOSITION] =~
10801 s/ < \+ ( .*? ) > \s* (.*?) \s* <-\1> /<$1> $2/x;
10803 # Convert '<join> HHHH HHHH <join>' to '<medial> HHHH HHHH', etc.
10804 $fields[$PERL_DECOMPOSITION] =~
10805 s/ <join> \s* (.*?) \s* <no-join> /<final> $1/x
10806 or $fields[$PERL_DECOMPOSITION] =~
10807 s/ <join> \s* (.*?) \s* <join> /<medial> $1/x
10808 or $fields[$PERL_DECOMPOSITION] =~
10809 s/ <no-join> \s* (.*?) \s* <join> /<initial> $1/x
10810 or $fields[$PERL_DECOMPOSITION] =~
10811 s/ <no-join> \s* (.*?) \s* <no-join> /<isolated> $1/x;
10813 # Convert '<break> HHHH HHHH <break>' to '<break> HHHH', etc.
10814 $fields[$PERL_DECOMPOSITION] =~
10815 s/ <(break|no-break)> \s* (.*?) \s* <\1> /<$1> $2/x;
10817 # Change names to modern form.
10818 $fields[$PERL_DECOMPOSITION] =~ s/<font variant>/<font>/g;
10819 $fields[$PERL_DECOMPOSITION] =~ s/<no-break>/<noBreak>/g;
10820 $fields[$PERL_DECOMPOSITION] =~ s/<circled>/<circle>/g;
10821 $fields[$PERL_DECOMPOSITION] =~ s/<break>/<fraction>/g;
10823 # One entry has weird braces
10824 $fields[$PERL_DECOMPOSITION] =~ s/[{}]//g;
10827 $_ = join ';', $code_point, @fields;
10828 trace $_ if main::DEBUG && $to_trace;
10832 sub filter_v2_1_5_ucd {
10833 # A dozen entries in this 2.1.5 file had the mirrored and numeric
10834 # columns swapped; These all had mirrored be 'N'. So if the numeric
10835 # column appears to be N, swap it back.
10837 my ($code_point, @fields) = split /\s*;\s*/, $_, -1;
10838 if ($fields[$NUMERIC] eq 'N') {
10839 $fields[$NUMERIC] = $fields[$MIRRORED];
10840 $fields[$MIRRORED] = 'N';
10841 $_ = join ';', $code_point, @fields;
10846 sub filter_v6_ucd {
10848 # Unicode 6.0 co-opted the name BELL for U+1F514, but we haven't
10849 # accepted that yet to allow for some deprecation cycles.
10851 return if $_ !~ /^(?:0007|1F514|070F);/;
10853 my ($code_point, @fields) = split /\s*;\s*/, $_, -1;
10854 if ($code_point eq '0007') {
10855 $fields[$CHARNAME] = "";
10857 elsif ($code_point eq '070F') { # Unicode Corrigendum #8; see
10858 # http://www.unicode.org/versions/corrigendum8.html
10859 $fields[$BIDI] = "AL";
10861 elsif ($^V lt v5.18.0) { # For 5.18 will convert to use Unicode's name
10862 $fields[$CHARNAME] = "";
10865 $_ = join ';', $code_point, @fields;
10869 } # End closure for UnicodeData
10871 sub process_GCB_test {
10874 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10876 while ($file->next_line) {
10877 push @backslash_X_tests, $_;
10883 sub process_NamedSequences {
10884 # NamedSequences.txt entries are just added to an array. Because these
10885 # don't look like the other tables, they have their own handler.
10887 # LATIN CAPITAL LETTER A WITH MACRON AND GRAVE;0100 0300
10889 # This just adds the sequence to an array for later handling
10892 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10894 while ($file->next_line) {
10895 my ($name, $sequence, @remainder) = split /\s*;\s*/, $_, -1;
10897 $file->carp_bad_line(
10898 "Doesn't look like 'KHMER VOWEL SIGN OM;17BB 17C6'");
10902 # Note single \t in keeping with special output format of
10903 # Perl_charnames. But it turns out that the code points don't have to
10904 # be 5 digits long, like the rest, based on the internal workings of
10905 # charnames.pm. This could be easily changed for consistency.
10906 push @named_sequences, "$sequence\t$name";
10915 sub filter_early_ea_lb {
10916 # Fixes early EastAsianWidth.txt and LineBreak.txt files. These had a
10917 # third field be the name of the code point, which can be ignored in
10918 # most cases. But it can be meaningful if it marks a range:
10919 # 33FE;W;IDEOGRAPHIC TELEGRAPH SYMBOL FOR DAY THIRTY-ONE
10920 # 3400;W;<CJK Ideograph Extension A, First>
10922 # We need to see the First in the example above to know it's a range.
10923 # They did not use the later range syntaxes. This routine changes it
10924 # to use the modern syntax.
10925 # $1 is the Input_file object.
10927 my @fields = split /\s*;\s*/;
10928 if ($fields[2] =~ /^<.*, First>/) {
10929 $first_range = $fields[0];
10932 elsif ($fields[2] =~ /^<.*, Last>/) {
10933 $_ = $_ = "$first_range..$fields[0]; $fields[1]";
10936 undef $first_range;
10937 $_ = "$fields[0]; $fields[1]";
10944 sub filter_old_style_arabic_shaping {
10945 # Early versions used a different term for the later one.
10947 my @fields = split /\s*;\s*/;
10948 $fields[3] =~ s/<no shaping>/No_Joining_Group/;
10949 $fields[3] =~ s/\s+/_/g; # Change spaces to underscores
10950 $_ = join ';', @fields;
10954 sub filter_arabic_shaping_line {
10955 # ArabicShaping.txt has entries that look like:
10956 # 062A; TEH; D; BEH
10957 # The field containing 'TEH' is not used. The next field is Joining_Type
10958 # and the last is Joining_Group
10959 # This generates two lines to pass on, one for each property on the input
10963 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10965 my @fields = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
10968 $file->carp_bad_line('Extra fields');
10973 $file->insert_adjusted_lines("$fields[0]; Joining_Group; $fields[3]");
10974 $_ = "$fields[0]; Joining_Type; $fields[2]";
10980 my $lc; # Table for lowercase mapping
10984 sub setup_special_casing {
10985 # SpecialCasing.txt contains the non-simple case change mappings. The
10986 # simple ones are in UnicodeData.txt, which should already have been
10987 # read in to the full property data structures, so as to initialize
10988 # these with the simple ones. Then the SpecialCasing.txt entries
10989 # add or overwrite the ones which have different full mappings.
10991 # This routine sees if the simple mappings are to be output, and if
10992 # so, copies what has already been put into the full mapping tables,
10993 # while they still contain only the simple mappings.
10995 # The reason it is done this way is that the simple mappings are
10996 # probably not going to be output, so it saves work to initialize the
10997 # full tables with the simple mappings, and then overwrite those
10998 # relatively few entries in them that have different full mappings,
10999 # and thus skip the simple mapping tables altogether.
11002 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
11004 $lc = property_ref('lc');
11005 $tc = property_ref('tc');
11006 $uc = property_ref('uc');
11008 # For each of the case change mappings...
11009 foreach my $full_table ($lc, $tc, $uc) {
11010 my $full_name = $full_table->name;
11011 unless (defined $full_table && ! $full_table->is_empty) {
11012 Carp::my_carp_bug("Need to process UnicodeData before SpecialCasing. Only special casing will be generated.");
11015 # Create a table in the old-style format and with the original
11016 # file name for backwards compatibility with applications that
11017 # read it directly. The new tables contain both the simple and
11018 # full maps, and the old are missing simple maps when there is a
11019 # conflicting full one. Probably it would have been ok to add
11020 # those to the legacy version, as was already done in 5.14 to the
11021 # case folding one, but this was not done, out of an abundance of
11022 # caution. The tables are set up here before we deal with the
11023 # full maps so that as we handle those, we can override the simple
11024 # maps for them in the legacy table, and merely add them in the
11026 my $legacy = Property->new("Legacy_" . $full_table->full_name,
11027 File => $full_table->full_name =~
11030 Format => $HEX_FORMAT,
11031 Default_Map => $CODE_POINT,
11033 Initialize => $full_table,
11034 To_Output_Map => $EXTERNAL_MAP,
11037 $full_table->add_comment(join_lines( <<END
11038 This file includes both the simple and full case changing maps. The simple
11039 ones are in the main body of the table below, and the full ones adding to or
11040 overriding them are in the hash.
11044 # The simple version's name in each mapping merely has an 's' in
11045 # front of the full one's
11046 my $simple_name = 's' . $full_name;
11047 my $simple = property_ref($simple_name);
11048 $simple->initialize($full_table) if $simple->to_output_map();
11050 unless ($simple->to_output_map()) {
11051 $full_table->set_proxy_for($simple_name);
11058 sub filter_special_casing_line {
11059 # Change the format of $_ from SpecialCasing.txt into something that
11060 # the generic handler understands. Each input line contains three
11061 # case mappings. This will generate three lines to pass to the
11062 # generic handler for each of those.
11064 # The input syntax (after stripping comments and trailing white space
11065 # is like one of the following (with the final two being entries that
11067 # 00DF; 00DF; 0053 0073; 0053 0053; # LATIN SMALL LETTER SHARP S
11068 # 03A3; 03C2; 03A3; 03A3; Final_Sigma;
11069 # 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
11070 # Note the trailing semi-colon, unlike many of the input files. That
11071 # means that there will be an extra null field generated by the split
11074 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
11076 my @fields = split /\s*;\s*/, $_, -1; # -1 => retain trailing null
11079 # field #4 is when this mapping is conditional. If any of these get
11080 # implemented, it would be by hard-coding in the casing functions in
11081 # the Perl core, not through tables. But if there is a new condition
11082 # we don't know about, output a warning. We know about all the
11083 # conditions through 6.0
11084 if ($fields[4] ne "") {
11085 my @conditions = split ' ', $fields[4];
11086 if ($conditions[0] ne 'tr' # We know that these languages have
11087 # conditions, and some are multiple
11088 && $conditions[0] ne 'az'
11089 && $conditions[0] ne 'lt'
11091 # And, we know about a single condition Final_Sigma, but
11093 && ($v_version gt v5.2.0
11094 && (@conditions > 1 || $conditions[0] ne 'Final_Sigma')))
11096 $file->carp_bad_line("Unknown condition '$fields[4]'. You should inspect it and either add code to handle it, or add to list of those that are to ignore");
11098 elsif ($conditions[0] ne 'Final_Sigma') {
11100 # Don't print out a message for Final_Sigma, because we
11101 # have hard-coded handling for it. (But the standard
11102 # could change what the rule should be, but it wouldn't
11103 # show up here anyway.
11105 print "# SKIPPING Special Casing: $_\n"
11106 if $verbosity >= $VERBOSE;
11111 elsif (@fields > 6 || (@fields == 6 && $fields[5] ne "" )) {
11112 $file->carp_bad_line('Extra fields');
11117 my $decimal_code_point = hex $fields[0];
11119 # Loop to handle each of the three mappings in the input line, in
11120 # order, with $i indicating the current field number.
11122 for my $object ($lc, $tc, $uc) {
11123 $i++; # First time through, $i = 0 ... 3rd time = 3
11125 my $value = $object->value_of($decimal_code_point);
11126 $value = ($value eq $CODE_POINT)
11127 ? $decimal_code_point
11130 # If this isn't a multi-character mapping, it should already have
11132 if ($fields[$i] !~ / /) {
11133 if ($value != hex $fields[$i]) {
11134 Carp::my_carp("Bad news. UnicodeData.txt thinks "
11136 . "(0x$fields[0]) is $value"
11137 . " and SpecialCasing.txt thinks it is "
11139 . ". Good luck. Retaining UnicodeData value, and proceeding anyway.");
11144 # The mapping goes into both the legacy table, in which it
11145 # replaces the simple one...
11146 $file->insert_adjusted_lines("$fields[0]; Legacy_"
11147 . $object->full_name
11148 . "; $fields[$i]");
11150 # ... and, the The regular table, in which it is additional,
11151 # beyond the simple mapping.
11152 $file->insert_adjusted_lines("$fields[0]; "
11156 . "$REPLACE_CMD=$MULTIPLE_BEFORE"
11162 # Everything has been handled by the insert_adjusted_lines()
11169 sub filter_old_style_case_folding {
11170 # This transforms $_ containing the case folding style of 3.0.1, to 3.1
11171 # and later style. Different letters were used in the earlier.
11174 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
11176 my @fields = split /\s*;\s*/;
11177 if ($fields[0] =~ /^ 013 [01] $/x) { # The two turkish fields
11180 elsif ($fields[1] eq 'L') {
11181 $fields[1] = 'C'; # L => C always
11183 elsif ($fields[1] eq 'E') {
11184 if ($fields[2] =~ / /) { # E => C if one code point; F otherwise
11192 $file->carp_bad_line("Expecting L or E in second field");
11196 $_ = join("; ", @fields) . ';';
11200 { # Closure for case folding
11202 # Create the map for simple only if are going to output it, for otherwise
11203 # it takes no part in anything we do.
11204 my $to_output_simple;
11205 my $non_final_folds;
11207 sub setup_case_folding($) {
11208 # Read in the case foldings in CaseFolding.txt. This handles both
11209 # simple and full case folding.
11212 = property_ref('Simple_Case_Folding')->to_output_map;
11214 if (! $to_output_simple) {
11215 property_ref('Case_Folding')->set_proxy_for('Simple_Case_Folding');
11218 $non_final_folds = $perl->add_match_table("_Perl_Non_Final_Folds",
11219 Perl_Extension => 1,
11220 Fate => $INTERNAL_ONLY,
11221 Description => "Code points that particpate in a multi-char fold and are not the final character of said fold",
11224 # If we ever wanted to show that these tables were combined, a new
11225 # property method could be created, like set_combined_props()
11226 property_ref('Case_Folding')->add_comment(join_lines( <<END
11227 This file includes both the simple and full case folding maps. The simple
11228 ones are in the main body of the table below, and the full ones adding to or
11229 overriding them are in the hash.
11235 sub filter_case_folding_line {
11236 # Called for each line in CaseFolding.txt
11237 # Input lines look like:
11238 # 0041; C; 0061; # LATIN CAPITAL LETTER A
11239 # 00DF; F; 0073 0073; # LATIN SMALL LETTER SHARP S
11240 # 1E9E; S; 00DF; # LATIN CAPITAL LETTER SHARP S
11242 # 'C' means that folding is the same for both simple and full
11243 # 'F' that it is only for full folding
11244 # 'S' that it is only for simple folding
11245 # 'T' is locale-dependent, and ignored
11246 # 'I' is a type of 'F' used in some early releases.
11247 # Note the trailing semi-colon, unlike many of the input files. That
11248 # means that there will be an extra null field generated by the split
11249 # below, which we ignore and hence is not an error.
11252 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
11254 my ($range, $type, $map, @remainder) = split /\s*;\s*/, $_, -1;
11255 if (@remainder > 1 || (@remainder == 1 && $remainder[0] ne "" )) {
11256 $file->carp_bad_line('Extra fields');
11261 if ($type eq 'T') { # Skip Turkic case folding, is locale dependent
11266 # C: complete, F: full, or I: dotted uppercase I -> dotless lowercase
11267 # I are all full foldings; S is single-char. For S, there is always
11268 # an F entry, so we must allow multiple values for the same code
11269 # point. Fortunately this table doesn't need further manipulation
11270 # which would preclude using multiple-values. The S is now included
11271 # so that _swash_inversion_hash() is able to construct closures
11272 # without having to worry about F mappings.
11273 if ($type eq 'C' || $type eq 'F' || $type eq 'I' || $type eq 'S') {
11274 $_ = "$range; Case_Folding; "
11275 . "$CMD_DELIM$REPLACE_CMD=$MULTIPLE_BEFORE$CMD_DELIM$map";
11276 if ($type eq 'F') {
11277 my @string = split " ", $map;
11278 for my $i (0 .. @string - 1 -1) {
11279 $non_final_folds->add_range(hex $string[$i], hex $string[$i]);
11285 $file->carp_bad_line('Expecting C F I S or T in second field');
11288 # C and S are simple foldings, but simple case folding is not needed
11289 # unless we explicitly want its map table output.
11290 if ($to_output_simple && $type eq 'C' || $type eq 'S') {
11291 $file->insert_adjusted_lines("$range; Simple_Case_Folding; $map");
11297 } # End case fold closure
11299 sub filter_jamo_line {
11300 # Filter Jamo.txt lines. This routine mainly is used to populate hashes
11301 # from this file that is used in generating the Name property for Jamo
11302 # code points. But, it also is used to convert early versions' syntax
11303 # into the modern form. Here are two examples:
11304 # 1100; G # HANGUL CHOSEONG KIYEOK # Modern syntax
11305 # U+1100; G; HANGUL CHOSEONG KIYEOK # 2.0 syntax
11307 # The input is $_, the output is $_ filtered.
11309 my @fields = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
11311 # Let the caller handle unexpected input. In earlier versions, there was
11312 # a third field which is supposed to be a comment, but did not have a '#'
11314 return if @fields > (($v_version gt v3.0.0) ? 2 : 3);
11316 $fields[0] =~ s/^U\+//; # Also, early versions had this extraneous
11319 # Some 2.1 versions had this wrong. Causes havoc with the algorithm.
11320 $fields[1] = 'R' if $fields[0] eq '1105';
11322 # Add to structure so can generate Names from it.
11323 my $cp = hex $fields[0];
11324 my $short_name = $fields[1];
11325 $Jamo{$cp} = $short_name;
11326 if ($cp <= $LBase + $LCount) {
11327 $Jamo_L{$short_name} = $cp - $LBase;
11329 elsif ($cp <= $VBase + $VCount) {
11330 $Jamo_V{$short_name} = $cp - $VBase;
11332 elsif ($cp <= $TBase + $TCount) {
11333 $Jamo_T{$short_name} = $cp - $TBase;
11336 Carp::my_carp_bug("Unexpected Jamo code point in $_");
11340 # Reassemble using just the first two fields to look like a typical
11341 # property file line
11342 $_ = "$fields[0]; $fields[1]";
11347 sub register_fraction($) {
11348 # This registers the input rational number so that it can be passed on to
11349 # utf8_heavy.pl, both in rational and floating forms.
11351 my $rational = shift;
11353 my $float = eval $rational;
11354 $nv_floating_to_rational{$float} = $rational;
11358 sub filter_numeric_value_line {
11359 # DNumValues contains lines of a different syntax than the typical
11361 # 0F33 ; -0.5 ; ; -1/2 # No TIBETAN DIGIT HALF ZERO
11363 # This routine transforms $_ containing the anomalous syntax to the
11364 # typical, by filtering out the extra columns, and convert early version
11365 # decimal numbers to strings that look like rational numbers.
11368 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
11370 # Starting in 5.1, there is a rational field. Just use that, omitting the
11371 # extra columns. Otherwise convert the decimal number in the second field
11372 # to a rational, and omit extraneous columns.
11373 my @fields = split /\s*;\s*/, $_, -1;
11376 if ($v_version ge v5.1.0) {
11377 if (@fields != 4) {
11378 $file->carp_bad_line('Not 4 semi-colon separated fields');
11382 $rational = $fields[3];
11383 $_ = join '; ', @fields[ 0, 3 ];
11387 # Here, is an older Unicode file, which has decimal numbers instead of
11388 # rationals in it. Use the fraction to calculate the denominator and
11389 # convert to rational.
11391 if (@fields != 2 && @fields != 3) {
11392 $file->carp_bad_line('Not 2 or 3 semi-colon separated fields');
11397 my $codepoints = $fields[0];
11398 my $decimal = $fields[1];
11399 if ($decimal =~ s/\.0+$//) {
11401 # Anything ending with a decimal followed by nothing but 0's is an
11403 $_ = "$codepoints; $decimal";
11404 $rational = $decimal;
11409 if ($decimal =~ /\.50*$/) {
11413 # Here have the hardcoded repeating decimals in the fraction, and
11414 # the denominator they imply. There were only a few denominators
11415 # in the older Unicode versions of this file which this code
11416 # handles, so it is easy to convert them.
11418 # The 4 is because of a round-off error in the Unicode 3.2 files
11419 elsif ($decimal =~ /\.33*[34]$/ || $decimal =~ /\.6+7$/) {
11422 elsif ($decimal =~ /\.[27]50*$/) {
11425 elsif ($decimal =~ /\.[2468]0*$/) {
11428 elsif ($decimal =~ /\.16+7$/ || $decimal =~ /\.83+$/) {
11431 elsif ($decimal =~ /\.(12|37|62|87)50*$/) {
11434 if ($denominator) {
11435 my $sign = ($decimal < 0) ? "-" : "";
11436 my $numerator = int((abs($decimal) * $denominator) + .5);
11437 $rational = "$sign$numerator/$denominator";
11438 $_ = "$codepoints; $rational";
11441 $file->carp_bad_line("Can't cope with number '$decimal'.");
11448 register_fraction($rational) if $rational =~ qr{/};
11453 my %unihan_properties;
11456 # Do any special setup for Unihan properties.
11458 # This property gives the wrong computed type, so override.
11459 my $usource = property_ref('kIRG_USource');
11460 $usource->set_type($STRING) if defined $usource;
11462 # This property is to be considered binary (it says so in
11463 # http://www.unicode.org/reports/tr38/)
11464 my $iicore = property_ref('kIICore');
11465 if (defined $iicore) {
11466 $iicore->set_type($FORCED_BINARY);
11467 $iicore->table("Y")->add_note("Forced to a binary property as per unicode.org UAX #38.");
11469 # Unicode doesn't include the maps for this property, so don't
11470 # warn that they are missing.
11471 $iicore->set_pre_declared_maps(0);
11472 $iicore->add_comment(join_lines( <<END
11473 This property contains enum values, but Unicode UAX #38 says it should be
11474 interpreted as binary, so Perl creates tables for both 1) its enum values,
11475 plus 2) true/false tables in which it is considered true for all code points
11476 that have a non-null value
11484 sub filter_unihan_line {
11485 # Change unihan db lines to look like the others in the db. Here is
11487 # U+341C kCangjie IEKN
11489 # Tabs are used instead of semi-colons to separate fields; therefore
11490 # they may have semi-colons embedded in them. Change these to periods
11491 # so won't screw up the rest of the code.
11494 # Remove lines that don't look like ones we accept.
11495 if ($_ !~ /^ [^\t]* \t ( [^\t]* ) /x) {
11500 # Extract the property, and save a reference to its object.
11502 if (! exists $unihan_properties{$property}) {
11503 $unihan_properties{$property} = property_ref($property);
11506 # Don't do anything unless the property is one we're handling, which
11507 # we determine by seeing if there is an object defined for it or not
11508 if (! defined $unihan_properties{$property}) {
11513 # Convert the tab separators to our standard semi-colons, and convert
11514 # the U+HHHH notation to the rest of the standard's HHHH
11516 s/\b U \+ (?= $code_point_re )//xg;
11518 #local $to_trace = 1 if main::DEBUG;
11519 trace $_ if main::DEBUG && $to_trace;
11525 sub filter_blocks_lines {
11526 # In the Blocks.txt file, the names of the blocks don't quite match the
11527 # names given in PropertyValueAliases.txt, so this changes them so they
11528 # do match: Blanks and hyphens are changed into underscores. Also makes
11529 # early release versions look like later ones
11531 # $_ is transformed to the correct value.
11534 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
11536 if ($v_version lt v3.2.0) {
11537 if (/FEFF.*Specials/) { # Bug in old versions: line wrongly inserted
11542 # Old versions used a different syntax to mark the range.
11543 $_ =~ s/;\s+/../ if $v_version lt v3.1.0;
11546 my @fields = split /\s*;\s*/, $_, -1;
11547 if (@fields != 2) {
11548 $file->carp_bad_line("Expecting exactly two fields");
11553 # Change hyphens and blanks in the block name field only
11554 $fields[1] =~ s/[ -]/_/g;
11555 $fields[1] =~ s/_ ( [a-z] ) /_\u$1/g; # Capitalize first letter of word
11557 $_ = join("; ", @fields);
11562 my $current_property;
11564 sub filter_old_style_proplist {
11565 # PropList.txt has been in Unicode since version 2.0. Until 3.1, it
11566 # was in a completely different syntax. Ken Whistler of Unicode says
11567 # that it was something he used as an aid for his own purposes, but
11568 # was never an official part of the standard. However, comments in
11569 # DAge.txt indicate that non-character code points were available in
11570 # the UCD as of 3.1. It is unclear to me (khw) how they could be
11571 # there except through this file (but on the other hand, they first
11572 # appeared there in 3.0.1), so maybe it was part of the UCD, and maybe
11573 # not. But the claim is that it was published as an aid to others who
11574 # might want some more information than was given in the official UCD
11575 # of the time. Many of the properties in it were incorporated into
11576 # the later PropList.txt, but some were not. This program uses this
11577 # early file to generate property tables that are otherwise not
11578 # accessible in the early UCD's, and most were probably not really
11579 # official at that time, so one could argue that it should be ignored,
11580 # and you can easily modify things to skip this. And there are bugs
11581 # in this file in various versions. (For example, the 2.1.9 version
11582 # removes from Alphabetic the CJK range starting at 4E00, and they
11583 # weren't added back in until 3.1.0.) Many of this file's properties
11584 # were later sanctioned, so this code generates tables for those
11585 # properties that aren't otherwise in the UCD of the time but
11586 # eventually did become official, and throws away the rest. Here is a
11587 # list of all the ones that are thrown away:
11588 # Bidi=* duplicates UnicodeData.txt
11589 # Combining never made into official property;
11591 # Composite never made into official property.
11592 # Currency Symbol duplicates UnicodeData.txt: gc=sc
11593 # Decimal Digit duplicates UnicodeData.txt: gc=nd
11594 # Delimiter never made into official property;
11596 # Format Control never made into official property;
11598 # High Surrogate duplicates Blocks.txt
11599 # Ignorable Control never made into official property;
11601 # ISO Control duplicates UnicodeData.txt: gc=cc
11602 # Left of Pair never made into official property;
11603 # Line Separator duplicates UnicodeData.txt: gc=zl
11604 # Low Surrogate duplicates Blocks.txt
11605 # Non-break was actually listed as a property
11606 # in 3.2, but without any code
11607 # points. Unicode denies that this
11608 # was ever an official property
11609 # Non-spacing duplicate UnicodeData.txt: gc=mn
11610 # Numeric duplicates UnicodeData.txt: gc=cc
11611 # Paired Punctuation never made into official property;
11612 # appears to be gc=ps + gc=pe
11613 # Paragraph Separator duplicates UnicodeData.txt: gc=cc
11614 # Private Use duplicates UnicodeData.txt: gc=co
11615 # Private Use High Surrogate duplicates Blocks.txt
11616 # Punctuation duplicates UnicodeData.txt: gc=p
11617 # Space different definition than eventual
11619 # Titlecase duplicates UnicodeData.txt: gc=lt
11620 # Unassigned Code Value duplicates UnicodeData.txt: gc=cc
11621 # Zero-width never made into official property;
11623 # Most of the properties have the same names in this file as in later
11624 # versions, but a couple do not.
11626 # This subroutine filters $_, converting it from the old style into
11627 # the new style. Here's a sample of the old-style
11629 # *******************************************
11631 # Property dump for: 0x100000A0 (Join Control)
11633 # 200C..200D (2 chars)
11635 # In the example, the property is "Join Control". It is kept in this
11636 # closure between calls to the subroutine. The numbers beginning with
11637 # 0x were internal to Ken's program that generated this file.
11639 # If this line contains the property name, extract it.
11640 if (/^Property dump for: [^(]*\((.*)\)/) {
11643 # Convert white space to underscores.
11646 # Convert the few properties that don't have the same name as
11647 # their modern counterparts
11648 s/Identifier_Part/ID_Continue/
11649 or s/Not_a_Character/NChar/;
11651 # If the name matches an existing property, use it.
11652 if (defined property_ref($_)) {
11653 trace "new property=", $_ if main::DEBUG && $to_trace;
11654 $current_property = $_;
11656 else { # Otherwise discard it
11657 trace "rejected property=", $_ if main::DEBUG && $to_trace;
11658 undef $current_property;
11660 $_ = ""; # The property is saved for the next lines of the
11661 # file, but this defining line is of no further use,
11662 # so clear it so that the caller won't process it
11665 elsif (! defined $current_property || $_ !~ /^$code_point_re/) {
11667 # Here, the input line isn't a header defining a property for the
11668 # following section, and either we aren't in such a section, or
11669 # the line doesn't look like one that defines the code points in
11670 # such a section. Ignore this line.
11675 # Here, we have a line defining the code points for the current
11676 # stashed property. Anything starting with the first blank is
11677 # extraneous. Otherwise, it should look like a normal range to
11678 # the caller. Append the property name so that it looks just like
11679 # a modern PropList entry.
11682 $_ .= "; $current_property";
11684 trace $_ if main::DEBUG && $to_trace;
11687 } # End closure for old style proplist
11689 sub filter_old_style_normalization_lines {
11690 # For early releases of Unicode, the lines were like:
11691 # 74..2A76 ; NFKD_NO
11692 # For later releases this became:
11693 # 74..2A76 ; NFKD_QC; N
11694 # Filter $_ to look like those in later releases.
11695 # Similarly for MAYBEs
11697 s/ _NO \b /_QC; N/x || s/ _MAYBE \b /_QC; M/x;
11699 # Also, the property FC_NFKC was abbreviated to FNC
11704 sub setup_script_extensions {
11705 # The Script_Extensions property starts out with a clone of the Script
11708 my $scx = property_ref("Script_Extensions");
11709 $scx = Property->new("scx", Full_Name => "Script_Extensions")
11711 $scx->_set_format($STRING_WHITE_SPACE_LIST);
11712 $scx->initialize($script);
11713 $scx->set_default_map($script->default_map);
11714 $scx->set_pre_declared_maps(0); # PropValueAliases doesn't list these
11715 $scx->add_comment(join_lines( <<END
11716 The values for code points that appear in one script are just the same as for
11717 the 'Script' property. Likewise the values for those that appear in many
11718 scripts are either 'Common' or 'Inherited', same as with 'Script'. But the
11719 values of code points that appear in a few scripts are a space separated list
11724 # Initialize scx's tables and the aliases for them to be the same as sc's
11725 foreach my $table ($script->tables) {
11726 my $scx_table = $scx->add_match_table($table->name,
11727 Full_Name => $table->full_name);
11728 foreach my $alias ($table->aliases) {
11729 $scx_table->add_alias($alias->name);
11734 sub filter_script_extensions_line {
11735 # The Scripts file comes with the full name for the scripts; the
11736 # ScriptExtensions, with the short name. The final mapping file is a
11737 # combination of these, and without adjustment, would have inconsistent
11738 # entries. This filters the latter file to convert to full names.
11739 # Entries look like this:
11740 # 064B..0655 ; Arab Syrc # Mn [11] ARABIC FATHATAN..ARABIC HAMZA BELOW
11742 my @fields = split /\s*;\s*/;
11744 # This script was erroneously omitted in this Unicode version.
11745 $fields[1] .= ' Takr' if $v_version eq v6.1.0 && $fields[0] =~ /^0964/;
11748 foreach my $short_name (split " ", $fields[1]) {
11749 push @full_names, $script->table($short_name)->full_name;
11751 $fields[1] = join " ", @full_names;
11752 $_ = join "; ", @fields;
11757 sub setup_early_name_alias {
11759 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
11761 # This has the effect of pretending that the Name_Alias property was
11762 # available in all Unicode releases. Strictly speaking, this property
11763 # should not be availabe in early releases, but doing this allows
11764 # charnames.pm to work on older releases without change. Prior to v5.16
11765 # it had these names hard-coded inside it. Unicode 6.1 came along and
11766 # created these names, and so they were removed from charnames.
11768 my $aliases = property_ref('Name_Alias');
11769 if (! defined $aliases) {
11770 $aliases = Property->new('Name_Alias', Default_Map => "");
11773 $file->insert_lines(get_old_name_aliases());
11778 sub get_old_name_aliases () {
11780 # The Unicode_1_Name field, contains most of these names. One would
11781 # expect, given the field's name, that its values would be fixed across
11782 # versions, giving the true Unicode version 1 name for the character.
11783 # Sadly, this is not the case. Actually Version 1.1.5 had no names for
11784 # any of the controls; Version 2.0 introduced names for the C0 controls,
11785 # and 3.0 introduced C1 names. 3.0.1 removed the name INDEX; and 3.2
11786 # changed some names: it
11787 # changed to parenthesized versions like "NEXT LINE" to
11788 # "NEXT LINE (NEL)";
11789 # changed PARTIAL LINE DOWN to PARTIAL LINE FORWARD
11790 # changed PARTIAL LINE UP to PARTIAL LINE BACKWARD;;
11791 # changed e.g. FILE SEPARATOR to INFORMATION SEPARATOR FOUR
11792 # This list contains all the names that were defined so that
11793 # charnames::vianame(), etc. understand them all EVEN if this version of
11794 # Unicode didn't specify them (this could be construed as a bug).
11795 # mktables elsewhere gives preference to the Unicode_1_Name field over
11796 # these names, so that viacode() will return the correct value for that
11797 # version of Unicode, except when that version doesn't define a name,
11798 # viacode() will return one anyway (this also could be construed as a
11799 # bug). But these potential "bugs" allow for the smooth working of code
11800 # on earlier Unicode releases.
11802 my @return = split /\n/, <<'END';
11804 0000;NUL;abbreviation
11805 0001;START OF HEADING;control
11806 0001;SOH;abbreviation
11807 0002;START OF TEXT;control
11808 0002;STX;abbreviation
11809 0003;END OF TEXT;control
11810 0003;ETX;abbreviation
11811 0004;END OF TRANSMISSION;control
11812 0004;EOT;abbreviation
11813 0005;ENQUIRY;control
11814 0005;ENQ;abbreviation
11815 0006;ACKNOWLEDGE;control
11816 0006;ACK;abbreviation
11818 0007;BEL;abbreviation
11819 0008;BACKSPACE;control
11820 0008;BS;abbreviation
11821 0009;CHARACTER TABULATION;control
11822 0009;HORIZONTAL TABULATION;control
11823 0009;HT;abbreviation
11824 0009;TAB;abbreviation
11825 000A;LINE FEED;control
11826 000A;LINE FEED (LF);control
11827 000A;NEW LINE;control
11828 000A;END OF LINE;control
11829 000A;LF;abbreviation
11830 000A;NL;abbreviation
11831 000A;EOL;abbreviation
11832 000B;LINE TABULATION;control
11833 000B;VERTICAL TABULATION;control
11834 000B;VT;abbreviation
11835 000C;FORM FEED;control
11836 000C;FORM FEED (FF);control
11837 000C;FF;abbreviation
11838 000D;CARRIAGE RETURN;control
11839 000D;CARRIAGE RETURN (CR);control
11840 000D;CR;abbreviation
11841 000E;SHIFT OUT;control
11842 000E;LOCKING-SHIFT ONE;control
11843 000E;SO;abbreviation
11844 000F;SHIFT IN;control
11845 000F;LOCKING-SHIFT ZERO;control
11846 000F;SI;abbreviation
11847 0010;DATA LINK ESCAPE;control
11848 0010;DLE;abbreviation
11849 0011;DEVICE CONTROL ONE;control
11850 0011;DC1;abbreviation
11851 0012;DEVICE CONTROL TWO;control
11852 0012;DC2;abbreviation
11853 0013;DEVICE CONTROL THREE;control
11854 0013;DC3;abbreviation
11855 0014;DEVICE CONTROL FOUR;control
11856 0014;DC4;abbreviation
11857 0015;NEGATIVE ACKNOWLEDGE;control
11858 0015;NAK;abbreviation
11859 0016;SYNCHRONOUS IDLE;control
11860 0016;SYN;abbreviation
11861 0017;END OF TRANSMISSION BLOCK;control
11862 0017;ETB;abbreviation
11863 0018;CANCEL;control
11864 0018;CAN;abbreviation
11865 0019;END OF MEDIUM;control
11866 0019;EOM;abbreviation
11867 001A;SUBSTITUTE;control
11868 001A;SUB;abbreviation
11869 001B;ESCAPE;control
11870 001B;ESC;abbreviation
11871 001C;INFORMATION SEPARATOR FOUR;control
11872 001C;FILE SEPARATOR;control
11873 001C;FS;abbreviation
11874 001D;INFORMATION SEPARATOR THREE;control
11875 001D;GROUP SEPARATOR;control
11876 001D;GS;abbreviation
11877 001E;INFORMATION SEPARATOR TWO;control
11878 001E;RECORD SEPARATOR;control
11879 001E;RS;abbreviation
11880 001F;INFORMATION SEPARATOR ONE;control
11881 001F;UNIT SEPARATOR;control
11882 001F;US;abbreviation
11883 0020;SP;abbreviation
11884 007F;DELETE;control
11885 007F;DEL;abbreviation
11886 0080;PADDING CHARACTER;figment
11887 0080;PAD;abbreviation
11888 0081;HIGH OCTET PRESET;figment
11889 0081;HOP;abbreviation
11890 0082;BREAK PERMITTED HERE;control
11891 0082;BPH;abbreviation
11892 0083;NO BREAK HERE;control
11893 0083;NBH;abbreviation
11895 0084;IND;abbreviation
11896 0085;NEXT LINE;control
11897 0085;NEXT LINE (NEL);control
11898 0085;NEL;abbreviation
11899 0086;START OF SELECTED AREA;control
11900 0086;SSA;abbreviation
11901 0087;END OF SELECTED AREA;control
11902 0087;ESA;abbreviation
11903 0088;CHARACTER TABULATION SET;control
11904 0088;HORIZONTAL TABULATION SET;control
11905 0088;HTS;abbreviation
11906 0089;CHARACTER TABULATION WITH JUSTIFICATION;control
11907 0089;HORIZONTAL TABULATION WITH JUSTIFICATION;control
11908 0089;HTJ;abbreviation
11909 008A;LINE TABULATION SET;control
11910 008A;VERTICAL TABULATION SET;control
11911 008A;VTS;abbreviation
11912 008B;PARTIAL LINE FORWARD;control
11913 008B;PARTIAL LINE DOWN;control
11914 008B;PLD;abbreviation
11915 008C;PARTIAL LINE BACKWARD;control
11916 008C;PARTIAL LINE UP;control
11917 008C;PLU;abbreviation
11918 008D;REVERSE LINE FEED;control
11919 008D;REVERSE INDEX;control
11920 008D;RI;abbreviation
11921 008E;SINGLE SHIFT TWO;control
11922 008E;SINGLE-SHIFT-2;control
11923 008E;SS2;abbreviation
11924 008F;SINGLE SHIFT THREE;control
11925 008F;SINGLE-SHIFT-3;control
11926 008F;SS3;abbreviation
11927 0090;DEVICE CONTROL STRING;control
11928 0090;DCS;abbreviation
11929 0091;PRIVATE USE ONE;control
11930 0091;PRIVATE USE-1;control
11931 0091;PU1;abbreviation
11932 0092;PRIVATE USE TWO;control
11933 0092;PRIVATE USE-2;control
11934 0092;PU2;abbreviation
11935 0093;SET TRANSMIT STATE;control
11936 0093;STS;abbreviation
11937 0094;CANCEL CHARACTER;control
11938 0094;CCH;abbreviation
11939 0095;MESSAGE WAITING;control
11940 0095;MW;abbreviation
11941 0096;START OF GUARDED AREA;control
11942 0096;START OF PROTECTED AREA;control
11943 0096;SPA;abbreviation
11944 0097;END OF GUARDED AREA;control
11945 0097;END OF PROTECTED AREA;control
11946 0097;EPA;abbreviation
11947 0098;START OF STRING;control
11948 0098;SOS;abbreviation
11949 0099;SINGLE GRAPHIC CHARACTER INTRODUCER;figment
11950 0099;SGC;abbreviation
11951 009A;SINGLE CHARACTER INTRODUCER;control
11952 009A;SCI;abbreviation
11953 009B;CONTROL SEQUENCE INTRODUCER;control
11954 009B;CSI;abbreviation
11955 009C;STRING TERMINATOR;control
11956 009C;ST;abbreviation
11957 009D;OPERATING SYSTEM COMMAND;control
11958 009D;OSC;abbreviation
11959 009E;PRIVACY MESSAGE;control
11960 009E;PM;abbreviation
11961 009F;APPLICATION PROGRAM COMMAND;control
11962 009F;APC;abbreviation
11963 00A0;NBSP;abbreviation
11964 00AD;SHY;abbreviation
11965 200B;ZWSP;abbreviation
11966 200C;ZWNJ;abbreviation
11967 200D;ZWJ;abbreviation
11968 200E;LRM;abbreviation
11969 200F;RLM;abbreviation
11970 202A;LRE;abbreviation
11971 202B;RLE;abbreviation
11972 202C;PDF;abbreviation
11973 202D;LRO;abbreviation
11974 202E;RLO;abbreviation
11975 FEFF;BYTE ORDER MARK;alternate
11976 FEFF;BOM;abbreviation
11977 FEFF;ZWNBSP;abbreviation
11980 if ($v_version ge v3.0.0) {
11981 push @return, split /\n/, <<'END';
11982 180B; FVS1; abbreviation
11983 180C; FVS2; abbreviation
11984 180D; FVS3; abbreviation
11985 180E; MVS; abbreviation
11986 202F; NNBSP; abbreviation
11990 if ($v_version ge v3.2.0) {
11991 push @return, split /\n/, <<'END';
11992 034F; CGJ; abbreviation
11993 205F; MMSP; abbreviation
11994 2060; WJ; abbreviation
11997 my $cp = 0xFE00 - 1;
11998 for my $i (1..16) {
11999 push @return, sprintf("%04X; VS%d; abbreviation", $cp + $i, $i);
12002 if ($v_version ge v4.0.0) { # Add in VS17..VS256
12003 my $cp = 0xE0100 - 17;
12004 for my $i (17..256) {
12005 push @return, sprintf("%04X; VS%d; abbreviation", $cp + $i, $i);
12009 # ALERT did not come along until 6.0, at which point it became preferred
12010 # over BELL, and was never in the Unicode_1_Name field. For the same
12011 # reasons, that the other names are made known to all releases by this
12012 # function, we make ALERT known too. By inserting it
12013 # last in early releases, BELL is preferred over it; and vice-vers in 6.0
12014 my $alert = '0007; ALERT; control';
12015 if ($v_version lt v6.0.0) {
12016 push @return, $alert;
12019 unshift @return, $alert;
12025 sub filter_later_version_name_alias_line {
12027 # This file has an extra entry per line for the alias type. This is
12028 # handled by creating a compound entry: "$alias: $type"; First, split
12029 # the line into components.
12030 my ($range, $alias, $type, @remainder)
12031 = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
12033 # This file contains multiple entries for some components, so tell the
12034 # downstream code to allow this in our internal tables; the
12035 # $MULTIPLE_AFTER preserves the input ordering.
12036 $_ = join ";", $range, $CMD_DELIM
12046 sub filter_early_version_name_alias_line {
12048 # Early versions did not have the trailing alias type field; implicitly it
12049 # was 'correction'. But our synthetic lines we add in this program do
12050 # have it, so test for the type field.
12051 $_ .= "; correction" if $_ !~ /;.*;/;
12053 filter_later_version_name_alias_line;
12057 sub finish_Unicode() {
12058 # This routine should be called after all the Unicode files have been read
12060 # 1) Creates properties that are missing from the version of Unicode being
12061 # compiled, and which, for whatever reason, are needed for the Perl
12062 # core to function properly. These are minimally populated as
12064 # 2) Adds the mappings for code points missing from the files which have
12065 # defaults specified for them.
12066 # 3) At this this point all mappings are known, so it computes the type of
12067 # each property whose type hasn't been determined yet.
12068 # 4) Calculates all the regular expression match tables based on the
12070 # 5) Calculates and adds the tables which are defined by Unicode, but
12071 # which aren't derived by them, and certain derived tables that Perl
12074 # GCB and hst are not in early Unicode releases; create dummy ones if
12075 # they don't exist, as the core needs tables generated from them.
12076 my $gcb = property_ref('Grapheme_Cluster_Break');
12077 if (! defined $gcb) {
12078 $gcb = Property->new('GCB', Full_Name => 'Grapheme_Cluster_Break',
12079 Status => $PLACEHOLDER,
12081 Default_Map => 'Other');
12083 my $hst = property_ref('Hangul_Syllable_Type');
12084 if (!defined $hst) {
12085 $hst = Property->new('hst', Full_Name => 'Hangul_Syllable_Type',
12086 Status => $PLACEHOLDER,
12088 Default_Map => 'Not_Applicable');
12091 # For each property, fill in any missing mappings, and calculate the re
12092 # match tables. If a property has more than one missing mapping, the
12093 # default is a reference to a data structure, and requires data from other
12094 # properties to resolve. The sort is used to cause these to be processed
12095 # last, after all the other properties have been calculated.
12096 # (Fortunately, the missing properties so far don't depend on each other.)
12097 foreach my $property
12098 (sort { (defined $a->default_map && ref $a->default_map) ? 1 : -1 }
12101 # $perl has been defined, but isn't one of the Unicode properties that
12102 # need to be finished up.
12103 next if $property == $perl;
12105 # Nor do we need to do anything with properties that aren't going to
12107 next if $property->fate == $SUPPRESSED;
12109 # Handle the properties that have more than one possible default
12110 if (ref $property->default_map) {
12111 my $default_map = $property->default_map;
12113 # These properties have stored in the default_map:
12115 # 1) A default map which applies to all code points in a
12117 # 2) an expression which will evaluate to the list of code
12118 # points in that class
12120 # 3) the default map which applies to every other missing code
12123 # Go through each list.
12124 while (my ($default, $eval) = $default_map->get_next_defaults) {
12126 # Get the class list, and intersect it with all the so-far
12127 # unspecified code points yielding all the code points
12128 # in the class that haven't been specified.
12129 my $list = eval $eval;
12131 Carp::my_carp("Can't set some defaults for missing code points for $property because eval '$eval' failed with '$@'");
12135 # Narrow down the list to just those code points we don't have
12137 $list = $list & $property->inverse_list;
12139 # Add mappings to the property for each code point in the list
12140 foreach my $range ($list->ranges) {
12141 $property->add_map($range->start, $range->end, $default,
12142 Replace => $CROAK);
12146 # All remaining code points have the other mapping. Set that up
12147 # so the normal single-default mapping code will work on them
12148 $property->set_default_map($default_map->other_default);
12150 # And fall through to do that
12153 # We should have enough data now to compute the type of the property.
12154 $property->compute_type;
12155 my $property_type = $property->type;
12157 next if ! $property->to_create_match_tables;
12159 # Here want to create match tables for this property
12161 # The Unicode db always (so far, and they claim into the future) have
12162 # the default for missing entries in binary properties be 'N' (unless
12163 # there is a '@missing' line that specifies otherwise)
12164 if ($property_type == $BINARY && ! defined $property->default_map) {
12165 $property->set_default_map('N');
12168 # Add any remaining code points to the mapping, using the default for
12169 # missing code points.
12171 if (defined (my $default_map = $property->default_map)) {
12173 # Make sure there is a match table for the default
12174 if (! defined ($default_table = $property->table($default_map))) {
12175 $default_table = $property->add_match_table($default_map);
12178 # And, if the property is binary, the default table will just
12179 # be the complement of the other table.
12180 if ($property_type == $BINARY) {
12181 my $non_default_table;
12183 # Find the non-default table.
12184 for my $table ($property->tables) {
12185 next if $table == $default_table;
12186 $non_default_table = $table;
12188 $default_table->set_complement($non_default_table);
12192 # This fills in any missing values with the default. It's not
12193 # necessary to do this with binary properties, as the default
12194 # is defined completely in terms of the Y table.
12195 $property->add_map(0, $MAX_UNICODE_CODEPOINT,
12196 $default_map, Replace => $NO);
12200 # Have all we need to populate the match tables.
12201 my $property_name = $property->name;
12202 my $maps_should_be_defined = $property->pre_declared_maps;
12203 foreach my $range ($property->ranges) {
12204 my $map = $range->value;
12205 my $table = $property->table($map);
12206 if (! defined $table) {
12208 # Integral and rational property values are not necessarily
12209 # defined in PropValueAliases, but whether all the other ones
12210 # should be depends on the property.
12211 if ($maps_should_be_defined
12212 && $map !~ /^ -? \d+ ( \/ \d+ )? $/x)
12214 Carp::my_carp("Table '$property_name=$map' should have been defined. Defining it now.")
12216 $table = $property->add_match_table($map);
12219 next if $table->complement != 0; # Don't need to populate these
12220 $table->add_range($range->start, $range->end);
12223 # A forced binary property has additional true/false tables which
12224 # should have been set up when it was forced into binary. The false
12225 # table matches exactly the same set as the property's default table.
12226 # The true table matches the complement of that. The false table is
12227 # not the same as an additional set of aliases on top of the default
12228 # table, so use 'set_equivalent_to'. If it were implemented as
12229 # additional aliases, various things would have to be adjusted, but
12230 # especially, if the user wants to get a list of names for the table
12231 # using Unicode::UCD::prop_value_aliases(), s/he should get a
12232 # different set depending on whether they want the default table or
12234 if ($property_type == $FORCED_BINARY) {
12235 $property->table('N')->set_equivalent_to($default_table,
12237 $property->table('Y')->set_complement($default_table);
12240 # For Perl 5.6 compatibility, all properties matchable in regexes can
12241 # have an optional 'Is_' prefix. This is now done in utf8_heavy.pl.
12242 # But warn if this creates a conflict with a (new) Unicode property
12243 # name, although it appears that Unicode has made a decision never to
12244 # begin a property name with 'Is_', so this shouldn't happen.
12245 foreach my $alias ($property->aliases) {
12246 my $Is_name = 'Is_' . $alias->name;
12247 if (defined (my $pre_existing = property_ref($Is_name))) {
12248 Carp::my_carp(<<END
12249 There is already an alias named $Is_name (from " . $pre_existing . "), so
12250 creating one for $property won't work. This is bad news. If it is not too
12251 late, get Unicode to back off. Otherwise go back to the old scheme (findable
12252 from the git blame log for this area of the code that suppressed individual
12253 aliases that conflict with the new Unicode names. Proceeding anyway.
12257 } # End of loop through aliases for this property
12258 } # End of loop through all Unicode properties.
12260 # Fill in the mappings that Unicode doesn't completely furnish. First the
12261 # single letter major general categories. If Unicode were to start
12262 # delivering the values, this would be redundant, but better that than to
12263 # try to figure out if should skip and not get it right. Ths could happen
12264 # if a new major category were to be introduced, and the hard-coded test
12265 # wouldn't know about it.
12266 # This routine depends on the standard names for the general categories
12267 # being what it thinks they are, like 'Cn'. The major categories are the
12268 # union of all the general category tables which have the same first
12269 # letters. eg. L = Lu + Lt + Ll + Lo + Lm
12270 foreach my $minor_table ($gc->tables) {
12271 my $minor_name = $minor_table->name;
12272 next if length $minor_name == 1;
12273 if (length $minor_name != 2) {
12274 Carp::my_carp_bug("Unexpected general category '$minor_name'. Skipped.");
12278 my $major_name = uc(substr($minor_name, 0, 1));
12279 my $major_table = $gc->table($major_name);
12280 $major_table += $minor_table;
12283 # LC is Ll, Lu, and Lt. (used to be L& or L_, but PropValueAliases.txt
12284 # defines it as LC)
12285 my $LC = $gc->table('LC');
12286 $LC->add_alias('L_', Status => $DISCOURAGED); # For backwards...
12287 $LC->add_alias('L&', Status => $DISCOURAGED); # compatibility.
12290 if ($LC->is_empty) { # Assume if not empty that Unicode has started to
12291 # deliver the correct values in it
12292 $LC->initialize($gc->table('Ll') + $gc->table('Lu'));
12294 # Lt not in release 1.
12295 if (defined $gc->table('Lt')) {
12296 $LC += $gc->table('Lt');
12297 $gc->table('Lt')->set_caseless_equivalent($LC);
12300 $LC->add_description('[\p{Ll}\p{Lu}\p{Lt}]');
12302 $gc->table('Ll')->set_caseless_equivalent($LC);
12303 $gc->table('Lu')->set_caseless_equivalent($LC);
12305 my $Cs = $gc->table('Cs');
12308 # Folding information was introduced later into Unicode data. To get
12309 # Perl's case ignore (/i) to work at all in releases that don't have
12310 # folding, use the best available alternative, which is lower casing.
12311 my $fold = property_ref('Simple_Case_Folding');
12312 if ($fold->is_empty) {
12313 $fold->initialize(property_ref('Simple_Lowercase_Mapping'));
12314 $fold->add_note(join_lines(<<END
12315 WARNING: This table uses lower case as a substitute for missing fold
12321 # Multiple-character mapping was introduced later into Unicode data. If
12322 # missing, use the single-characters maps as best available alternative
12323 foreach my $map (qw { Uppercase_Mapping
12329 my $full = property_ref($map);
12330 if ($full->is_empty) {
12331 my $simple = property_ref('Simple_' . $map);
12332 $full->initialize($simple);
12333 $full->add_comment($simple->comment) if ($simple->comment);
12334 $full->add_note(join_lines(<<END
12335 WARNING: This table uses simple mapping (single-character only) as a
12336 substitute for missing multiple-character information
12342 # Create digit and case fold tables with the original file names for
12343 # backwards compatibility with applications that read them directly.
12344 my $Digit = Property->new("Legacy_Perl_Decimal_Digit",
12346 Perl_Extension => 1,
12347 File => 'Digit', # Trad. location
12348 Directory => $map_directory,
12351 To_Output_Map => $EXTERNAL_MAP,
12353 Initialize => property_ref('Perl_Decimal_Digit'),
12355 $Digit->add_comment(join_lines(<<END
12356 This file gives the mapping of all code points which represent a single
12357 decimal digit [0-9] to their respective digits. For example, the code point
12358 U+0031 (an ASCII '1') is mapped to a numeric 1. These code points are those
12359 that have Numeric_Type=Decimal; not special things, like subscripts nor Roman
12364 Property->new('Legacy_Case_Folding',
12366 Directory => $map_directory,
12367 Default_Map => $CODE_POINT,
12371 To_Output_Map => $EXTERNAL_MAP,
12372 Format => $HEX_FORMAT,
12373 Initialize => property_ref('cf'),
12376 # The Script_Extensions property started out as a clone of the Script
12377 # property. But processing its data file caused some elements to be
12378 # replaced with different data. (These elements were for the Common and
12379 # Inherited properties.) This data is a qw() list of all the scripts that
12380 # the code points in the given range are in. An example line is:
12381 # 060C ; Arab Syrc Thaa # Po ARABIC COMMA
12383 # The code above has created a new match table named "Arab Syrc Thaa"
12384 # which contains 060C. (The cloned table started out with this code point
12385 # mapping to "Common".) Now we add 060C to each of the Arab, Syrc, and
12386 # Thaa match tables. Then we delete the now spurious "Arab Syrc Thaa"
12387 # match table. This is repeated for all these tables and ranges. The map
12388 # data is retained in the map table for reference, but the spurious match
12389 # tables are deleted.
12391 my $scx = property_ref("Script_Extensions");
12392 if (defined $scx) {
12393 foreach my $table ($scx->tables) {
12394 next unless $table->name =~ /\s/; # All the new and only the new
12395 # tables have a space in their
12397 my @scripts = split /\s+/, $table->name;
12398 foreach my $script (@scripts) {
12399 my $script_table = $scx->table($script);
12400 $script_table += $table;
12402 $scx->delete_match_table($table);
12409 sub compile_perl() {
12410 # Create perl-defined tables. Almost all are part of the pseudo-property
12411 # named 'perl' internally to this program. Many of these are recommended
12412 # in UTS#18 "Unicode Regular Expressions", and their derivations are based
12413 # on those found there.
12414 # Almost all of these are equivalent to some Unicode property.
12415 # A number of these properties have equivalents restricted to the ASCII
12416 # range, with their names prefaced by 'Posix', to signify that these match
12417 # what the Posix standard says they should match. A couple are
12418 # effectively this, but the name doesn't have 'Posix' in it because there
12419 # just isn't any Posix equivalent. 'XPosix' are the Posix tables extended
12420 # to the full Unicode range, by our guesses as to what is appropriate.
12422 # 'Any' is all code points. As an error check, instead of just setting it
12423 # to be that, construct it to be the union of all the major categories
12424 $Any = $perl->add_match_table('Any',
12425 Description => "[\\x{0000}-\\x{$MAX_UNICODE_CODEPOINT_STRING}]",
12428 foreach my $major_table ($gc->tables) {
12430 # Major categories are the ones with single letter names.
12431 next if length($major_table->name) != 1;
12433 $Any += $major_table;
12436 if ($Any->max != $MAX_UNICODE_CODEPOINT) {
12437 Carp::my_carp_bug("Generated highest code point ("
12438 . sprintf("%X", $Any->max)
12439 . ") doesn't match expected value $MAX_UNICODE_CODEPOINT_STRING.")
12441 if ($Any->range_count != 1 || $Any->min != 0) {
12442 Carp::my_carp_bug("Generated table 'Any' doesn't match all code points.")
12445 $Any->add_alias('All');
12447 # Assigned is the opposite of gc=unassigned
12448 my $Assigned = $perl->add_match_table('Assigned',
12449 Description => "All assigned code points",
12450 Initialize => ~ $gc->table('Unassigned'),
12453 # Our internal-only property should be treated as more than just a
12454 # synonym; grandfather it in to the pod.
12455 $perl->add_match_table('_CombAbove', Re_Pod_Entry => 1,
12456 Fate => $INTERNAL_ONLY, Status => $DISCOURAGED)
12457 ->set_equivalent_to(property_ref('ccc')->table('Above'),
12460 my $ASCII = $perl->add_match_table('ASCII', Description => '[[:ASCII:]]');
12461 if (defined $block) { # This is equivalent to the block if have it.
12462 my $Unicode_ASCII = $block->table('Basic_Latin');
12463 if (defined $Unicode_ASCII && ! $Unicode_ASCII->is_empty) {
12464 $ASCII->set_equivalent_to($Unicode_ASCII, Related => 1);
12468 # Very early releases didn't have blocks, so initialize ASCII ourselves if
12470 if ($ASCII->is_empty) {
12471 $ASCII->initialize([ 0..127 ]);
12474 # Get the best available case definitions. Early Unicode versions didn't
12475 # have Uppercase and Lowercase defined, so use the general category
12476 # instead for them.
12477 my $Lower = $perl->add_match_table('Lower');
12478 my $Unicode_Lower = property_ref('Lowercase');
12479 if (defined $Unicode_Lower && ! $Unicode_Lower->is_empty) {
12480 $Lower->set_equivalent_to($Unicode_Lower->table('Y'), Related => 1);
12481 $Unicode_Lower->table('Y')->set_caseless_equivalent(property_ref('Cased')->table('Y'));
12482 $Unicode_Lower->table('N')->set_caseless_equivalent(property_ref('Cased')->table('N'));
12483 $Lower->set_caseless_equivalent(property_ref('Cased')->table('Y'));
12487 $Lower->set_equivalent_to($gc->table('Lowercase_Letter'),
12490 $Lower->add_alias('XPosixLower');
12491 my $Posix_Lower = $perl->add_match_table("PosixLower",
12492 Description => "[a-z]",
12493 Initialize => $Lower & $ASCII,
12496 my $Upper = $perl->add_match_table('Upper');
12497 my $Unicode_Upper = property_ref('Uppercase');
12498 if (defined $Unicode_Upper && ! $Unicode_Upper->is_empty) {
12499 $Upper->set_equivalent_to($Unicode_Upper->table('Y'), Related => 1);
12500 $Unicode_Upper->table('Y')->set_caseless_equivalent(property_ref('Cased')->table('Y'));
12501 $Unicode_Upper->table('N')->set_caseless_equivalent(property_ref('Cased')->table('N'));
12502 $Upper->set_caseless_equivalent(property_ref('Cased')->table('Y'));
12505 $Upper->set_equivalent_to($gc->table('Uppercase_Letter'),
12508 $Upper->add_alias('XPosixUpper');
12509 my $Posix_Upper = $perl->add_match_table("PosixUpper",
12510 Description => "[A-Z]",
12511 Initialize => $Upper & $ASCII,
12514 # Earliest releases didn't have title case. Initialize it to empty if not
12515 # otherwise present
12516 my $Title = $perl->add_match_table('Title', Full_Name => 'Titlecase',
12517 Description => '(= \p{Gc=Lt})');
12518 my $lt = $gc->table('Lt');
12520 # Earlier versions of mktables had this related to $lt since they have
12521 # identical code points, but their caseless equivalents are not the same,
12522 # one being 'Cased' and the other being 'LC', and so now must be kept as
12523 # separate entities.
12528 push @tables_that_may_be_empty, $Title->complete_name;
12531 # If this Unicode version doesn't have Cased, set up our own. From
12532 # Unicode 5.1: Definition D120: A character C is defined to be cased if
12533 # and only if C has the Lowercase or Uppercase property or has a
12534 # General_Category value of Titlecase_Letter.
12535 my $Unicode_Cased = property_ref('Cased');
12536 unless (defined $Unicode_Cased) {
12537 my $cased = $perl->add_match_table('Cased',
12538 Initialize => $Lower + $Upper + $Title,
12539 Description => 'Uppercase or Lowercase or Titlecase',
12541 $Unicode_Cased = $cased;
12543 $Title->set_caseless_equivalent($Unicode_Cased->table('Y'));
12545 # Similarly, set up our own Case_Ignorable property if this Unicode
12546 # version doesn't have it. From Unicode 5.1: Definition D121: A character
12547 # C is defined to be case-ignorable if C has the value MidLetter or the
12548 # value MidNumLet for the Word_Break property or its General_Category is
12549 # one of Nonspacing_Mark (Mn), Enclosing_Mark (Me), Format (Cf),
12550 # Modifier_Letter (Lm), or Modifier_Symbol (Sk).
12552 # Perl has long had an internal-only alias for this property; grandfather
12553 # it in to the pod, but discourage its use.
12554 my $perl_case_ignorable = $perl->add_match_table('_Case_Ignorable',
12556 Fate => $INTERNAL_ONLY,
12557 Status => $DISCOURAGED);
12558 my $case_ignorable = property_ref('Case_Ignorable');
12559 if (defined $case_ignorable && ! $case_ignorable->is_empty) {
12560 $perl_case_ignorable->set_equivalent_to($case_ignorable->table('Y'),
12565 $perl_case_ignorable->initialize($gc->table('Mn') + $gc->table('Lm'));
12567 # The following three properties are not in early releases
12568 $perl_case_ignorable += $gc->table('Me') if defined $gc->table('Me');
12569 $perl_case_ignorable += $gc->table('Cf') if defined $gc->table('Cf');
12570 $perl_case_ignorable += $gc->table('Sk') if defined $gc->table('Sk');
12572 # For versions 4.1 - 5.0, there is no MidNumLet property, and
12573 # correspondingly the case-ignorable definition lacks that one. For
12574 # 4.0, it appears that it was meant to be the same definition, but was
12575 # inadvertently omitted from the standard's text, so add it if the
12576 # property actually is there
12577 my $wb = property_ref('Word_Break');
12579 my $midlet = $wb->table('MidLetter');
12580 $perl_case_ignorable += $midlet if defined $midlet;
12581 my $midnumlet = $wb->table('MidNumLet');
12582 $perl_case_ignorable += $midnumlet if defined $midnumlet;
12586 # In earlier versions of the standard, instead of the above two
12587 # properties , just the following characters were used:
12588 $perl_case_ignorable += 0x0027 # APOSTROPHE
12589 + 0x00AD # SOFT HYPHEN (SHY)
12590 + 0x2019; # RIGHT SINGLE QUOTATION MARK
12594 # The remaining perl defined tables are mostly based on Unicode TR 18,
12595 # "Annex C: Compatibility Properties". All of these have two versions,
12596 # one whose name generally begins with Posix that is posix-compliant, and
12597 # one that matches Unicode characters beyond the Posix, ASCII range
12599 my $Alpha = $perl->add_match_table('Alpha');
12601 # Alphabetic was not present in early releases
12602 my $Alphabetic = property_ref('Alphabetic');
12603 if (defined $Alphabetic && ! $Alphabetic->is_empty) {
12604 $Alpha->set_equivalent_to($Alphabetic->table('Y'), Related => 1);
12608 # For early releases, we don't get it exactly right. The below
12609 # includes more than it should, which in 5.2 terms is: L + Nl +
12610 # Other_Alphabetic. Other_Alphabetic contains many characters from
12611 # Mn and Mc. It's better to match more than we should, than less than
12613 $Alpha->initialize($gc->table('Letter')
12615 + $gc->table('Mc'));
12616 $Alpha += $gc->table('Nl') if defined $gc->table('Nl');
12617 $Alpha->add_description('Alphabetic');
12619 $Alpha->add_alias('XPosixAlpha');
12620 my $Posix_Alpha = $perl->add_match_table("PosixAlpha",
12621 Description => "[A-Za-z]",
12622 Initialize => $Alpha & $ASCII,
12624 $Posix_Upper->set_caseless_equivalent($Posix_Alpha);
12625 $Posix_Lower->set_caseless_equivalent($Posix_Alpha);
12627 my $Alnum = $perl->add_match_table('Alnum',
12628 Description => 'Alphabetic and (decimal) Numeric',
12629 Initialize => $Alpha + $gc->table('Decimal_Number'),
12631 $Alnum->add_alias('XPosixAlnum');
12632 $perl->add_match_table("PosixAlnum",
12633 Description => "[A-Za-z0-9]",
12634 Initialize => $Alnum & $ASCII,
12637 my $Word = $perl->add_match_table('Word',
12638 Description => '\w, including beyond ASCII;'
12639 . ' = \p{Alnum} + \pM + \p{Pc}',
12640 Initialize => $Alnum + $gc->table('Mark'),
12642 $Word->add_alias('XPosixWord');
12643 my $Pc = $gc->table('Connector_Punctuation'); # 'Pc' Not in release 1
12644 $Word += $Pc if defined $Pc;
12646 # This is a Perl extension, so the name doesn't begin with Posix.
12647 my $PerlWord = $perl->add_match_table('PerlWord',
12648 Description => '\w, restricted to ASCII = [A-Za-z0-9_]',
12649 Initialize => $Word & $ASCII,
12651 $PerlWord->add_alias('PosixWord');
12653 my $Blank = $perl->add_match_table('Blank',
12654 Description => '\h, Horizontal white space',
12656 # 200B is Zero Width Space which is for line
12657 # break control, and was listed as
12658 # Space_Separator in early releases
12659 Initialize => $gc->table('Space_Separator')
12663 $Blank->add_alias('HorizSpace'); # Another name for it.
12664 $Blank->add_alias('XPosixBlank');
12665 $perl->add_match_table("PosixBlank",
12666 Description => "\\t and ' '",
12667 Initialize => $Blank & $ASCII,
12670 my $VertSpace = $perl->add_match_table('VertSpace',
12671 Description => '\v',
12672 Initialize => $gc->table('Line_Separator')
12673 + $gc->table('Paragraph_Separator')
12674 + 0x000A # LINE FEED
12675 + 0x000B # VERTICAL TAB
12676 + 0x000C # FORM FEED
12677 + 0x000D # CARRIAGE RETURN
12680 # No Posix equivalent for vertical space
12682 my $Space = $perl->add_match_table('Space',
12683 Description => '\s including beyond ASCII plus vertical tab',
12684 Initialize => $Blank + $VertSpace,
12686 $Space->add_alias('XPosixSpace');
12687 $perl->add_match_table("PosixSpace",
12688 Description => "\\t, \\n, \\cK, \\f, \\r, and ' '. (\\cK is vertical tab)",
12689 Initialize => $Space & $ASCII,
12692 # Perl's traditional space doesn't include Vertical Tab
12693 my $XPerlSpace = $perl->add_match_table('XPerlSpace',
12694 Description => '\s, including beyond ASCII',
12695 #Initialize => $Space - 0x000B,
12696 Initialize => $Space,
12698 $XPerlSpace->add_alias('SpacePerl'); # A pre-existing synonym
12699 my $PerlSpace = $perl->add_match_table('PerlSpace',
12700 Description => '\s, restricted to ASCII = [ \f\n\r\t] plus vertical tab',
12701 Initialize => $XPerlSpace & $ASCII,
12705 my $Cntrl = $perl->add_match_table('Cntrl',
12706 Description => 'Control characters');
12707 $Cntrl->set_equivalent_to($gc->table('Cc'), Related => 1);
12708 $Cntrl->add_alias('XPosixCntrl');
12709 $perl->add_match_table("PosixCntrl",
12710 Description => "ASCII control characters: NUL, SOH, STX, ETX, EOT, ENQ, ACK, BEL, BS, HT, LF, VT, FF, CR, SO, SI, DLE, DC1, DC2, DC3, DC4, NAK, SYN, ETB, CAN, EOM, SUB, ESC, FS, GS, RS, US, and DEL",
12711 Initialize => $Cntrl & $ASCII,
12714 # $controls is a temporary used to construct Graph.
12715 my $controls = Range_List->new(Initialize => $gc->table('Unassigned')
12716 + $gc->table('Control'));
12717 # Cs not in release 1
12718 $controls += $gc->table('Surrogate') if defined $gc->table('Surrogate');
12720 # Graph is ~space & ~(Cc|Cs|Cn) = ~(space + $controls)
12721 my $Graph = $perl->add_match_table('Graph',
12722 Description => 'Characters that are graphical',
12723 Initialize => ~ ($Space + $controls),
12725 $Graph->add_alias('XPosixGraph');
12726 $perl->add_match_table("PosixGraph",
12728 '[-!"#$%&\'()*+,./:;<>?@[\\\]^_`{|}~0-9A-Za-z]',
12729 Initialize => $Graph & $ASCII,
12732 $print = $perl->add_match_table('Print',
12733 Description => 'Characters that are graphical plus space characters (but no controls)',
12734 Initialize => $Blank + $Graph - $gc->table('Control'),
12736 $print->add_alias('XPosixPrint');
12737 $perl->add_match_table("PosixPrint",
12739 '[- 0-9A-Za-z!"#$%&\'()*+,./:;<>?@[\\\]^_`{|}~]',
12740 Initialize => $print & $ASCII,
12743 my $Punct = $perl->add_match_table('Punct');
12744 $Punct->set_equivalent_to($gc->table('Punctuation'), Related => 1);
12746 # \p{punct} doesn't include the symbols, which posix does
12747 my $XPosixPunct = $perl->add_match_table('XPosixPunct',
12748 Description => '\p{Punct} + ASCII-range \p{Symbol}',
12749 Initialize => $gc->table('Punctuation')
12750 + ($ASCII & $gc->table('Symbol')),
12751 Perl_Extension => 1
12753 $perl->add_match_table('PosixPunct', Perl_Extension => 1,
12754 Description => '[-!"#$%&\'()*+,./:;<>?@[\\\]^_`{|}~]',
12755 Initialize => $ASCII & $XPosixPunct,
12758 my $Digit = $perl->add_match_table('Digit',
12759 Description => '[0-9] + all other decimal digits');
12760 $Digit->set_equivalent_to($gc->table('Decimal_Number'), Related => 1);
12761 $Digit->add_alias('XPosixDigit');
12762 my $PosixDigit = $perl->add_match_table("PosixDigit",
12763 Description => '[0-9]',
12764 Initialize => $Digit & $ASCII,
12767 # Hex_Digit was not present in first release
12768 my $Xdigit = $perl->add_match_table('XDigit');
12769 $Xdigit->add_alias('XPosixXDigit');
12770 my $Hex = property_ref('Hex_Digit');
12771 if (defined $Hex && ! $Hex->is_empty) {
12772 $Xdigit->set_equivalent_to($Hex->table('Y'), Related => 1);
12775 # (Have to use hex instead of e.g. '0', because could be running on an
12776 # non-ASCII machine, and we want the Unicode (ASCII) values)
12777 $Xdigit->initialize([ 0x30..0x39, 0x41..0x46, 0x61..0x66,
12778 0xFF10..0xFF19, 0xFF21..0xFF26, 0xFF41..0xFF46]);
12779 $Xdigit->add_description('[0-9A-Fa-f] and corresponding fullwidth versions, like U+FF10: FULLWIDTH DIGIT ZERO');
12782 # AHex was not present in early releases
12783 my $PosixXDigit = $perl->add_match_table('PosixXDigit');
12784 my $AHex = property_ref('ASCII_Hex_Digit');
12785 if (defined $AHex && ! $AHex->is_empty) {
12786 $PosixXDigit->set_equivalent_to($AHex->table('Y'), Related => 1);
12789 $PosixXDigit->initialize($Xdigit & $ASCII);
12791 $PosixXDigit->add_description('[0-9A-Fa-f]');
12793 my $dt = property_ref('Decomposition_Type');
12794 $dt->add_match_table('Non_Canon', Full_Name => 'Non_Canonical',
12795 Initialize => ~ ($dt->table('None') + $dt->table('Canonical')),
12796 Perl_Extension => 1,
12797 Note => 'Union of all non-canonical decompositions',
12800 # _CanonDCIJ is equivalent to Soft_Dotted, but if on a release earlier
12801 # than SD appeared, construct it ourselves, based on the first release SD
12802 # was in. A pod entry is grandfathered in for it
12803 my $CanonDCIJ = $perl->add_match_table('_CanonDCIJ', Re_Pod_Entry => 1,
12804 Perl_Extension => 1,
12805 Fate => $INTERNAL_ONLY,
12806 Status => $DISCOURAGED);
12807 my $soft_dotted = property_ref('Soft_Dotted');
12808 if (defined $soft_dotted && ! $soft_dotted->is_empty) {
12809 $CanonDCIJ->set_equivalent_to($soft_dotted->table('Y'), Related => 1);
12813 # This list came from 3.2 Soft_Dotted.
12814 $CanonDCIJ->initialize([ 0x0069,
12823 $CanonDCIJ = $CanonDCIJ & $Assigned;
12826 # These are used in Unicode's definition of \X
12827 my $begin = $perl->add_match_table('_X_Begin', Perl_Extension => 1,
12828 Fate => $INTERNAL_ONLY);
12829 my $extend = $perl->add_match_table('_X_Extend', Perl_Extension => 1,
12830 Fate => $INTERNAL_ONLY);
12832 # For backward compatibility, Perl has its own definition for IDStart
12833 # First, we include the underscore, and then the regular XID_Start also
12835 $perl->add_match_table('_Perl_IDStart',
12836 Perl_Extension => 1,
12837 Fate => $INTERNAL_ONLY,
12840 + (property_ref('XID_Start')->table('Y') & $Word)
12843 my $gcb = property_ref('Grapheme_Cluster_Break');
12845 # The 'extended' grapheme cluster came in 5.1. The non-extended
12846 # definition differs too much from the traditional Perl one to use.
12847 if (defined $gcb->table('SpacingMark')) {
12849 # Note that assumes hst is defined; it came in an earlier release than
12850 # GCB. In the line below, two negatives means: yes hangul
12851 $begin += ~ property_ref('Hangul_Syllable_Type')
12852 ->table('Not_Applicable')
12853 + ~ ($gcb->table('Control')
12854 + $gcb->table('CR')
12855 + $gcb->table('LF'));
12856 $begin->add_comment('For use in \X; matches: Hangul_Syllable | ! Control');
12858 $extend += $gcb->table('Extend') + $gcb->table('SpacingMark');
12859 $extend->add_comment('For use in \X; matches: Extend | SpacingMark');
12861 else { # Old definition, used on early releases.
12862 $extend += $gc->table('Mark')
12865 $begin += ~ $extend;
12867 # Here we may have a release that has the regular grapheme cluster
12868 # defined, or a release that doesn't have anything defined.
12869 # We set things up so the Perl core degrades gracefully, possibly with
12870 # placeholders that match nothing.
12872 my $hst = property_ref('Hangul_Syllable_Type');
12874 # On some releases, here we may not have the needed tables for the
12875 # perl core, in some releases we may.
12876 foreach my $name (qw{ L LV LVT T V prepend }) {
12877 my $table = $gcb->table($name);
12878 if (! defined $table) {
12879 $table = $gcb->add_match_table($name);
12880 push @tables_that_may_be_empty, $table->complete_name;
12883 # The hst property predates the GCB one, and has identical tables
12884 # for some of them, so use it if we can.
12885 if ($table->is_empty && defined $hst->table($name))
12887 $table += $hst->table($name);
12892 # More GCB. If we found some hangul syllables, populate a combined
12894 my $lv_lvt_v = $perl->add_match_table('_X_LV_LVT_V',
12895 Perl_Extension => 1,
12896 Fate => $INTERNAL_ONLY);
12897 my $LV = $gcb->table('LV');
12898 if ($LV->is_empty) {
12899 push @tables_that_may_be_empty, $lv_lvt_v->complete_name;
12901 $lv_lvt_v += $LV + $gcb->table('LVT') + $gcb->table('V');
12902 $lv_lvt_v->add_comment('For use in \X; matches: hst=LV | hst=LVT | hst=V');
12905 # Was previously constructed to contain both Name and Unicode_1_Name
12906 my @composition = ('Name', 'Unicode_1_Name');
12908 if (@named_sequences) {
12909 push @composition, 'Named_Sequence';
12910 foreach my $sequence (@named_sequences) {
12911 $perl_charname->add_anomalous_entry($sequence);
12915 my $alias_sentence = "";
12917 my $alias = property_ref('Name_Alias');
12918 push @composition, 'Name_Alias';
12919 $perl_charname->set_proxy_for('Name_Alias');
12921 # Add each entry in Name_Alias to Perl_Charnames. Where these go with
12922 # respect to any existing entry depends on the entry type. Corrections go
12923 # before said entry, as they should be returned in preference over the
12924 # existing entry. (A correction to a correction should be later in the
12925 # Name_Alias table, so it will correctly precede the erroneous correction
12926 # in Perl_Charnames.)
12928 # Abbreviations go after everything else, so they are saved temporarily in
12929 # a hash for later.
12931 # Controls are currently added afterwards. This is because Perl has
12932 # previously used the Unicode1 name, and so should still use that. (Most
12933 # of them will be the same anyway, in which case we don't add a duplicate)
12935 $alias->reset_each_range;
12936 while (my ($range) = $alias->each_range) {
12937 next if $range->value eq "";
12938 my $code_point = $range->start;
12939 if ($code_point != $range->end) {
12940 Carp::my_carp_bug("Bad News. Expecting only one code point in the range $range. Just to keep going, using only the first code point;");
12942 my ($value, $type) = split ': ', $range->value;
12944 if ($type eq 'correction') {
12945 $replace_type = $MULTIPLE_BEFORE;
12947 elsif ($type eq 'abbreviation') {
12950 $abbreviations{$value} = $code_point;
12954 $replace_type = $MULTIPLE_AFTER;
12957 # Actually add; before or after current entry(ies) as determined
12960 $perl_charname->add_duplicate($code_point, $value, Replace => $replace_type);
12962 $alias_sentence = <<END;
12963 The Name_Alias property adds duplicate code point entries that are
12964 alternatives to the original name. If an addition is a corrected
12965 name, it will be physically first in the table. The original (less correct,
12966 but still valid) name will be next; then any alternatives, in no particular
12967 order; and finally any abbreviations, again in no particular order.
12970 # Now add the Unicode_1 names for the controls. The Unicode_1 names had
12971 # precedence before 6.1, so should be first in the file; the other names
12972 # have precedence starting in 6.1,
12973 my $before_or_after = ($v_version lt v6.1.0)
12977 foreach my $range (property_ref('Unicode_1_Name')->ranges) {
12978 my $code_point = $range->start;
12979 my $unicode_1_value = $range->value;
12980 next if $unicode_1_value eq ""; # Skip if name doesn't exist.
12982 if ($code_point != $range->end) {
12983 Carp::my_carp_bug("Bad News. Expecting only one code point in the range $range. Just to keep going, using only the first code point;");
12986 # To handle EBCDIC, we don't hard code in the code points of the
12987 # controls; instead realizing that all of them are below 256.
12988 last if $code_point > 255;
12990 # We only add in the controls.
12991 next if $gc->value_of($code_point) ne 'Cc';
12993 # This won't add an exact duplicate.
12994 $perl_charname->add_duplicate($code_point, $unicode_1_value,
12995 Replace => $before_or_after);
12998 # Now that have everything added, add in abbreviations after
13000 foreach my $value (keys %abbreviations) {
13001 $perl_charname->add_duplicate($abbreviations{$value}, $value,
13002 Replace => $MULTIPLE_AFTER);
13006 if (@composition <= 2) { # Always at least 2
13007 $comment = join " and ", @composition;
13010 $comment = join ", ", @composition[0 .. scalar @composition - 2];
13011 $comment .= ", and $composition[-1]";
13014 $perl_charname->add_comment(join_lines( <<END
13015 This file is for charnames.pm. It is the union of the $comment properties.
13016 Unicode_1_Name entries are used only for nameless code points in the Name
13019 This file doesn't include the algorithmically determinable names. For those,
13020 use 'unicore/Name.pm'
13023 property_ref('Name')->add_comment(join_lines( <<END
13024 This file doesn't include the algorithmically determinable names. For those,
13025 use 'unicore/Name.pm'
13029 # Construct the Present_In property from the Age property.
13030 if (-e 'DAge.txt' && defined (my $age = property_ref('Age'))) {
13031 my $default_map = $age->default_map;
13032 my $in = Property->new('In',
13033 Default_Map => $default_map,
13034 Full_Name => "Present_In",
13035 Perl_Extension => 1,
13037 Initialize => $age,
13039 $in->add_comment(join_lines(<<END
13040 THIS FILE SHOULD NOT BE USED FOR ANY PURPOSE. The values in this file are the
13041 same as for $age, and not for what $in really means. This is because anything
13042 defined in a given release should have multiple values: that release and all
13043 higher ones. But only one value per code point can be represented in a table
13048 # The Age tables are named like 1.5, 2.0, 2.1, .... Sort so that the
13049 # lowest numbered (earliest) come first, with the non-numeric one
13051 my ($first_age, @rest_ages) = sort { ($a->name !~ /^[\d.]*$/)
13053 : ($b->name !~ /^[\d.]*$/)
13055 : $a->name <=> $b->name
13058 # The Present_In property is the cumulative age properties. The first
13059 # one hence is identical to the first age one.
13060 my $previous_in = $in->add_match_table($first_age->name);
13061 $previous_in->set_equivalent_to($first_age, Related => 1);
13063 my $description_start = "Code point's usage introduced in version ";
13064 $first_age->add_description($description_start . $first_age->name);
13066 # To construct the accumulated values, for each of the age tables
13067 # starting with the 2nd earliest, merge the earliest with it, to get
13068 # all those code points existing in the 2nd earliest. Repeat merging
13069 # the new 2nd earliest with the 3rd earliest to get all those existing
13070 # in the 3rd earliest, and so on.
13071 foreach my $current_age (@rest_ages) {
13072 next if $current_age->name !~ /^[\d.]*$/; # Skip the non-numeric
13074 my $current_in = $in->add_match_table(
13075 $current_age->name,
13076 Initialize => $current_age + $previous_in,
13077 Description => $description_start
13078 . $current_age->name
13081 $previous_in = $current_in;
13083 # Add clarifying material for the corresponding age file. This is
13084 # in part because of the confusing and contradictory information
13085 # given in the Standard's documentation itself, as of 5.2.
13086 $current_age->add_description(
13087 "Code point's usage was introduced in version "
13088 . $current_age->name);
13089 $current_age->add_note("See also $in");
13093 # And finally the code points whose usages have yet to be decided are
13094 # the same in both properties. Note that permanently unassigned code
13095 # points actually have their usage assigned (as being permanently
13096 # unassigned), so that these tables are not the same as gc=cn.
13097 my $unassigned = $in->add_match_table($default_map);
13098 my $age_default = $age->table($default_map);
13099 $age_default->add_description(<<END
13100 Code point's usage has not been assigned in any Unicode release thus far.
13103 $unassigned->set_equivalent_to($age_default, Related => 1);
13106 # See L<perlfunc/quotemeta>
13107 my $quotemeta = $perl->add_match_table('_Perl_Quotemeta',
13108 Perl_Extension => 1,
13109 Fate => $INTERNAL_ONLY,
13111 # Initialize to what's common in
13112 # all Unicode releases.
13115 + $gc->table('Control')
13118 # In early releases without the proper Unicode properties, just set to \W.
13119 if (! defined (my $patsyn = property_ref('Pattern_Syntax'))
13120 || ! defined (my $patws = property_ref('Pattern_White_Space'))
13121 || ! defined (my $di = property_ref('Default_Ignorable_Code_Point')))
13123 $quotemeta += ~ $Word;
13126 $quotemeta += $patsyn->table('Y')
13127 + $patws->table('Y')
13129 + ((~ $Word) & $ASCII);
13132 # Finished creating all the perl properties. All non-internal non-string
13133 # ones have a synonym of 'Is_' prefixed. (Internal properties begin with
13134 # an underscore.) These do not get a separate entry in the pod file
13135 foreach my $table ($perl->tables) {
13136 foreach my $alias ($table->aliases) {
13137 next if $alias->name =~ /^_/;
13138 $table->add_alias('Is_' . $alias->name,
13141 Status => $alias->status,
13142 OK_as_Filename => 0);
13146 # Here done with all the basic stuff. Ready to populate the information
13147 # about each character if annotating them.
13150 # See comments at its declaration
13151 $annotate_ranges = Range_Map->new;
13153 # This separates out the non-characters from the other unassigneds, so
13154 # can give different annotations for each.
13155 $unassigned_sans_noncharacters = Range_List->new(
13156 Initialize => $gc->table('Unassigned')
13157 & property_ref('Noncharacter_Code_Point')->table('N'));
13159 for (my $i = 0; $i <= $MAX_UNICODE_CODEPOINT; $i++ ) {
13160 $i = populate_char_info($i); # Note sets $i so may cause skips
13167 sub add_perl_synonyms() {
13168 # A number of Unicode tables have Perl synonyms that are expressed in
13169 # the single-form, \p{name}. These are:
13170 # All the binary property Y tables, so that \p{Name=Y} gets \p{Name} and
13171 # \p{Is_Name} as synonyms
13172 # \p{Script=Value} gets \p{Value}, \p{Is_Value} as synonyms
13173 # \p{General_Category=Value} gets \p{Value}, \p{Is_Value} as synonyms
13174 # \p{Block=Value} gets \p{In_Value} as a synonym, and, if there is no
13175 # conflict, \p{Value} and \p{Is_Value} as well
13177 # This routine generates these synonyms, warning of any unexpected
13180 # Construct the list of tables to get synonyms for. Start with all the
13181 # binary and the General_Category ones.
13182 my @tables = grep { $_->type == $BINARY || $_->type == $FORCED_BINARY }
13184 push @tables, $gc->tables;
13186 # If the version of Unicode includes the Script property, add its tables
13187 push @tables, $script->tables if defined $script;
13189 # The Block tables are kept separate because they are treated differently.
13190 # And the earliest versions of Unicode didn't include them, so add only if
13193 push @blocks, $block->tables if defined $block;
13195 # Here, have the lists of tables constructed. Process blocks last so that
13196 # if there are name collisions with them, blocks have lowest priority.
13197 # Should there ever be other collisions, manual intervention would be
13198 # required. See the comments at the beginning of the program for a
13199 # possible way to handle those semi-automatically.
13200 foreach my $table (@tables, @blocks) {
13202 # For non-binary properties, the synonym is just the name of the
13203 # table, like Greek, but for binary properties the synonym is the name
13204 # of the property, and means the code points in its 'Y' table.
13205 my $nominal = $table;
13206 my $nominal_property = $nominal->property;
13208 if (! $nominal->isa('Property')) {
13213 # Here is a binary property. Use the 'Y' table. Verify that is
13215 my $yes = $nominal->table('Y');
13216 unless (defined $yes) { # Must be defined, but is permissible to
13218 Carp::my_carp_bug("Undefined $nominal, 'Y'. Skipping.");
13224 foreach my $alias ($nominal->aliases) {
13226 # Attempt to create a table in the perl directory for the
13227 # candidate table, using whatever aliases in it that don't
13228 # conflict. Also add non-conflicting aliases for all these
13229 # prefixed by 'Is_' (and/or 'In_' for Block property tables)
13231 foreach my $prefix ("", 'Is_', 'In_') {
13233 # Only Block properties can have added 'In_' aliases.
13234 next if $prefix eq 'In_' and $nominal_property != $block;
13236 my $proposed_name = $prefix . $alias->name;
13238 # No Is_Is, In_In, nor combinations thereof
13239 trace "$proposed_name is a no-no" if main::DEBUG && $to_trace && $proposed_name =~ /^ I [ns] _I [ns] _/x;
13240 next if $proposed_name =~ /^ I [ns] _I [ns] _/x;
13242 trace "Seeing if can add alias or table: 'perl=$proposed_name' based on $nominal" if main::DEBUG && $to_trace;
13244 # Get a reference to any existing table in the perl
13245 # directory with the desired name.
13246 my $pre_existing = $perl->table($proposed_name);
13248 if (! defined $pre_existing) {
13250 # No name collision, so ok to add the perl synonym.
13252 my $make_re_pod_entry;
13253 my $ok_as_filename;
13254 my $status = $alias->status;
13255 if ($nominal_property == $block) {
13257 # For block properties, the 'In' form is preferred for
13258 # external use; the pod file contains wild cards for
13259 # this and the 'Is' form so no entries for those; and
13260 # we don't want people using the name without the
13261 # 'In', so discourage that.
13262 if ($prefix eq "") {
13263 $make_re_pod_entry = 1;
13264 $status = $status || $DISCOURAGED;
13265 $ok_as_filename = 0;
13267 elsif ($prefix eq 'In_') {
13268 $make_re_pod_entry = 0;
13269 $status = $status || $NORMAL;
13270 $ok_as_filename = 1;
13273 $make_re_pod_entry = 0;
13274 $status = $status || $DISCOURAGED;
13275 $ok_as_filename = 0;
13278 elsif ($prefix ne "") {
13280 # The 'Is' prefix is handled in the pod by a wild
13281 # card, and we won't use it for an external name
13282 $make_re_pod_entry = 0;
13283 $status = $status || $NORMAL;
13284 $ok_as_filename = 0;
13288 # Here, is an empty prefix, non block. This gets its
13289 # own pod entry and can be used for an external name.
13290 $make_re_pod_entry = 1;
13291 $status = $status || $NORMAL;
13292 $ok_as_filename = 1;
13295 # Here, there isn't a perl pre-existing table with the
13296 # name. Look through the list of equivalents of this
13297 # table to see if one is a perl table.
13298 foreach my $equivalent ($actual->leader->equivalents) {
13299 next if $equivalent->property != $perl;
13301 # Here, have found a table for $perl. Add this alias
13302 # to it, and are done with this prefix.
13303 $equivalent->add_alias($proposed_name,
13304 Re_Pod_Entry => $make_re_pod_entry,
13306 # Currently don't output these in the
13307 # ucd pod, as are strongly discouraged
13312 OK_as_Filename => $ok_as_filename);
13313 trace "adding alias perl=$proposed_name to $equivalent" if main::DEBUG && $to_trace;
13317 # Here, $perl doesn't already have a table that is a
13318 # synonym for this property, add one.
13319 my $added_table = $perl->add_match_table($proposed_name,
13320 Re_Pod_Entry => $make_re_pod_entry,
13322 # See UCD comment just above
13326 OK_as_Filename => $ok_as_filename);
13327 # And it will be related to the actual table, since it is
13329 $added_table->set_equivalent_to($actual, Related => 1);
13330 trace "added ", $perl->table($proposed_name) if main::DEBUG && $to_trace;
13332 } # End of no pre-existing.
13334 # Here, there is a pre-existing table that has the proposed
13335 # name. We could be in trouble, but not if this is just a
13336 # synonym for another table that we have already made a child
13337 # of the pre-existing one.
13338 if ($pre_existing->is_set_equivalent_to($actual)) {
13339 trace "$pre_existing is already equivalent to $actual; adding alias perl=$proposed_name to it" if main::DEBUG && $to_trace;
13340 $pre_existing->add_alias($proposed_name);
13344 # Here, there is a name collision, but it still could be ok if
13345 # the tables match the identical set of code points, in which
13346 # case, we can combine the names. Compare each table's code
13347 # point list to see if they are identical.
13348 trace "Potential name conflict with $pre_existing having ", $pre_existing->count, " code points" if main::DEBUG && $to_trace;
13349 if ($pre_existing->matches_identically_to($actual)) {
13351 # Here, they do match identically. Not a real conflict.
13352 # Make the perl version a child of the Unicode one, except
13353 # in the non-obvious case of where the perl name is
13354 # already a synonym of another Unicode property. (This is
13355 # excluded by the test for it being its own parent.) The
13356 # reason for this exclusion is that then the two Unicode
13357 # properties become related; and we don't really know if
13358 # they are or not. We generate documentation based on
13359 # relatedness, and this would be misleading. Code
13360 # later executed in the process will cause the tables to
13361 # be represented by a single file anyway, without making
13362 # it look in the pod like they are necessarily related.
13363 if ($pre_existing->parent == $pre_existing
13364 && ($pre_existing->property == $perl
13365 || $actual->property == $perl))
13367 trace "Setting $pre_existing equivalent to $actual since one is \$perl, and match identical sets" if main::DEBUG && $to_trace;
13368 $pre_existing->set_equivalent_to($actual, Related => 1);
13370 elsif (main::DEBUG && $to_trace) {
13371 trace "$pre_existing is equivalent to $actual since match identical sets, but not setting them equivalent, to preserve the separateness of the perl aliases";
13372 trace $pre_existing->parent;
13377 # Here they didn't match identically, there is a real conflict
13378 # between our new name and a pre-existing property.
13379 $actual->add_conflicting($proposed_name, 'p', $pre_existing);
13380 $pre_existing->add_conflicting($nominal->full_name,
13384 # Don't output a warning for aliases for the block
13385 # properties (unless they start with 'In_') as it is
13386 # expected that there will be conflicts and the block
13388 if ($verbosity >= $NORMAL_VERBOSITY
13389 && ($actual->property != $block || $prefix eq 'In_'))
13391 print simple_fold(join_lines(<<END
13392 There is already an alias named $proposed_name (from " . $pre_existing . "),
13393 so not creating this alias for " . $actual
13398 # Keep track for documentation purposes.
13399 $has_In_conflicts++ if $prefix eq 'In_';
13400 $has_Is_conflicts++ if $prefix eq 'Is_';
13405 # There are some properties which have No and Yes (and N and Y) as
13406 # property values, but aren't binary, and could possibly be confused with
13407 # binary ones. So create caveats for them. There are tables that are
13408 # named 'No', and tables that are named 'N', but confusion is not likely
13409 # unless they are the same table. For example, N meaning Number or
13410 # Neutral is not likely to cause confusion, so don't add caveats to things
13412 foreach my $property (grep { $_->type != $BINARY
13413 && $_->type != $FORCED_BINARY }
13416 my $yes = $property->table('Yes');
13417 if (defined $yes) {
13418 my $y = $property->table('Y');
13419 if (defined $y && $yes == $y) {
13420 foreach my $alias ($property->aliases) {
13421 $yes->add_conflicting($alias->name);
13425 my $no = $property->table('No');
13427 my $n = $property->table('N');
13428 if (defined $n && $no == $n) {
13429 foreach my $alias ($property->aliases) {
13430 $no->add_conflicting($alias->name, 'P');
13439 sub register_file_for_name($$$) {
13440 # Given info about a table and a datafile that it should be associated
13441 # with, register that association
13444 my $directory_ref = shift; # Array of the directory path for the file
13445 my $file = shift; # The file name in the final directory.
13446 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
13448 trace "table=$table, file=$file, directory=@$directory_ref" if main::DEBUG && $to_trace;
13450 if ($table->isa('Property')) {
13451 $table->set_file_path(@$directory_ref, $file);
13452 push @map_properties, $table;
13454 # No swash means don't do the rest of this.
13455 return if $table->fate != $ORDINARY;
13457 # Get the path to the file
13458 my @path = $table->file_path;
13460 # Use just the file name if no subdirectory.
13461 shift @path if $path[0] eq File::Spec->curdir();
13463 my $file = join '/', @path;
13465 # Create a hash entry for utf8_heavy to get the file that stores this
13466 # property's map table
13467 foreach my $alias ($table->aliases) {
13468 my $name = $alias->name;
13469 $loose_property_to_file_of{standardize($name)} = $file;
13472 # And a way for utf8_heavy to find the proper key in the SwashInfo
13473 # hash for this property.
13474 $file_to_swash_name{$file} = "To" . $table->swash_name;
13478 # Do all of the work for all equivalent tables when called with the leader
13479 # table, so skip if isn't the leader.
13480 return if $table->leader != $table;
13482 # If this is a complement of another file, use that other file instead,
13483 # with a ! prepended to it.
13485 if (($complement = $table->complement) != 0) {
13486 my @directories = $complement->file_path;
13488 # This assumes that the 0th element is something like 'lib',
13489 # the 1th element the property name (in its own directory), like
13490 # 'AHex', and the 2th element the file like 'Y' which will have a .pl
13491 # appended to it later.
13492 $directories[1] =~ s/^/!/;
13493 $file = pop @directories;
13494 $directory_ref =\@directories;
13497 # Join all the file path components together, using slashes.
13498 my $full_filename = join('/', @$directory_ref, $file);
13500 # All go in the same subdirectory of unicore
13501 if ($directory_ref->[0] ne $matches_directory) {
13502 Carp::my_carp("Unexpected directory in "
13503 . join('/', @{$directory_ref}, $file));
13506 # For this table and all its equivalents ...
13507 foreach my $table ($table, $table->equivalents) {
13509 # Associate it with its file internally. Don't include the
13510 # $matches_directory first component
13511 $table->set_file_path(@$directory_ref, $file);
13513 # No swash means don't do the rest of this.
13514 next if $table->isa('Map_Table') && $table->fate != $ORDINARY;
13516 my $sub_filename = join('/', $directory_ref->[1, -1], $file);
13518 my $property = $table->property;
13519 my $property_name = ($property == $perl)
13520 ? "" # 'perl' is never explicitly stated
13521 : standardize($property->name) . '=';
13523 my $is_default = 0; # Is this table the default one for the property?
13525 # To calculate $is_default, we find if this table is the same as the
13526 # default one for the property. But this is complicated by the
13527 # possibility that there is a master table for this one, and the
13528 # information is stored there instead of here.
13529 my $parent = $table->parent;
13530 my $leader_prop = $parent->property;
13531 my $default_map = $leader_prop->default_map;
13532 if (defined $default_map) {
13533 my $default_table = $leader_prop->table($default_map);
13534 $is_default = 1 if defined $default_table && $parent == $default_table;
13537 # Calculate the loose name for this table. Mostly it's just its name,
13538 # standardized. But in the case of Perl tables that are single-form
13539 # equivalents to Unicode properties, it is the latter's name.
13540 my $loose_table_name =
13541 ($property != $perl || $leader_prop == $perl)
13542 ? standardize($table->name)
13543 : standardize($parent->name);
13545 my $deprecated = ($table->status eq $DEPRECATED)
13546 ? $table->status_info
13548 my $caseless_equivalent = $table->caseless_equivalent;
13550 # And for each of the table's aliases... This inner loop eventually
13551 # goes through all aliases in the UCD that we generate regex match
13553 foreach my $alias ($table->aliases) {
13554 my $standard = utf8_heavy_name($table, $alias);
13556 # Generate an entry in either the loose or strict hashes, which
13557 # will translate the property and alias names combination into the
13558 # file where the table for them is stored.
13559 if ($alias->loose_match) {
13560 if (exists $loose_to_file_of{$standard}) {
13561 Carp::my_carp("Can't change file registered to $loose_to_file_of{$standard} to '$sub_filename'.");
13564 $loose_to_file_of{$standard} = $sub_filename;
13568 if (exists $stricter_to_file_of{$standard}) {
13569 Carp::my_carp("Can't change file registered to $stricter_to_file_of{$standard} to '$sub_filename'.");
13572 $stricter_to_file_of{$standard} = $sub_filename;
13574 # Tightly coupled with how utf8_heavy.pl works, for a
13575 # floating point number that is a whole number, get rid of
13576 # the trailing decimal point and 0's, so that utf8_heavy
13577 # will work. Also note that this assumes that such a
13578 # number is matched strictly; so if that were to change,
13579 # this would be wrong.
13580 if ((my $integer_name = $alias->name)
13581 =~ s/^ ( -? \d+ ) \.0+ $ /$1/x)
13583 $stricter_to_file_of{$property_name . $integer_name}
13589 # For Unicode::UCD, create a mapping of the prop=value to the
13590 # canonical =value for that property.
13591 if ($standard =~ /=/) {
13593 # This could happen if a strict name mapped into an existing
13594 # loose name. In that event, the strict names would have to
13595 # be moved to a new hash.
13596 if (exists($loose_to_standard_value{$standard})) {
13597 Carp::my_carp_bug("'$standard' conflicts with a pre-existing use. Bad News. Continuing anyway");
13599 $loose_to_standard_value{$standard} = $loose_table_name;
13602 # Keep a list of the deprecated properties and their filenames
13603 if ($deprecated && $complement == 0) {
13604 $utf8::why_deprecated{$sub_filename} = $deprecated;
13607 # And a substitute table, if any, for case-insensitive matching
13608 if ($caseless_equivalent != 0) {
13609 $caseless_equivalent_to{$standard} = $caseless_equivalent;
13612 # Add to defaults list if the table this alias belongs to is the
13614 $loose_defaults{$standard} = 1 if $is_default;
13622 my %base_names; # Names already used for avoiding DOS 8.3 filesystem
13624 my %full_dir_name_of; # Full length names of directories used.
13626 sub construct_filename($$$) {
13627 # Return a file name for a table, based on the table name, but perhaps
13628 # changed to get rid of non-portable characters in it, and to make
13629 # sure that it is unique on a file system that allows the names before
13630 # any period to be at most 8 characters (DOS). While we're at it
13631 # check and complain if there are any directory conflicts.
13633 my $name = shift; # The name to start with
13634 my $mutable = shift; # Boolean: can it be changed? If no, but
13635 # yet it must be to work properly, a warning
13637 my $directories_ref = shift; # A reference to an array containing the
13638 # path to the file, with each element one path
13639 # component. This is used because the same
13640 # name can be used in different directories.
13641 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
13643 my $warn = ! defined wantarray; # If true, then if the name is
13644 # changed, a warning is issued as well.
13646 if (! defined $name) {
13647 Carp::my_carp("Undefined name in directory "
13648 . File::Spec->join(@$directories_ref)
13653 # Make sure that no directory names conflict with each other. Look at
13654 # each directory in the input file's path. If it is already in use,
13655 # assume it is correct, and is merely being re-used, but if we
13656 # truncate it to 8 characters, and find that there are two directories
13657 # that are the same for the first 8 characters, but differ after that,
13658 # then that is a problem.
13659 foreach my $directory (@$directories_ref) {
13660 my $short_dir = substr($directory, 0, 8);
13661 if (defined $full_dir_name_of{$short_dir}) {
13662 next if $full_dir_name_of{$short_dir} eq $directory;
13663 Carp::my_carp("$directory conflicts with $full_dir_name_of{$short_dir}. Bad News. Continuing anyway");
13666 $full_dir_name_of{$short_dir} = $directory;
13670 my $path = join '/', @$directories_ref;
13671 $path .= '/' if $path;
13673 # Remove interior underscores.
13674 (my $filename = $name) =~ s/ (?<=.) _ (?=.) //xg;
13676 # Change any non-word character into an underscore, and truncate to 8.
13677 $filename =~ s/\W+/_/g; # eg., "L&" -> "L_"
13678 substr($filename, 8) = "" if length($filename) > 8;
13680 # Make sure the basename doesn't conflict with something we
13681 # might have already written. If we have, say,
13688 while (my $num = $base_names{$path}{lc $filename}++) {
13689 $num++; # so basenames with numbers start with '2', which
13690 # just looks more natural.
13692 # Want to append $num, but if it'll make the basename longer
13693 # than 8 characters, pre-truncate $filename so that the result
13695 my $delta = length($filename) + length($num) - 8;
13697 substr($filename, -$delta) = $num;
13702 if ($warn && ! $warned) {
13704 Carp::my_carp("'$path$name' conflicts with another name on a filesystem with 8 significant characters (like DOS). Proceeding anyway.");
13708 return $filename if $mutable;
13710 # If not changeable, must return the input name, but warn if needed to
13711 # change it beyond shortening it.
13712 if ($name ne $filename
13713 && substr($name, 0, length($filename)) ne $filename) {
13714 Carp::my_carp("'$path$name' had to be changed into '$filename'. Bad News. Proceeding anyway.");
13720 # The pod file contains a very large table. Many of the lines in that table
13721 # would exceed a typical output window's size, and so need to be wrapped with
13722 # a hanging indent to make them look good. The pod language is really
13723 # insufficient here. There is no general construct to do that in pod, so it
13724 # is done here by beginning each such line with a space to cause the result to
13725 # be output without formatting, and doing all the formatting here. This leads
13726 # to the result that if the eventual display window is too narrow it won't
13727 # look good, and if the window is too wide, no advantage is taken of that
13728 # extra width. A further complication is that the output may be indented by
13729 # the formatter so that there is less space than expected. What I (khw) have
13730 # done is to assume that that indent is a particular number of spaces based on
13731 # what it is in my Linux system; people can always resize their windows if
13732 # necessary, but this is obviously less than desirable, but the best that can
13734 my $automatic_pod_indent = 8;
13736 # Try to format so that uses fewest lines, but few long left column entries
13737 # slide into the right column. An experiment on 5.1 data yielded the
13738 # following percentages that didn't cut into the other side along with the
13739 # associated first-column widths
13741 # 80% not too bad except for a few blocks
13742 # 90% = 33; # , cuts 353/3053 lines from 37 = 12%
13744 my $indent_info_column = 27; # 75% of lines didn't have overlap
13746 my $FILLER = 3; # Length of initial boiler-plate columns in a pod line
13747 # The 3 is because of:
13748 # 1 for the leading space to tell the pod formatter to
13751 # 1 for the space between the flag and the main data
13753 sub format_pod_line ($$$;$$) {
13754 # Take a pod line and return it, formatted properly
13756 my $first_column_width = shift;
13757 my $entry = shift; # Contents of left column
13758 my $info = shift; # Contents of right column
13760 my $status = shift || ""; # Any flag
13762 my $loose_match = shift; # Boolean.
13763 $loose_match = 1 unless defined $loose_match;
13765 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
13768 $flags .= $STRICTER if ! $loose_match;
13770 $flags .= $status if $status;
13772 # There is a blank in the left column to cause the pod formatter to
13773 # output the line as-is.
13774 return sprintf " %-*s%-*s %s\n",
13775 # The first * in the format is replaced by this, the -1 is
13776 # to account for the leading blank. There isn't a
13777 # hard-coded blank after this to separate the flags from
13778 # the rest of the line, so that in the unlikely event that
13779 # multiple flags are shown on the same line, they both
13780 # will get displayed at the expense of that separation,
13781 # but since they are left justified, a blank will be
13782 # inserted in the normal case.
13786 # The other * in the format is replaced by this number to
13787 # cause the first main column to right fill with blanks.
13788 # The -1 is for the guaranteed blank following it.
13789 $first_column_width - $FILLER - 1,
13794 my @zero_match_tables; # List of tables that have no matches in this release
13796 sub make_re_pod_entries($) {
13797 # This generates the entries for the pod file for a given table.
13798 # Also done at this time are any children tables. The output looks like:
13799 # \p{Common} \p{Script=Common} (Short: \p{Zyyy}) (5178)
13801 my $input_table = shift; # Table the entry is for
13802 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
13804 # Generate parent and all its children at the same time.
13805 return if $input_table->parent != $input_table;
13807 my $property = $input_table->property;
13808 my $type = $property->type;
13809 my $full_name = $property->full_name;
13811 my $count = $input_table->count;
13812 my $string_count = clarify_number($count);
13813 my $status = $input_table->status;
13814 my $status_info = $input_table->status_info;
13815 my $caseless_equivalent = $input_table->caseless_equivalent;
13817 my $entry_for_first_table; # The entry for the first table output.
13818 # Almost certainly, it is the parent.
13820 # For each related table (including itself), we will generate a pod entry
13821 # for each name each table goes by
13822 foreach my $table ($input_table, $input_table->children) {
13824 # utf8_heavy.pl cannot deal with null string property values, so skip
13825 # any tables that have no non-null names.
13826 next if ! grep { $_->name ne "" } $table->aliases;
13828 # First, gather all the info that applies to this table as a whole.
13830 push @zero_match_tables, $table if $count == 0;
13832 my $table_property = $table->property;
13834 # The short name has all the underscores removed, while the full name
13835 # retains them. Later, we decide whether to output a short synonym
13836 # for the full one, we need to compare apples to apples, so we use the
13837 # short name's length including underscores.
13838 my $table_property_short_name_length;
13839 my $table_property_short_name
13840 = $table_property->short_name(\$table_property_short_name_length);
13841 my $table_property_full_name = $table_property->full_name;
13843 # Get how much savings there is in the short name over the full one
13844 # (delta will always be <= 0)
13845 my $table_property_short_delta = $table_property_short_name_length
13846 - length($table_property_full_name);
13847 my @table_description = $table->description;
13848 my @table_note = $table->note;
13850 # Generate an entry for each alias in this table.
13851 my $entry_for_first_alias; # saves the first one encountered.
13852 foreach my $alias ($table->aliases) {
13854 # Skip if not to go in pod.
13855 next unless $alias->make_re_pod_entry;
13857 # Start gathering all the components for the entry
13858 my $name = $alias->name;
13860 # Skip if name is empty, as can't be accessed by regexes.
13861 next if $name eq "";
13863 my $entry; # Holds the left column, may include extras
13864 my $entry_ref; # To refer to the left column's contents from
13865 # another entry; has no extras
13867 # First the left column of the pod entry. Tables for the $perl
13868 # property always use the single form.
13869 if ($table_property == $perl) {
13870 $entry = "\\p{$name}";
13871 $entry_ref = "\\p{$name}";
13873 else { # Compound form.
13875 # Only generate one entry for all the aliases that mean true
13876 # or false in binary properties. Append a '*' to indicate
13877 # some are missing. (The heading comment notes this.)
13879 if ($type == $BINARY) {
13880 next if $name ne 'N' && $name ne 'Y';
13883 elsif ($type != $FORCED_BINARY) {
13888 # Forced binary properties require special handling. It
13889 # has two sets of tables, one set is true/false; and the
13890 # other set is everything else. Entries are generated for
13891 # each set. Use the Bidi_Mirrored property (which appears
13892 # in all Unicode versions) to get a list of the aliases
13893 # for the true/false tables. Of these, only output the N
13894 # and Y ones, the same as, a regular binary property. And
13895 # output all the rest, same as a non-binary property.
13896 my $bm = property_ref("Bidi_Mirrored");
13897 if ($name eq 'N' || $name eq 'Y') {
13899 } elsif (grep { $name eq $_->name } $bm->table("Y")->aliases,
13900 $bm->table("N")->aliases)
13909 # Colon-space is used to give a little more space to be easier
13912 . $table_property_full_name
13915 # But for the reference to this entry, which will go in the
13916 # right column, where space is at a premium, use equals
13918 $entry_ref = "\\p{" . $table_property_full_name . "=$name}";
13921 # Then the right (info) column. This is stored as components of
13922 # an array for the moment, then joined into a string later. For
13923 # non-internal only properties, begin the info with the entry for
13924 # the first table we encountered (if any), as things are ordered
13925 # so that that one is the most descriptive. This leads to the
13926 # info column of an entry being a more descriptive version of the
13929 if ($name =~ /^_/) {
13931 '(For internal use by Perl, not necessarily stable)';
13933 elsif ($entry_for_first_alias) {
13934 push @info, $entry_for_first_alias;
13937 # If this entry is equivalent to another, add that to the info,
13938 # using the first such table we encountered
13939 if ($entry_for_first_table) {
13941 push @info, "(= $entry_for_first_table)";
13944 push @info, $entry_for_first_table;
13948 # If the name is a large integer, add an equivalent with an
13949 # exponent for better readability
13950 if ($name =~ /^[+-]?[\d]+$/ && $name >= 10_000) {
13951 push @info, sprintf "(= %.1e)", $name
13954 my $parenthesized = "";
13955 if (! $entry_for_first_alias) {
13957 # This is the first alias for the current table. The alias
13958 # array is ordered so that this is the fullest, most
13959 # descriptive alias, so it gets the fullest info. The other
13960 # aliases are mostly merely pointers to this one, using the
13961 # information already added above.
13963 # Display any status message, but only on the parent table
13964 if ($status && ! $entry_for_first_table) {
13965 push @info, $status_info;
13968 # Put out any descriptive info
13969 if (@table_description || @table_note) {
13970 push @info, join "; ", @table_description, @table_note;
13973 # Look to see if there is a shorter name we can point people
13975 my $standard_name = standardize($name);
13977 my $proposed_short = $table->short_name;
13978 if (defined $proposed_short) {
13979 my $standard_short = standardize($proposed_short);
13981 # If the short name is shorter than the standard one, or
13982 # even it it's not, but the combination of it and its
13983 # short property name (as in \p{prop=short} ($perl doesn't
13984 # have this form)) saves at least two characters, then,
13985 # cause it to be listed as a shorter synonym.
13986 if (length $standard_short < length $standard_name
13987 || ($table_property != $perl
13988 && (length($standard_short)
13989 - length($standard_name)
13990 + $table_property_short_delta) # (<= 0)
13993 $short_name = $proposed_short;
13994 if ($table_property != $perl) {
13995 $short_name = $table_property_short_name
13998 $short_name = "\\p{$short_name}";
14002 # And if this is a compound form name, see if there is a
14003 # single form equivalent
14005 if ($table_property != $perl) {
14007 # Special case the binary N tables, so that will print
14008 # \P{single}, but use the Y table values to populate
14009 # 'single', as we haven't likewise populated the N table.
14010 # For forced binary tables, we can't just look at the N
14011 # table, but must see if this table is equivalent to the N
14012 # one, as there are two equivalent beasts in these
14016 if ( ($type == $BINARY
14017 && $input_table == $property->table('No'))
14018 || ($type == $FORCED_BINARY
14019 && $property->table('No')->
14020 is_set_equivalent_to($input_table)))
14022 $test_table = $property->table('Yes');
14026 $test_table = $input_table;
14030 # Look for a single form amongst all the children.
14031 foreach my $table ($test_table->children) {
14032 next if $table->property != $perl;
14033 my $proposed_name = $table->short_name;
14034 next if ! defined $proposed_name;
14036 # Don't mention internal-only properties as a possible
14037 # single form synonym
14038 next if substr($proposed_name, 0, 1) eq '_';
14040 $proposed_name = "\\$p\{$proposed_name}";
14041 if (! defined $single_form
14042 || length($proposed_name) < length $single_form)
14044 $single_form = $proposed_name;
14046 # The goal here is to find a single form; not the
14047 # shortest possible one. We've already found a
14048 # short name. So, stop at the first single form
14049 # found, which is likely to be closer to the
14056 # Ouput both short and single in the same parenthesized
14057 # expression, but with only one of 'Single', 'Short' if there
14059 if ($short_name || $single_form || $table->conflicting) {
14060 $parenthesized .= "Short: $short_name" if $short_name;
14061 if ($short_name && $single_form) {
14062 $parenthesized .= ', ';
14064 elsif ($single_form) {
14065 $parenthesized .= 'Single: ';
14067 $parenthesized .= $single_form if $single_form;
14071 if ($caseless_equivalent != 0) {
14072 $parenthesized .= '; ' if $parenthesized ne "";
14073 $parenthesized .= "/i= " . $caseless_equivalent->complete_name;
14077 # Warn if this property isn't the same as one that a
14078 # semi-casual user might expect. The other components of this
14079 # parenthesized structure are calculated only for the first entry
14080 # for this table, but the conflicting is deemed important enough
14081 # to go on every entry.
14082 my $conflicting = join " NOR ", $table->conflicting;
14083 if ($conflicting) {
14084 $parenthesized .= '; ' if $parenthesized ne "";
14085 $parenthesized .= "NOT $conflicting";
14088 push @info, "($parenthesized)" if $parenthesized;
14090 if ($name =~ /_$/ && $alias->loose_match) {
14091 push @info, "Note the trailing '_' matters in spite of loose matching rules.";
14094 if ($table_property != $perl && $table->perl_extension) {
14095 push @info, '(Perl extension)';
14097 push @info, "($string_count)";
14099 # Now, we have both the entry and info so add them to the
14100 # list of all the properties.
14101 push @match_properties,
14102 format_pod_line($indent_info_column,
14106 $alias->loose_match);
14108 $entry_for_first_alias = $entry_ref unless $entry_for_first_alias;
14109 } # End of looping through the aliases for this table.
14111 if (! $entry_for_first_table) {
14112 $entry_for_first_table = $entry_for_first_alias;
14114 } # End of looping through all the related tables
14118 sub make_ucd_table_pod_entries {
14121 # Generate the entries for the UCD section of the pod for $table. This
14122 # also calculates if names are ambiguous, so has to be called even if the
14123 # pod is not being output
14125 my $short_name = $table->name;
14126 my $standard_short_name = standardize($short_name);
14127 my $full_name = $table->full_name;
14128 my $standard_full_name = standardize($full_name);
14130 my $full_info = ""; # Text of info column for full-name entries
14131 my $other_info = ""; # Text of info column for short-name entries
14132 my $short_info = ""; # Text of info column for other entries
14133 my $meaning = ""; # Synonym of this table
14135 my $property = ($table->isa('Property'))
14137 : $table->parent->property;
14139 my $perl_extension = $table->perl_extension;
14141 # Get the more official name for for perl extensions that aren't
14142 # stand-alone properties
14143 if ($perl_extension && $property != $table) {
14144 if ($property == $perl ||$property->type == $BINARY) {
14145 $meaning = $table->complete_name;
14148 $meaning = $property->full_name . "=$full_name";
14152 # There are three types of info column. One for the short name, one for
14153 # the full name, and one for everything else. They mostly are the same,
14154 # so initialize in the same loop.
14155 foreach my $info_ref (\$full_info, \$short_info, \$other_info) {
14156 if ($perl_extension && $property != $table) {
14158 # Add the synonymous name for the non-full name entries; and to
14159 # the full-name entry if it adds extra information
14160 if ($info_ref == \$other_info
14161 || ($info_ref == \$short_info
14162 && $standard_short_name ne $standard_full_name)
14163 || standardize($meaning) ne $standard_full_name
14165 $$info_ref .= "$meaning.";
14168 elsif ($info_ref != \$full_info) {
14170 # Otherwise, the non-full name columns include the full name
14171 $$info_ref .= $full_name;
14174 # And the full-name entry includes the short name, if different
14175 if ($info_ref == \$full_info
14176 && $standard_short_name ne $standard_full_name)
14178 $full_info =~ s/\.\Z//;
14179 $full_info .= " " if $full_info;
14180 $full_info .= "(Short: $short_name)";
14183 if ($table->perl_extension) {
14184 $$info_ref =~ s/\.\Z//;
14185 $$info_ref .= ". " if $$info_ref;
14186 $$info_ref .= "(Perl extension)";
14190 # Add any extra annotations to the full name entry
14191 foreach my $more_info ($table->description,
14193 $table->status_info)
14195 next unless $more_info;
14196 $full_info =~ s/\.\Z//;
14197 $full_info .= ". " if $full_info;
14198 $full_info .= $more_info;
14201 # These keep track if have created full and short name pod entries for the
14204 my $done_short = 0;
14206 # Every possible name is kept track of, even those that aren't going to be
14207 # output. This way we can be sure to find the ambiguities.
14208 foreach my $alias ($table->aliases) {
14209 my $name = $alias->name;
14210 my $standard = standardize($name);
14212 my $output_this = $alias->ucd;
14214 # If the full and short names are the same, we want to output the full
14215 # one's entry, so it has priority.
14216 if ($standard eq $standard_full_name) {
14217 next if $done_full;
14219 $info = $full_info;
14221 elsif ($standard eq $standard_short_name) {
14222 next if $done_short;
14224 next if $standard_short_name eq $standard_full_name;
14225 $info = $short_info;
14228 $info = $other_info;
14231 # Here, we have set up the two columns for this entry. But if an
14232 # entry already exists for this name, we have to decide which one
14233 # we're going to later output.
14234 if (exists $ucd_pod{$standard}) {
14236 # If the two entries refer to the same property, it's not going to
14237 # be ambiguous. (Likely it's because the names when standardized
14238 # are the same.) But that means if they are different properties,
14239 # there is ambiguity.
14240 if ($ucd_pod{$standard}->{'property'} != $property) {
14242 # Here, we have an ambiguity. This code assumes that one is
14243 # scheduled to be output and one not and that one is a perl
14244 # extension (which is not to be output) and the other isn't.
14245 # If those assumptions are wrong, things have to be rethought.
14246 if ($ucd_pod{$standard}{'output_this'} == $output_this
14247 || $ucd_pod{$standard}{'perl_extension'} == $perl_extension
14248 || $output_this == $perl_extension)
14250 Carp::my_carp("Bad news. $property and $ucd_pod{$standard}->{'property'} have unexpected output status and perl-extension combinations. Proceeding anyway.");
14253 # We modifiy the info column of the one being output to
14254 # indicate the ambiguity. Set $which to point to that one's
14257 if ($ucd_pod{$standard}{'output_this'}) {
14258 $which = \$ucd_pod{$standard}->{'info'};
14262 $meaning = $ucd_pod{$standard}{'meaning'};
14266 $$which =~ s/\.\Z//;
14267 $$which .= "; NOT '$standard' meaning '$meaning'";
14269 $ambiguous_names{$standard} = 1;
14272 # Use the non-perl-extension variant
14273 next unless $ucd_pod{$standard}{'perl_extension'};
14276 # Store enough information about this entry that we can later look for
14277 # ambiguities, and output it properly.
14278 $ucd_pod{$standard} = { 'name' => $name,
14280 'meaning' => $meaning,
14281 'output_this' => $output_this,
14282 'perl_extension' => $perl_extension,
14283 'property' => $property,
14284 'status' => $alias->status,
14286 } # End of looping through all this table's aliases
14291 sub pod_alphanumeric_sort {
14292 # Sort pod entries alphanumerically.
14294 # The first few character columns are filler, plus the '\p{'; and get rid
14295 # of all the trailing stuff, starting with the trailing '}', so as to sort
14296 # on just 'Name=Value'
14297 (my $a = lc $a) =~ s/^ .*? { //x;
14299 (my $b = lc $b) =~ s/^ .*? { //x;
14302 # Determine if the two operands are both internal only or both not.
14303 # Character 0 should be a '\'; 1 should be a p; 2 should be '{', so 3
14304 # should be the underscore that begins internal only
14305 my $a_is_internal = (substr($a, 0, 1) eq '_');
14306 my $b_is_internal = (substr($b, 0, 1) eq '_');
14308 # Sort so the internals come last in the table instead of first (which the
14309 # leading underscore would otherwise indicate).
14310 if ($a_is_internal != $b_is_internal) {
14311 return 1 if $a_is_internal;
14315 # Determine if the two operands are numeric property values or not.
14316 # A numeric property will look like xyz: 3. But the number
14317 # can begin with an optional minus sign, and may have a
14318 # fraction or rational component, like xyz: 3/2. If either
14319 # isn't numeric, use alphabetic sort.
14320 my ($a_initial, $a_number) =
14321 ($a =~ /^ ( [^:=]+ [:=] \s* ) (-? \d+ (?: [.\/] \d+)? )/ix);
14322 return $a cmp $b unless defined $a_number;
14323 my ($b_initial, $b_number) =
14324 ($b =~ /^ ( [^:=]+ [:=] \s* ) (-? \d+ (?: [.\/] \d+)? )/ix);
14325 return $a cmp $b unless defined $b_number;
14327 # Here they are both numeric, but use alphabetic sort if the
14328 # initial parts don't match
14329 return $a cmp $b if $a_initial ne $b_initial;
14331 # Convert rationals to floating for the comparison.
14332 $a_number = eval $a_number if $a_number =~ qr{/};
14333 $b_number = eval $b_number if $b_number =~ qr{/};
14335 return $a_number <=> $b_number;
14339 # Create the .pod file. This generates the various subsections and then
14340 # combines them in one big HERE document.
14342 my $Is_flags_text = "If an entry has flag(s) at its beginning, like \"$DEPRECATED\", the \"Is_\" form has the same flag(s)";
14344 return unless defined $pod_directory;
14345 print "Making pod file\n" if $verbosity >= $PROGRESS;
14347 my $exception_message =
14348 '(Any exceptions are individually noted beginning with the word NOT.)';
14350 if (-e 'Blocks.txt') {
14352 # Add the line: '\p{In_*} \p{Block: *}', with the warning message
14353 # if the global $has_In_conflicts indicates we have them.
14354 push @match_properties, format_pod_line($indent_info_column,
14357 . (($has_In_conflicts)
14358 ? " $exception_message"
14360 @block_warning = << "END";
14362 Matches in the Block property have shortcuts that begin with "In_". For
14363 example, C<\\p{Block=Latin1}> can be written as C<\\p{In_Latin1}>. For
14364 backward compatibility, if there is no conflict with another shortcut, these
14365 may also be written as C<\\p{Latin1}> or C<\\p{Is_Latin1}>. But, N.B., there
14366 are numerous such conflicting shortcuts. Use of these forms for Block is
14367 discouraged, and are flagged as such, not only because of the potential
14368 confusion as to what is meant, but also because a later release of Unicode may
14369 preempt the shortcut, and your program would no longer be correct. Use the
14370 "In_" form instead to avoid this, or even more clearly, use the compound form,
14371 e.g., C<\\p{blk:latin1}>. See L<perlunicode/"Blocks"> for more information
14375 my $text = $Is_flags_text;
14376 $text = "$exception_message $text" if $has_Is_conflicts;
14378 # And the 'Is_ line';
14379 push @match_properties, format_pod_line($indent_info_column,
14383 # Sort the properties array for output. It is sorted alphabetically
14384 # except numerically for numeric properties, and only output unique lines.
14385 @match_properties = sort pod_alphanumeric_sort uniques @match_properties;
14387 my $formatted_properties = simple_fold(\@match_properties,
14389 # indent succeeding lines by two extra
14390 # which looks better
14391 $indent_info_column + 2,
14393 # shorten the line length by how much
14394 # the formatter indents, so the folded
14395 # line will fit in the space
14396 # presumably available
14397 $automatic_pod_indent);
14398 # Add column headings, indented to be a little more centered, but not
14400 $formatted_properties = format_pod_line($indent_info_column,
14404 . $formatted_properties;
14406 # Generate pod documentation lines for the tables that match nothing
14407 my $zero_matches = "";
14408 if (@zero_match_tables) {
14409 @zero_match_tables = uniques(@zero_match_tables);
14410 $zero_matches = join "\n\n",
14411 map { $_ = '=item \p{' . $_->complete_name . "}" }
14412 sort { $a->complete_name cmp $b->complete_name }
14413 @zero_match_tables;
14415 $zero_matches = <<END;
14417 =head2 Legal C<\\p{}> and C<\\P{}> constructs that match no characters
14419 Unicode has some property-value pairs that currently don't match anything.
14420 This happens generally either because they are obsolete, or they exist for
14421 symmetry with other forms, but no language has yet been encoded that uses
14422 them. In this version of Unicode, the following match zero code points:
14433 # Generate list of properties that we don't accept, grouped by the reasons
14434 # why. This is so only put out the 'why' once, and then list all the
14435 # properties that have that reason under it.
14437 my %why_list; # The keys are the reasons; the values are lists of
14438 # properties that have the key as their reason
14440 # For each property, add it to the list that are suppressed for its reason
14441 # The sort will cause the alphabetically first properties to be added to
14442 # each list first, so each list will be sorted.
14443 foreach my $property (sort keys %why_suppressed) {
14444 push @{$why_list{$why_suppressed{$property}}}, $property;
14447 # For each reason (sorted by the first property that has that reason)...
14448 my @bad_re_properties;
14449 foreach my $why (sort { $why_list{$a}->[0] cmp $why_list{$b}->[0] }
14452 # Add to the output, all the properties that have that reason.
14453 my $has_item = 0; # Flag if actually output anything.
14454 foreach my $name (@{$why_list{$why}}) {
14456 # Split compound names into $property and $table components
14457 my $property = $name;
14459 if ($property =~ / (.*) = (.*) /x) {
14464 # This release of Unicode may not have a property that is
14465 # suppressed, so don't reference a non-existent one.
14466 $property = property_ref($property);
14467 next if ! defined $property;
14469 # And since this list is only for match tables, don't list the
14470 # ones that don't have match tables.
14471 next if ! $property->to_create_match_tables;
14473 # Find any abbreviation, and turn it into a compound name if this
14474 # is a property=value pair.
14475 my $short_name = $property->name;
14476 $short_name .= '=' . $property->table($table)->name if $table;
14478 # Start with an empty line.
14479 push @bad_re_properties, "\n\n" unless $has_item;
14481 # And add the property as an item for the reason.
14482 push @bad_re_properties, "\n=item I<$name> ($short_name)\n";
14486 # And add the reason under the list of properties, if such a list
14487 # actually got generated. Note that the header got added
14488 # unconditionally before. But pod ignores extra blank lines, so no
14490 push @bad_re_properties, "\n$why\n" if $has_item;
14492 } # End of looping through each reason.
14494 if (! @bad_re_properties) {
14495 push @bad_re_properties,
14496 "*** This installation accepts ALL non-Unihan properties ***";
14499 # Add =over only if non-empty to avoid an empty =over/=back section,
14500 # which is considered bad form.
14501 unshift @bad_re_properties, "\n=over 4\n";
14502 push @bad_re_properties, "\n=back\n";
14505 # Similiarly, generate a list of files that we don't use, grouped by the
14506 # reasons why. First, create a hash whose keys are the reasons, and whose
14507 # values are anonymous arrays of all the files that share that reason.
14508 my %grouped_by_reason;
14509 foreach my $file (keys %ignored_files) {
14510 push @{$grouped_by_reason{$ignored_files{$file}}}, $file;
14512 foreach my $file (keys %skipped_files) {
14513 push @{$grouped_by_reason{$skipped_files{$file}}}, $file;
14516 # Then, sort each group.
14517 foreach my $group (keys %grouped_by_reason) {
14518 @{$grouped_by_reason{$group}} = sort { lc $a cmp lc $b }
14519 @{$grouped_by_reason{$group}} ;
14522 # Finally, create the output text. For each reason (sorted by the
14523 # alphabetically first file that has that reason)...
14525 foreach my $reason (sort { lc $grouped_by_reason{$a}->[0]
14526 cmp lc $grouped_by_reason{$b}->[0]
14528 keys %grouped_by_reason)
14530 # Add all the files that have that reason to the output. Start
14531 # with an empty line.
14532 push @unused_files, "\n\n";
14533 push @unused_files, map { "\n=item F<$_> \n" }
14534 @{$grouped_by_reason{$reason}};
14535 # And add the reason under the list of files
14536 push @unused_files, "\n$reason\n";
14539 # Similarly, create the output text for the UCD section of the pod
14541 foreach my $key (keys %ucd_pod) {
14542 next unless $ucd_pod{$key}->{'output_this'};
14543 push @ucd_pod, format_pod_line($indent_info_column,
14544 $ucd_pod{$key}->{'name'},
14545 $ucd_pod{$key}->{'info'},
14546 $ucd_pod{$key}->{'status'},
14550 # Sort alphabetically, and fold for output
14551 @ucd_pod = sort { lc substr($a, 2) cmp lc substr($b, 2) } @ucd_pod;
14552 my $ucd_pod = simple_fold(\@ucd_pod,
14554 $indent_info_column,
14555 $automatic_pod_indent);
14556 $ucd_pod = format_pod_line($indent_info_column, 'NAME', ' INFO')
14561 # Everything is ready to assemble.
14562 my @OUT = << "END";
14567 To change this file, edit $0 instead.
14573 $pod_file - Index of Unicode Version $string_version character properties in Perl
14577 This document provides information about the portion of the Unicode database
14578 that deals with character properties, that is the portion that is defined on
14579 single code points. (L</Other information in the Unicode data base>
14580 below briefly mentions other data that Unicode provides.)
14582 Perl can provide access to all non-provisional Unicode character properties,
14583 though not all are enabled by default. The omitted ones are the Unihan
14584 properties (accessible via the CPAN module L<Unicode::Unihan>) and certain
14585 deprecated or Unicode-internal properties. (An installation may choose to
14586 recompile Perl's tables to change this. See L<Unicode character
14587 properties that are NOT accepted by Perl>.)
14589 For most purposes, access to Unicode properties from the Perl core is through
14590 regular expression matches, as described in the next section.
14591 For some special purposes, and to access the properties that are not suitable
14592 for regular expression matching, all the Unicode character properties that
14593 Perl handles are accessible via the standard L<Unicode::UCD> module, as
14594 described in the section L</Properties accessible through Unicode::UCD>.
14596 Perl also provides some additional extensions and short-cut synonyms
14597 for Unicode properties.
14599 This document merely lists all available properties and does not attempt to
14600 explain what each property really means. There is a brief description of each
14601 Perl extension; see L<perlunicode/Other Properties> for more information on
14602 these. There is some detail about Blocks, Scripts, General_Category,
14603 and Bidi_Class in L<perlunicode>, but to find out about the intricacies of the
14604 official Unicode properties, refer to the Unicode standard. A good starting
14605 place is L<$unicode_reference_url>.
14607 Note that you can define your own properties; see
14608 L<perlunicode/"User-Defined Character Properties">.
14610 =head1 Properties accessible through C<\\p{}> and C<\\P{}>
14612 The Perl regular expression C<\\p{}> and C<\\P{}> constructs give access to
14613 most of the Unicode character properties. The table below shows all these
14614 constructs, both single and compound forms.
14616 B<Compound forms> consist of two components, separated by an equals sign or a
14617 colon. The first component is the property name, and the second component is
14618 the particular value of the property to match against, for example,
14619 C<\\p{Script: Greek}> and C<\\p{Script=Greek}> both mean to match characters
14620 whose Script property is Greek.
14622 B<Single forms>, like C<\\p{Greek}>, are mostly Perl-defined shortcuts for
14623 their equivalent compound forms. The table shows these equivalences. (In our
14624 example, C<\\p{Greek}> is a just a shortcut for C<\\p{Script=Greek}>.)
14625 There are also a few Perl-defined single forms that are not shortcuts for a
14626 compound form. One such is C<\\p{Word}>. These are also listed in the table.
14628 In parsing these constructs, Perl always ignores Upper/lower case differences
14629 everywhere within the {braces}. Thus C<\\p{Greek}> means the same thing as
14630 C<\\p{greek}>. But note that changing the case of the C<"p"> or C<"P"> before
14631 the left brace completely changes the meaning of the construct, from "match"
14632 (for C<\\p{}>) to "doesn't match" (for C<\\P{}>). Casing in this document is
14633 for improved legibility.
14635 Also, white space, hyphens, and underscores are also normally ignored
14636 everywhere between the {braces}, and hence can be freely added or removed
14637 even if the C</x> modifier hasn't been specified on the regular expression.
14638 But $a_bold_stricter at the beginning of an entry in the table below
14639 means that tighter (stricter) rules are used for that entry:
14643 =item Single form (C<\\p{name}>) tighter rules:
14645 White space, hyphens, and underscores ARE significant
14650 =item * white space adjacent to a non-word character
14652 =item * underscores separating digits in numbers
14656 That means, for example, that you can freely add or remove white space
14657 adjacent to (but within) the braces without affecting the meaning.
14659 =item Compound form (C<\\p{name=value}> or C<\\p{name:value}>) tighter rules:
14661 The tighter rules given above for the single form apply to everything to the
14662 right of the colon or equals; the looser rules still apply to everything to
14665 That means, for example, that you can freely add or remove white space
14666 adjacent to (but within) the braces and the colon or equal sign.
14670 Some properties are considered obsolete by Unicode, but still available.
14671 There are several varieties of obsolescence:
14677 A property may be stabilized. Such a determination does not indicate
14678 that the property should or should not be used; instead it is a declaration
14679 that the property will not be maintained nor extended for newly encoded
14680 characters. Such properties are marked with $a_bold_stabilized in the
14685 A property may be deprecated, perhaps because its original intent
14686 has been replaced by another property, or because its specification was
14687 somehow defective. This means that its use is strongly
14688 discouraged, so much so that a warning will be issued if used, unless the
14689 regular expression is in the scope of a C<S<no warnings 'deprecated'>>
14690 statement. $A_bold_deprecated flags each such entry in the table, and
14691 the entry there for the longest, most descriptive version of the property will
14692 give the reason it is deprecated, and perhaps advice. Perl may issue such a
14693 warning, even for properties that aren't officially deprecated by Unicode,
14694 when there used to be characters or code points that were matched by them, but
14695 no longer. This is to warn you that your program may not work like it did on
14696 earlier Unicode releases.
14698 A deprecated property may be made unavailable in a future Perl version, so it
14699 is best to move away from them.
14701 A deprecated property may also be stabilized, but this fact is not shown.
14705 Properties marked with $a_bold_obsolete in the table are considered (plain)
14706 obsolete. Generally this designation is given to properties that Unicode once
14707 used for internal purposes (but not any longer).
14711 Some Perl extensions are present for backwards compatibility and are
14712 discouraged from being used, but are not obsolete. $A_bold_discouraged
14713 flags each such entry in the table. Future Unicode versions may force
14714 some of these extensions to be removed without warning, replaced by another
14715 property with the same name that means something different. Use the
14716 equivalent shown instead.
14720 The table below has two columns. The left column contains the C<\\p{}>
14721 constructs to look up, possibly preceded by the flags mentioned above; and
14722 the right column contains information about them, like a description, or
14723 synonyms. It shows both the single and compound forms for each property that
14724 has them. If the left column is a short name for a property, the right column
14725 will give its longer, more descriptive name; and if the left column is the
14726 longest name, the right column will show any equivalent shortest name, in both
14727 single and compound forms if applicable.
14729 The right column will also caution you if a property means something different
14730 than what might normally be expected.
14732 All single forms are Perl extensions; a few compound forms are as well, and
14735 Numbers in (parentheses) indicate the total number of code points matched by
14736 the property. For emphasis, those properties that match no code points at all
14737 are listed as well in a separate section following the table.
14739 Most properties match the same code points regardless of whether C<"/i">
14740 case-insensitive matching is specified or not. But a few properties are
14741 affected. These are shown with the notation
14743 (/i= other_property)
14745 in the second column. Under case-insensitive matching they match the
14746 same code pode points as the property "other_property".
14748 There is no description given for most non-Perl defined properties (See
14749 L<$unicode_reference_url> for that).
14751 For compactness, 'B<*>' is used as a wildcard instead of showing all possible
14752 combinations. For example, entries like:
14754 \\p{Gc: *} \\p{General_Category: *}
14756 mean that 'Gc' is a synonym for 'General_Category', and anything that is valid
14757 for the latter is also valid for the former. Similarly,
14761 means that if and only if, for example, C<\\p{Foo}> exists, then
14762 C<\\p{Is_Foo}> and C<\\p{IsFoo}> are also valid and all mean the same thing.
14763 And similarly, C<\\p{Foo=Bar}> means the same as C<\\p{Is_Foo=Bar}> and
14764 C<\\p{IsFoo=Bar}>. "*" here is restricted to something not beginning with an
14767 Also, in binary properties, 'Yes', 'T', and 'True' are all synonyms for 'Y'.
14768 And 'No', 'F', and 'False' are all synonyms for 'N'. The table shows 'Y*' and
14769 'N*' to indicate this, and doesn't have separate entries for the other
14770 possibilities. Note that not all properties which have values 'Yes' and 'No'
14771 are binary, and they have all their values spelled out without using this wild
14772 card, and a C<NOT> clause in their description that highlights their not being
14773 binary. These also require the compound form to match them, whereas true
14774 binary properties have both single and compound forms available.
14776 Note that all non-essential underscores are removed in the display of the
14783 =item Z<>B<*> is a wild-card
14785 =item B<(\\d+)> in the info column gives the number of code points matched by
14788 =item B<$DEPRECATED> means this is deprecated.
14790 =item B<$OBSOLETE> means this is obsolete.
14792 =item B<$STABILIZED> means this is stabilized.
14794 =item B<$STRICTER> means tighter (stricter) name matching applies.
14796 =item B<$DISCOURAGED> means use of this form is discouraged, and may not be
14801 $formatted_properties
14805 =head1 Properties accessible through Unicode::UCD
14807 All the Unicode character properties mentioned above (except for those marked
14808 as for internal use by Perl) are also accessible by
14809 L<Unicode::UCD/prop_invlist()>.
14811 Due to their nature, not all Unicode character properties are suitable for
14812 regular expression matches, nor C<prop_invlist()>. The remaining
14813 non-provisional, non-internal ones are accessible via
14814 L<Unicode::UCD/prop_invmap()> (except for those that this Perl installation
14815 hasn't included; see L<below for which those are|/Unicode character properties
14816 that are NOT accepted by Perl>).
14818 For compatibility with other parts of Perl, all the single forms given in the
14819 table in the L<section above|/Properties accessible through \\p{} and \\P{}>
14820 are recognized. BUT, there are some ambiguities between some Perl extensions
14821 and the Unicode properties, all of which are silently resolved in favor of the
14822 official Unicode property. To avoid surprises, you should only use
14823 C<prop_invmap()> for forms listed in the table below, which omits the
14824 non-recommended ones. The affected forms are the Perl single form equivalents
14825 of Unicode properties, such as C<\\p{sc}> being a single-form equivalent of
14826 C<\\p{gc=sc}>, which is treated by C<prop_invmap()> as the C<Script> property,
14827 whose short name is C<sc>. The table indicates the current ambiguities in the
14828 INFO column, beginning with the word C<"NOT">.
14830 The standard Unicode properties listed below are documented in
14831 L<$unicode_reference_url>; Perl_Decimal_Digit is documented in
14832 L<Unicode::UCD/prop_invmap()>. The other Perl extensions are in
14833 L<perlunicode/Other Properties>;
14835 The first column in the table is a name for the property; the second column is
14836 an alternative name, if any, plus possibly some annotations. The alternative
14837 name is the property's full name, unless that would simply repeat the first
14838 column, in which case the second column indicates the property's short name
14839 (if different). The annotations are given only in the entry for the full
14840 name. If a property is obsolete, etc, the entry will be flagged with the same
14841 characters used in the table in the L<section above|/Properties accessible
14842 through \\p{} and \\P{}>, like B<$DEPRECATED> or B<$STABILIZED>.
14846 =head1 Properties accessible through other means
14848 Certain properties are accessible also via core function calls. These are:
14850 Lowercase_Mapping lc() and lcfirst()
14851 Titlecase_Mapping ucfirst()
14852 Uppercase_Mapping uc()
14854 Also, Case_Folding is accessible through the C</i> modifier in regular
14855 expressions, the C<\\F> transliteration escape, and the C<L<fc|perlfunc/fc>>
14858 And, the Name and Name_Aliases properties are accessible through the C<\\N{}>
14859 interpolation in double-quoted strings and regular expressions; and functions
14860 C<charnames::viacode()>, C<charnames::vianame()>, and
14861 C<charnames::string_vianame()> (which require a C<use charnames ();> to be
14864 Finally, most properties related to decomposition are accessible via
14865 L<Unicode::Normalize>.
14867 =head1 Unicode character properties that are NOT accepted by Perl
14869 Perl will generate an error for a few character properties in Unicode when
14870 used in a regular expression. The non-Unihan ones are listed below, with the
14871 reasons they are not accepted, perhaps with work-arounds. The short names for
14872 the properties are listed enclosed in (parentheses).
14873 As described after the list, an installation can change the defaults and choose
14874 to accept any of these. The list is machine generated based on the
14875 choices made for the installation that generated this document.
14879 An installation can choose to allow any of these to be matched by downloading
14880 the Unicode database from L<http://www.unicode.org/Public/> to
14881 C<\$Config{privlib}>/F<unicore/> in the Perl source tree, changing the
14882 controlling lists contained in the program
14883 C<\$Config{privlib}>/F<unicore/mktables> and then re-compiling and installing.
14884 (C<\%Config> is available from the Config module).
14886 =head1 Other information in the Unicode data base
14888 The Unicode data base is delivered in two different formats. The XML version
14889 is valid for more modern Unicode releases. The other version is a collection
14890 of files. The two are intended to give equivalent information. Perl uses the
14891 older form; this allows you to recompile Perl to use early Unicode releases.
14893 The only non-character property that Perl currently supports is Named
14894 Sequences, in which a sequence of code points
14895 is given a name and generally treated as a single entity. (Perl supports
14896 these via the C<\\N{...}> double-quotish construct,
14897 L<charnames/charnames::string_vianame(name)>, and L<Unicode::UCD/namedseq()>.
14899 Below is a list of the files in the Unicode data base that Perl doesn't
14900 currently use, along with very brief descriptions of their purposes.
14901 Some of the names of the files have been shortened from those that Unicode
14902 uses, in order to allow them to be distinguishable from similarly named files
14903 on file systems for which only the first 8 characters of a name are
14914 L<$unicode_reference_url>
14922 # And write it. The 0 means no utf8.
14923 main::write([ $pod_directory, "$pod_file.pod" ], 0, \@OUT);
14927 sub make_Heavy () {
14928 # Create and write Heavy.pl, which passes info about the tables to
14931 # Stringify structures for output
14932 my $loose_property_name_of
14933 = simple_dumper(\%loose_property_name_of, ' ' x 4);
14934 chomp $loose_property_name_of;
14936 my $stricter_to_file_of = simple_dumper(\%stricter_to_file_of, ' ' x 4);
14937 chomp $stricter_to_file_of;
14939 my $loose_to_file_of = simple_dumper(\%loose_to_file_of, ' ' x 4);
14940 chomp $loose_to_file_of;
14942 my $nv_floating_to_rational
14943 = simple_dumper(\%nv_floating_to_rational, ' ' x 4);
14944 chomp $nv_floating_to_rational;
14946 my $why_deprecated = simple_dumper(\%utf8::why_deprecated, ' ' x 4);
14947 chomp $why_deprecated;
14949 # We set the key to the file when we associated files with tables, but we
14950 # couldn't do the same for the value then, as we might not have the file
14951 # for the alternate table figured out at that time.
14952 foreach my $cased (keys %caseless_equivalent_to) {
14953 my @path = $caseless_equivalent_to{$cased}->file_path;
14954 my $path = join '/', @path[1, -1];
14955 $caseless_equivalent_to{$cased} = $path;
14957 my $caseless_equivalent_to
14958 = simple_dumper(\%caseless_equivalent_to, ' ' x 4);
14959 chomp $caseless_equivalent_to;
14961 my $loose_property_to_file_of
14962 = simple_dumper(\%loose_property_to_file_of, ' ' x 4);
14963 chomp $loose_property_to_file_of;
14965 my $file_to_swash_name = simple_dumper(\%file_to_swash_name, ' ' x 4);
14966 chomp $file_to_swash_name;
14970 $INTERNAL_ONLY_HEADER
14972 # This file is for the use of utf8_heavy.pl and Unicode::UCD
14974 # Maps Unicode (not Perl single-form extensions) property names in loose
14975 # standard form to their corresponding standard names
14976 \%utf8::loose_property_name_of = (
14977 $loose_property_name_of
14980 # Maps property, table to file for those using stricter matching
14981 \%utf8::stricter_to_file_of = (
14982 $stricter_to_file_of
14985 # Maps property, table to file for those using loose matching
14986 \%utf8::loose_to_file_of = (
14990 # Maps floating point to fractional form
14991 \%utf8::nv_floating_to_rational = (
14992 $nv_floating_to_rational
14995 # If a floating point number doesn't have enough digits in it to get this
14996 # close to a fraction, it isn't considered to be that fraction even if all the
14997 # digits it does have match.
14998 \$utf8::max_floating_slop = $MAX_FLOATING_SLOP;
15000 # Deprecated tables to generate a warning for. The key is the file containing
15001 # the table, so as to avoid duplication, as many property names can map to the
15002 # file, but we only need one entry for all of them.
15003 \%utf8::why_deprecated = (
15007 # A few properties have different behavior under /i matching. This maps
15008 # those to substitute files to use under /i.
15009 \%utf8::caseless_equivalent = (
15010 $caseless_equivalent_to
15013 # Property names to mapping files
15014 \%utf8::loose_property_to_file_of = (
15015 $loose_property_to_file_of
15018 # Files to the swash names within them.
15019 \%utf8::file_to_swash_name = (
15020 $file_to_swash_name
15026 main::write("Heavy.pl", 0, \@heavy); # The 0 means no utf8.
15030 sub make_Name_pm () {
15031 # Create and write Name.pm, which contains subroutines and data to use in
15032 # conjunction with Name.pl
15034 # Maybe there's nothing to do.
15035 return unless $has_hangul_syllables || @code_points_ending_in_code_point;
15039 $INTERNAL_ONLY_HEADER
15042 # Convert these structures to output format.
15043 my $code_points_ending_in_code_point =
15044 main::simple_dumper(\@code_points_ending_in_code_point,
15046 my $names = main::simple_dumper(\%names_ending_in_code_point,
15048 my $loose_names = main::simple_dumper(\%loose_names_ending_in_code_point,
15051 # Do the same with the Hangul names,
15057 if ($has_hangul_syllables) {
15059 # Construct a regular expression of all the possible
15060 # combinations of the Hangul syllables.
15061 my @L_re; # Leading consonants
15062 for my $i ($LBase .. $LBase + $LCount - 1) {
15063 push @L_re, $Jamo{$i}
15065 my @V_re; # Middle vowels
15066 for my $i ($VBase .. $VBase + $VCount - 1) {
15067 push @V_re, $Jamo{$i}
15069 my @T_re; # Trailing consonants
15070 for my $i ($TBase + 1 .. $TBase + $TCount - 1) {
15071 push @T_re, $Jamo{$i}
15074 # The whole re is made up of the L V T combination.
15076 . join ('|', sort @L_re)
15078 . join ('|', sort @V_re)
15080 . join ('|', sort @T_re)
15083 # These hashes needed by the algorithm were generated
15084 # during reading of the Jamo.txt file
15085 $jamo = main::simple_dumper(\%Jamo, ' ' x 8);
15086 $jamo_l = main::simple_dumper(\%Jamo_L, ' ' x 8);
15087 $jamo_v = main::simple_dumper(\%Jamo_V, ' ' x 8);
15088 $jamo_t = main::simple_dumper(\%Jamo_T, ' ' x 8);
15095 # This module contains machine-generated tables and code for the
15096 # algorithmically-determinable Unicode character names. The following
15097 # routines can be used to translate between name and code point and vice versa
15101 # Matches legal code point. 4-6 hex numbers, If there are 6, the first
15102 # two must be 10; if there are 5, the first must not be a 0. Written this
15103 # way to decrease backtracking. The first regex allows the code point to
15104 # be at the end of a word, but to work properly, the word shouldn't end
15105 # with a valid hex character. The second one won't match a code point at
15106 # the end of a word, and doesn't have the run-on issue
15107 my \$run_on_code_point_re = qr/$run_on_code_point_re/;
15108 my \$code_point_re = qr/$code_point_re/;
15110 # In the following hash, the keys are the bases of names which includes
15111 # the code point in the name, like CJK UNIFIED IDEOGRAPH-4E01. The values
15112 # of each key is another hash which is used to get the low and high ends
15113 # for each range of code points that apply to the name.
15114 my %names_ending_in_code_point = (
15118 # The following hash is a copy of the previous one, except is for loose
15119 # matching, so each name has blanks and dashes squeezed out
15120 my %loose_names_ending_in_code_point = (
15124 # And the following array gives the inverse mapping from code points to
15125 # names. Lowest code points are first
15126 my \@code_points_ending_in_code_point = (
15127 $code_points_ending_in_code_point
15130 # Earlier releases didn't have Jamos. No sense outputting
15131 # them unless will be used.
15132 if ($has_hangul_syllables) {
15135 # Convert from code point to Jamo short name for use in composing Hangul
15141 # Leading consonant (can be null)
15151 # Optional trailing consonant
15156 # Computed re that splits up a Hangul name into LVT or LV syllables
15157 my \$syllable_re = qr/$jamo_re/;
15159 my \$HANGUL_SYLLABLE = "HANGUL SYLLABLE ";
15160 my \$loose_HANGUL_SYLLABLE = "HANGULSYLLABLE";
15162 # These constants names and values were taken from the Unicode standard,
15163 # version 5.1, section 3.12. They are used in conjunction with Hangul
15165 my \$SBase = $SBase_string;
15166 my \$LBase = $LBase_string;
15167 my \$VBase = $VBase_string;
15168 my \$TBase = $TBase_string;
15169 my \$SCount = $SCount;
15170 my \$LCount = $LCount;
15171 my \$VCount = $VCount;
15172 my \$TCount = $TCount;
15173 my \$NCount = \$VCount * \$TCount;
15175 } # End of has Jamos
15177 push @name, << 'END';
15179 sub name_to_code_point_special {
15180 my ($name, $loose) = @_;
15182 # Returns undef if not one of the specially handled names; otherwise
15183 # returns the code point equivalent to the input name
15184 # $loose is non-zero if to use loose matching, 'name' in that case
15185 # must be input as upper case with all blanks and dashes squeezed out.
15187 if ($has_hangul_syllables) {
15188 push @name, << 'END';
15190 if ((! $loose && $name =~ s/$HANGUL_SYLLABLE//)
15191 || ($loose && $name =~ s/$loose_HANGUL_SYLLABLE//))
15193 return if $name !~ qr/^$syllable_re$/;
15194 my $L = $Jamo_L{$1};
15195 my $V = $Jamo_V{$2};
15196 my $T = (defined $3) ? $Jamo_T{$3} : 0;
15197 return ($L * $VCount + $V) * $TCount + $T + $SBase;
15201 push @name, << 'END';
15203 # Name must end in 'code_point' for this to handle.
15204 return if (($loose && $name !~ /^ (.*?) ($run_on_code_point_re) $/x)
15205 || (! $loose && $name !~ /^ (.*) ($code_point_re) $/x));
15208 my $code_point = CORE::hex $2;
15212 $names_ref = \%loose_names_ending_in_code_point;
15215 return if $base !~ s/-$//;
15216 $names_ref = \%names_ending_in_code_point;
15219 # Name must be one of the ones which has the code point in it.
15220 return if ! $names_ref->{$base};
15222 # Look through the list of ranges that apply to this name to see if
15223 # the code point is in one of them.
15224 for (my $i = 0; $i < scalar @{$names_ref->{$base}{'low'}}; $i++) {
15225 return if $names_ref->{$base}{'low'}->[$i] > $code_point;
15226 next if $names_ref->{$base}{'high'}->[$i] < $code_point;
15228 # Here, the code point is in the range.
15229 return $code_point;
15232 # Here, looked like the name had a code point number in it, but
15233 # did not match one of the valid ones.
15237 sub code_point_to_name_special {
15238 my $code_point = shift;
15240 # Returns the name of a code point if algorithmically determinable;
15243 if ($has_hangul_syllables) {
15244 push @name, << 'END';
15246 # If in the Hangul range, calculate the name based on Unicode's
15248 if ($code_point >= $SBase && $code_point <= $SBase + $SCount -1) {
15250 my $SIndex = $code_point - $SBase;
15251 my $L = $LBase + $SIndex / $NCount;
15252 my $V = $VBase + ($SIndex % $NCount) / $TCount;
15253 my $T = $TBase + $SIndex % $TCount;
15254 $name = "$HANGUL_SYLLABLE$Jamo{$L}$Jamo{$V}";
15255 $name .= $Jamo{$T} if $T != $TBase;
15260 push @name, << 'END';
15262 # Look through list of these code points for one in range.
15263 foreach my $hash (@code_points_ending_in_code_point) {
15264 return if $code_point < $hash->{'low'};
15265 if ($code_point <= $hash->{'high'}) {
15266 return sprintf("%s-%04X", $hash->{'name'}, $code_point);
15269 return; # None found
15276 main::write("Name.pm", 0, \@name); # The 0 means no utf8.
15281 # Create and write UCD.pl, which passes info about the tables to
15284 # Create a mapping from each alias of Perl single-form extensions to all
15285 # its equivalent aliases, for quick look-up.
15286 my %perlprop_to_aliases;
15287 foreach my $table ($perl->tables) {
15289 # First create the list of the aliases of each extension
15290 my @aliases_list; # List of legal aliases for this extension
15292 my $table_name = $table->name;
15293 my $standard_table_name = standardize($table_name);
15294 my $table_full_name = $table->full_name;
15295 my $standard_table_full_name = standardize($table_full_name);
15297 # Make sure that the list has both the short and full names
15298 push @aliases_list, $table_name, $table_full_name;
15300 my $found_ucd = 0; # ? Did we actually get an alias that should be
15301 # output for this table
15303 # Go through all the aliases (including the two just added), and add
15304 # any new unique ones to the list
15305 foreach my $alias ($table->aliases) {
15307 # Skip non-legal names
15308 next unless $alias->ok_as_filename;
15309 next unless $alias->ucd;
15311 $found_ucd = 1; # have at least one legal name
15313 my $name = $alias->name;
15314 my $standard = standardize($name);
15316 # Don't repeat a name that is equivalent to one already on the
15318 next if $standard eq $standard_table_name;
15319 next if $standard eq $standard_table_full_name;
15321 push @aliases_list, $name;
15324 # If there were no legal names, don't output anything.
15325 next unless $found_ucd;
15327 # To conserve memory in the program reading these in, omit full names
15328 # that are identical to the short name, when those are the only two
15329 # aliases for the property.
15330 if (@aliases_list == 2 && $aliases_list[0] eq $aliases_list[1]) {
15334 # Here, @aliases_list is the list of all the aliases that this
15335 # extension legally has. Now can create a map to it from each legal
15336 # standardized alias
15337 foreach my $alias ($table->aliases) {
15338 next unless $alias->ucd;
15339 next unless $alias->ok_as_filename;
15340 push @{$perlprop_to_aliases{standardize($alias->name)}},
15345 # Make a list of all combinations of properties/values that are suppressed.
15347 if (! $debug_skip) { # This tends to fail in this debug mode
15348 foreach my $property_name (keys %why_suppressed) {
15351 my $value_name = $1 if $property_name =~ s/ = ( .* ) //x;
15353 # The hash may contain properties not in this release of Unicode
15354 next unless defined (my $property = property_ref($property_name));
15356 # Find all combinations
15357 foreach my $prop_alias ($property->aliases) {
15358 my $prop_alias_name = standardize($prop_alias->name);
15360 # If no =value, there's just one combination possibe for this
15361 if (! $value_name) {
15363 # The property may be suppressed, but there may be a proxy
15364 # for it, so it shouldn't be listed as suppressed
15365 next if $prop_alias->ucd;
15366 push @suppressed, $prop_alias_name;
15369 foreach my $value_alias
15370 ($property->table($value_name)->aliases)
15372 next if $value_alias->ucd;
15374 push @suppressed, "$prop_alias_name="
15375 . standardize($value_alias->name);
15382 # Convert the structure below (designed for Name.pm) to a form that UCD
15383 # wants, so it doesn't have to modify it at all; i.e. so that it includes
15384 # an element for the Hangul syllables in the appropriate place, and
15385 # otherwise changes the name to include the "-<code point>" suffix.
15386 my @algorithm_names;
15387 my $done_hangul = 0;
15389 # Copy it linearly.
15390 for my $i (0 .. @code_points_ending_in_code_point - 1) {
15392 # Insert the hanguls in the correct place.
15394 && $code_points_ending_in_code_point[$i]->{'low'} > $SBase)
15397 push @algorithm_names, { low => $SBase,
15398 high => $SBase + $SCount - 1,
15399 name => '<hangul syllable>',
15403 # Copy the current entry, modified.
15404 push @algorithm_names, {
15405 low => $code_points_ending_in_code_point[$i]->{'low'},
15406 high => $code_points_ending_in_code_point[$i]->{'high'},
15408 "$code_points_ending_in_code_point[$i]->{'name'}-<code point>",
15412 # Serialize these structures for output.
15413 my $loose_to_standard_value
15414 = simple_dumper(\%loose_to_standard_value, ' ' x 4);
15415 chomp $loose_to_standard_value;
15417 my $string_property_loose_to_name
15418 = simple_dumper(\%string_property_loose_to_name, ' ' x 4);
15419 chomp $string_property_loose_to_name;
15421 my $perlprop_to_aliases = simple_dumper(\%perlprop_to_aliases, ' ' x 4);
15422 chomp $perlprop_to_aliases;
15424 my $prop_aliases = simple_dumper(\%prop_aliases, ' ' x 4);
15425 chomp $prop_aliases;
15427 my $prop_value_aliases = simple_dumper(\%prop_value_aliases, ' ' x 4);
15428 chomp $prop_value_aliases;
15430 my $suppressed = (@suppressed) ? simple_dumper(\@suppressed, ' ' x 4) : "";
15433 my $algorithm_names = simple_dumper(\@algorithm_names, ' ' x 4);
15434 chomp $algorithm_names;
15436 my $ambiguous_names = simple_dumper(\%ambiguous_names, ' ' x 4);
15437 chomp $ambiguous_names;
15439 my $loose_defaults = simple_dumper(\%loose_defaults, ' ' x 4);
15440 chomp $loose_defaults;
15444 $INTERNAL_ONLY_HEADER
15446 # This file is for the use of Unicode::UCD
15448 # Highest legal Unicode code point
15449 \$Unicode::UCD::MAX_UNICODE_CODEPOINT = 0x$MAX_UNICODE_CODEPOINT_STRING;
15452 \$Unicode::UCD::HANGUL_BEGIN = $SBase_string;
15453 \$Unicode::UCD::HANGUL_COUNT = $SCount;
15455 # Keys are all the possible "prop=value" combinations, in loose form; values
15456 # are the standard loose name for the 'value' part of the key
15457 \%Unicode::UCD::loose_to_standard_value = (
15458 $loose_to_standard_value
15461 # String property loose names to standard loose name
15462 \%Unicode::UCD::string_property_loose_to_name = (
15463 $string_property_loose_to_name
15466 # Keys are Perl extensions in loose form; values are each one's list of
15468 \%Unicode::UCD::loose_perlprop_to_name = (
15469 $perlprop_to_aliases
15472 # Keys are standard property name; values are each one's aliases
15473 \%Unicode::UCD::prop_aliases = (
15477 # Keys of top level are standard property name; values are keys to another
15478 # hash, Each one is one of the property's values, in standard form. The
15479 # values are that prop-val's aliases. If only one specified, the short and
15480 # long alias are identical.
15481 \%Unicode::UCD::prop_value_aliases = (
15482 $prop_value_aliases
15485 # Ordered (by code point ordinal) list of the ranges of code points whose
15486 # names are algorithmically determined. Each range entry is an anonymous hash
15487 # of the start and end points and a template for the names within it.
15488 \@Unicode::UCD::algorithmic_named_code_points = (
15492 # The properties that as-is have two meanings, and which must be disambiguated
15493 \%Unicode::UCD::ambiguous_names = (
15497 # Keys are the prop-val combinations which are the default values for the
15498 # given property, expressed in standard loose form
15499 \%Unicode::UCD::loose_defaults = (
15503 # All combinations of names that are suppressed.
15504 # This is actually for UCD.t, so it knows which properties shouldn't have
15505 # entries. If it got any bigger, would probably want to put it in its own
15506 # file to use memory only when it was needed, in testing.
15507 \@Unicode::UCD::suppressed_properties = (
15514 main::write("UCD.pl", 0, \@ucd); # The 0 means no utf8.
15518 sub write_all_tables() {
15519 # Write out all the tables generated by this program to files, as well as
15520 # the supporting data structures, pod file, and .t file.
15522 my @writables; # List of tables that actually get written
15523 my %match_tables_to_write; # Used to collapse identical match tables
15524 # into one file. Each key is a hash function
15525 # result to partition tables into buckets.
15526 # Each value is an array of the tables that
15527 # fit in the bucket.
15529 # For each property ...
15530 # (sort so that if there is an immutable file name, it has precedence, so
15531 # some other property can't come in and take over its file name. If b's
15532 # file name is defined, will return 1, meaning to take it first; don't
15533 # care if both defined, as they had better be different anyway. And the
15534 # property named 'Perl' needs to be first (it doesn't have any immutable
15535 # file name) because empty properties are defined in terms of it's table
15538 foreach my $property (sort { return -1 if $a == $perl;
15539 return 1 if $b == $perl;
15540 return defined $b->file
15541 } property_ref('*'))
15543 my $type = $property->type;
15545 # And for each table for that property, starting with the mapping
15548 foreach my $table($property,
15550 # and all the match tables for it (if any), sorted so
15551 # the ones with the shortest associated file name come
15552 # first. The length sorting prevents problems of a
15553 # longer file taking a name that might have to be used
15554 # by a shorter one. The alphabetic sorting prevents
15555 # differences between releases
15556 sort { my $ext_a = $a->external_name;
15557 return 1 if ! defined $ext_a;
15558 my $ext_b = $b->external_name;
15559 return -1 if ! defined $ext_b;
15561 # But return the non-complement table before
15562 # the complement one, as the latter is defined
15563 # in terms of the former, and needs to have
15564 # the information for the former available.
15565 return 1 if $a->complement != 0;
15566 return -1 if $b->complement != 0;
15568 # Similarly, return a subservient table after
15570 return 1 if $a->leader != $a;
15571 return -1 if $b->leader != $b;
15573 my $cmp = length $ext_a <=> length $ext_b;
15575 # Return result if lengths not equal
15576 return $cmp if $cmp;
15578 # Alphabetic if lengths equal
15579 return $ext_a cmp $ext_b
15580 } $property->tables
15584 # Here we have a table associated with a property. It could be
15585 # the map table (done first for each property), or one of the
15586 # other tables. Determine which type.
15587 my $is_property = $table->isa('Property');
15589 my $name = $table->name;
15590 my $complete_name = $table->complete_name;
15592 # See if should suppress the table if is empty, but warn if it
15593 # contains something.
15594 my $suppress_if_empty_warn_if_not
15595 = $why_suppress_if_empty_warn_if_not{$complete_name} || 0;
15597 # Calculate if this table should have any code points associated
15599 my $expected_empty =
15601 # $perl should be empty, as well as properties that we just
15602 # don't do anything with
15604 && ($table == $perl
15605 || grep { $complete_name eq $_ }
15606 @unimplemented_properties
15610 # Match tables in properties we skipped populating should be
15612 || (! $is_property && ! $property->to_create_match_tables)
15614 # Tables and properties that are expected to have no code
15615 # points should be empty
15616 || $suppress_if_empty_warn_if_not
15619 # Set a boolean if this table is the complement of an empty binary
15621 my $is_complement_of_empty_binary =
15622 $type == $BINARY &&
15623 (($table == $property->table('Y')
15624 && $property->table('N')->is_empty)
15625 || ($table == $property->table('N')
15626 && $property->table('Y')->is_empty));
15628 if ($table->is_empty) {
15630 if ($suppress_if_empty_warn_if_not) {
15631 $table->set_fate($SUPPRESSED,
15632 $suppress_if_empty_warn_if_not);
15635 # Suppress (by skipping them) expected empty tables.
15636 next TABLE if $expected_empty;
15638 # And setup to later output a warning for those that aren't
15639 # known to be allowed to be empty. Don't do the warning if
15640 # this table is a child of another one to avoid duplicating
15641 # the warning that should come from the parent one.
15642 if (($table == $property || $table->parent == $table)
15643 && $table->fate != $SUPPRESSED
15644 && $table->fate != $MAP_PROXIED
15645 && ! grep { $complete_name =~ /^$_$/ }
15646 @tables_that_may_be_empty)
15648 push @unhandled_properties, "$table";
15651 # An empty table is just the complement of everything.
15652 $table->set_complement($Any) if $table != $property;
15654 elsif ($expected_empty) {
15656 if ($suppress_if_empty_warn_if_not) {
15657 $because = " because $suppress_if_empty_warn_if_not";
15660 Carp::my_carp("Not expecting property $table$because. Generating file for it anyway.");
15663 # Some tables should match everything
15664 my $expected_full =
15665 ($table->fate == $SUPPRESSED)
15668 ? # All these types of map tables will be full because
15669 # they will have been populated with defaults
15670 ($type == $ENUM || $type == $FORCED_BINARY)
15672 : # A match table should match everything if its method
15674 ($table->matches_all
15676 # The complement of an empty binary table will match
15678 || $is_complement_of_empty_binary
15682 my $count = $table->count;
15683 if ($expected_full) {
15684 if ($count != $MAX_UNICODE_CODEPOINTS) {
15685 Carp::my_carp("$table matches only "
15686 . clarify_number($count)
15687 . " Unicode code points but should match "
15688 . clarify_number($MAX_UNICODE_CODEPOINTS)
15690 . clarify_number(abs($MAX_UNICODE_CODEPOINTS - $count))
15691 . "). Proceeding anyway.");
15694 # Here is expected to be full. If it is because it is the
15695 # complement of an (empty) binary table that is to be
15696 # suppressed, then suppress this one as well.
15697 if ($is_complement_of_empty_binary) {
15698 my $opposing_name = ($name eq 'Y') ? 'N' : 'Y';
15699 my $opposing = $property->table($opposing_name);
15700 my $opposing_status = $opposing->status;
15701 if ($opposing_status) {
15702 $table->set_status($opposing_status,
15703 $opposing->status_info);
15707 elsif ($count == $MAX_UNICODE_CODEPOINTS
15708 && ($table == $property || $table->leader == $table)
15709 && $table->property->status != $PLACEHOLDER)
15711 Carp::my_carp("$table unexpectedly matches all Unicode code points. Proceeding anyway.");
15714 if ($table->fate == $SUPPRESSED) {
15715 if (! $is_property) {
15716 my @children = $table->children;
15717 foreach my $child (@children) {
15718 if ($child->fate != $SUPPRESSED) {
15719 Carp::my_carp_bug("'$table' is suppressed and has a child '$child' which isn't");
15727 if (! $is_property) {
15729 make_ucd_table_pod_entries($table) if $table->property == $perl;
15731 # Several things need to be done just once for each related
15732 # group of match tables. Do them on the parent.
15733 if ($table->parent == $table) {
15735 # Add an entry in the pod file for the table; it also does
15737 make_re_pod_entries($table) if defined $pod_directory;
15739 # See if the the table matches identical code points with
15740 # something that has already been output. In that case,
15741 # no need to have two files with the same code points in
15742 # them. We use the table's hash() method to store these
15743 # in buckets, so that it is quite likely that if two
15744 # tables are in the same bucket they will be identical, so
15745 # don't have to compare tables frequently. The tables
15746 # have to have the same status to share a file, so add
15747 # this to the bucket hash. (The reason for this latter is
15748 # that Heavy.pl associates a status with a file.)
15749 # We don't check tables that are inverses of others, as it
15750 # would lead to some coding complications, and checking
15751 # all the regular ones should find everything.
15752 if ($table->complement == 0) {
15753 my $hash = $table->hash . ';' . $table->status;
15755 # Look at each table that is in the same bucket as
15756 # this one would be.
15757 foreach my $comparison
15758 (@{$match_tables_to_write{$hash}})
15760 if ($table->matches_identically_to($comparison)) {
15761 $table->set_equivalent_to($comparison,
15767 # Here, not equivalent, add this table to the bucket.
15768 push @{$match_tables_to_write{$hash}}, $table;
15774 # Here is the property itself.
15775 # Don't write out or make references to the $perl property
15776 next if $table == $perl;
15778 make_ucd_table_pod_entries($table);
15780 # There is a mapping stored of the various synonyms to the
15781 # standardized name of the property for utf8_heavy.pl.
15782 # Also, the pod file contains entries of the form:
15783 # \p{alias: *} \p{full: *}
15784 # rather than show every possible combination of things.
15786 my @property_aliases = $property->aliases;
15788 my $full_property_name = $property->full_name;
15789 my $property_name = $property->name;
15790 my $standard_property_name = standardize($property_name);
15791 my $standard_property_full_name
15792 = standardize($full_property_name);
15794 # We also create for Unicode::UCD a list of aliases for
15795 # the property. The list starts with the property name;
15796 # then its full name.
15799 if ( $property->fate <= $MAP_PROXIED) {
15800 @property_list = ($property_name, $full_property_name);
15801 @standard_list = ($standard_property_name,
15802 $standard_property_full_name);
15805 # For each synonym ...
15806 for my $i (0 .. @property_aliases - 1) {
15807 my $alias = $property_aliases[$i];
15808 my $alias_name = $alias->name;
15809 my $alias_standard = standardize($alias_name);
15812 # Add other aliases to the list of property aliases
15813 if ($property->fate <= $MAP_PROXIED
15814 && ! grep { $alias_standard eq $_ } @standard_list)
15816 push @property_list, $alias_name;
15817 push @standard_list, $alias_standard;
15820 # For utf8_heavy, set the mapping of the alias to the
15822 if ($type == $STRING) {
15823 if ($property->fate <= $MAP_PROXIED) {
15824 $string_property_loose_to_name{$alias_standard}
15825 = $standard_property_name;
15829 if (exists ($loose_property_name_of{$alias_standard}))
15831 Carp::my_carp("There already is a property with the same standard name as $alias_name: $loose_property_name_of{$alias_standard}. Old name is retained");
15834 $loose_property_name_of{$alias_standard}
15835 = $standard_property_name;
15838 # Now for the re pod entry for this alias. Skip if not
15839 # outputting a pod; skip the first one, which is the
15840 # full name so won't have an entry like: '\p{full: *}
15841 # \p{full: *}', and skip if don't want an entry for
15844 || ! defined $pod_directory
15845 || ! $alias->make_re_pod_entry;
15847 my $rhs = "\\p{$full_property_name: *}";
15848 if ($property != $perl && $table->perl_extension) {
15849 $rhs .= ' (Perl extension)';
15851 push @match_properties,
15852 format_pod_line($indent_info_column,
15853 '\p{' . $alias->name . ': *}',
15859 # The list of all possible names is attached to each alias, so
15861 if (@property_list) {
15862 push @{$prop_aliases{$standard_list[0]}}, @property_list;
15865 if ($property->fate <= $MAP_PROXIED) {
15867 # Similarly, we create for Unicode::UCD a list of
15868 # property-value aliases.
15870 my $property_full_name = $property->full_name;
15872 # Look at each table in the property...
15873 foreach my $table ($property->tables) {
15875 my $table_full_name = $table->full_name;
15876 my $standard_table_full_name
15877 = standardize($table_full_name);
15878 my $table_name = $table->name;
15879 my $standard_table_name = standardize($table_name);
15881 # The list starts with the table name and its full
15883 push @values_list, $table_name, $table_full_name;
15885 # We add to the table each unique alias that isn't
15886 # discouraged from use.
15887 foreach my $alias ($table->aliases) {
15888 next if $alias->status
15889 && $alias->status eq $DISCOURAGED;
15890 my $name = $alias->name;
15891 my $standard = standardize($name);
15892 next if $standard eq $standard_table_name;
15893 next if $standard eq $standard_table_full_name;
15894 push @values_list, $name;
15897 # Here @values_list is a list of all the aliases for
15898 # the table. That is, all the property-values given
15899 # by this table. By agreement with Unicode::UCD,
15900 # if the name and full name are identical, and there
15901 # are no other names, drop the duplcate entry to save
15903 if (@values_list == 2
15904 && $values_list[0] eq $values_list[1])
15909 # To save memory, unlike the similar list for property
15910 # aliases above, only the standard forms hve the list.
15911 # This forces an extra step of converting from input
15912 # name to standard name, but the savings are
15913 # considerable. (There is only marginal savings if we
15914 # did this with the property aliases.)
15915 push @{$prop_value_aliases{$standard_property_name}{$standard_table_name}}, @values_list;
15919 # Don't write out a mapping file if not desired.
15920 next if ! $property->to_output_map;
15923 # Here, we know we want to write out the table, but don't do it
15924 # yet because there may be other tables that come along and will
15925 # want to share the file, and the file's comments will change to
15926 # mention them. So save for later.
15927 push @writables, $table;
15929 } # End of looping through the property and all its tables.
15930 } # End of looping through all properties.
15932 # Now have all the tables that will have files written for them. Do it.
15933 foreach my $table (@writables) {
15936 my $property = $table->property;
15937 my $is_property = ($table == $property);
15938 if (! $is_property) {
15940 # Match tables for the property go in lib/$subdirectory, which is
15941 # the property's name. Don't use the standard file name for this,
15942 # as may get an unfamiliar alias
15943 @directory = ($matches_directory, $property->external_name);
15947 @directory = $table->directory;
15948 $filename = $table->file;
15951 # Use specified filename if available, or default to property's
15952 # shortest name. We need an 8.3 safe filename (which means "an 8
15953 # safe" filename, since after the dot is only 'pl', which is < 3)
15954 # The 2nd parameter is if the filename shouldn't be changed, and
15955 # it shouldn't iff there is a hard-coded name for this table.
15956 $filename = construct_filename(
15957 $filename || $table->external_name,
15958 ! $filename, # mutable if no filename
15961 register_file_for_name($table, \@directory, $filename);
15963 # Only need to write one file when shared by more than one
15965 next if ! $is_property
15966 && ($table->leader != $table || $table->complement != 0);
15968 # Construct a nice comment to add to the file
15969 $table->set_final_comment;
15975 # Write out the pod file
15978 # And Heavy.pl, Name.pm, UCD.pl
15983 make_property_test_script() if $make_test_script;
15984 make_normalization_test_script() if $make_norm_test_script;
15988 my @white_space_separators = ( # This used only for making the test script.
15995 sub generate_separator($) {
15996 # This used only for making the test script. It generates the colon or
15997 # equal separator between the property and property value, with random
15998 # white space surrounding the separator
16002 return "" if $lhs eq ""; # No separator if there's only one (the r) side
16004 # Choose space before and after randomly
16005 my $spaces_before =$white_space_separators[rand(@white_space_separators)];
16006 my $spaces_after = $white_space_separators[rand(@white_space_separators)];
16008 # And return the whole complex, half the time using a colon, half the
16010 return $spaces_before
16011 . (rand() < 0.5) ? '=' : ':'
16015 sub generate_tests($$$$$) {
16016 # This used only for making the test script. It generates test cases that
16017 # are expected to compile successfully in perl. Note that the lhs and
16018 # rhs are assumed to already be as randomized as the caller wants.
16020 my $lhs = shift; # The property: what's to the left of the colon
16021 # or equals separator
16022 my $rhs = shift; # The property value; what's to the right
16023 my $valid_code = shift; # A code point that's known to be in the
16024 # table given by lhs=rhs; undef if table is
16026 my $invalid_code = shift; # A code point known to not be in the table;
16027 # undef if the table is all code points
16028 my $warning = shift;
16030 # Get the colon or equal
16031 my $separator = generate_separator($lhs);
16033 # The whole 'property=value'
16034 my $name = "$lhs$separator$rhs";
16037 # Create a complete set of tests, with complements.
16038 if (defined $valid_code) {
16039 push @output, <<"EOC"
16040 Expect(1, $valid_code, '\\p{$name}', $warning);
16041 Expect(0, $valid_code, '\\p{^$name}', $warning);
16042 Expect(0, $valid_code, '\\P{$name}', $warning);
16043 Expect(1, $valid_code, '\\P{^$name}', $warning);
16046 if (defined $invalid_code) {
16047 push @output, <<"EOC"
16048 Expect(0, $invalid_code, '\\p{$name}', $warning);
16049 Expect(1, $invalid_code, '\\p{^$name}', $warning);
16050 Expect(1, $invalid_code, '\\P{$name}', $warning);
16051 Expect(0, $invalid_code, '\\P{^$name}', $warning);
16057 sub generate_error($$$) {
16058 # This used only for making the test script. It generates test cases that
16059 # are expected to not only not match, but to be syntax or similar errors
16061 my $lhs = shift; # The property: what's to the left of the
16062 # colon or equals separator
16063 my $rhs = shift; # The property value; what's to the right
16064 my $already_in_error = shift; # Boolean; if true it's known that the
16065 # unmodified lhs and rhs will cause an error.
16066 # This routine should not force another one
16067 # Get the colon or equal
16068 my $separator = generate_separator($lhs);
16070 # Since this is an error only, don't bother to randomly decide whether to
16071 # put the error on the left or right side; and assume that the rhs is
16072 # loosely matched, again for convenience rather than rigor.
16073 $rhs = randomize_loose_name($rhs, 'ERROR') unless $already_in_error;
16075 my $property = $lhs . $separator . $rhs;
16078 Error('\\p{$property}');
16079 Error('\\P{$property}');
16083 # These are used only for making the test script
16084 # XXX Maybe should also have a bad strict seps, which includes underscore.
16086 my @good_loose_seps = (
16093 my @bad_loose_seps = (
16098 sub randomize_stricter_name {
16099 # This used only for making the test script. Take the input name and
16100 # return a randomized, but valid version of it under the stricter matching
16104 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
16106 # If the name looks like a number (integer, floating, or rational), do
16108 if ($name =~ qr{ ^ ( -? ) (\d+ ( ( [./] ) \d+ )? ) $ }x) {
16111 my $separator = $3;
16113 # If there isn't a sign, part of the time add a plus
16114 # Note: Not testing having any denominator having a minus sign
16116 $sign = '+' if rand() <= .3;
16119 # And add 0 or more leading zeros.
16120 $name = $sign . ('0' x int rand(10)) . $number;
16122 if (defined $separator) {
16123 my $extra_zeros = '0' x int rand(10);
16125 if ($separator eq '.') {
16127 # Similarly, add 0 or more trailing zeros after a decimal
16129 $name .= $extra_zeros;
16133 # Or, leading zeros before the denominator
16134 $name =~ s,/,/$extra_zeros,;
16139 # For legibility of the test, only change the case of whole sections at a
16140 # time. To do this, first split into sections. The split returns the
16143 for my $section (split / ( [ - + \s _ . ]+ ) /x, $name) {
16144 trace $section if main::DEBUG && $to_trace;
16146 if (length $section > 1 && $section !~ /\D/) {
16148 # If the section is a sequence of digits, about half the time
16149 # randomly add underscores between some of them.
16152 # Figure out how many underscores to add. max is 1 less than
16153 # the number of digits. (But add 1 at the end to make sure
16154 # result isn't 0, and compensate earlier by subtracting 2
16156 my $num_underscores = int rand(length($section) - 2) + 1;
16158 # And add them evenly throughout, for convenience, not rigor
16160 my $spacing = (length($section) - 1)/ $num_underscores;
16161 my $temp = $section;
16163 for my $i (1 .. $num_underscores) {
16164 $section .= substr($temp, 0, $spacing, "") . '_';
16168 push @sections, $section;
16172 # Here not a sequence of digits. Change the case of the section
16174 my $switch = int rand(4);
16175 if ($switch == 0) {
16176 push @sections, uc $section;
16178 elsif ($switch == 1) {
16179 push @sections, lc $section;
16181 elsif ($switch == 2) {
16182 push @sections, ucfirst $section;
16185 push @sections, $section;
16189 trace "returning", join "", @sections if main::DEBUG && $to_trace;
16190 return join "", @sections;
16193 sub randomize_loose_name($;$) {
16194 # This used only for making the test script
16197 my $want_error = shift; # if true, make an error
16198 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
16200 $name = randomize_stricter_name($name);
16203 push @parts, $good_loose_seps[rand(@good_loose_seps)];
16205 # Preserve trailing ones for the sake of not stripping the underscore from
16207 for my $part (split /[-\s_]+ (?= . )/, $name) {
16209 if ($want_error and rand() < 0.3) {
16210 push @parts, $bad_loose_seps[rand(@bad_loose_seps)];
16214 push @parts, $good_loose_seps[rand(@good_loose_seps)];
16217 push @parts, $part;
16219 my $new = join("", @parts);
16220 trace "$name => $new" if main::DEBUG && $to_trace;
16223 if (rand() >= 0.5) {
16224 $new .= $bad_loose_seps[rand(@bad_loose_seps)];
16227 $new = $bad_loose_seps[rand(@bad_loose_seps)] . $new;
16233 # Used to make sure don't generate duplicate test cases.
16234 my %test_generated;
16236 sub make_property_test_script() {
16237 # This used only for making the test script
16238 # this written directly -- it's huge.
16240 print "Making test script\n" if $verbosity >= $PROGRESS;
16242 # This uses randomness to test different possibilities without testing all
16243 # possibilities. To ensure repeatability, set the seed to 0. But if
16244 # tests are added, it will perturb all later ones in the .t file
16247 $t_path = 'TestProp.pl' unless defined $t_path; # the traditional name
16249 # Keep going down an order of magnitude
16250 # until find that adding this quantity to
16251 # 1 remains 1; but put an upper limit on
16252 # this so in case this algorithm doesn't
16253 # work properly on some platform, that we
16254 # won't loop forever.
16256 my $min_floating_slop = 1;
16257 while (1+ $min_floating_slop != 1
16260 my $next = $min_floating_slop / 10;
16261 last if $next == 0; # If underflows,
16263 $min_floating_slop = $next;
16266 # It doesn't matter whether the elements of this array contain single lines
16267 # or multiple lines. main::write doesn't count the lines.
16270 foreach my $property (property_ref('*')) {
16271 foreach my $table ($property->tables) {
16273 # Find code points that match, and don't match this table.
16274 my $valid = $table->get_valid_code_point;
16275 my $invalid = $table->get_invalid_code_point;
16276 my $warning = ($table->status eq $DEPRECATED)
16280 # Test each possible combination of the property's aliases with
16281 # the table's. If this gets to be too many, could do what is done
16282 # in the set_final_comment() for Tables
16283 my @table_aliases = $table->aliases;
16284 my @property_aliases = $table->property->aliases;
16286 # Every property can be optionally be prefixed by 'Is_', so test
16287 # that those work, by creating such a new alias for each
16288 # pre-existing one.
16289 push @property_aliases, map { Alias->new("Is_" . $_->name,
16291 $_->make_re_pod_entry,
16292 $_->ok_as_filename,
16296 } @property_aliases;
16297 my $max = max(scalar @table_aliases, scalar @property_aliases);
16298 for my $j (0 .. $max - 1) {
16300 # The current alias for property is the next one on the list,
16301 # or if beyond the end, start over. Similarly for table
16303 = $property_aliases[$j % @property_aliases]->name;
16305 $property_name = "" if $table->property == $perl;
16306 my $table_alias = $table_aliases[$j % @table_aliases];
16307 my $table_name = $table_alias->name;
16308 my $loose_match = $table_alias->loose_match;
16310 # If the table doesn't have a file, any test for it is
16311 # already guaranteed to be in error
16312 my $already_error = ! $table->file_path;
16314 # Generate error cases for this alias.
16315 push @output, generate_error($property_name,
16319 # If the table is guaranteed to always generate an error,
16320 # quit now without generating success cases.
16321 next if $already_error;
16323 # Now for the success cases.
16325 if ($loose_match) {
16327 # For loose matching, create an extra test case for the
16329 my $standard = standardize($table_name);
16331 # $test_name should be a unique combination for each test
16332 # case; used just to avoid duplicate tests
16333 my $test_name = "$property_name=$standard";
16335 # Don't output duplicate test cases.
16336 if (! exists $test_generated{$test_name}) {
16337 $test_generated{$test_name} = 1;
16338 push @output, generate_tests($property_name,
16345 $random = randomize_loose_name($table_name)
16347 else { # Stricter match
16348 $random = randomize_stricter_name($table_name);
16351 # Now for the main test case for this alias.
16352 my $test_name = "$property_name=$random";
16353 if (! exists $test_generated{$test_name}) {
16354 $test_generated{$test_name} = 1;
16355 push @output, generate_tests($property_name,
16362 # If the name is a rational number, add tests for the
16363 # floating point equivalent.
16364 if ($table_name =~ qr{/}) {
16366 # Calculate the float, and find just the fraction.
16367 my $float = eval $table_name;
16368 my ($whole, $fraction)
16369 = $float =~ / (.*) \. (.*) /x;
16371 # Starting with one digit after the decimal point,
16372 # create a test for each possible precision (number of
16373 # digits past the decimal point) until well beyond the
16374 # native number found on this machine. (If we started
16375 # with 0 digits, it would be an integer, which could
16376 # well match an unrelated table)
16378 for my $i (1 .. $min_floating_slop + 3) {
16379 my $table_name = sprintf("%.*f", $i, $float);
16380 if ($i < $MIN_FRACTION_LENGTH) {
16382 # If the test case has fewer digits than the
16383 # minimum acceptable precision, it shouldn't
16384 # succeed, so we expect an error for it.
16385 # E.g., 2/3 = .7 at one decimal point, and we
16386 # shouldn't say it matches .7. We should make
16387 # it be .667 at least before agreeing that the
16388 # intent was to match 2/3. But at the
16389 # less-than- acceptable level of precision, it
16390 # might actually match an unrelated number.
16391 # So don't generate a test case if this
16392 # conflating is possible. In our example, we
16393 # don't want 2/3 matching 7/10, if there is
16394 # a 7/10 code point.
16396 (keys %nv_floating_to_rational)
16399 if abs($table_name - $existing)
16400 < $MAX_FLOATING_SLOP;
16402 push @output, generate_error($property_name,
16404 1 # 1 => already an error
16409 # Here the number of digits exceeds the
16410 # minimum we think is needed. So generate a
16411 # success test case for it.
16412 push @output, generate_tests($property_name,
16430 (map {"Test_X('$_');\n"} @backslash_X_tests),
16435 sub make_normalization_test_script() {
16436 print "Making normalization test script\n" if $verbosity >= $PROGRESS;
16438 my $n_path = 'TestNorm.pl';
16440 unshift @normalization_tests, <<'END';
16444 sub ord_string { # Convert packed ords to printable string
16446 return "'" . join("", map { '\N{' . charnames::viacode($_) . '}' }
16447 unpack "U*", shift) . "'";
16448 #return "'" . join(" ", map { sprintf "%04X", $_ } unpack "U*", shift) . "'";
16452 my ($source, $nfc, $nfd, $nfkc, $nfkd) = @_;
16453 my $display_source = ord_string($source);
16454 my $display_nfc = ord_string($nfc);
16455 my $display_nfd = ord_string($nfd);
16456 my $display_nfkc = ord_string($nfkc);
16457 my $display_nfkd = ord_string($nfkd);
16459 use Unicode::Normalize;
16461 # nfc == toNFC(source) == toNFC(nfc) == toNFC(nfd)
16462 # nfkc == toNFC(nfkc) == toNFC(nfkd)
16465 # nfd == toNFD(source) == toNFD(nfc) == toNFD(nfd)
16466 # nfkd == toNFD(nfkc) == toNFD(nfkd)
16469 # nfkc == toNFKC(source) == toNFKC(nfc) == toNFKC(nfd) ==
16470 # toNFKC(nfkc) == toNFKC(nfkd)
16473 # nfkd == toNFKD(source) == toNFKD(nfc) == toNFKD(nfd) ==
16474 # toNFKD(nfkc) == toNFKD(nfkd)
16476 is(NFC($source), $nfc, "NFC($display_source) eq $display_nfc");
16477 is(NFC($nfc), $nfc, "NFC($display_nfc) eq $display_nfc");
16478 is(NFC($nfd), $nfc, "NFC($display_nfd) eq $display_nfc");
16479 is(NFC($nfkc), $nfkc, "NFC($display_nfkc) eq $display_nfkc");
16480 is(NFC($nfkd), $nfkc, "NFC($display_nfkd) eq $display_nfkc");
16482 is(NFD($source), $nfd, "NFD($display_source) eq $display_nfd");
16483 is(NFD($nfc), $nfd, "NFD($display_nfc) eq $display_nfd");
16484 is(NFD($nfd), $nfd, "NFD($display_nfd) eq $display_nfd");
16485 is(NFD($nfkc), $nfkd, "NFD($display_nfkc) eq $display_nfkd");
16486 is(NFD($nfkd), $nfkd, "NFD($display_nfkd) eq $display_nfkd");
16488 is(NFKC($source), $nfkc, "NFKC($display_source) eq $display_nfkc");
16489 is(NFKC($nfc), $nfkc, "NFKC($display_nfc) eq $display_nfkc");
16490 is(NFKC($nfd), $nfkc, "NFKC($display_nfd) eq $display_nfkc");
16491 is(NFKC($nfkc), $nfkc, "NFKC($display_nfkc) eq $display_nfkc");
16492 is(NFKC($nfkd), $nfkc, "NFKC($display_nfkd) eq $display_nfkc");
16494 is(NFKD($source), $nfkd, "NFKD($display_source) eq $display_nfkd");
16495 is(NFKD($nfc), $nfkd, "NFKD($display_nfc) eq $display_nfkd");
16496 is(NFKD($nfd), $nfkd, "NFKD($display_nfd) eq $display_nfkd");
16497 is(NFKD($nfkc), $nfkd, "NFKD($display_nfkc) eq $display_nfkd");
16498 is(NFKD($nfkd), $nfkd, "NFKD($display_nfkd) eq $display_nfkd");
16505 @normalization_tests,
16511 # This is a list of the input files and how to handle them. The files are
16512 # processed in their order in this list. Some reordering is possible if
16513 # desired, but the v0 files should be first, and the extracted before the
16514 # others except DAge.txt (as data in an extracted file can be over-ridden by
16515 # the non-extracted. Some other files depend on data derived from an earlier
16516 # file, like UnicodeData requires data from Jamo, and the case changing and
16517 # folding requires data from Unicode. Mostly, it is safest to order by first
16518 # version releases in (except the Jamo). DAge.txt is read before the
16519 # extracted ones because of the rarely used feature $compare_versions. In the
16520 # unlikely event that there were ever an extracted file that contained the Age
16521 # property information, it would have to go in front of DAge.
16523 # The version strings allow the program to know whether to expect a file or
16524 # not, but if a file exists in the directory, it will be processed, even if it
16525 # is in a version earlier than expected, so you can copy files from a later
16526 # release into an earlier release's directory.
16527 my @input_file_objects = (
16528 Input_file->new('PropertyAliases.txt', v0,
16529 Handler => \&process_PropertyAliases,
16531 Input_file->new(undef, v0, # No file associated with this
16532 Progress_Message => 'Finishing property setup',
16533 Handler => \&finish_property_setup,
16535 Input_file->new('PropValueAliases.txt', v0,
16536 Handler => \&process_PropValueAliases,
16537 Has_Missings_Defaults => $NOT_IGNORED,
16539 Input_file->new('DAge.txt', v3.2.0,
16540 Has_Missings_Defaults => $NOT_IGNORED,
16543 Input_file->new("${EXTRACTED}DGeneralCategory.txt", v3.1.0,
16544 Property => 'General_Category',
16546 Input_file->new("${EXTRACTED}DCombiningClass.txt", v3.1.0,
16547 Property => 'Canonical_Combining_Class',
16548 Has_Missings_Defaults => $NOT_IGNORED,
16550 Input_file->new("${EXTRACTED}DNumType.txt", v3.1.0,
16551 Property => 'Numeric_Type',
16552 Has_Missings_Defaults => $NOT_IGNORED,
16554 Input_file->new("${EXTRACTED}DEastAsianWidth.txt", v3.1.0,
16555 Property => 'East_Asian_Width',
16556 Has_Missings_Defaults => $NOT_IGNORED,
16558 Input_file->new("${EXTRACTED}DLineBreak.txt", v3.1.0,
16559 Property => 'Line_Break',
16560 Has_Missings_Defaults => $NOT_IGNORED,
16562 Input_file->new("${EXTRACTED}DBidiClass.txt", v3.1.1,
16563 Property => 'Bidi_Class',
16564 Has_Missings_Defaults => $NOT_IGNORED,
16566 Input_file->new("${EXTRACTED}DDecompositionType.txt", v3.1.0,
16567 Property => 'Decomposition_Type',
16568 Has_Missings_Defaults => $NOT_IGNORED,
16570 Input_file->new("${EXTRACTED}DBinaryProperties.txt", v3.1.0),
16571 Input_file->new("${EXTRACTED}DNumValues.txt", v3.1.0,
16572 Property => 'Numeric_Value',
16573 Each_Line_Handler => \&filter_numeric_value_line,
16574 Has_Missings_Defaults => $NOT_IGNORED,
16576 Input_file->new("${EXTRACTED}DJoinGroup.txt", v3.1.0,
16577 Property => 'Joining_Group',
16578 Has_Missings_Defaults => $NOT_IGNORED,
16581 Input_file->new("${EXTRACTED}DJoinType.txt", v3.1.0,
16582 Property => 'Joining_Type',
16583 Has_Missings_Defaults => $NOT_IGNORED,
16585 Input_file->new('Jamo.txt', v2.0.0,
16586 Property => 'Jamo_Short_Name',
16587 Each_Line_Handler => \&filter_jamo_line,
16589 Input_file->new('UnicodeData.txt', v1.1.5,
16590 Pre_Handler => \&setup_UnicodeData,
16592 # We clean up this file for some early versions.
16593 Each_Line_Handler => [ (($v_version lt v2.0.0 )
16595 : ($v_version eq v2.1.5)
16596 ? \&filter_v2_1_5_ucd
16598 # And for 5.14 Perls with 6.0,
16599 # have to also make changes
16600 : ($v_version ge v6.0.0)
16604 # And the main filter
16605 \&filter_UnicodeData_line,
16607 EOF_Handler => \&EOF_UnicodeData,
16609 Input_file->new('ArabicShaping.txt', v2.0.0,
16610 Each_Line_Handler =>
16611 [ ($v_version lt 4.1.0)
16612 ? \&filter_old_style_arabic_shaping
16614 \&filter_arabic_shaping_line,
16616 Has_Missings_Defaults => $NOT_IGNORED,
16618 Input_file->new('Blocks.txt', v2.0.0,
16619 Property => 'Block',
16620 Has_Missings_Defaults => $NOT_IGNORED,
16621 Each_Line_Handler => \&filter_blocks_lines
16623 Input_file->new('PropList.txt', v2.0.0,
16624 Each_Line_Handler => (($v_version lt v3.1.0)
16625 ? \&filter_old_style_proplist
16628 Input_file->new('Unihan.txt', v2.0.0,
16629 Pre_Handler => \&setup_unihan,
16631 Each_Line_Handler => \&filter_unihan_line,
16633 Input_file->new('SpecialCasing.txt', v2.1.8,
16634 Each_Line_Handler => \&filter_special_casing_line,
16635 Pre_Handler => \&setup_special_casing,
16636 Has_Missings_Defaults => $IGNORED,
16639 'LineBreak.txt', v3.0.0,
16640 Has_Missings_Defaults => $NOT_IGNORED,
16641 Property => 'Line_Break',
16642 # Early versions had problematic syntax
16643 Each_Line_Handler => (($v_version lt v3.1.0)
16644 ? \&filter_early_ea_lb
16647 Input_file->new('EastAsianWidth.txt', v3.0.0,
16648 Property => 'East_Asian_Width',
16649 Has_Missings_Defaults => $NOT_IGNORED,
16650 # Early versions had problematic syntax
16651 Each_Line_Handler => (($v_version lt v3.1.0)
16652 ? \&filter_early_ea_lb
16655 Input_file->new('CompositionExclusions.txt', v3.0.0,
16656 Property => 'Composition_Exclusion',
16658 Input_file->new('BidiMirroring.txt', v3.0.1,
16659 Property => 'Bidi_Mirroring_Glyph',
16661 Input_file->new("NormTest.txt", v3.0.0,
16662 Handler => \&process_NormalizationsTest,
16663 Skip => ($make_norm_test_script) ? 0 : 'Validation Tests',
16665 Input_file->new('CaseFolding.txt', v3.0.1,
16666 Pre_Handler => \&setup_case_folding,
16667 Each_Line_Handler =>
16668 [ ($v_version lt v3.1.0)
16669 ? \&filter_old_style_case_folding
16671 \&filter_case_folding_line
16673 Has_Missings_Defaults => $IGNORED,
16675 Input_file->new('DCoreProperties.txt', v3.1.0,
16676 # 5.2 changed this file
16677 Has_Missings_Defaults => (($v_version ge v5.2.0)
16681 Input_file->new('Scripts.txt', v3.1.0,
16682 Property => 'Script',
16683 Has_Missings_Defaults => $NOT_IGNORED,
16685 Input_file->new('DNormalizationProps.txt', v3.1.0,
16686 Has_Missings_Defaults => $NOT_IGNORED,
16687 Each_Line_Handler => (($v_version lt v4.0.1)
16688 ? \&filter_old_style_normalization_lines
16691 Input_file->new('HangulSyllableType.txt', v4.0.0,
16692 Has_Missings_Defaults => $NOT_IGNORED,
16693 Property => 'Hangul_Syllable_Type'),
16694 Input_file->new("$AUXILIARY/WordBreakProperty.txt", v4.1.0,
16695 Property => 'Word_Break',
16696 Has_Missings_Defaults => $NOT_IGNORED,
16698 Input_file->new("$AUXILIARY/GraphemeBreakProperty.txt", v4.1.0,
16699 Property => 'Grapheme_Cluster_Break',
16700 Has_Missings_Defaults => $NOT_IGNORED,
16702 Input_file->new("$AUXILIARY/GCBTest.txt", v4.1.0,
16703 Handler => \&process_GCB_test,
16705 Input_file->new("$AUXILIARY/LBTest.txt", v4.1.0,
16706 Skip => 'Validation Tests',
16708 Input_file->new("$AUXILIARY/SBTest.txt", v4.1.0,
16709 Skip => 'Validation Tests',
16711 Input_file->new("$AUXILIARY/WBTest.txt", v4.1.0,
16712 Skip => 'Validation Tests',
16714 Input_file->new("$AUXILIARY/SentenceBreakProperty.txt", v4.1.0,
16715 Property => 'Sentence_Break',
16716 Has_Missings_Defaults => $NOT_IGNORED,
16718 Input_file->new('NamedSequences.txt', v4.1.0,
16719 Handler => \&process_NamedSequences
16721 Input_file->new('NameAliases.txt', v0,
16722 Property => 'Name_Alias',
16723 Pre_Handler => ($v_version le v6.0.0)
16724 ? \&setup_early_name_alias
16726 Each_Line_Handler => ($v_version le v6.0.0)
16727 ? \&filter_early_version_name_alias_line
16728 : \&filter_later_version_name_alias_line,
16730 Input_file->new("BidiTest.txt", v5.2.0,
16731 Skip => 'Validation Tests',
16733 Input_file->new('UnihanIndicesDictionary.txt', v5.2.0,
16735 Each_Line_Handler => \&filter_unihan_line,
16737 Input_file->new('UnihanDataDictionaryLike.txt', v5.2.0,
16739 Each_Line_Handler => \&filter_unihan_line,
16741 Input_file->new('UnihanIRGSources.txt', v5.2.0,
16743 Pre_Handler => \&setup_unihan,
16744 Each_Line_Handler => \&filter_unihan_line,
16746 Input_file->new('UnihanNumericValues.txt', v5.2.0,
16748 Each_Line_Handler => \&filter_unihan_line,
16750 Input_file->new('UnihanOtherMappings.txt', v5.2.0,
16752 Each_Line_Handler => \&filter_unihan_line,
16754 Input_file->new('UnihanRadicalStrokeCounts.txt', v5.2.0,
16756 Each_Line_Handler => \&filter_unihan_line,
16758 Input_file->new('UnihanReadings.txt', v5.2.0,
16760 Each_Line_Handler => \&filter_unihan_line,
16762 Input_file->new('UnihanVariants.txt', v5.2.0,
16764 Each_Line_Handler => \&filter_unihan_line,
16766 Input_file->new('ScriptExtensions.txt', v6.0.0,
16767 Property => 'Script_Extensions',
16768 Pre_Handler => \&setup_script_extensions,
16769 Each_Line_Handler => \&filter_script_extensions_line,
16770 Has_Missings_Defaults => (($v_version le v6.0.0)
16774 # The two Indic files are actually available starting in v6.0.0, but their
16775 # property values are missing from PropValueAliases.txt in that release,
16776 # so that further work would have to be done to get them to work properly
16777 # for that release.
16778 Input_file->new('IndicMatraCategory.txt', v6.1.0,
16779 Property => 'Indic_Matra_Category',
16780 Has_Missings_Defaults => $NOT_IGNORED,
16781 Skip => "Provisional; for the analysis and processing of Indic scripts",
16783 Input_file->new('IndicSyllabicCategory.txt', v6.1.0,
16784 Property => 'Indic_Syllabic_Category',
16785 Has_Missings_Defaults => $NOT_IGNORED,
16786 Skip => "Provisional; for the analysis and processing of Indic scripts",
16790 # End of all the preliminaries.
16793 if ($compare_versions) {
16794 Carp::my_carp(<<END
16795 Warning. \$compare_versions is set. Output is not suitable for production
16800 # Put into %potential_files a list of all the files in the directory structure
16801 # that could be inputs to this program, excluding those that we should ignore.
16802 # Use absolute file names because it makes it easier across machine types.
16803 my @ignored_files_full_names = map { File::Spec->rel2abs(
16804 internal_file_to_platform($_))
16805 } keys %ignored_files;
16808 return unless /\.txt$/i; # Some platforms change the name's case
16809 my $full = lc(File::Spec->rel2abs($_));
16810 $potential_files{$full} = 1
16811 if ! grep { $full eq lc($_) } @ignored_files_full_names;
16814 }, File::Spec->curdir());
16816 my @mktables_list_output_files;
16817 my $old_start_time = 0;
16819 if (! -e $file_list) {
16820 print "'$file_list' doesn't exist, so forcing rebuild.\n" if $verbosity >= $VERBOSE;
16821 $write_unchanged_files = 1;
16822 } elsif ($write_unchanged_files) {
16823 print "Not checking file list '$file_list'.\n" if $verbosity >= $VERBOSE;
16826 print "Reading file list '$file_list'\n" if $verbosity >= $VERBOSE;
16828 if (! open $file_handle, "<", $file_list) {
16829 Carp::my_carp("Failed to open '$file_list'; turning on -globlist option instead: $!");
16835 # Read and parse mktables.lst, placing the results from the first part
16836 # into @input, and the second part into @mktables_list_output_files
16837 for my $list ( \@input, \@mktables_list_output_files ) {
16838 while (<$file_handle>) {
16839 s/^ \s+ | \s+ $//xg;
16840 if (/^ \s* \# .* Autogenerated\ starting\ on\ (\d+)/x) {
16841 $old_start_time = $1;
16843 next if /^ \s* (?: \# .* )? $/x;
16845 my ( $file ) = split /\t/;
16846 push @$list, $file;
16848 @$list = uniques(@$list);
16852 # Look through all the input files
16853 foreach my $input (@input) {
16854 next if $input eq 'version'; # Already have checked this.
16856 # Ignore if doesn't exist. The checking about whether we care or
16857 # not is done via the Input_file object.
16858 next if ! file_exists($input);
16860 # The paths are stored with relative names, and with '/' as the
16861 # delimiter; convert to absolute on this machine
16862 my $full = lc(File::Spec->rel2abs(internal_file_to_platform($input)));
16863 $potential_files{lc $full} = 1
16864 if ! grep { lc($full) eq lc($_) } @ignored_files_full_names;
16868 close $file_handle;
16873 # Here wants to process all .txt files in the directory structure.
16874 # Convert them to full path names. They are stored in the platform's
16877 foreach my $object (@input_file_objects) {
16878 my $file = $object->file;
16879 next unless defined $file;
16880 push @known_files, File::Spec->rel2abs($file);
16883 my @unknown_input_files;
16884 foreach my $file (keys %potential_files) { # The keys are stored in lc
16885 next if grep { $file eq lc($_) } @known_files;
16887 # Here, the file is unknown to us. Get relative path name
16888 $file = File::Spec->abs2rel($file);
16889 push @unknown_input_files, $file;
16891 # What will happen is we create a data structure for it, and add it to
16892 # the list of input files to process. First get the subdirectories
16894 my (undef, $directories, undef) = File::Spec->splitpath($file);
16895 $directories =~ s;/$;;; # Can have extraneous trailing '/'
16896 my @directories = File::Spec->splitdir($directories);
16898 # If the file isn't extracted (meaning none of the directories is the
16899 # extracted one), just add it to the end of the list of inputs.
16900 if (! grep { $EXTRACTED_DIR eq $_ } @directories) {
16901 push @input_file_objects, Input_file->new($file, v0);
16905 # Here, the file is extracted. It needs to go ahead of most other
16906 # processing. Search for the first input file that isn't a
16907 # special required property (that is, find one whose first_release
16908 # is non-0), and isn't extracted. Also, the Age property file is
16909 # processed before the extracted ones, just in case
16910 # $compare_versions is set.
16911 for (my $i = 0; $i < @input_file_objects; $i++) {
16912 if ($input_file_objects[$i]->first_released ne v0
16913 && lc($input_file_objects[$i]->file) ne 'dage.txt'
16914 && $input_file_objects[$i]->file !~ /$EXTRACTED_DIR/i)
16916 splice @input_file_objects, $i, 0,
16917 Input_file->new($file, v0);
16924 if (@unknown_input_files) {
16925 print STDERR simple_fold(join_lines(<<END
16927 The following files are unknown as to how to handle. Assuming they are
16928 typical property files. You'll know by later error messages if it worked or
16931 ) . " " . join(", ", @unknown_input_files) . "\n\n");
16933 } # End of looking through directory structure for more .txt files.
16935 # Create the list of input files from the objects we have defined, plus
16937 my @input_files = 'version';
16938 foreach my $object (@input_file_objects) {
16939 my $file = $object->file;
16940 next if ! defined $file; # Not all objects have files
16941 next if $object->optional && ! -e $file;
16942 push @input_files, $file;
16945 if ( $verbosity >= $VERBOSE ) {
16946 print "Expecting ".scalar( @input_files )." input files. ",
16947 "Checking ".scalar( @mktables_list_output_files )." output files.\n";
16950 # We set $most_recent to be the most recently changed input file, including
16951 # this program itself (done much earlier in this file)
16952 foreach my $in (@input_files) {
16953 next unless -e $in; # Keep going even if missing a file
16954 my $mod_time = (stat $in)[9];
16955 $most_recent = $mod_time if $mod_time > $most_recent;
16957 # See that the input files have distinct names, to warn someone if they
16958 # are adding a new one
16960 my ($volume, $directories, $file ) = File::Spec->splitpath($in);
16961 $directories =~ s;/$;;; # Can have extraneous trailing '/'
16962 my @directories = File::Spec->splitdir($directories);
16963 my $base = $file =~ s/\.txt$//;
16964 construct_filename($file, 'mutable', \@directories);
16968 my $rebuild = $write_unchanged_files # Rebuild: if unconditional rebuild
16969 || ! scalar @mktables_list_output_files # or if no outputs known
16970 || $old_start_time < $most_recent; # or out-of-date
16972 # Now we check to see if any output files are older than youngest, if
16973 # they are, we need to continue on, otherwise we can presumably bail.
16975 foreach my $out (@mktables_list_output_files) {
16976 if ( ! file_exists($out)) {
16977 print "'$out' is missing.\n" if $verbosity >= $VERBOSE;
16981 #local $to_trace = 1 if main::DEBUG;
16982 trace $most_recent, (stat $out)[9] if main::DEBUG && $to_trace;
16983 if ( (stat $out)[9] <= $most_recent ) {
16984 #trace "$out: most recent mod time: ", (stat $out)[9], ", youngest: $most_recent\n" if main::DEBUG && $to_trace;
16985 print "'$out' is too old.\n" if $verbosity >= $VERBOSE;
16992 print "Files seem to be ok, not bothering to rebuild. Add '-w' option to force build\n";
16995 print "Must rebuild tables.\n" if $verbosity >= $VERBOSE;
16997 # Ready to do the major processing. First create the perl pseudo-property.
16998 $perl = Property->new('perl', Type => $NON_STRING, Perl_Extension => 1);
17000 # Process each input file
17001 foreach my $file (@input_file_objects) {
17005 # Finish the table generation.
17007 print "Finishing processing Unicode properties\n" if $verbosity >= $PROGRESS;
17010 print "Compiling Perl properties\n" if $verbosity >= $PROGRESS;
17013 print "Creating Perl synonyms\n" if $verbosity >= $PROGRESS;
17014 add_perl_synonyms();
17016 print "Writing tables\n" if $verbosity >= $PROGRESS;
17017 write_all_tables();
17019 # Write mktables.lst
17020 if ( $file_list and $make_list ) {
17022 print "Updating '$file_list'\n" if $verbosity >= $PROGRESS;
17023 foreach my $file (@input_files, @files_actually_output) {
17024 my (undef, $directories, $file) = File::Spec->splitpath($file);
17025 my @directories = File::Spec->splitdir($directories);
17026 $file = join '/', @directories, $file;
17030 if (! open $ofh,">",$file_list) {
17031 Carp::my_carp("Can't write to '$file_list'. Skipping: $!");
17035 my $localtime = localtime $start_time;
17036 print $ofh <<"END";
17038 # $file_list -- File list for $0.
17040 # Autogenerated starting on $start_time ($localtime)
17042 # - First section is input files
17043 # ($0 itself is not listed but is automatically considered an input)
17044 # - Section separator is /^=+\$/
17045 # - Second section is a list of output files.
17046 # - Lines matching /^\\s*#/ are treated as comments
17047 # which along with blank lines are ignored.
17053 print $ofh "$_\n" for sort(@input_files);
17054 print $ofh "\n=================================\n# Output files:\n\n";
17055 print $ofh "$_\n" for sort @files_actually_output;
17056 print $ofh "\n# ",scalar(@input_files)," input files\n",
17057 "# ",scalar(@files_actually_output)+1," output files\n\n",
17060 or Carp::my_carp("Failed to close $ofh: $!");
17062 print "Filelist has ",scalar(@input_files)," input files and ",
17063 scalar(@files_actually_output)+1," output files\n"
17064 if $verbosity >= $VERBOSE;
17068 # Output these warnings unless -q explicitly specified.
17069 if ($verbosity >= $NORMAL_VERBOSITY && ! $debug_skip) {
17070 if (@unhandled_properties) {
17071 print "\nProperties and tables that unexpectedly have no code points\n";
17072 foreach my $property (sort @unhandled_properties) {
17073 print $property, "\n";
17077 if (%potential_files) {
17078 print "\nInput files that are not considered:\n";
17079 foreach my $file (sort keys %potential_files) {
17080 print File::Spec->abs2rel($file), "\n";
17083 print "\nAll done\n" if $verbosity >= $VERBOSE;
17087 # TRAILING CODE IS USED BY make_property_test_script()
17093 # If run outside the normal test suite on an ASCII platform, you can
17094 # just create a latin1_to_native() function that just returns its
17095 # inputs, because that's the only function used from test.pl
17098 # Test qr/\X/ and the \p{} regular expression constructs. This file is
17099 # constructed by mktables from the tables it generates, so if mktables is
17100 # buggy, this won't necessarily catch those bugs. Tests are generated for all
17101 # feasible properties; a few aren't currently feasible; see
17102 # is_code_point_usable() in mktables for details.
17104 # Standard test packages are not used because this manipulates SIG_WARN. It
17105 # exits 0 if every non-skipped test succeeded; -1 if any failed.
17111 my $expected = shift;
17114 my $warning_type = shift; # Type of warning message, like 'deprecated'
17116 my $line = (caller)[2];
17117 $ord = ord(latin1_to_native(chr($ord)));
17119 # Convert the code point to hex form
17120 my $string = sprintf "\"\\x{%04X}\"", $ord;
17124 # The first time through, use all warnings. If the input should generate
17125 # a warning, add another time through with them turned off
17126 push @tests, "no warnings '$warning_type';" if $warning_type;
17128 foreach my $no_warnings (@tests) {
17130 # Store any warning messages instead of outputting them
17131 local $SIG{__WARN__} = $SIG{__WARN__};
17132 my $warning_message;
17133 $SIG{__WARN__} = sub { $warning_message = $_[0] };
17137 # A string eval is needed because of the 'no warnings'.
17138 # Assumes no parens in the regular expression
17139 my $result = eval "$no_warnings
17140 my \$RegObj = qr($regex);
17141 $string =~ \$RegObj ? 1 : 0";
17142 if (not defined $result) {
17143 print "not ok $Tests - couldn't compile /$regex/; line $line: $@\n";
17146 elsif ($result ^ $expected) {
17147 print "not ok $Tests - expected $expected but got $result for $string =~ qr/$regex/; line $line\n";
17150 elsif ($warning_message) {
17151 if (! $warning_type || ($warning_type && $no_warnings)) {
17152 print "not ok $Tests - for qr/$regex/ did not expect warning message '$warning_message'; line $line\n";
17156 print "ok $Tests - expected and got a warning message for qr/$regex/; line $line\n";
17159 elsif ($warning_type && ! $no_warnings) {
17160 print "not ok $Tests - for qr/$regex/ expected a $warning_type warning message, but got none; line $line\n";
17164 print "ok $Tests - got $result for $string =~ qr/$regex/; line $line\n";
17173 if (eval { 'x' =~ qr/$regex/; 1 }) {
17175 my $line = (caller)[2];
17176 print "not ok $Tests - re compiled ok, but expected error for qr/$regex/; line $line: $@\n";
17179 my $line = (caller)[2];
17180 print "ok $Tests - got and expected error for qr/$regex/; line $line\n";
17185 # GCBTest.txt character that separates grapheme clusters
17186 my $breakable_utf8 = my $breakable = chr(0xF7);
17187 utf8::upgrade($breakable_utf8);
17189 # GCBTest.txt character that indicates that the adjoining code points are part
17190 # of the same grapheme cluster
17191 my $nobreak_utf8 = my $nobreak = chr(0xD7);
17192 utf8::upgrade($nobreak_utf8);
17195 # Test qr/\X/ matches. The input is a line from auxiliary/GCBTest.txt
17196 # Each such line is a sequence of code points given by their hex numbers,
17197 # separated by the two characters defined just before this subroutine that
17198 # indicate that either there can or cannot be a break between the adjacent
17199 # code points. If there isn't a break, that means the sequence forms an
17200 # extended grapheme cluster, which means that \X should match the whole
17201 # thing. If there is a break, \X should stop there. This is all
17202 # converted by this routine into a match:
17203 # $string =~ /(\X)/,
17204 # Each \X should match the next cluster; and that is what is checked.
17206 my $template = shift;
17208 my $line = (caller)[2];
17210 # The line contains characters above the ASCII range, but in Latin1. It
17211 # may or may not be in utf8, and if it is, it may or may not know it. So,
17212 # convert these characters to 8 bits. If knows is in utf8, simply
17214 if (utf8::is_utf8($template)) {
17215 utf8::downgrade($template);
17218 # Otherwise, if it is in utf8, but doesn't know it, the next lines
17219 # convert the two problematic characters to their 8-bit equivalents.
17220 # If it isn't in utf8, they don't harm anything.
17222 $template =~ s/$nobreak_utf8/$nobreak/g;
17223 $template =~ s/$breakable_utf8/$breakable/g;
17226 # Get rid of the leading and trailing breakables
17227 $template =~ s/^ \s* $breakable \s* //x;
17228 $template =~ s/ \s* $breakable \s* $ //x;
17230 # And no-breaks become just a space.
17231 $template =~ s/ \s* $nobreak \s* / /xg;
17233 # Split the input into segments that are breakable between them.
17234 my @segments = split /\s*$breakable\s*/, $template;
17237 my $display_string = "";
17239 my @should_display;
17241 # Convert the code point sequence in each segment into a Perl string of
17243 foreach my $segment (@segments) {
17244 my @code_points = split /\s+/, $segment;
17245 my $this_string = "";
17246 my $this_display = "";
17247 foreach my $code_point (@code_points) {
17248 $this_string .= latin1_to_native(chr(hex $code_point));
17249 $this_display .= "\\x{$code_point}";
17252 # The next cluster should match the string in this segment.
17253 push @should_match, $this_string;
17254 push @should_display, $this_display;
17255 $string .= $this_string;
17256 $display_string .= $this_display;
17259 # If a string can be represented in both non-ut8 and utf8, test both cases
17261 for my $to_upgrade (0 .. 1) {
17265 # If already in utf8, would just be a repeat
17266 next UPGRADE if utf8::is_utf8($string);
17268 utf8::upgrade($string);
17271 # Finally, do the \X match.
17272 my @matches = $string =~ /(\X)/g;
17274 # Look through each matched cluster to verify that it matches what we
17276 my $min = (@matches < @should_match) ? @matches : @should_match;
17277 for my $i (0 .. $min - 1) {
17279 if ($matches[$i] eq $should_match[$i]) {
17280 print "ok $Tests - ";
17282 print "In \"$display_string\" =~ /(\\X)/g, \\X #1";
17284 print "And \\X #", $i + 1,
17286 print " correctly matched $should_display[$i]; line $line\n";
17288 $matches[$i] = join("", map { sprintf "\\x{%04X}", $_ }
17289 unpack("U*", $matches[$i]));
17290 print "not ok $Tests - In \"$display_string\" =~ /(\\X)/g, \\X #",
17292 " should have matched $should_display[$i]",
17293 " but instead matched $matches[$i]",
17294 ". Abandoning rest of line $line\n";
17299 # And the number of matches should equal the number of expected matches.
17301 if (@matches == @should_match) {
17302 print "ok $Tests - Nothing was left over; line $line\n";
17304 print "not ok $Tests - There were ", scalar @should_match, " \\X matches expected, but got ", scalar @matches, " instead; line $line\n";
17312 print "1..$Tests\n";
17313 exit($Fails ? -1 : 0);
17316 Error('\p{Script=InGreek}'); # Bug #69018
17317 Test_X("1100 $nobreak 1161"); # Bug #70940
17318 Expect(0, 0x2028, '\p{Print}', ""); # Bug # 71722
17319 Expect(0, 0x2029, '\p{Print}', ""); # Bug # 71722
17320 Expect(1, 0xFF10, '\p{XDigit}', ""); # Bug # 71726