3 # !!!!!!!!!!!!!! IF YOU MODIFY THIS FILE !!!!!!!!!!!!!!!!!!!!!!!!!
4 # Any files created or read by this program should be listed in 'mktables.lst'
5 # Use -makelist to regenerate it.
7 # Needs 'no overloading' to run faster on miniperl. Code commented out at the
8 # subroutine objaddr can be used instead to work as far back (untested) as
9 # 5.8: needs pack "U". But almost all occurrences of objaddr have been
10 # removed in favor of using 'no overloading'. You also would have to go
11 # through and replace occurrences like:
12 # my $addr = do { no overloading; pack 'J', $self; }
14 # my $addr = main::objaddr $self;
15 # (or reverse commit 9b01bafde4b022706c3d6f947a0963f821b2e50b
16 # that instituted the change to main::objaddr, and subsequent commits that
17 # changed 0+$self to pack 'J', $self.)
20 BEGIN { # Get the time the script started running; do it at compiliation to
21 # get it as close as possible
35 sub DEBUG () { 0 } # Set to 0 for production; 1 for development
37 ##########################################################################
39 # mktables -- create the runtime Perl Unicode files (lib/unicore/.../*.pl),
40 # from the Unicode database files (lib/unicore/.../*.txt), It also generates
41 # a pod file and a .t file
43 # The structure of this file is:
44 # First these introductory comments; then
45 # code needed for everywhere, such as debugging stuff; then
46 # code to handle input parameters; then
47 # data structures likely to be of external interest (some of which depend on
48 # the input parameters, so follows them; then
49 # more data structures and subroutine and package (class) definitions; then
50 # the small actual loop to process the input files and finish up; then
51 # a __DATA__ section, for the .t tests
53 # This program works on all releases of Unicode through at least 5.2. The
54 # outputs have been scrutinized most intently for release 5.1. The others
55 # have been checked for somewhat more than just sanity. It can handle all
56 # existing Unicode character properties in those releases.
58 # This program is mostly about Unicode character (or code point) properties.
59 # A property describes some attribute or quality of a code point, like if it
60 # is lowercase or not, its name, what version of Unicode it was first defined
61 # in, or what its uppercase equivalent is. Unicode deals with these disparate
62 # possibilities by making all properties into mappings from each code point
63 # into some corresponding value. In the case of it being lowercase or not,
64 # the mapping is either to 'Y' or 'N' (or various synonyms thereof). Each
65 # property maps each Unicode code point to a single value, called a "property
66 # value". (Hence each Unicode property is a true mathematical function with
67 # exactly one value per code point.)
69 # When using a property in a regular expression, what is desired isn't the
70 # mapping of the code point to its property's value, but the reverse (or the
71 # mathematical "inverse relation"): starting with the property value, "Does a
72 # code point map to it?" These are written in a "compound" form:
73 # \p{property=value}, e.g., \p{category=punctuation}. This program generates
74 # files containing the lists of code points that map to each such regular
75 # expression property value, one file per list
77 # There is also a single form shortcut that Perl adds for many of the commonly
78 # used properties. This happens for all binary properties, plus script,
79 # general_category, and block properties.
81 # Thus the outputs of this program are files. There are map files, mostly in
82 # the 'To' directory; and there are list files for use in regular expression
83 # matching, all in subdirectories of the 'lib' directory, with each
84 # subdirectory being named for the property that the lists in it are for.
85 # Bookkeeping, test, and documentation files are also generated.
87 my $matches_directory = 'lib'; # Where match (\p{}) files go.
88 my $map_directory = 'To'; # Where map files go.
92 # The major data structures of this program are Property, of course, but also
93 # Table. There are two kinds of tables, very similar to each other.
94 # "Match_Table" is the data structure giving the list of code points that have
95 # a particular property value, mentioned above. There is also a "Map_Table"
96 # data structure which gives the property's mapping from code point to value.
97 # There are two structures because the match tables need to be combined in
98 # various ways, such as constructing unions, intersections, complements, etc.,
99 # and the map ones don't. And there would be problems, perhaps subtle, if
100 # a map table were inadvertently operated on in some of those ways.
101 # The use of separate classes with operations defined on one but not the other
102 # prevents accidentally confusing the two.
104 # At the heart of each table's data structure is a "Range_List", which is just
105 # an ordered list of "Ranges", plus ancillary information, and methods to
106 # operate on them. A Range is a compact way to store property information.
107 # Each range has a starting code point, an ending code point, and a value that
108 # is meant to apply to all the code points between the two end points,
109 # inclusive. For a map table, this value is the property value for those
110 # code points. Two such ranges could be written like this:
111 # 0x41 .. 0x5A, 'Upper',
112 # 0x61 .. 0x7A, 'Lower'
114 # Each range also has a type used as a convenience to classify the values.
115 # Most ranges in this program will be Type 0, or normal, but there are some
116 # ranges that have a non-zero type. These are used only in map tables, and
117 # are for mappings that don't fit into the normal scheme of things. Mappings
118 # that require a hash entry to communicate with utf8.c are one example;
119 # another example is mappings for charnames.pm to use which indicate a name
120 # that is algorithmically determinable from its code point (and vice-versa).
121 # These are used to significantly compact these tables, instead of listing
122 # each one of the tens of thousands individually.
124 # In a match table, the value of a range is irrelevant (and hence the type as
125 # well, which will always be 0), and arbitrarily set to the null string.
126 # Using the example above, there would be two match tables for those two
127 # entries, one named Upper would contain the 0x41..0x5A range, and the other
128 # named Lower would contain 0x61..0x7A.
130 # Actually, there are two types of range lists, "Range_Map" is the one
131 # associated with map tables, and "Range_List" with match tables.
132 # Again, this is so that methods can be defined on one and not the other so as
133 # to prevent operating on them in incorrect ways.
135 # Eventually, most tables are written out to files to be read by utf8_heavy.pl
136 # in the perl core. All tables could in theory be written, but some are
137 # suppressed because there is no current practical use for them. It is easy
138 # to change which get written by changing various lists that are near the top
139 # of the actual code in this file. The table data structures contain enough
140 # ancillary information to allow them to be treated as separate entities for
141 # writing, such as the path to each one's file. There is a heading in each
142 # map table that gives the format of its entries, and what the map is for all
143 # the code points missing from it. (This allows tables to be more compact.)
145 # The Property data structure contains one or more tables. All properties
146 # contain a map table (except the $perl property which is a
147 # pseudo-property containing only match tables), and any properties that
148 # are usable in regular expression matches also contain various matching
149 # tables, one for each value the property can have. A binary property can
150 # have two values, True and False (or Y and N, which are preferred by Unicode
151 # terminology). Thus each of these properties will have a map table that
152 # takes every code point and maps it to Y or N (but having ranges cuts the
153 # number of entries in that table way down), and two match tables, one
154 # which has a list of all the code points that map to Y, and one for all the
155 # code points that map to N. (For each of these, a third table is also
156 # generated for the pseudo Perl property. It contains the identical code
157 # points as the Y table, but can be written, not in the compound form, but in
158 # a "single" form like \p{IsUppercase}.) Many properties are binary, but some
159 # properties have several possible values, some have many, and properties like
160 # Name have a different value for every named code point. Those will not,
161 # unless the controlling lists are changed, have their match tables written
162 # out. But all the ones which can be used in regular expression \p{} and \P{}
163 # constructs will. Generally a property will have either its map table or its
164 # match tables written but not both. Again, what gets written is controlled
165 # by lists which can easily be changed.
167 # For information about the Unicode properties, see Unicode's UAX44 document:
169 my $unicode_reference_url = 'http://www.unicode.org/reports/tr44/';
171 # As stated earlier, this program will work on any release of Unicode so far.
172 # Most obvious problems in earlier data have NOT been corrected except when
173 # necessary to make Perl or this program work reasonably. For example, no
174 # folding information was given in early releases, so this program uses the
175 # substitute of lower case, just so that a regular expression with the /i
176 # option will do something that actually gives the right results in many
177 # cases. There are also a couple other corrections for version 1.1.5,
178 # commented at the point they are made. As an example of corrections that
179 # weren't made (but could be) is this statement from DerivedAge.txt: "The
180 # supplementary private use code points and the non-character code points were
181 # assigned in version 2.0, but not specifically listed in the UCD until
182 # versions 3.0 and 3.1 respectively." (To be precise it was 3.0.1 not 3.0.0)
183 # More information on Unicode version glitches is further down in these
184 # introductory comments.
186 # This program works on all properties as of 5.2, though the files for some
187 # are suppressed from apparent lack of demand for them. You can change which
188 # are output by changing lists in this program.
190 # The old version of mktables emphasized the term "Fuzzy" to mean Unocde's
191 # loose matchings rules (from Unicode TR18):
193 # The recommended names for UCD properties and property values are in
194 # PropertyAliases.txt [Prop] and PropertyValueAliases.txt
195 # [PropValue]. There are both abbreviated names and longer, more
196 # descriptive names. It is strongly recommended that both names be
197 # recognized, and that loose matching of property names be used,
198 # whereby the case distinctions, whitespace, hyphens, and underbar
200 # The program still allows Fuzzy to override its determination of if loose
201 # matching should be used, but it isn't currently used, as it is no longer
202 # needed; the calculations it makes are good enough.
204 # SUMMARY OF HOW IT WORKS:
208 # A list is constructed containing each input file that is to be processed
210 # Each file on the list is processed in a loop, using the associated handler
212 # The PropertyAliases.txt and PropValueAliases.txt files are processed
213 # first. These files name the properties and property values.
214 # Objects are created of all the property and property value names
215 # that the rest of the input should expect, including all synonyms.
216 # The other input files give mappings from properties to property
217 # values. That is, they list code points and say what the mapping
218 # is under the given property. Some files give the mappings for
219 # just one property; and some for many. This program goes through
220 # each file and populates the properties from them. Some properties
221 # are listed in more than one file, and Unicode has set up a
222 # precedence as to which has priority if there is a conflict. Thus
223 # the order of processing matters, and this program handles the
224 # conflict possibility by processing the overriding input files
225 # last, so that if necessary they replace earlier values.
226 # After this is all done, the program creates the property mappings not
227 # furnished by Unicode, but derivable from what it does give.
228 # The tables of code points that match each property value in each
229 # property that is accessible by regular expressions are created.
230 # The Perl-defined properties are created and populated. Many of these
231 # require data determined from the earlier steps
232 # Any Perl-defined synonyms are created, and name clashes between Perl
233 # and Unicode are reconciled and warned about.
234 # All the properties are written to files
235 # Any other files are written, and final warnings issued.
237 # For clarity, a number of operators have been overloaded to work on tables:
238 # ~ means invert (take all characters not in the set). The more
239 # conventional '!' is not used because of the possibility of confusing
240 # it with the actual boolean operation.
242 # - means subtraction
243 # & means intersection
244 # The precedence of these is the order listed. Parentheses should be
245 # copiously used. These are not a general scheme. The operations aren't
246 # defined for a number of things, deliberately, to avoid getting into trouble.
247 # Operations are done on references and affect the underlying structures, so
248 # that the copy constructors for them have been overloaded to not return a new
249 # clone, but the input object itself.
251 # The bool operator is deliberately not overloaded to avoid confusion with
252 # "should it mean if the object merely exists, or also is non-empty?".
254 # WHY CERTAIN DESIGN DECISIONS WERE MADE
256 # This program needs to be able to run under miniperl. Therefore, it uses a
257 # minimum of other modules, and hence implements some things itself that could
258 # be gotten from CPAN
260 # This program uses inputs published by the Unicode Consortium. These can
261 # change incompatibly between releases without the Perl maintainers realizing
262 # it. Therefore this program is now designed to try to flag these. It looks
263 # at the directories where the inputs are, and flags any unrecognized files.
264 # It keeps track of all the properties in the files it handles, and flags any
265 # that it doesn't know how to handle. It also flags any input lines that
266 # don't match the expected syntax, among other checks.
268 # It is also designed so if a new input file matches one of the known
269 # templates, one hopefully just needs to add it to a list to have it
272 # As mentioned earlier, some properties are given in more than one file. In
273 # particular, the files in the extracted directory are supposedly just
274 # reformattings of the others. But they contain information not easily
275 # derivable from the other files, including results for Unihan, which this
276 # program doesn't ordinarily look at, and for unassigned code points. They
277 # also have historically had errors or been incomplete. In an attempt to
278 # create the best possible data, this program thus processes them first to
279 # glean information missing from the other files; then processes those other
280 # files to override any errors in the extracted ones. Much of the design was
281 # driven by this need to store things and then possibly override them.
283 # It tries to keep fatal errors to a minimum, to generate something usable for
284 # testing purposes. It always looks for files that could be inputs, and will
285 # warn about any that it doesn't know how to handle (the -q option suppresses
288 # Why have files written out for binary 'N' matches?
289 # For binary properties, if you know the mapping for either Y or N; the
290 # other is trivial to construct, so could be done at Perl run-time by just
291 # complementing the result, instead of having a file for it. That is, if
292 # someone types in \p{foo: N}, Perl could translate that to \P{foo: Y} and
293 # not need a file. The problem is communicating to Perl that a given
294 # property is binary. Perl can't figure it out from looking at the N (or
295 # No), as some non-binary properties have these as property values. So
296 # rather than inventing a way to communicate this info back to the core,
297 # which would have required changes there as well, it was simpler just to
298 # add the extra tables.
300 # Why is there more than one type of range?
301 # This simplified things. There are some very specialized code points that
302 # have to be handled specially for output, such as Hangul syllable names.
303 # By creating a range type (done late in the development process), it
304 # allowed this to be stored with the range, and overridden by other input.
305 # Originally these were stored in another data structure, and it became a
306 # mess trying to decide if a second file that was for the same property was
307 # overriding the earlier one or not.
309 # Why are there two kinds of tables, match and map?
310 # (And there is a base class shared by the two as well.) As stated above,
311 # they actually are for different things. Development proceeded much more
312 # smoothly when I (khw) realized the distinction. Map tables are used to
313 # give the property value for every code point (actually every code point
314 # that doesn't map to a default value). Match tables are used for regular
315 # expression matches, and are essentially the inverse mapping. Separating
316 # the two allows more specialized methods, and error checks so that one
317 # can't just take the intersection of two map tables, for example, as that
320 # There are no match tables generated for matches of the null string. These
321 # would look like qr/\p{JSN=}/ currently without modifying the regex code.
322 # Perhaps something like them could be added if necessary. The JSN does have
323 # a real code point U+110B that maps to the null string, but it is a
324 # contributory property, and therefore not output by default. And it's easily
325 # handled so far by making the null string the default where it is a
330 # This program is written so it will run under miniperl. Occasionally changes
331 # will cause an error where the backtrace doesn't work well under miniperl.
332 # To diagnose the problem, you can instead run it under regular perl, if you
335 # There is a good trace facility. To enable it, first sub DEBUG must be set
336 # to return true. Then a line like
338 # local $to_trace = 1 if main::DEBUG;
340 # can be added to enable tracing in its lexical scope or until you insert
343 # local $to_trace = 0 if main::DEBUG;
345 # then use a line like "trace $a, @b, %c, ...;
347 # Some of the more complex subroutines already have trace statements in them.
348 # Permanent trace statements should be like:
350 # trace ... if main::DEBUG && $to_trace;
352 # If there is just one or a few files that you're debugging, you can easily
353 # cause most everything else to be skipped. Change the line
355 # my $debug_skip = 0;
357 # to 1, and every file whose object is in @input_file_objects and doesn't have
358 # a, 'non_skip => 1,' in its constructor will be skipped.
360 # To compare the output tables, it may be useful to specify the -output_names
361 # flag. This causes the tables to expand so there is one entry for each
362 # non-algorithmically named code point giving, currently its name, and its
363 # graphic representation if printable (and you have a font that knows about
364 # it). This makes it easier to see what the particular code points are in
365 # each output table. The tables are usable, but because they don't have
366 # ranges (for the most part), a Perl using them will run slower. Non-named
367 # code points are annotated with a description of their status, and contiguous
368 # ones with the same description will be output as a range rather than
369 # individually. Algorithmically named characters are also output as ranges,
370 # except when there are just a few contiguous ones.
374 # The program would break if Unicode were to change its names so that
375 # interior white space, underscores, or dashes differences were significant
376 # within property and property value names.
378 # It might be easier to use the xml versions of the UCD if this program ever
379 # would need heavy revision, and the ability to handle old versions was not
382 # There is the potential for name collisions, in that Perl has chosen names
383 # that Unicode could decide it also likes. There have been such collisions in
384 # the past, with mostly Perl deciding to adopt the Unicode definition of the
385 # name. However in the 5.2 Unicode beta testing, there were a number of such
386 # collisions, which were withdrawn before the final release, because of Perl's
387 # and other's protests. These all involved new properties which began with
388 # 'Is'. Based on the protests, Unicode is unlikely to try that again. Also,
389 # many of the Perl-defined synonyms, like Any, Word, etc, are listed in a
390 # Unicode document, so they are unlikely to be used by Unicode for another
391 # purpose. However, they might try something beginning with 'In', or use any
392 # of the other Perl-defined properties. This program will warn you of name
393 # collisions, and refuse to generate tables with them, but manual intervention
394 # will be required in this event. One scheme that could be implemented, if
395 # necessary, would be to have this program generate another file, or add a
396 # field to mktables.lst that gives the date of first definition of a property.
397 # Each new release of Unicode would use that file as a basis for the next
398 # iteration. And the Perl synonym addition code could sort based on the age
399 # of the property, so older properties get priority, and newer ones that clash
400 # would be refused; hence existing code would not be impacted, and some other
401 # synonym would have to be used for the new property. This is ugly, and
402 # manual intervention would certainly be easier to do in the short run; lets
403 # hope it never comes to this.
407 # This program can generate tables from the Unihan database. But it doesn't
408 # by default, letting the CPAN module Unicode::Unihan handle them. Prior to
409 # version 5.2, this database was in a single file, Unihan.txt. In 5.2 the
410 # database was split into 8 different files, all beginning with the letters
411 # 'Unihan'. This program will read those file(s) if present, but it needs to
412 # know which of the many properties in the file(s) should have tables created
413 # for them. It will create tables for any properties listed in
414 # PropertyAliases.txt and PropValueAliases.txt, plus any listed in the
415 # @cjk_properties array and the @cjk_property_values array. Thus, if a
416 # property you want is not in those files of the release you are building
417 # against, you must add it to those two arrays. Starting in 4.0, the
418 # Unicode_Radical_Stroke was listed in those files, so if the Unihan database
419 # is present in the directory, a table will be generated for that property.
420 # In 5.2, several more properties were added. For your convenience, the two
421 # arrays are initialized with all the 5.2 listed properties that are also in
422 # earlier releases. But these are commented out. You can just uncomment the
423 # ones you want, or use them as a template for adding entries for other
426 # You may need to adjust the entries to suit your purposes. setup_unihan(),
427 # and filter_unihan_line() are the functions where this is done. This program
428 # already does some adjusting to make the lines look more like the rest of the
429 # Unicode DB; You can see what that is in filter_unihan_line()
431 # There is a bug in the 3.2 data file in which some values for the
432 # kPrimaryNumeric property have commas and an unexpected comment. A filter
433 # could be added for these; or for a particular installation, the Unihan.txt
434 # file could be edited to fix them.
436 # HOW TO ADD A FILE TO BE PROCESSED
438 # A new file from Unicode needs to have an object constructed for it in
439 # @input_file_objects, probably at the end or at the end of the extracted
440 # ones. The program should warn you if its name will clash with others on
441 # restrictive file systems, like DOS. If so, figure out a better name, and
442 # add lines to the README.perl file giving that. If the file is a character
443 # property, it should be in the format that Unicode has by default
444 # standardized for such files for the more recently introduced ones.
445 # If so, the Input_file constructor for @input_file_objects can just be the
446 # file name and release it first appeared in. If not, then it should be
447 # possible to construct an each_line_handler() to massage the line into the
450 # For non-character properties, more code will be needed. You can look at
451 # the existing entries for clues.
453 # UNICODE VERSIONS NOTES
455 # The Unicode UCD has had a number of errors in it over the versions. And
456 # these remain, by policy, in the standard for that version. Therefore it is
457 # risky to correct them, because code may be expecting the error. So this
458 # program doesn't generally make changes, unless the error breaks the Perl
459 # core. As an example, some versions of 2.1.x Jamo.txt have the wrong value
460 # for U+1105, which causes real problems for the algorithms for Jamo
461 # calculations, so it is changed here.
463 # But it isn't so clear cut as to what to do about concepts that are
464 # introduced in a later release; should they extend back to earlier releases
465 # where the concept just didn't exist? It was easier to do this than to not,
466 # so that's what was done. For example, the default value for code points not
467 # in the files for various properties was probably undefined until changed by
468 # some version. No_Block for blocks is such an example. This program will
469 # assign No_Block even in Unicode versions that didn't have it. This has the
470 # benefit that code being written doesn't have to special case earlier
471 # versions; and the detriment that it doesn't match the Standard precisely for
472 # the affected versions.
474 # Here are some observations about some of the issues in early versions:
476 # The number of code points in \p{alpha} halve in 2.1.9. It turns out that
477 # the reason is that the CJK block starting at 4E00 was removed from PropList,
478 # and was not put back in until 3.1.0
480 # Unicode introduced the synonym Space for White_Space in 4.1. Perl has
481 # always had a \p{Space}. In release 3.2 only, they are not synonymous. The
482 # reason is that 3.2 introduced U+205F=medium math space, which was not
483 # classed as white space, but Perl figured out that it should have been. 4.0
484 # reclassified it correctly.
486 # Another change between 3.2 and 4.0 is the CCC property value ATBL. In 3.2
487 # this was erroneously a synonym for 202. In 4.0, ATB became 202, and ATBL
488 # was left with no code points, as all the ones that mapped to 202 stayed
489 # mapped to 202. Thus if your program used the numeric name for the class,
490 # it would not have been affected, but if it used the mnemonic, it would have
493 # \p{Script=Hrkt} (Katakana_Or_Hiragana) came in 4.0.1. Before that code
494 # points which eventually came to have this script property value, instead
495 # mapped to "Unknown". But in the next release all these code points were
496 # moved to \p{sc=common} instead.
498 # The default for missing code points for BidiClass is complicated. Starting
499 # in 3.1.1, the derived file DBidiClass.txt handles this, but this program
500 # tries to do the best it can for earlier releases. It is done in
501 # process_PropertyAliases()
503 ##############################################################################
505 my $UNDEF = ':UNDEF:'; # String to print out for undefined values in tracing
507 my $MAX_LINE_WIDTH = 78;
509 # Debugging aid to skip most files so as to not be distracted by them when
510 # concentrating on the ones being debugged. Add
512 # to the constructor for those files you want processed when you set this.
513 # Files with a first version number of 0 are special: they are always
514 # processed regardless of the state of this flag.
517 # Set to 1 to enable tracing.
520 { # Closure for trace: debugging aid
521 my $print_caller = 1; # ? Include calling subroutine name
522 my $main_with_colon = 'main::';
523 my $main_colon_length = length($main_with_colon);
526 return unless $to_trace; # Do nothing if global flag not set
530 local $DB::trace = 0;
531 $DB::trace = 0; # Quiet 'used only once' message
535 # Loop looking up the stack to get the first non-trace caller
540 $line_number = $caller_line;
541 (my $pkg, my $file, $caller_line, my $caller) = caller $i++;
542 $caller = $main_with_colon unless defined $caller;
544 $caller_name = $caller;
547 $caller_name =~ s/.*:://;
548 if (substr($caller_name, 0, $main_colon_length)
551 $caller_name = substr($caller_name, $main_colon_length);
554 } until ($caller_name ne 'trace');
556 # If the stack was empty, we were called from the top level
557 $caller_name = 'main' if ($caller_name eq ""
558 || $caller_name eq 'trace');
561 foreach my $string (@input) {
562 #print STDERR __LINE__, ": ", join ", ", @input, "\n";
563 if (ref $string eq 'ARRAY' || ref $string eq 'HASH') {
564 $output .= simple_dumper($string);
567 $string = "$string" if ref $string;
568 $string = $UNDEF unless defined $string;
570 $string = '""' if $string eq "";
571 $output .= " " if $output ne ""
573 && substr($output, -1, 1) ne " "
574 && substr($string, 0, 1) ne " ";
579 print STDERR sprintf "%4d: ", $line_number if defined $line_number;
580 print STDERR "$caller_name: " if $print_caller;
581 print STDERR $output, "\n";
586 # This is for a rarely used development feature that allows you to compare two
587 # versions of the Unicode standard without having to deal with changes caused
588 # by the code points introduced in the later verson. Change the 0 to a SINGLE
589 # dotted Unicode release number (e.g. 2.1). Only code points introduced in
590 # that release and earlier will be used; later ones are thrown away. You use
591 # the version number of the earliest one you want to compare; then run this
592 # program on directory structures containing each release, and compare the
593 # outputs. These outputs will therefore include only the code points common
594 # to both releases, and you can see the changes caused just by the underlying
595 # release semantic changes. For versions earlier than 3.2, you must copy a
596 # version of DAge.txt into the directory.
597 my $string_compare_versions = DEBUG && 0; # e.g., v2.1;
598 my $compare_versions = DEBUG
599 && $string_compare_versions
600 && pack "C*", split /\./, $string_compare_versions;
603 # Returns non-duplicated input values. From "Perl Best Practices:
604 # Encapsulated Cleverness". p. 455 in first edition.
607 # Arguably this breaks encapsulation, if the goal is to permit multiple
608 # distinct objects to stringify to the same value, and be interchangeable.
609 # However, for this program, no two objects stringify identically, and all
610 # lists passed to this function are either objects or strings. So this
611 # doesn't affect correctness, but it does give a couple of percent speedup.
613 return grep { ! $seen{$_}++ } @_;
616 $0 = File::Spec->canonpath($0);
618 my $make_test_script = 0; # ? Should we output a test script
619 my $write_unchanged_files = 0; # ? Should we update the output files even if
620 # we don't think they have changed
621 my $use_directory = ""; # ? Should we chdir somewhere.
622 my $pod_directory; # input directory to store the pod file.
623 my $pod_file = 'perluniprops';
624 my $t_path; # Path to the .t test file
625 my $file_list = 'mktables.lst'; # File to store input and output file names.
626 # This is used to speed up the build, by not
627 # executing the main body of the program if
628 # nothing on the list has changed since the
630 my $make_list = 1; # ? Should we write $file_list. Set to always
631 # make a list so that when the pumpking is
632 # preparing a release, s/he won't have to do
634 my $glob_list = 0; # ? Should we try to include unknown .txt files
636 my $output_range_counts = 1; # ? Should we include the number of code points
637 # in ranges in the output
638 my $output_names = 0; # ? Should character names be in the output
640 # Verbosity levels; 0 is quiet
641 my $NORMAL_VERBOSITY = 1;
645 my $verbosity = $NORMAL_VERBOSITY;
649 my $arg = shift @ARGV;
651 $verbosity = $VERBOSE;
653 elsif ($arg eq '-p') {
654 $verbosity = $PROGRESS;
655 $| = 1; # Flush buffers as we go.
657 elsif ($arg eq '-q') {
660 elsif ($arg eq '-w') {
661 $write_unchanged_files = 1; # update the files even if havent changed
663 elsif ($arg eq '-check') {
664 my $this = shift @ARGV;
665 my $ok = shift @ARGV;
667 print "Skipping as check params are not the same.\n";
671 elsif ($arg eq '-P' && defined ($pod_directory = shift)) {
672 -d $pod_directory or croak "Directory '$pod_directory' doesn't exist";
674 elsif ($arg eq '-maketest' || ($arg eq '-T' && defined ($t_path = shift)))
676 $make_test_script = 1;
678 elsif ($arg eq '-makelist') {
681 elsif ($arg eq '-C' && defined ($use_directory = shift)) {
682 -d $use_directory or croak "Unknown directory '$use_directory'";
684 elsif ($arg eq '-L') {
686 # Existence not tested until have chdir'd
689 elsif ($arg eq '-globlist') {
692 elsif ($arg eq '-c') {
693 $output_range_counts = ! $output_range_counts
695 elsif ($arg eq '-output_names') {
700 $with_c .= 'out' if $output_range_counts; # Complements the state
702 usage: $0 [-c|-p|-q|-v|-w] [-C dir] [-L filelist] [ -P pod_dir ]
703 [ -T test_file_path ] [-globlist] [-makelist] [-maketest]
705 -c : Output comments $with_c number of code points in ranges
706 -q : Quiet Mode: Only output serious warnings.
707 -p : Set verbosity level to normal plus show progress.
708 -v : Set Verbosity level high: Show progress and non-serious
710 -w : Write files regardless
711 -C dir : Change to this directory before proceeding. All relative paths
712 except those specified by the -P and -T options will be done
713 with respect to this directory.
714 -P dir : Output $pod_file file to directory 'dir'.
715 -T path : Create a test script as 'path'; overrides -maketest
716 -L filelist : Use alternate 'filelist' instead of standard one
717 -globlist : Take as input all non-Test *.txt files in current and sub
719 -maketest : Make test script 'TestProp.pl' in current (or -C directory),
721 -makelist : Rewrite the file list $file_list based on current setup
722 -output_names: Output an annotation for each character in the table files;
723 useful for debugging mktables, looking at diffs; but is slow,
724 memory intensive; resulting tables are usable but slow and
726 -check A B : Executes $0 only if A and B are the same
731 # Stores the most-recently changed file. If none have changed, can skip the
733 my $most_recent = (stat $0)[9]; # Do this before the chdir!
735 # Change directories now, because need to read 'version' early.
736 if ($use_directory) {
737 if ($pod_directory && ! File::Spec->file_name_is_absolute($pod_directory)) {
738 $pod_directory = File::Spec->rel2abs($pod_directory);
740 if ($t_path && ! File::Spec->file_name_is_absolute($t_path)) {
741 $t_path = File::Spec->rel2abs($t_path);
743 chdir $use_directory or croak "Failed to chdir to '$use_directory':$!";
744 if ($pod_directory && File::Spec->file_name_is_absolute($pod_directory)) {
745 $pod_directory = File::Spec->abs2rel($pod_directory);
747 if ($t_path && File::Spec->file_name_is_absolute($t_path)) {
748 $t_path = File::Spec->abs2rel($t_path);
752 # Get Unicode version into regular and v-string. This is done now because
753 # various tables below get populated based on it. These tables are populated
754 # here to be near the top of the file, and so easily seeable by those needing
756 open my $VERSION, "<", "version"
757 or croak "$0: can't open required file 'version': $!\n";
758 my $string_version = <$VERSION>;
760 chomp $string_version;
761 my $v_version = pack "C*", split /\./, $string_version; # v string
763 # The following are the complete names of properties with property values that
764 # are known to not match any code points in some versions of Unicode, but that
765 # may change in the future so they should be matchable, hence an empty file is
766 # generated for them.
767 my @tables_that_may_be_empty = (
768 'Joining_Type=Left_Joining',
770 push @tables_that_may_be_empty, 'Script=Common' if $v_version le v4.0.1;
771 push @tables_that_may_be_empty, 'Title' if $v_version lt v2.0.0;
772 push @tables_that_may_be_empty, 'Script=Katakana_Or_Hiragana'
773 if $v_version ge v4.1.0;
775 # The lists below are hashes, so the key is the item in the list, and the
776 # value is the reason why it is in the list. This makes generation of
777 # documentation easier.
779 my %why_suppressed; # No file generated for these.
781 # Files aren't generated for empty extraneous properties. This is arguable.
782 # Extraneous properties generally come about because a property is no longer
783 # used in a newer version of Unicode. If we generated a file without code
784 # points, programs that used to work on that property will still execute
785 # without errors. It just won't ever match (or will always match, with \P{}).
786 # This means that the logic is now likely wrong. I (khw) think its better to
787 # find this out by getting an error message. Just move them to the table
788 # above to change this behavior
789 my %why_suppress_if_empty_warn_if_not = (
791 # It is the only property that has ever officially been removed from the
792 # Standard. The database never contained any code points for it.
793 'Special_Case_Condition' => 'Obsolete',
795 # Apparently never official, but there were code points in some versions of
796 # old-style PropList.txt
797 'Non_Break' => 'Obsolete',
800 # These would normally go in the warn table just above, but they were changed
801 # a long time before this program was written, so warnings about them are
803 if ($v_version gt v3.2.0) {
804 push @tables_that_may_be_empty,
805 'Canonical_Combining_Class=Attached_Below_Left'
808 # These are listed in the Property aliases file in 5.2, but Unihan is ignored
809 # unless explicitly added.
810 if ($v_version ge v5.2.0) {
811 my $unihan = 'Unihan; remove from list if using Unihan';
812 foreach my $table (qw (
816 kCompatibilityVariant
830 $why_suppress_if_empty_warn_if_not{$table} = $unihan;
834 # Properties that this program ignores.
835 my @unimplemented_properties = (
836 'Unicode_Radical_Stroke' # Remove if changing to handle this one.
839 # There are several types of obsolete properties defined by Unicode. These
840 # must be hand-edited for every new Unicode release.
841 my %why_deprecated; # Generates a deprecated warning message if used.
842 my %why_stabilized; # Documentation only
843 my %why_obsolete; # Documentation only
846 my $simple = 'Perl uses the more complete version of this property';
847 my $unihan = 'Unihan properties are by default not enabled in the Perl core. Instead use CPAN: Unicode::Unihan';
849 my $other_properties = 'other properties';
850 my $contributory = "Used by Unicode internally for generating $other_properties and not intended to be used stand-alone";
851 my $why_no_expand = "Easily computed, and yet doesn't cover the common encoding forms (UTF-16/8)",
854 'Grapheme_Link' => 'Deprecated by Unicode. Use ccc=vr (Canonical_Combining_Class=Virama) instead',
855 'Jamo_Short_Name' => $contributory,
856 'Line_Break=Surrogate' => 'Deprecated by Unicode because surrogates should never appear in well-formed text, and therefore shouldn\'t be the basis for line breaking',
857 'Other_Alphabetic' => $contributory,
858 'Other_Default_Ignorable_Code_Point' => $contributory,
859 'Other_Grapheme_Extend' => $contributory,
860 'Other_ID_Continue' => $contributory,
861 'Other_ID_Start' => $contributory,
862 'Other_Lowercase' => $contributory,
863 'Other_Math' => $contributory,
864 'Other_Uppercase' => $contributory,
868 # There is a lib/unicore/Decomposition.pl (used by normalize.pm) which
869 # contains the same information, but without the algorithmically
870 # determinable Hangul syllables'. This file is not published, so it's
871 # existence is not noted in the comment.
872 'Decomposition_Mapping' => 'Accessible via Unicode::Normalize',
874 'ISO_Comment' => 'Apparently no demand for it, but can access it through Unicode::UCD::charinfo. Obsoleted, and code points for it removed in Unicode 5.2',
875 'Unicode_1_Name' => "$simple, and no apparent demand for it, but can access it through Unicode::UCD::charinfo. If there is no later name for a code point, then this one is used instead in charnames",
877 'Simple_Case_Folding' => "$simple. Can access this through Unicode::UCD::casefold",
878 'Simple_Lowercase_Mapping' => "$simple. Can access this through Unicode::UCD::charinfo",
879 'Simple_Titlecase_Mapping' => "$simple. Can access this through Unicode::UCD::charinfo",
880 'Simple_Uppercase_Mapping' => "$simple. Can access this through Unicode::UCD::charinfo",
882 'Name' => "Accessible via 'use charnames;'",
883 'Name_Alias' => "Accessible via 'use charnames;'",
885 # These are sort of jumping the gun; deprecation is proposed for
886 # Unicode version 6.0, but they have never been exposed by Perl, and
887 # likely are soon to be deprecated, so best not to expose them.
888 FC_NFKC_Closure => 'Use NFKC_Casefold instead',
889 Expands_On_NFC => $why_no_expand,
890 Expands_On_NFD => $why_no_expand,
891 Expands_On_NFKC => $why_no_expand,
892 Expands_On_NFKD => $why_no_expand,
895 # The following are suppressed because they were made contributory or
896 # deprecated by Unicode before Perl ever thought about supporting them.
897 foreach my $property ('Jamo_Short_Name', 'Grapheme_Link') {
898 $why_suppressed{$property} = $why_deprecated{$property};
901 # Customize the message for all the 'Other_' properties
902 foreach my $property (keys %why_deprecated) {
903 next if (my $main_property = $property) !~ s/^Other_//;
904 $why_deprecated{$property} =~ s/$other_properties/the $main_property property (which should be used instead)/;
908 if ($v_version ge 4.0.0) {
909 $why_stabilized{'Hyphen'} = 'Use the Line_Break property instead; see www.unicode.org/reports/tr14';
911 if ($v_version ge 5.2.0) {
912 $why_obsolete{'ISO_Comment'} = 'Code points for it have been removed';
915 # Probably obsolete forever
916 if ($v_version ge v4.1.0) {
917 $why_suppressed{'Script=Katakana_Or_Hiragana'} = 'Obsolete. All code points previously matched by this have been moved to "Script=Common"';
920 # This program can create files for enumerated-like properties, such as
921 # 'Numeric_Type'. This file would be the same format as for a string
922 # property, with a mapping from code point to its value, so you could look up,
923 # for example, the script a code point is in. But no one so far wants this
924 # mapping, or they have found another way to get it since this is a new
925 # feature. So no file is generated except if it is in this list.
926 my @output_mapped_properties = split "\n", <<END;
929 # If you are using the Unihan database, you need to add the properties that
930 # you want to extract from it to this table. For your convenience, the
931 # properties in the 5.2 PropertyAliases.txt file are listed, commented out
932 my @cjk_properties = split "\n", <<'END';
933 #cjkAccountingNumeric; kAccountingNumeric
934 #cjkOtherNumeric; kOtherNumeric
935 #cjkPrimaryNumeric; kPrimaryNumeric
936 #cjkCompatibilityVariant; kCompatibilityVariant
938 #cjkIRG_GSource; kIRG_GSource
939 #cjkIRG_HSource; kIRG_HSource
940 #cjkIRG_JSource; kIRG_JSource
941 #cjkIRG_KPSource; kIRG_KPSource
942 #cjkIRG_KSource; kIRG_KSource
943 #cjkIRG_TSource; kIRG_TSource
944 #cjkIRG_USource; kIRG_USource
945 #cjkIRG_VSource; kIRG_VSource
946 #cjkRSUnicode; kRSUnicode ; Unicode_Radical_Stroke; URS
949 # Similarly for the property values. For your convenience, the lines in the
950 # 5.2 PropertyAliases.txt file are listed. Just remove the first BUT NOT both
952 my @cjk_property_values = split "\n", <<'END';
953 ## @missing: 0000..10FFFF; cjkAccountingNumeric; NaN
954 ## @missing: 0000..10FFFF; cjkCompatibilityVariant; <code point>
955 ## @missing: 0000..10FFFF; cjkIICore; <none>
956 ## @missing: 0000..10FFFF; cjkIRG_GSource; <none>
957 ## @missing: 0000..10FFFF; cjkIRG_HSource; <none>
958 ## @missing: 0000..10FFFF; cjkIRG_JSource; <none>
959 ## @missing: 0000..10FFFF; cjkIRG_KPSource; <none>
960 ## @missing: 0000..10FFFF; cjkIRG_KSource; <none>
961 ## @missing: 0000..10FFFF; cjkIRG_TSource; <none>
962 ## @missing: 0000..10FFFF; cjkIRG_USource; <none>
963 ## @missing: 0000..10FFFF; cjkIRG_VSource; <none>
964 ## @missing: 0000..10FFFF; cjkOtherNumeric; NaN
965 ## @missing: 0000..10FFFF; cjkPrimaryNumeric; NaN
966 ## @missing: 0000..10FFFF; cjkRSUnicode; <none>
969 # The input files don't list every code point. Those not listed are to be
970 # defaulted to some value. Below are hard-coded what those values are for
971 # non-binary properties as of 5.1. Starting in 5.0, there are
972 # machine-parsable comment lines in the files the give the defaults; so this
973 # list shouldn't have to be extended. The claim is that all missing entries
974 # for binary properties will default to 'N'. Unicode tried to change that in
975 # 5.2, but the beta period produced enough protest that they backed off.
977 # The defaults for the fields that appear in UnicodeData.txt in this hash must
978 # be in the form that it expects. The others may be synonyms.
979 my $CODE_POINT = '<code point>';
980 my %default_mapping = (
982 # Bidi_Class => Complicated; set in code
983 Bidi_Mirroring_Glyph => "",
985 Canonical_Combining_Class => 0,
986 Case_Folding => $CODE_POINT,
987 Decomposition_Mapping => $CODE_POINT,
988 Decomposition_Type => 'None',
989 East_Asian_Width => "Neutral",
990 FC_NFKC_Closure => $CODE_POINT,
991 General_Category => 'Cn',
992 Grapheme_Cluster_Break => 'Other',
993 Hangul_Syllable_Type => 'NA',
995 Jamo_Short_Name => "",
996 Joining_Group => "No_Joining_Group",
997 # Joining_Type => Complicated; set in code
998 kIICore => 'N', # Is converted to binary
999 #Line_Break => Complicated; set in code
1000 Lowercase_Mapping => $CODE_POINT,
1007 Numeric_Type => 'None',
1008 Numeric_Value => 'NaN',
1009 Script => ($v_version le 4.1.0) ? 'Common' : 'Unknown',
1010 Sentence_Break => 'Other',
1011 Simple_Case_Folding => $CODE_POINT,
1012 Simple_Lowercase_Mapping => $CODE_POINT,
1013 Simple_Titlecase_Mapping => $CODE_POINT,
1014 Simple_Uppercase_Mapping => $CODE_POINT,
1015 Titlecase_Mapping => $CODE_POINT,
1016 Unicode_1_Name => "",
1017 Unicode_Radical_Stroke => "",
1018 Uppercase_Mapping => $CODE_POINT,
1019 Word_Break => 'Other',
1022 # Below are files that Unicode furnishes, but this program ignores, and why
1023 my %ignored_files = (
1024 'CJKRadicals.txt' => 'Unihan data',
1025 'Index.txt' => 'An index, not actual data',
1026 'NamedSqProv.txt' => 'Not officially part of the Unicode standard; Append it to NamedSequences.txt if you want to process the contents.',
1027 'NamesList.txt' => 'Just adds commentary',
1028 'NormalizationCorrections.txt' => 'Data is already in other files.',
1029 'Props.txt' => 'Adds nothing to PropList.txt; only in very early releases',
1030 'ReadMe.txt' => 'Just comments',
1031 'README.TXT' => 'Just comments',
1032 'StandardizedVariants.txt' => 'Only for glyph changes, not a Unicode character property. Does not fit into current scheme where one code point is mapped',
1035 ### End of externally interesting definitions, except for @input_file_objects
1038 # !!!!!!! DO NOT EDIT THIS FILE !!!!!!!
1039 # This file is machine-generated by $0 from the Unicode
1040 # database, Version $string_version. Any changes made here will be lost!
1043 my $INTERNAL_ONLY=<<"EOF";
1045 # !!!!!!! INTERNAL PERL USE ONLY !!!!!!!
1046 # This file is for internal use by the Perl program only. The format and even
1047 # the name or existence of this file are subject to change without notice.
1048 # Don't use it directly.
1051 my $DEVELOPMENT_ONLY=<<"EOF";
1052 # !!!!!!! DEVELOPMENT USE ONLY !!!!!!!
1053 # This file contains information artificially constrained to code points
1054 # present in Unicode release $string_compare_versions.
1055 # IT CANNOT BE RELIED ON. It is for use during development only and should
1056 # not be used for production.
1060 my $LAST_UNICODE_CODEPOINT_STRING = "10FFFF";
1061 my $LAST_UNICODE_CODEPOINT = hex $LAST_UNICODE_CODEPOINT_STRING;
1062 my $MAX_UNICODE_CODEPOINTS = $LAST_UNICODE_CODEPOINT + 1;
1064 # Matches legal code point. 4-6 hex numbers, If there are 6, the first
1065 # two must be 10; if there are 5, the first must not be a 0. Written this way
1066 # to decrease backtracking
1068 qr/ \b (?: 10[0-9A-F]{4} | [1-9A-F][0-9A-F]{4} | [0-9A-F]{4} ) \b/x;
1070 # This matches the beginning of the line in the Unicode db files that give the
1071 # defaults for code points not listed (i.e., missing) in the file. The code
1072 # depends on this ending with a semi-colon, so it can assume it is a valid
1073 # field when the line is split() by semi-colons
1074 my $missing_defaults_prefix =
1075 qr/^#\s+\@missing:\s+0000\.\.$LAST_UNICODE_CODEPOINT_STRING\s*;/;
1077 # Property types. Unicode has more types, but these are sufficient for our
1079 my $UNKNOWN = -1; # initialized to illegal value
1080 my $NON_STRING = 1; # Either binary or enum
1082 my $ENUM = 3; # Include catalog
1083 my $STRING = 4; # Anything else: string or misc
1085 # Some input files have lines that give default values for code points not
1086 # contained in the file. Sometimes these should be ignored.
1087 my $NO_DEFAULTS = 0; # Must evaluate to false
1088 my $NOT_IGNORED = 1;
1091 # Range types. Each range has a type. Most ranges are type 0, for normal,
1092 # and will appear in the main body of the tables in the output files, but
1093 # there are other types of ranges as well, listed below, that are specially
1094 # handled. There are pseudo-types as well that will never be stored as a
1095 # type, but will affect the calculation of the type.
1097 # 0 is for normal, non-specials
1098 my $MULTI_CP = 1; # Sequence of more than code point
1099 my $HANGUL_SYLLABLE = 2;
1100 my $CP_IN_NAME = 3; # The NAME contains the code point appended to it.
1101 my $NULL = 4; # The map is to the null string; utf8.c can't
1102 # handle these, nor is there an accepted syntax
1103 # for them in \p{} constructs
1104 my $COMPUTE_NO_MULTI_CP = 5; # Pseudo-type; means that ranges that would
1105 # otherwise be $MULTI_CP type are instead type 0
1107 # process_generic_property_file() can accept certain overrides in its input.
1108 # Each of these must begin AND end with $CMD_DELIM.
1109 my $CMD_DELIM = "\a";
1110 my $REPLACE_CMD = 'replace'; # Override the Replace
1111 my $MAP_TYPE_CMD = 'map_type'; # Override the Type
1116 # Values for the Replace argument to add_range.
1117 # $NO # Don't replace; add only the code points not
1119 my $IF_NOT_EQUIVALENT = 1; # Replace only under certain conditions; details in
1120 # the comments at the subroutine definition.
1121 my $UNCONDITIONALLY = 2; # Replace without conditions.
1122 my $MULTIPLE = 4; # Don't replace, but add a duplicate record if
1125 # Flags to give property statuses. The phrases are to remind maintainers that
1126 # if the flag is changed, the indefinite article referring to it in the
1127 # documentation may need to be as well.
1129 my $SUPPRESSED = 'z'; # The character should never actually be seen, since
1131 my $PLACEHOLDER = 'P'; # Implies no pod entry generated
1132 my $DEPRECATED = 'D';
1133 my $a_bold_deprecated = "a 'B<$DEPRECATED>'";
1134 my $A_bold_deprecated = "A 'B<$DEPRECATED>'";
1135 my $DISCOURAGED = 'X';
1136 my $a_bold_discouraged = "an 'B<$DISCOURAGED>'";
1137 my $A_bold_discouraged = "An 'B<$DISCOURAGED>'";
1139 my $a_bold_stricter = "a 'B<$STRICTER>'";
1140 my $A_bold_stricter = "A 'B<$STRICTER>'";
1141 my $STABILIZED = 'S';
1142 my $a_bold_stabilized = "an 'B<$STABILIZED>'";
1143 my $A_bold_stabilized = "An 'B<$STABILIZED>'";
1145 my $a_bold_obsolete = "an 'B<$OBSOLETE>'";
1146 my $A_bold_obsolete = "An 'B<$OBSOLETE>'";
1148 my %status_past_participles = (
1149 $DISCOURAGED => 'discouraged',
1150 $SUPPRESSED => 'should never be generated',
1151 $STABILIZED => 'stabilized',
1152 $OBSOLETE => 'obsolete',
1153 $DEPRECATED => 'deprecated',
1156 # The format of the values of the tables:
1157 my $EMPTY_FORMAT = "";
1158 my $BINARY_FORMAT = 'b';
1159 my $DECIMAL_FORMAT = 'd';
1160 my $FLOAT_FORMAT = 'f';
1161 my $INTEGER_FORMAT = 'i';
1162 my $HEX_FORMAT = 'x';
1163 my $RATIONAL_FORMAT = 'r';
1164 my $STRING_FORMAT = 's';
1165 my $DECOMP_STRING_FORMAT = 'c';
1167 my %map_table_formats = (
1168 $BINARY_FORMAT => 'binary',
1169 $DECIMAL_FORMAT => 'single decimal digit',
1170 $FLOAT_FORMAT => 'floating point number',
1171 $INTEGER_FORMAT => 'integer',
1172 $HEX_FORMAT => 'positive hex whole number; a code point',
1173 $RATIONAL_FORMAT => 'rational: an integer or a fraction',
1174 $STRING_FORMAT => 'string',
1175 $DECOMP_STRING_FORMAT => 'Perl\'s internal (Normalize.pm) decompostion mapping',
1178 # Unicode didn't put such derived files in a separate directory at first.
1179 my $EXTRACTED_DIR = (-d 'extracted') ? 'extracted' : "";
1180 my $EXTRACTED = ($EXTRACTED_DIR) ? "$EXTRACTED_DIR/" : "";
1181 my $AUXILIARY = 'auxiliary';
1183 # Hashes that will eventually go into Heavy.pl for the use of utf8_heavy.pl
1184 my %loose_to_file_of; # loosely maps table names to their respective
1186 my %stricter_to_file_of; # same; but for stricter mapping.
1187 my %nv_floating_to_rational; # maps numeric values floating point numbers to
1188 # their rational equivalent
1189 my %loose_property_name_of; # Loosely maps property names to standard form
1191 # These constants names and values were taken from the Unicode standard,
1192 # version 5.1, section 3.12. They are used in conjunction with Hangul
1193 # syllables. The '_string' versions are so generated tables can retain the
1194 # hex format, which is the more familiar value
1195 my $SBase_string = "0xAC00";
1196 my $SBase = CORE::hex $SBase_string;
1197 my $LBase_string = "0x1100";
1198 my $LBase = CORE::hex $LBase_string;
1199 my $VBase_string = "0x1161";
1200 my $VBase = CORE::hex $VBase_string;
1201 my $TBase_string = "0x11A7";
1202 my $TBase = CORE::hex $TBase_string;
1207 my $NCount = $VCount * $TCount;
1209 # For Hangul syllables; These store the numbers from Jamo.txt in conjunction
1210 # with the above published constants.
1212 my %Jamo_L; # Leading consonants
1213 my %Jamo_V; # Vowels
1214 my %Jamo_T; # Trailing consonants
1216 my @backslash_X_tests; # List of tests read in for testing \X
1217 my @unhandled_properties; # Will contain a list of properties found in
1218 # the input that we didn't process.
1219 my @match_properties; # Properties that have match tables, to be
1221 my @map_properties; # Properties that get map files written
1222 my @named_sequences; # NamedSequences.txt contents.
1223 my %potential_files; # Generated list of all .txt files in the directory
1224 # structure so we can warn if something is being
1226 my @files_actually_output; # List of files we generated.
1227 my @more_Names; # Some code point names are compound; this is used
1228 # to store the extra components of them.
1229 my $MIN_FRACTION_LENGTH = 3; # How many digits of a floating point number at
1230 # the minimum before we consider it equivalent to a
1231 # candidate rational
1232 my $MAX_FLOATING_SLOP = 10 ** - $MIN_FRACTION_LENGTH; # And in floating terms
1234 # These store references to certain commonly used property objects
1241 # Are there conflicting names because of beginning with 'In_', or 'Is_'
1242 my $has_In_conflicts = 0;
1243 my $has_Is_conflicts = 0;
1245 sub internal_file_to_platform ($) {
1246 # Convert our file paths which have '/' separators to those of the
1250 return undef unless defined $file;
1252 return File::Spec->join(split '/', $file);
1255 sub file_exists ($) { # platform independent '-e'. This program internally
1256 # uses slash as a path separator.
1258 return 0 if ! defined $file;
1259 return -e internal_file_to_platform($file);
1263 # Returns the address of the blessed input object.
1264 # It doesn't check for blessedness because that would do a string eval
1265 # every call, and the program is structured so that this is never called
1266 # for a non-blessed object.
1268 no overloading; # If overloaded, numifying below won't work.
1270 # Numifying a ref gives its address.
1271 return pack 'J', $_[0];
1274 # These are used only if $output_names is true.
1275 # The entire range of Unicode characters is examined to populate these
1276 # after all the input has been processed. But most can be skipped, as they
1277 # have the same descriptive phrases, such as being unassigned
1278 my @viacode; # Contains the 1 million character names
1279 my @printable; # boolean: And are those characters printable?
1280 my @annotate_char_type; # Contains a type of those characters, specifically
1281 # for the purposes of annotation.
1282 my $annotate_ranges; # A map of ranges of code points that have the same
1283 # name for the purposes of annoation. They map to the
1284 # upper edge of the range, so that the end point can
1285 # be immediately found. This is used to skip ahead to
1286 # the end of a range, and avoid processing each
1287 # individual code point in it.
1288 my $unassigned_sans_noncharacters; # A Range_List of the unassigned
1289 # characters, but excluding those which are
1290 # also noncharacter code points
1292 # The annotation types are an extension of the regular range types, though
1293 # some of the latter are folded into one. Make the new types negative to
1294 # avoid conflicting with the regular types
1295 my $SURROGATE_TYPE = -1;
1296 my $UNASSIGNED_TYPE = -2;
1297 my $PRIVATE_USE_TYPE = -3;
1298 my $NONCHARACTER_TYPE = -4;
1299 my $CONTROL_TYPE = -5;
1300 my $UNKNOWN_TYPE = -6; # Used only if there is a bug in this program
1302 sub populate_char_info ($) {
1303 # Used only with the $output_names option. Populates the arrays with the
1304 # input code point's info that are needed for outputting more detailed
1305 # comments. If calling context wants a return, it is the end point of
1306 # any contiguous range of characters that share essentially the same info
1309 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
1311 $viacode[$i] = $perl_charname->value_of($i) || "";
1313 # A character is generally printable if Unicode says it is,
1314 # but below we make sure that most Unicode general category 'C' types
1316 $printable[$i] = $print->contains($i);
1318 $annotate_char_type[$i] = $perl_charname->type_of($i) || 0;
1320 # Only these two regular types are treated specially for annotations
1322 $annotate_char_type[$i] = 0 if $annotate_char_type[$i] != $CP_IN_NAME
1323 && $annotate_char_type[$i] != $HANGUL_SYLLABLE;
1325 # Give a generic name to all code points that don't have a real name.
1326 # We output ranges, if applicable, for these. Also calculate the end
1327 # point of the range.
1329 if (! $viacode[$i]) {
1330 if ($gc-> table('Surrogate')->contains($i)) {
1331 $viacode[$i] = 'Surrogate';
1332 $annotate_char_type[$i] = $SURROGATE_TYPE;
1334 $end = $gc->table('Surrogate')->containing_range($i)->end;
1336 elsif ($gc-> table('Private_use')->contains($i)) {
1337 $viacode[$i] = 'Private Use';
1338 $annotate_char_type[$i] = $PRIVATE_USE_TYPE;
1340 $end = $gc->table('Private_Use')->containing_range($i)->end;
1342 elsif (Property::property_ref('Noncharacter_Code_Point')-> table('Y')->
1345 $viacode[$i] = 'Noncharacter';
1346 $annotate_char_type[$i] = $NONCHARACTER_TYPE;
1348 $end = property_ref('Noncharacter_Code_Point')->table('Y')->
1349 containing_range($i)->end;
1351 elsif ($gc-> table('Control')->contains($i)) {
1352 $viacode[$i] = 'Control';
1353 $annotate_char_type[$i] = $CONTROL_TYPE;
1355 $end = 0x81 if $i == 0x80; # Hard-code this one known case
1357 elsif ($gc-> table('Unassigned')->contains($i)) {
1358 $viacode[$i] = 'Unassigned, block=' . $block-> value_of($i);
1359 $annotate_char_type[$i] = $UNASSIGNED_TYPE;
1362 # Because we name the unassigned by the blocks they are in, it
1363 # can't go past the end of that block, and it also can't go past
1364 # the unassigned range it is in. The special table makes sure
1365 # that the non-characters, which are unassigned, are separated
1367 $end = min($block->containing_range($i)->end,
1368 $unassigned_sans_noncharacters-> containing_range($i)->
1371 my_carp_bug("Can't figure out how to annotate"
1372 . sprintf("U+%04X", $i)
1373 . "Proceeding anyway.");
1374 $viacode[$i] = 'UNKNOWN';
1375 $annotate_char_type[$i] = $UNKNOWN_TYPE;
1380 # Here, has a name, but if it's one in which the code point number is
1381 # appended to the name, do that.
1382 elsif ($annotate_char_type[$i] == $CP_IN_NAME) {
1383 $viacode[$i] .= sprintf("-%04X", $i);
1384 $end = $perl_charname->containing_range($i)->end;
1387 # And here, has a name, but if it's a hangul syllable one, replace it with
1388 # the correct name from the Unicode algorithm
1389 elsif ($annotate_char_type[$i] == $HANGUL_SYLLABLE) {
1391 my $SIndex = $i - $SBase;
1392 my $L = $LBase + $SIndex / $NCount;
1393 my $V = $VBase + ($SIndex % $NCount) / $TCount;
1394 my $T = $TBase + $SIndex % $TCount;
1395 $viacode[$i] = "HANGUL SYLLABLE $Jamo{$L}$Jamo{$V}";
1396 $viacode[$i] .= $Jamo{$T} if $T != $TBase;
1397 $end = $perl_charname->containing_range($i)->end;
1400 return if ! defined wantarray;
1401 return $i if ! defined $end; # If not a range, return the input
1403 # Save this whole range so can find the end point quickly
1404 $annotate_ranges->add_map($i, $end, $end);
1409 # Commented code below should work on Perl 5.8.
1410 ## This 'require' doesn't necessarily work in miniperl, and even if it does,
1411 ## the native perl version of it (which is what would operate under miniperl)
1412 ## is extremely slow, as it does a string eval every call.
1413 #my $has_fast_scalar_util = $
\18 !~ /miniperl/
1414 # && defined eval "require Scalar::Util";
1417 # # Returns the address of the blessed input object. Uses the XS version if
1418 # # available. It doesn't check for blessedness because that would do a
1419 # # string eval every call, and the program is structured so that this is
1420 # # never called for a non-blessed object.
1422 # return Scalar::Util::refaddr($_[0]) if $has_fast_scalar_util;
1424 # # Check at least that is a ref.
1425 # my $pkg = ref($_[0]) or return undef;
1427 # # Change to a fake package to defeat any overloaded stringify
1428 # bless $_[0], 'main::Fake';
1430 # # Numifying a ref gives its address.
1431 # my $addr = pack 'J', $_[0];
1433 # # Return to original class
1434 # bless $_[0], $pkg;
1441 return $a if $a >= $b;
1448 return $a if $a <= $b;
1452 sub clarify_number ($) {
1453 # This returns the input number with underscores inserted every 3 digits
1454 # in large (5 digits or more) numbers. Input must be entirely digits, not
1458 my $pos = length($number) - 3;
1459 return $number if $pos <= 1;
1461 substr($number, $pos, 0) = '_';
1470 # These routines give a uniform treatment of messages in this program. They
1471 # are placed in the Carp package to cause the stack trace to not include them,
1472 # although an alternative would be to use another package and set @CARP_NOT
1475 our $Verbose = 1 if main::DEBUG; # Useful info when debugging
1477 # This is a work-around suggested by Nicholas Clark to fix a problem with Carp
1478 # and overload trying to load Scalar:Util under miniperl. See
1479 # http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/2009-11/msg01057.html
1480 undef $overload::VERSION;
1483 my $message = shift || "";
1484 my $nofold = shift || 0;
1487 $message = main::join_lines($message);
1488 $message =~ s/^$0: *//; # Remove initial program name
1489 $message =~ s/[.;,]+$//; # Remove certain ending punctuation
1490 $message = "\n$0: $message;";
1492 # Fold the message with program name, semi-colon end punctuation
1493 # (which looks good with the message that carp appends to it), and a
1494 # hanging indent for continuation lines.
1495 $message = main::simple_fold($message, "", 4) unless $nofold;
1496 $message =~ s/\n$//; # Remove the trailing nl so what carp
1497 # appends is to the same line
1500 return $message if defined wantarray; # If a caller just wants the msg
1507 # This is called when it is clear that the problem is caused by a bug in
1510 my $message = shift;
1511 $message =~ s/^$0: *//;
1512 $message = my_carp("Bug in $0. Please report it by running perlbug or if that is unavailable, by sending email to perbug\@perl.org:\n$message");
1517 sub carp_too_few_args {
1519 my_carp_bug("Wrong number of arguments: to 'carp_too_few_arguments'. No action taken.");
1523 my $args_ref = shift;
1526 my_carp_bug("Need at least $count arguments to "
1528 . ". Instead got: '"
1529 . join ', ', @$args_ref
1530 . "'. No action taken.");
1534 sub carp_extra_args {
1535 my $args_ref = shift;
1536 my_carp_bug("Too many arguments to 'carp_extra_args': (" . join(', ', @_) . "); Extras ignored.") if @_;
1538 unless (ref $args_ref) {
1539 my_carp_bug("Argument to 'carp_extra_args' ($args_ref) must be a ref. Not checking arguments.");
1542 my ($package, $file, $line) = caller;
1543 my $subroutine = (caller 1)[3];
1546 if (ref $args_ref eq 'HASH') {
1547 foreach my $key (keys %$args_ref) {
1548 $args_ref->{$key} = $UNDEF unless defined $args_ref->{$key};
1550 $list = join ', ', each %{$args_ref};
1552 elsif (ref $args_ref eq 'ARRAY') {
1553 foreach my $arg (@$args_ref) {
1554 $arg = $UNDEF unless defined $arg;
1556 $list = join ', ', @$args_ref;
1559 my_carp_bug("Can't cope with ref "
1561 . " . argument to 'carp_extra_args'. Not checking arguments.");
1565 my_carp_bug("Unrecognized parameters in options: '$list' to $subroutine. Skipped.");
1573 # This program uses the inside-out method for objects, as recommended in
1574 # "Perl Best Practices". This closure aids in generating those. There
1575 # are two routines. setup_package() is called once per package to set
1576 # things up, and then set_access() is called for each hash representing a
1577 # field in the object. These routines arrange for the object to be
1578 # properly destroyed when no longer used, and for standard accessor
1579 # functions to be generated. If you need more complex accessors, just
1580 # write your own and leave those accesses out of the call to set_access().
1581 # More details below.
1583 my %constructor_fields; # fields that are to be used in constructors; see
1586 # The values of this hash will be the package names as keys to other
1587 # hashes containing the name of each field in the package as keys, and
1588 # references to their respective hashes as values.
1592 # Sets up the package, creating standard DESTROY and dump methods
1593 # (unless already defined). The dump method is used in debugging by
1595 # The optional parameters are:
1596 # a) a reference to a hash, that gets populated by later
1597 # set_access() calls with one of the accesses being
1598 # 'constructor'. The caller can then refer to this, but it is
1599 # not otherwise used by these two routines.
1600 # b) a reference to a callback routine to call during destruction
1601 # of the object, before any fields are actually destroyed
1604 my $constructor_ref = delete $args{'Constructor_Fields'};
1605 my $destroy_callback = delete $args{'Destroy_Callback'};
1606 Carp::carp_extra_args(\@_) if main::DEBUG && %args;
1609 my $package = (caller)[0];
1611 $package_fields{$package} = \%fields;
1612 $constructor_fields{$package} = $constructor_ref;
1614 unless ($package->can('DESTROY')) {
1615 my $destroy_name = "${package}::DESTROY";
1618 # Use typeglob to give the anonymous subroutine the name we want
1619 *$destroy_name = sub {
1621 my $addr = do { no overloading; pack 'J', $self; };
1623 $self->$destroy_callback if $destroy_callback;
1624 foreach my $field (keys %{$package_fields{$package}}) {
1625 #print STDERR __LINE__, ": Destroying ", ref $self, " ", sprintf("%04X", $addr), ": ", $field, "\n";
1626 delete $package_fields{$package}{$field}{$addr};
1632 unless ($package->can('dump')) {
1633 my $dump_name = "${package}::dump";
1637 return dump_inside_out($self, $package_fields{$package}, @_);
1644 # Arrange for the input field to be garbage collected when no longer
1645 # needed. Also, creates standard accessor functions for the field
1646 # based on the optional parameters-- none if none of these parameters:
1647 # 'addable' creates an 'add_NAME()' accessor function.
1648 # 'readable' or 'readable_array' creates a 'NAME()' accessor
1650 # 'settable' creates a 'set_NAME()' accessor function.
1651 # 'constructor' doesn't create an accessor function, but adds the
1652 # field to the hash that was previously passed to
1654 # Any of the accesses can be abbreviated down, so that 'a', 'ad',
1655 # 'add' etc. all mean 'addable'.
1656 # The read accessor function will work on both array and scalar
1657 # values. If another accessor in the parameter list is 'a', the read
1658 # access assumes an array. You can also force it to be array access
1659 # by specifying 'readable_array' instead of 'readable'
1661 # A sort-of 'protected' access can be set-up by preceding the addable,
1662 # readable or settable with some initial portion of 'protected_' (but,
1663 # the underscore is required), like 'p_a', 'pro_set', etc. The
1664 # "protection" is only by convention. All that happens is that the
1665 # accessor functions' names begin with an underscore. So instead of
1666 # calling set_foo, the call is _set_foo. (Real protection could be
1667 # accomplished by having a new subroutine, end_package, called at the
1668 # end of each package, and then storing the __LINE__ ranges and
1669 # checking them on every accessor. But that is way overkill.)
1671 # We create anonymous subroutines as the accessors and then use
1672 # typeglobs to assign them to the proper package and name
1674 my $name = shift; # Name of the field
1675 my $field = shift; # Reference to the inside-out hash containing the
1678 my $package = (caller)[0];
1680 if (! exists $package_fields{$package}) {
1681 croak "$0: Must call 'setup_package' before 'set_access'";
1684 # Stash the field so DESTROY can get it.
1685 $package_fields{$package}{$name} = $field;
1687 # Remaining arguments are the accessors. For each...
1688 foreach my $access (@_) {
1689 my $access = lc $access;
1693 # Match the input as far as it goes.
1694 if ($access =~ /^(p[^_]*)_/) {
1696 if (substr('protected_', 0, length $protected)
1700 # Add 1 for the underscore not included in $protected
1701 $access = substr($access, length($protected) + 1);
1709 if (substr('addable', 0, length $access) eq $access) {
1710 my $subname = "${package}::${protected}add_$name";
1713 # add_ accessor. Don't add if already there, which we
1714 # determine using 'eq' for scalars and '==' otherwise.
1717 return Carp::carp_too_few_args(\@_, 2) if main::DEBUG && @_ < 2;
1720 my $addr = do { no overloading; pack 'J', $self; };
1721 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
1723 return if grep { $value == $_ } @{$field->{$addr}};
1726 return if grep { $value eq $_ } @{$field->{$addr}};
1728 push @{$field->{$addr}}, $value;
1732 elsif (substr('constructor', 0, length $access) eq $access) {
1734 Carp::my_carp_bug("Can't set-up 'protected' constructors")
1737 $constructor_fields{$package}{$name} = $field;
1740 elsif (substr('readable_array', 0, length $access) eq $access) {
1742 # Here has read access. If one of the other parameters for
1743 # access is array, or this one specifies array (by being more
1744 # than just 'readable_'), then create a subroutine that
1745 # assumes the data is an array. Otherwise just a scalar
1746 my $subname = "${package}::${protected}$name";
1747 if (grep { /^a/i } @_
1748 or length($access) > length('readable_'))
1753 Carp::carp_extra_args(\@_) if main::DEBUG && @_ > 1;
1754 my $addr = do { no overloading; pack 'J', $_[0]; };
1755 if (ref $field->{$addr} ne 'ARRAY') {
1756 my $type = ref $field->{$addr};
1757 $type = 'scalar' unless $type;
1758 Carp::my_carp_bug("Trying to read $name as an array when it is a $type. Big problems.");
1761 return scalar @{$field->{$addr}} unless wantarray;
1763 # Make a copy; had problems with caller modifying the
1764 # original otherwise
1765 my @return = @{$field->{$addr}};
1771 # Here not an array value, a simpler function.
1775 Carp::carp_extra_args(\@_) if main::DEBUG && @_ > 1;
1777 return $field->{pack 'J', $_[0]};
1781 elsif (substr('settable', 0, length $access) eq $access) {
1782 my $subname = "${package}::${protected}set_$name";
1787 return Carp::carp_too_few_args(\@_, 2) if @_ < 2;
1788 Carp::carp_extra_args(\@_) if @_ > 2;
1790 # $self is $_[0]; $value is $_[1]
1792 $field->{pack 'J', $_[0]} = $_[1];
1797 Carp::my_carp_bug("Unknown accessor type $access. No accessor set.");
1806 # All input files use this object, which stores various attributes about them,
1807 # and provides for convenient, uniform handling. The run method wraps the
1808 # processing. It handles all the bookkeeping of opening, reading, and closing
1809 # the file, returning only significant input lines.
1811 # Each object gets a handler which processes the body of the file, and is
1812 # called by run(). Most should use the generic, default handler, which has
1813 # code scrubbed to handle things you might not expect. A handler should
1814 # basically be a while(next_line()) {...} loop.
1816 # You can also set up handlers to
1817 # 1) call before the first line is read for pre processing
1818 # 2) call to adjust each line of the input before the main handler gets them
1819 # 3) call upon EOF before the main handler exits its loop
1820 # 4) call at the end for post processing
1822 # $_ is used to store the input line, and is to be filtered by the
1823 # each_line_handler()s. So, if the format of the line is not in the desired
1824 # format for the main handler, these are used to do that adjusting. They can
1825 # be stacked (by enclosing them in an [ anonymous array ] in the constructor,
1826 # so the $_ output of one is used as the input to the next. None of the other
1827 # handlers are stackable, but could easily be changed to be so.
1829 # Most of the handlers can call insert_lines() or insert_adjusted_lines()
1830 # which insert the parameters as lines to be processed before the next input
1831 # file line is read. This allows the EOF handler to flush buffers, for
1832 # example. The difference between the two routines is that the lines inserted
1833 # by insert_lines() are subjected to the each_line_handler()s. (So if you
1834 # called it from such a handler, you would get infinite recursion.) Lines
1835 # inserted by insert_adjusted_lines() go directly to the main handler without
1836 # any adjustments. If the post-processing handler calls any of these, there
1837 # will be no effect. Some error checking for these conditions could be added,
1838 # but it hasn't been done.
1840 # carp_bad_line() should be called to warn of bad input lines, which clears $_
1841 # to prevent further processing of the line. This routine will output the
1842 # message as a warning once, and then keep a count of the lines that have the
1843 # same message, and output that count at the end of the file's processing.
1844 # This keeps the number of messages down to a manageable amount.
1846 # get_missings() should be called to retrieve any @missing input lines.
1847 # Messages will be raised if this isn't done if the options aren't to ignore
1850 sub trace { return main::trace(@_); }
1853 # Keep track of fields that are to be put into the constructor.
1854 my %constructor_fields;
1856 main::setup_package(Constructor_Fields => \%constructor_fields);
1858 my %file; # Input file name, required
1859 main::set_access('file', \%file, qw{ c r });
1861 my %first_released; # Unicode version file was first released in, required
1862 main::set_access('first_released', \%first_released, qw{ c r });
1864 my %handler; # Subroutine to process the input file, defaults to
1865 # 'process_generic_property_file'
1866 main::set_access('handler', \%handler, qw{ c });
1869 # name of property this file is for. defaults to none, meaning not
1870 # applicable, or is otherwise determinable, for example, from each line.
1871 main::set_access('property', \%property, qw{ c });
1874 # If this is true, the file is optional. If not present, no warning is
1875 # output. If it is present, the string given by this parameter is
1876 # evaluated, and if false the file is not processed.
1877 main::set_access('optional', \%optional, 'c', 'r');
1880 # This is used for debugging, to skip processing of all but a few input
1881 # files. Add 'non_skip => 1' to the constructor for those files you want
1882 # processed when you set the $debug_skip global.
1883 main::set_access('non_skip', \%non_skip, 'c');
1886 # This is used to skip processing of this input file semi-permanently.
1887 # It is used for files that we aren't planning to process anytime soon,
1888 # but want to allow to be in the directory and not raise a message that we
1889 # are not handling. Mostly for test files. This is in contrast to the
1890 # non_skip element, which is supposed to be used very temporarily for
1891 # debugging. Sets 'optional' to 1
1892 main::set_access('skip', \%skip, 'c');
1894 my %each_line_handler;
1895 # list of subroutines to look at and filter each non-comment line in the
1896 # file. defaults to none. The subroutines are called in order, each is
1897 # to adjust $_ for the next one, and the final one adjusts it for
1899 main::set_access('each_line_handler', \%each_line_handler, 'c');
1901 my %has_missings_defaults;
1902 # ? Are there lines in the file giving default values for code points
1903 # missing from it?. Defaults to NO_DEFAULTS. Otherwise NOT_IGNORED is
1904 # the norm, but IGNORED means it has such lines, but the handler doesn't
1905 # use them. Having these three states allows us to catch changes to the
1906 # UCD that this program should track
1907 main::set_access('has_missings_defaults',
1908 \%has_missings_defaults, qw{ c r });
1911 # Subroutine to call before doing anything else in the file. If undef, no
1912 # such handler is called.
1913 main::set_access('pre_handler', \%pre_handler, qw{ c });
1916 # Subroutine to call upon getting an EOF on the input file, but before
1917 # that is returned to the main handler. This is to allow buffers to be
1918 # flushed. The handler is expected to call insert_lines() or
1919 # insert_adjusted() with the buffered material
1920 main::set_access('eof_handler', \%eof_handler, qw{ c r });
1923 # Subroutine to call after all the lines of the file are read in and
1924 # processed. If undef, no such handler is called.
1925 main::set_access('post_handler', \%post_handler, qw{ c });
1927 my %progress_message;
1928 # Message to print to display progress in lieu of the standard one
1929 main::set_access('progress_message', \%progress_message, qw{ c });
1932 # cache open file handle, internal. Is undef if file hasn't been
1933 # processed at all, empty if has;
1934 main::set_access('handle', \%handle);
1937 # cache of lines added virtually to the file, internal
1938 main::set_access('added_lines', \%added_lines);
1941 # cache of errors found, internal
1942 main::set_access('errors', \%errors);
1945 # storage of '@missing' defaults lines
1946 main::set_access('missings', \%missings);
1951 my $self = bless \do{ my $anonymous_scalar }, $class;
1952 my $addr = do { no overloading; pack 'J', $self; };
1955 $handler{$addr} = \&main::process_generic_property_file;
1956 $non_skip{$addr} = 0;
1958 $has_missings_defaults{$addr} = $NO_DEFAULTS;
1959 $handle{$addr} = undef;
1960 $added_lines{$addr} = [ ];
1961 $each_line_handler{$addr} = [ ];
1962 $errors{$addr} = { };
1963 $missings{$addr} = [ ];
1965 # Two positional parameters.
1966 return Carp::carp_too_few_args(\@_, 2) if main::DEBUG && @_ < 2;
1967 $file{$addr} = main::internal_file_to_platform(shift);
1968 $first_released{$addr} = shift;
1970 # The rest of the arguments are key => value pairs
1971 # %constructor_fields has been set up earlier to list all possible
1972 # ones. Either set or push, depending on how the default has been set
1975 foreach my $key (keys %args) {
1976 my $argument = $args{$key};
1978 # Note that the fields are the lower case of the constructor keys
1979 my $hash = $constructor_fields{lc $key};
1980 if (! defined $hash) {
1981 Carp::my_carp_bug("Unrecognized parameters '$key => $argument' to new() for $self. Skipped");
1984 if (ref $hash->{$addr} eq 'ARRAY') {
1985 if (ref $argument eq 'ARRAY') {
1986 foreach my $argument (@{$argument}) {
1987 next if ! defined $argument;
1988 push @{$hash->{$addr}}, $argument;
1992 push @{$hash->{$addr}}, $argument if defined $argument;
1996 $hash->{$addr} = $argument;
2001 # If the file has a property for it, it means that the property is not
2002 # listed in the file's entries. So add a handler to the list of line
2003 # handlers to insert the property name into the lines, to provide a
2004 # uniform interface to the final processing subroutine.
2005 # the final code doesn't have to worry about that.
2006 if ($property{$addr}) {
2007 push @{$each_line_handler{$addr}}, \&_insert_property_into_line;
2010 if ($non_skip{$addr} && ! $debug_skip && $verbosity) {
2011 print "Warning: " . __PACKAGE__ . " constructor for $file{$addr} has useless 'non_skip' in it\n";
2014 $optional{$addr} = 1 if $skip{$addr};
2022 qw("") => "_operator_stringify",
2023 "." => \&main::_operator_dot,
2026 sub _operator_stringify {
2029 return __PACKAGE__ . " object for " . $self->file;
2032 # flag to make sure extracted files are processed early
2033 my $seen_non_extracted_non_age = 0;
2036 # Process the input object $self. This opens and closes the file and
2037 # calls all the handlers for it. Currently, this can only be called
2038 # once per file, as it destroy's the EOF handler
2041 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2043 my $addr = do { no overloading; pack 'J', $self; };
2045 my $file = $file{$addr};
2047 # Don't process if not expecting this file (because released later
2048 # than this Unicode version), and isn't there. This means if someone
2049 # copies it into an earlier version's directory, we will go ahead and
2051 return if $first_released{$addr} gt $v_version && ! -e $file;
2053 # If in debugging mode and this file doesn't have the non-skip
2054 # flag set, and isn't one of the critical files, skip it.
2056 && $first_released{$addr} ne v0
2057 && ! $non_skip{$addr})
2059 print "Skipping $file in debugging\n" if $verbosity;
2063 # File could be optional
2064 if ($optional{$addr}) {
2065 return unless -e $file;
2066 my $result = eval $optional{$addr};
2067 if (! defined $result) {
2068 Carp::my_carp_bug("Got '$@' when tried to eval $optional{$addr}. $file Skipped.");
2073 print STDERR "Skipping processing input file '$file' because '$optional{$addr}' is not true\n";
2079 if (! defined $file || ! -e $file) {
2081 # If the file doesn't exist, see if have internal data for it
2082 # (based on first_released being 0).
2083 if ($first_released{$addr} eq v0) {
2084 $handle{$addr} = 'pretend_is_open';
2087 if (! $optional{$addr} # File could be optional
2088 && $v_version ge $first_released{$addr})
2090 print STDERR "Skipping processing input file '$file' because not found\n" if $v_version ge $first_released{$addr};
2097 # Here, the file exists. Some platforms may change the case of
2099 if ($seen_non_extracted_non_age) {
2100 if ($file =~ /$EXTRACTED/i) {
2101 Carp::my_carp_bug(join_lines(<<END
2102 $file should be processed just after the 'Prop...Alias' files, and before
2103 anything not in the $EXTRACTED_DIR directory. Proceeding, but the results may
2104 have subtle problems
2109 elsif ($EXTRACTED_DIR
2110 && $first_released{$addr} ne v0
2111 && $file !~ /$EXTRACTED/i
2112 && lc($file) ne 'dage.txt')
2114 # We don't set this (by the 'if' above) if we have no
2115 # extracted directory, so if running on an early version,
2116 # this test won't work. Not worth worrying about.
2117 $seen_non_extracted_non_age = 1;
2120 # And mark the file as having being processed, and warn if it
2121 # isn't a file we are expecting. As we process the files,
2122 # they are deleted from the hash, so any that remain at the
2123 # end of the program are files that we didn't process.
2124 my $fkey = File::Spec->rel2abs($file);
2125 my $expecting = delete $potential_files{$fkey};
2126 $expecting = delete $potential_files{lc($fkey)} unless defined $expecting;
2127 Carp::my_carp("Was not expecting '$file'.") if
2129 && ! defined $handle{$addr};
2131 # Having deleted from expected files, we can quit if not to do
2132 # anything. Don't print progress unless really want verbosity
2134 print "Skipping $file.\n" if $verbosity >= $VERBOSE;
2138 # Open the file, converting the slashes used in this program
2139 # into the proper form for the OS
2141 if (not open $file_handle, "<", $file) {
2142 Carp::my_carp("Can't open $file. Skipping: $!");
2145 $handle{$addr} = $file_handle; # Cache the open file handle
2148 if ($verbosity >= $PROGRESS) {
2149 if ($progress_message{$addr}) {
2150 print "$progress_message{$addr}\n";
2153 # If using a virtual file, say so.
2154 print "Processing ", (-e $file)
2156 : "substitute $file",
2162 # Call any special handler for before the file.
2163 &{$pre_handler{$addr}}($self) if $pre_handler{$addr};
2165 # Then the main handler
2166 &{$handler{$addr}}($self);
2168 # Then any special post-file handler.
2169 &{$post_handler{$addr}}($self) if $post_handler{$addr};
2171 # If any errors have been accumulated, output the counts (as the first
2172 # error message in each class was output when it was encountered).
2173 if ($errors{$addr}) {
2176 foreach my $error (keys %{$errors{$addr}}) {
2177 $total += $errors{$addr}->{$error};
2178 delete $errors{$addr}->{$error};
2183 = "A total of $total lines had errors in $file. ";
2185 $message .= ($types == 1)
2186 ? '(Only the first one was displayed.)'
2187 : '(Only the first of each type was displayed.)';
2188 Carp::my_carp($message);
2192 if (@{$missings{$addr}}) {
2193 Carp::my_carp_bug("Handler for $file didn't look at all the \@missing lines. Generated tables likely are wrong");
2196 # If a real file handle, close it.
2197 close $handle{$addr} or Carp::my_carp("Can't close $file: $!") if
2199 $handle{$addr} = ""; # Uses empty to indicate that has already seen
2200 # the file, as opposed to undef
2205 # Sets $_ to be the next logical input line, if any. Returns non-zero
2206 # if such a line exists. 'logical' means that any lines that have
2207 # been added via insert_lines() will be returned in $_ before the file
2211 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2213 my $addr = do { no overloading; pack 'J', $self; };
2215 # Here the file is open (or if the handle is not a ref, is an open
2216 # 'virtual' file). Get the next line; any inserted lines get priority
2217 # over the file itself.
2221 while (1) { # Loop until find non-comment, non-empty line
2222 #local $to_trace = 1 if main::DEBUG;
2223 my $inserted_ref = shift @{$added_lines{$addr}};
2224 if (defined $inserted_ref) {
2225 ($adjusted, $_) = @{$inserted_ref};
2226 trace $adjusted, $_ if main::DEBUG && $to_trace;
2227 return 1 if $adjusted;
2230 last if ! ref $handle{$addr}; # Don't read unless is real file
2231 last if ! defined ($_ = readline $handle{$addr});
2234 trace $_ if main::DEBUG && $to_trace;
2236 # See if this line is the comment line that defines what property
2237 # value that code points that are not listed in the file should
2238 # have. The format or existence of these lines is not guaranteed
2239 # by Unicode since they are comments, but the documentation says
2240 # that this was added for machine-readability, so probably won't
2241 # change. This works starting in Unicode Version 5.0. They look
2244 # @missing: 0000..10FFFF; Not_Reordered
2245 # @missing: 0000..10FFFF; Decomposition_Mapping; <code point>
2246 # @missing: 0000..10FFFF; ; NaN
2248 # Save the line for a later get_missings() call.
2249 if (/$missing_defaults_prefix/) {
2250 if ($has_missings_defaults{$addr} == $NO_DEFAULTS) {
2251 $self->carp_bad_line("Unexpected \@missing line. Assuming no missing entries");
2253 elsif ($has_missings_defaults{$addr} == $NOT_IGNORED) {
2254 my @defaults = split /\s* ; \s*/x, $_;
2256 # The first field is the @missing, which ends in a
2257 # semi-colon, so can safely shift.
2260 # Some of these lines may have empty field placeholders
2261 # which get in the way. An example is:
2262 # @missing: 0000..10FFFF; ; NaN
2263 # Remove them. Process starting from the top so the
2264 # splice doesn't affect things still to be looked at.
2265 for (my $i = @defaults - 1; $i >= 0; $i--) {
2266 next if $defaults[$i] ne "";
2267 splice @defaults, $i, 1;
2270 # What's left should be just the property (maybe) and the
2271 # default. Having only one element means it doesn't have
2275 if (@defaults >= 1) {
2276 if (@defaults == 1) {
2277 $default = $defaults[0];
2280 $property = $defaults[0];
2281 $default = $defaults[1];
2287 || ($default =~ /^</
2288 && $default !~ /^<code *point>$/i
2289 && $default !~ /^<none>$/i))
2291 $self->carp_bad_line("Unrecognized \@missing line: $_. Assuming no missing entries");
2295 # If the property is missing from the line, it should
2296 # be the one for the whole file
2297 $property = $property{$addr} if ! defined $property;
2299 # Change <none> to the null string, which is what it
2300 # really means. If the default is the code point
2301 # itself, set it to <code point>, which is what
2302 # Unicode uses (but sometimes they've forgotten the
2304 if ($default =~ /^<none>$/i) {
2307 elsif ($default =~ /^<code *point>$/i) {
2308 $default = $CODE_POINT;
2311 # Store them as a sub-arrays with both components.
2312 push @{$missings{$addr}}, [ $default, $property ];
2316 # There is nothing for the caller to process on this comment
2321 # Remove comments and trailing space, and skip this line if the
2327 # Call any handlers for this line, and skip further processing of
2328 # the line if the handler sets the line to null.
2329 foreach my $sub_ref (@{$each_line_handler{$addr}}) {
2334 # Here the line is ok. return success.
2336 } # End of looping through lines.
2338 # If there is an EOF handler, call it (only once) and if it generates
2339 # more lines to process go back in the loop to handle them.
2340 if ($eof_handler{$addr}) {
2341 &{$eof_handler{$addr}}($self);
2342 $eof_handler{$addr} = ""; # Currently only get one shot at it.
2343 goto LINE if $added_lines{$addr};
2346 # Return failure -- no more lines.
2351 # Not currently used, not fully tested.
2353 # # Non-destructive look-ahead one non-adjusted, non-comment, non-blank
2354 # # record. Not callable from an each_line_handler(), nor does it call
2355 # # an each_line_handler() on the line.
2358 # my $addr = do { no overloading; pack 'J', $self; };
2360 # foreach my $inserted_ref (@{$added_lines{$addr}}) {
2361 # my ($adjusted, $line) = @{$inserted_ref};
2362 # next if $adjusted;
2364 # # Remove comments and trailing space, and return a non-empty
2367 # $line =~ s/\s+$//;
2368 # return $line if $line ne "";
2371 # return if ! ref $handle{$addr}; # Don't read unless is real file
2372 # while (1) { # Loop until find non-comment, non-empty line
2373 # local $to_trace = 1 if main::DEBUG;
2374 # trace $_ if main::DEBUG && $to_trace;
2375 # return if ! defined (my $line = readline $handle{$addr});
2377 # push @{$added_lines{$addr}}, [ 0, $line ];
2380 # $line =~ s/\s+$//;
2381 # return $line if $line ne "";
2389 # Lines can be inserted so that it looks like they were in the input
2390 # file at the place it was when this routine is called. See also
2391 # insert_adjusted_lines(). Lines inserted via this routine go through
2392 # any each_line_handler()
2396 # Each inserted line is an array, with the first element being 0 to
2397 # indicate that this line hasn't been adjusted, and needs to be
2400 push @{$added_lines{pack 'J', $self}}, map { [ 0, $_ ] } @_;
2404 sub insert_adjusted_lines {
2405 # Lines can be inserted so that it looks like they were in the input
2406 # file at the place it was when this routine is called. See also
2407 # insert_lines(). Lines inserted via this routine are already fully
2408 # adjusted, ready to be processed; each_line_handler()s handlers will
2409 # not be called. This means this is not a completely general
2410 # facility, as only the last each_line_handler on the stack should
2411 # call this. It could be made more general, by passing to each of the
2412 # line_handlers their position on the stack, which they would pass on
2413 # to this routine, and that would replace the boolean first element in
2414 # the anonymous array pushed here, so that the next_line routine could
2415 # use that to call only those handlers whose index is after it on the
2416 # stack. But this is overkill for what is needed now.
2419 trace $_[0] if main::DEBUG && $to_trace;
2421 # Each inserted line is an array, with the first element being 1 to
2422 # indicate that this line has been adjusted
2424 push @{$added_lines{pack 'J', $self}}, map { [ 1, $_ ] } @_;
2429 # Returns the stored up @missings lines' values, and clears the list.
2430 # The values are in an array, consisting of the default in the first
2431 # element, and the property in the 2nd. However, since these lines
2432 # can be stacked up, the return is an array of all these arrays.
2435 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2437 my $addr = do { no overloading; pack 'J', $self; };
2439 # If not accepting a list return, just return the first one.
2440 return shift @{$missings{$addr}} unless wantarray;
2442 my @return = @{$missings{$addr}};
2443 undef @{$missings{$addr}};
2447 sub _insert_property_into_line {
2448 # Add a property field to $_, if this file requires it.
2451 my $addr = do { no overloading; pack 'J', $self; };
2452 my $property = $property{$addr};
2453 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2455 $_ =~ s/(;|$)/; $property$1/;
2460 # Output consistent error messages, using either a generic one, or the
2461 # one given by the optional parameter. To avoid gazillions of the
2462 # same message in case the syntax of a file is way off, this routine
2463 # only outputs the first instance of each message, incrementing a
2464 # count so the totals can be output at the end of the file.
2467 my $message = shift;
2468 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2470 my $addr = do { no overloading; pack 'J', $self; };
2472 $message = 'Unexpected line' unless $message;
2474 # No trailing punctuation so as to fit with our addenda.
2475 $message =~ s/[.:;,]$//;
2477 # If haven't seen this exact message before, output it now. Otherwise
2478 # increment the count of how many times it has occurred
2479 unless ($errors{$addr}->{$message}) {
2480 Carp::my_carp("$message in '$_' in "
2482 . " at line $.. Skipping this line;");
2483 $errors{$addr}->{$message} = 1;
2486 $errors{$addr}->{$message}++;
2489 # Clear the line to prevent any further (meaningful) processing of it.
2496 package Multi_Default;
2498 # Certain properties in early versions of Unicode had more than one possible
2499 # default for code points missing from the files. In these cases, one
2500 # default applies to everything left over after all the others are applied,
2501 # and for each of the others, there is a description of which class of code
2502 # points applies to it. This object helps implement this by storing the
2503 # defaults, and for all but that final default, an eval string that generates
2504 # the class that it applies to.
2509 main::setup_package();
2512 # The defaults structure for the classes
2513 main::set_access('class_defaults', \%class_defaults);
2516 # The default that applies to everything left over.
2517 main::set_access('other_default', \%other_default, 'r');
2521 # The constructor is called with default => eval pairs, terminated by
2522 # the left-over default. e.g.
2523 # Multi_Default->new(
2524 # 'T' => '$gc->table("Mn") + $gc->table("Cf") - 0x200C
2526 # 'R' => 'some other expression that evaluates to code points',
2534 my $self = bless \do{my $anonymous_scalar}, $class;
2535 my $addr = do { no overloading; pack 'J', $self; };
2538 my $default = shift;
2540 $class_defaults{$addr}->{$default} = $eval;
2543 $other_default{$addr} = shift;
2548 sub get_next_defaults {
2549 # Iterates and returns the next class of defaults.
2551 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2553 my $addr = do { no overloading; pack 'J', $self; };
2555 return each %{$class_defaults{$addr}};
2561 # An alias is one of the names that a table goes by. This class defines them
2562 # including some attributes. Everything is currently setup in the
2568 main::setup_package();
2571 main::set_access('name', \%name, 'r');
2574 # Determined by the constructor code if this name should match loosely or
2575 # not. The constructor parameters can override this, but it isn't fully
2576 # implemented, as should have ability to override Unicode one's via
2577 # something like a set_loose_match()
2578 main::set_access('loose_match', \%loose_match, 'r');
2581 # Some aliases should not get their own entries because they are covered
2582 # by a wild-card, and some we want to discourage use of. Binary
2583 main::set_access('make_pod_entry', \%make_pod_entry, 'r');
2586 # Aliases have a status, like deprecated, or even suppressed (which means
2587 # they don't appear in documentation). Enum
2588 main::set_access('status', \%status, 'r');
2591 # Similarly, some aliases should not be considered as usable ones for
2592 # external use, such as file names, or we don't want documentation to
2593 # recommend them. Boolean
2594 main::set_access('externally_ok', \%externally_ok, 'r');
2599 my $self = bless \do { my $anonymous_scalar }, $class;
2600 my $addr = do { no overloading; pack 'J', $self; };
2602 $name{$addr} = shift;
2603 $loose_match{$addr} = shift;
2604 $make_pod_entry{$addr} = shift;
2605 $externally_ok{$addr} = shift;
2606 $status{$addr} = shift;
2608 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2610 # Null names are never ok externally
2611 $externally_ok{$addr} = 0 if $name{$addr} eq "";
2619 # A range is the basic unit for storing code points, and is described in the
2620 # comments at the beginning of the program. Each range has a starting code
2621 # point; an ending code point (not less than the starting one); a value
2622 # that applies to every code point in between the two end-points, inclusive;
2623 # and an enum type that applies to the value. The type is for the user's
2624 # convenience, and has no meaning here, except that a non-zero type is
2625 # considered to not obey the normal Unicode rules for having standard forms.
2627 # The same structure is used for both map and match tables, even though in the
2628 # latter, the value (and hence type) is irrelevant and could be used as a
2629 # comment. In map tables, the value is what all the code points in the range
2630 # map to. Type 0 values have the standardized version of the value stored as
2631 # well, so as to not have to recalculate it a lot.
2633 sub trace { return main::trace(@_); }
2637 main::setup_package();
2640 main::set_access('start', \%start, 'r', 's');
2643 main::set_access('end', \%end, 'r', 's');
2646 main::set_access('value', \%value, 'r');
2649 main::set_access('type', \%type, 'r');
2652 # The value in internal standard form. Defined only if the type is 0.
2653 main::set_access('standard_form', \%standard_form);
2655 # Note that if these fields change, the dump() method should as well
2658 return Carp::carp_too_few_args(\@_, 3) if main::DEBUG && @_ < 3;
2661 my $self = bless \do { my $anonymous_scalar }, $class;
2662 my $addr = do { no overloading; pack 'J', $self; };
2664 $start{$addr} = shift;
2665 $end{$addr} = shift;
2669 my $value = delete $args{'Value'}; # Can be 0
2670 $value = "" unless defined $value;
2671 $value{$addr} = $value;
2673 $type{$addr} = delete $args{'Type'} || 0;
2675 Carp::carp_extra_args(\%args) if main::DEBUG && %args;
2677 if (! $type{$addr}) {
2678 $standard_form{$addr} = main::standardize($value);
2686 qw("") => "_operator_stringify",
2687 "." => \&main::_operator_dot,
2690 sub _operator_stringify {
2692 my $addr = do { no overloading; pack 'J', $self; };
2694 # Output it like '0041..0065 (value)'
2695 my $return = sprintf("%04X", $start{$addr})
2697 . sprintf("%04X", $end{$addr});
2698 my $value = $value{$addr};
2699 my $type = $type{$addr};
2701 $return .= "$value";
2702 $return .= ", Type=$type" if $type != 0;
2709 # The standard form is the value itself if the standard form is
2710 # undefined (that is if the value is special)
2713 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2715 my $addr = do { no overloading; pack 'J', $self; };
2717 return $standard_form{$addr} if defined $standard_form{$addr};
2718 return $value{$addr};
2722 # Human, not machine readable. For machine readable, comment out this
2723 # entire routine and let the standard one take effect.
2726 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2728 my $addr = do { no overloading; pack 'J', $self; };
2730 my $return = $indent
2731 . sprintf("%04X", $start{$addr})
2733 . sprintf("%04X", $end{$addr})
2734 . " '$value{$addr}';";
2735 if (! defined $standard_form{$addr}) {
2736 $return .= "(type=$type{$addr})";
2738 elsif ($standard_form{$addr} ne $value{$addr}) {
2739 $return .= "(standard '$standard_form{$addr}')";
2745 package _Range_List_Base;
2747 # Base class for range lists. A range list is simply an ordered list of
2748 # ranges, so that the ranges with the lowest starting numbers are first in it.
2750 # When a new range is added that is adjacent to an existing range that has the
2751 # same value and type, it merges with it to form a larger range.
2753 # Ranges generally do not overlap, except that there can be multiple entries
2754 # of single code point ranges. This is because of NameAliases.txt.
2756 # In this program, there is a standard value such that if two different
2757 # values, have the same standard value, they are considered equivalent. This
2758 # value was chosen so that it gives correct results on Unicode data
2760 # There are a number of methods to manipulate range lists, and some operators
2761 # are overloaded to handle them.
2763 sub trace { return main::trace(@_); }
2769 main::setup_package();
2772 # The list of ranges
2773 main::set_access('ranges', \%ranges, 'readable_array');
2776 # The highest code point in the list. This was originally a method, but
2777 # actual measurements said it was used a lot.
2778 main::set_access('max', \%max, 'r');
2780 my %each_range_iterator;
2781 # Iterator position for each_range()
2782 main::set_access('each_range_iterator', \%each_range_iterator);
2785 # Name of parent this is attached to, if any. Solely for better error
2787 main::set_access('owner_name_of', \%owner_name_of, 'p_r');
2789 my %_search_ranges_cache;
2790 # A cache of the previous result from _search_ranges(), for better
2792 main::set_access('_search_ranges_cache', \%_search_ranges_cache);
2798 # Optional initialization data for the range list.
2799 my $initialize = delete $args{'Initialize'};
2803 # Use _union() to initialize. _union() returns an object of this
2804 # class, which means that it will call this constructor recursively.
2805 # But it won't have this $initialize parameter so that it won't
2806 # infinitely loop on this.
2807 return _union($class, $initialize, %args) if defined $initialize;
2809 $self = bless \do { my $anonymous_scalar }, $class;
2810 my $addr = do { no overloading; pack 'J', $self; };
2812 # Optional parent object, only for debug info.
2813 $owner_name_of{$addr} = delete $args{'Owner'};
2814 $owner_name_of{$addr} = "" if ! defined $owner_name_of{$addr};
2816 # Stringify, in case it is an object.
2817 $owner_name_of{$addr} = "$owner_name_of{$addr}";
2819 # This is used only for error messages, and so a colon is added
2820 $owner_name_of{$addr} .= ": " if $owner_name_of{$addr} ne "";
2822 Carp::carp_extra_args(\%args) if main::DEBUG && %args;
2824 # Max is initialized to a negative value that isn't adjacent to 0,
2828 $_search_ranges_cache{$addr} = 0;
2829 $ranges{$addr} = [];
2836 qw("") => "_operator_stringify",
2837 "." => \&main::_operator_dot,
2840 sub _operator_stringify {
2842 my $addr = do { no overloading; pack 'J', $self; };
2844 return "Range_List attached to '$owner_name_of{$addr}'"
2845 if $owner_name_of{$addr};
2846 return "anonymous Range_List " . \$self;
2850 # Returns the union of the input code points. It can be called as
2851 # either a constructor or a method. If called as a method, the result
2852 # will be a new() instance of the calling object, containing the union
2853 # of that object with the other parameter's code points; if called as
2854 # a constructor, the first parameter gives the class the new object
2855 # should be, and the second parameter gives the code points to go into
2857 # In either case, there are two parameters looked at by this routine;
2858 # any additional parameters are passed to the new() constructor.
2860 # The code points can come in the form of some object that contains
2861 # ranges, and has a conventionally named method to access them; or
2862 # they can be an array of individual code points (as integers); or
2863 # just a single code point.
2865 # If they are ranges, this routine doesn't make any effort to preserve
2866 # the range values of one input over the other. Therefore this base
2867 # class should not allow _union to be called from other than
2868 # initialization code, so as to prevent two tables from being added
2869 # together where the range values matter. The general form of this
2870 # routine therefore belongs in a derived class, but it was moved here
2871 # to avoid duplication of code. The failure to overload this in this
2872 # class keeps it safe.
2876 my @args; # Arguments to pass to the constructor
2880 # If a method call, will start the union with the object itself, and
2881 # the class of the new object will be the same as self.
2888 # Add the other required parameter.
2890 # Rest of parameters are passed on to the constructor
2892 # Accumulate all records from both lists.
2894 for my $arg (@args) {
2895 #local $to_trace = 0 if main::DEBUG;
2896 trace "argument = $arg" if main::DEBUG && $to_trace;
2897 if (! defined $arg) {
2899 if (defined $self) {
2901 $message .= $owner_name_of{pack 'J', $self};
2903 Carp::my_carp_bug($message .= "Undefined argument to _union. No union done.");
2906 $arg = [ $arg ] if ! ref $arg;
2907 my $type = ref $arg;
2908 if ($type eq 'ARRAY') {
2909 foreach my $element (@$arg) {
2910 push @records, Range->new($element, $element);
2913 elsif ($arg->isa('Range')) {
2914 push @records, $arg;
2916 elsif ($arg->can('ranges')) {
2917 push @records, $arg->ranges;
2921 if (defined $self) {
2923 $message .= $owner_name_of{pack 'J', $self};
2925 Carp::my_carp_bug($message . "Cannot take the union of a $type. No union done.");
2930 # Sort with the range containing the lowest ordinal first, but if
2931 # two ranges start at the same code point, sort with the bigger range
2932 # of the two first, because it takes fewer cycles.
2933 @records = sort { ($a->start <=> $b->start)
2935 # if b is shorter than a, b->end will be
2936 # less than a->end, and we want to select
2937 # a, so want to return -1
2938 ($b->end <=> $a->end)
2941 my $new = $class->new(@_);
2943 # Fold in records so long as they add new information.
2944 for my $set (@records) {
2945 my $start = $set->start;
2946 my $end = $set->end;
2947 my $value = $set->value;
2948 if ($start > $new->max) {
2949 $new->_add_delete('+', $start, $end, $value);
2951 elsif ($end > $new->max) {
2952 $new->_add_delete('+', $new->max +1, $end, $value);
2959 sub range_count { # Return the number of ranges in the range list
2961 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2964 return scalar @{$ranges{pack 'J', $self}};
2968 # Returns the minimum code point currently in the range list, or if
2969 # the range list is empty, 2 beyond the max possible. This is a
2970 # method because used so rarely, that not worth saving between calls,
2971 # and having to worry about changing it as ranges are added and
2975 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2977 my $addr = do { no overloading; pack 'J', $self; };
2979 # If the range list is empty, return a large value that isn't adjacent
2980 # to any that could be in the range list, for simpler tests
2981 return $LAST_UNICODE_CODEPOINT + 2 unless scalar @{$ranges{$addr}};
2982 return $ranges{$addr}->[0]->start;
2986 # Boolean: Is argument in the range list? If so returns $i such that:
2987 # range[$i]->end < $codepoint <= range[$i+1]->end
2988 # which is one beyond what you want; this is so that the 0th range
2989 # doesn't return false
2991 my $codepoint = shift;
2992 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2994 my $i = $self->_search_ranges($codepoint);
2995 return 0 unless defined $i;
2997 # The search returns $i, such that
2998 # range[$i-1]->end < $codepoint <= range[$i]->end
2999 # So is in the table if and only iff it is at least the start position
3002 return 0 if $ranges{pack 'J', $self}->[$i]->start > $codepoint;
3006 sub containing_range {
3007 # Returns the range object that contains the code point, undef if none
3010 my $codepoint = shift;
3011 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3013 my $i = $self->contains($codepoint);
3016 # contains() returns 1 beyond where we should look
3018 return $ranges{pack 'J', $self}->[$i-1];
3022 # Returns the value associated with the code point, undef if none
3025 my $codepoint = shift;
3026 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3028 my $range = $self->containing_range($codepoint);
3029 return unless defined $range;
3031 return $range->value;
3035 # Returns the type of the range containing the code point, undef if
3036 # the code point is not in the table
3039 my $codepoint = shift;
3040 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3042 my $range = $self->containing_range($codepoint);
3043 return unless defined $range;
3045 return $range->type;
3048 sub _search_ranges {
3049 # Find the range in the list which contains a code point, or where it
3050 # should go if were to add it. That is, it returns $i, such that:
3051 # range[$i-1]->end < $codepoint <= range[$i]->end
3052 # Returns undef if no such $i is possible (e.g. at end of table), or
3053 # if there is an error.
3056 my $code_point = shift;
3057 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3059 my $addr = do { no overloading; pack 'J', $self; };
3061 return if $code_point > $max{$addr};
3062 my $r = $ranges{$addr}; # The current list of ranges
3063 my $range_list_size = scalar @$r;
3066 use integer; # want integer division
3068 # Use the cached result as the starting guess for this one, because,
3069 # an experiment on 5.1 showed that 90% of the time the cache was the
3070 # same as the result on the next call (and 7% it was one less).
3071 $i = $_search_ranges_cache{$addr};
3072 $i = 0 if $i >= $range_list_size; # Reset if no longer valid (prob.
3073 # from an intervening deletion
3074 #local $to_trace = 1 if main::DEBUG;
3075 trace "previous \$i is still valid: $i" if main::DEBUG && $to_trace && $code_point <= $r->[$i]->end && ($i == 0 || $r->[$i-1]->end < $code_point);
3076 return $i if $code_point <= $r->[$i]->end
3077 && ($i == 0 || $r->[$i-1]->end < $code_point);
3079 # Here the cache doesn't yield the correct $i. Try adding 1.
3080 if ($i < $range_list_size - 1
3081 && $r->[$i]->end < $code_point &&
3082 $code_point <= $r->[$i+1]->end)
3085 trace "next \$i is correct: $i" if main::DEBUG && $to_trace;
3086 $_search_ranges_cache{$addr} = $i;
3090 # Here, adding 1 also didn't work. We do a binary search to
3091 # find the correct position, starting with current $i
3093 my $upper = $range_list_size - 1;
3095 trace "top of loop i=$i:", sprintf("%04X", $r->[$lower]->start), "[$lower] .. ", sprintf("%04X", $r->[$i]->start), "[$i] .. ", sprintf("%04X", $r->[$upper]->start), "[$upper]" if main::DEBUG && $to_trace;
3097 if ($code_point <= $r->[$i]->end) {
3099 # Here we have met the upper constraint. We can quit if we
3100 # also meet the lower one.
3101 last if $i == 0 || $r->[$i-1]->end < $code_point;
3103 $upper = $i; # Still too high.
3108 # Here, $r[$i]->end < $code_point, so look higher up.
3112 # Split search domain in half to try again.
3113 my $temp = ($upper + $lower) / 2;
3115 # No point in continuing unless $i changes for next time
3119 # We can't reach the highest element because of the averaging.
3120 # So if one below the upper edge, force it there and try one
3122 if ($i == $range_list_size - 2) {
3124 trace "Forcing to upper edge" if main::DEBUG && $to_trace;
3125 $i = $range_list_size - 1;
3127 # Change $lower as well so if fails next time through,
3128 # taking the average will yield the same $i, and we will
3129 # quit with the error message just below.
3133 Carp::my_carp_bug("$owner_name_of{$addr}Can't find where the range ought to go. No action taken.");
3137 } # End of while loop
3139 if (main::DEBUG && $to_trace) {
3140 trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i;
3141 trace "i= [ $i ]", $r->[$i];
3142 trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < $range_list_size - 1;
3145 # Here we have found the offset. Cache it as a starting point for the
3147 $_search_ranges_cache{$addr} = $i;
3152 # Add, replace or delete ranges to or from a list. The $type
3153 # parameter gives which:
3154 # '+' => insert or replace a range, returning a list of any changed
3156 # '-' => delete a range, returning a list of any deleted ranges.
3158 # The next three parameters give respectively the start, end, and
3159 # value associated with the range. 'value' should be null unless the
3162 # The range list is kept sorted so that the range with the lowest
3163 # starting position is first in the list, and generally, adjacent
3164 # ranges with the same values are merged into a single larger one (see
3165 # exceptions below).
3167 # There are more parameters; all are key => value pairs:
3168 # Type gives the type of the value. It is only valid for '+'.
3169 # All ranges have types; if this parameter is omitted, 0 is
3170 # assumed. Ranges with type 0 are assumed to obey the
3171 # Unicode rules for casing, etc; ranges with other types are
3172 # not. Otherwise, the type is arbitrary, for the caller's
3173 # convenience, and looked at only by this routine to keep
3174 # adjacent ranges of different types from being merged into
3175 # a single larger range, and when Replace =>
3176 # $IF_NOT_EQUIVALENT is specified (see just below).
3177 # Replace determines what to do if the range list already contains
3178 # ranges which coincide with all or portions of the input
3179 # range. It is only valid for '+':
3180 # => $NO means that the new value is not to replace
3181 # any existing ones, but any empty gaps of the
3182 # range list coinciding with the input range
3183 # will be filled in with the new value.
3184 # => $UNCONDITIONALLY means to replace the existing values with
3185 # this one unconditionally. However, if the
3186 # new and old values are identical, the
3187 # replacement is skipped to save cycles
3188 # => $IF_NOT_EQUIVALENT means to replace the existing values
3189 # with this one if they are not equivalent.
3190 # Ranges are equivalent if their types are the
3191 # same, and they are the same string; or if
3192 # both are type 0 ranges, if their Unicode
3193 # standard forms are identical. In this last
3194 # case, the routine chooses the more "modern"
3195 # one to use. This is because some of the
3196 # older files are formatted with values that
3197 # are, for example, ALL CAPs, whereas the
3198 # derived files have a more modern style,
3199 # which looks better. By looking for this
3200 # style when the pre-existing and replacement
3201 # standard forms are the same, we can move to
3203 # => $MULTIPLE means that if this range duplicates an
3204 # existing one, but has a different value,
3205 # don't replace the existing one, but insert
3206 # this, one so that the same range can occur
3208 # => anything else is the same as => $IF_NOT_EQUIVALENT
3210 # "same value" means identical for non-type-0 ranges, and it means
3211 # having the same standard forms for type-0 ranges.
3213 return Carp::carp_too_few_args(\@_, 5) if main::DEBUG && @_ < 5;
3216 my $operation = shift; # '+' for add/replace; '-' for delete;
3223 $value = "" if not defined $value; # warning: $value can be "0"
3225 my $replace = delete $args{'Replace'};
3226 $replace = $IF_NOT_EQUIVALENT unless defined $replace;
3228 my $type = delete $args{'Type'};
3229 $type = 0 unless defined $type;
3231 Carp::carp_extra_args(\%args) if main::DEBUG && %args;
3233 my $addr = do { no overloading; pack 'J', $self; };
3235 if ($operation ne '+' && $operation ne '-') {
3236 Carp::my_carp_bug("$owner_name_of{$addr}First parameter to _add_delete must be '+' or '-'. No action taken.");
3239 unless (defined $start && defined $end) {
3240 Carp::my_carp_bug("$owner_name_of{$addr}Undefined start and/or end to _add_delete. No action taken.");
3243 unless ($end >= $start) {
3244 Carp::my_carp_bug("$owner_name_of{$addr}End of range (" . sprintf("%04X", $end) . ") must not be before start (" . sprintf("%04X", $start) . "). No action taken.");
3247 #local $to_trace = 1 if main::DEBUG;
3249 if ($operation eq '-') {
3250 if ($replace != $IF_NOT_EQUIVALENT) {
3251 Carp::my_carp_bug("$owner_name_of{$addr}Replace => \$IF_NOT_EQUIVALENT is required when deleting a range from a range list. Assuming Replace => \$IF_NOT_EQUIVALENT.");
3252 $replace = $IF_NOT_EQUIVALENT;
3255 Carp::my_carp_bug("$owner_name_of{$addr}Type => 0 is required when deleting a range from a range list. Assuming Type => 0.");
3259 Carp::my_carp_bug("$owner_name_of{$addr}Value => \"\" is required when deleting a range from a range list. Assuming Value => \"\".");
3264 my $r = $ranges{$addr}; # The current list of ranges
3265 my $range_list_size = scalar @$r; # And its size
3266 my $max = $max{$addr}; # The current high code point in
3267 # the list of ranges
3269 # Do a special case requiring fewer machine cycles when the new range
3270 # starts after the current highest point. The Unicode input data is
3271 # structured so this is common.
3272 if ($start > $max) {
3274 trace "$owner_name_of{$addr} $operation", sprintf("%04X", $start) . '..' . sprintf("%04X", $end) . " ($value) type=$type" if main::DEBUG && $to_trace;
3275 return if $operation eq '-'; # Deleting a non-existing range is a
3278 # If the new range doesn't logically extend the current final one
3279 # in the range list, create a new range at the end of the range
3280 # list. (max cleverly is initialized to a negative number not
3281 # adjacent to 0 if the range list is empty, so even adding a range
3282 # to an empty range list starting at 0 will have this 'if'
3284 if ($start > $max + 1 # non-adjacent means can't extend.
3285 || @{$r}[-1]->value ne $value # values differ, can't extend.
3286 || @{$r}[-1]->type != $type # types differ, can't extend.
3288 push @$r, Range->new($start, $end,
3294 # Here, the new range starts just after the current highest in
3295 # the range list, and they have the same type and value.
3296 # Extend the current range to incorporate the new one.
3297 @{$r}[-1]->set_end($end);
3300 # This becomes the new maximum.
3305 #local $to_trace = 0 if main::DEBUG;
3307 trace "$owner_name_of{$addr} $operation", sprintf("%04X", $start) . '..' . sprintf("%04X", $end) . " ($value) replace=$replace" if main::DEBUG && $to_trace;
3309 # Here, the input range isn't after the whole rest of the range list.
3310 # Most likely 'splice' will be needed. The rest of the routine finds
3311 # the needed splice parameters, and if necessary, does the splice.
3312 # First, find the offset parameter needed by the splice function for
3313 # the input range. Note that the input range may span multiple
3314 # existing ones, but we'll worry about that later. For now, just find
3315 # the beginning. If the input range is to be inserted starting in a
3316 # position not currently in the range list, it must (obviously) come
3317 # just after the range below it, and just before the range above it.
3318 # Slightly less obviously, it will occupy the position currently
3319 # occupied by the range that is to come after it. More formally, we
3320 # are looking for the position, $i, in the array of ranges, such that:
3322 # r[$i-1]->start <= r[$i-1]->end < $start < r[$i]->start <= r[$i]->end
3324 # (The ordered relationships within existing ranges are also shown in
3325 # the equation above). However, if the start of the input range is
3326 # within an existing range, the splice offset should point to that
3327 # existing range's position in the list; that is $i satisfies a
3328 # somewhat different equation, namely:
3330 #r[$i-1]->start <= r[$i-1]->end < r[$i]->start <= $start <= r[$i]->end
3332 # More briefly, $start can come before or after r[$i]->start, and at
3333 # this point, we don't know which it will be. However, these
3334 # two equations share these constraints:
3336 # r[$i-1]->end < $start <= r[$i]->end
3338 # And that is good enough to find $i.
3340 my $i = $self->_search_ranges($start);
3342 Carp::my_carp_bug("Searching $self for range beginning with $start unexpectedly returned undefined. Operation '$operation' not performed");
3346 # The search function returns $i such that:
3348 # r[$i-1]->end < $start <= r[$i]->end
3350 # That means that $i points to the first range in the range list
3351 # that could possibly be affected by this operation. We still don't
3352 # know if the start of the input range is within r[$i], or if it
3353 # points to empty space between r[$i-1] and r[$i].
3354 trace "[$i] is the beginning splice point. Existing range there is ", $r->[$i] if main::DEBUG && $to_trace;
3356 # Special case the insertion of data that is not to replace any
3358 if ($replace == $NO) { # If $NO, has to be operation '+'
3359 #local $to_trace = 1 if main::DEBUG;
3360 trace "Doesn't replace" if main::DEBUG && $to_trace;
3362 # Here, the new range is to take effect only on those code points
3363 # that aren't already in an existing range. This can be done by
3364 # looking through the existing range list and finding the gaps in
3365 # the ranges that this new range affects, and then calling this
3366 # function recursively on each of those gaps, leaving untouched
3367 # anything already in the list. Gather up a list of the changed
3368 # gaps first so that changes to the internal state as new ranges
3369 # are added won't be a problem.
3372 # First, if the starting point of the input range is outside an
3373 # existing one, there is a gap from there to the beginning of the
3374 # existing range -- add a span to fill the part that this new
3376 if ($start < $r->[$i]->start) {
3377 push @gap_list, Range->new($start,
3379 $r->[$i]->start - 1),
3381 trace "gap before $r->[$i] [$i], will add", $gap_list[-1] if main::DEBUG && $to_trace;
3384 # Then look through the range list for other gaps until we reach
3385 # the highest range affected by the input one.
3387 for ($j = $i+1; $j < $range_list_size; $j++) {
3388 trace "j=[$j]", $r->[$j] if main::DEBUG && $to_trace;
3389 last if $end < $r->[$j]->start;
3391 # If there is a gap between when this range starts and the
3392 # previous one ends, add a span to fill it. Note that just
3393 # because there are two ranges doesn't mean there is a
3394 # non-zero gap between them. It could be that they have
3395 # different values or types
3396 if ($r->[$j-1]->end + 1 != $r->[$j]->start) {
3398 Range->new($r->[$j-1]->end + 1,
3399 $r->[$j]->start - 1,
3401 trace "gap between $r->[$j-1] and $r->[$j] [$j], will add: $gap_list[-1]" if main::DEBUG && $to_trace;
3405 # Here, we have either found an existing range in the range list,
3406 # beyond the area affected by the input one, or we fell off the
3407 # end of the loop because the input range affects the whole rest
3408 # of the range list. In either case, $j is 1 higher than the
3409 # highest affected range. If $j == $i, it means that there are no
3410 # affected ranges, that the entire insertion is in the gap between
3411 # r[$i-1], and r[$i], which we already have taken care of before
3413 # On the other hand, if there are affected ranges, it might be
3414 # that there is a gap that needs filling after the final such
3415 # range to the end of the input range
3416 if ($r->[$j-1]->end < $end) {
3417 push @gap_list, Range->new(main::max($start,
3418 $r->[$j-1]->end + 1),
3421 trace "gap after $r->[$j-1], will add $gap_list[-1]" if main::DEBUG && $to_trace;
3424 # Call recursively to fill in all the gaps.
3425 foreach my $gap (@gap_list) {
3426 $self->_add_delete($operation,
3436 # Here, we have taken care of the case where $replace is $NO, which
3437 # means that whatever action we now take is done unconditionally. It
3438 # still could be that this call will result in a no-op, if duplicates
3439 # aren't allowed, and we are inserting a range that merely duplicates
3440 # data already in the range list; or also if deleting a non-existent
3442 # $i still points to the first potential affected range. Now find the
3443 # highest range affected, which will determine the length parameter to
3444 # splice. (The input range can span multiple existing ones.) While
3445 # we are looking through the range list, see also if this is an
3446 # insertion that will change the values of at least one of the
3447 # affected ranges. We don't need to do this check unless this is an
3448 # insertion of non-multiples, and also since this is a boolean, we
3449 # don't need to do it if have already determined that it will make a
3450 # change; just unconditionally change them. $cdm is created to be 1
3451 # if either of these is true. (The 'c' in the name comes from below)
3452 my $cdm = ($operation eq '-' || $replace == $MULTIPLE);
3453 my $j; # This will point to the highest affected range
3455 # For non-zero types, the standard form is the value itself;
3456 my $standard_form = ($type) ? $value : main::standardize($value);
3458 for ($j = $i; $j < $range_list_size; $j++) {
3459 trace "Looking for highest affected range; the one at $j is ", $r->[$j] if main::DEBUG && $to_trace;
3461 # If find a range that it doesn't overlap into, we can stop
3463 last if $end < $r->[$j]->start;
3465 # Here, overlaps the range at $j. If the value's don't match,
3466 # and this is supposedly an insertion, it becomes a change
3467 # instead. This is what the 'c' stands for in $cdm.
3469 if ($r->[$j]->standard_form ne $standard_form) {
3474 # Here, the two values are essentially the same. If the
3475 # two are actually identical, replacing wouldn't change
3476 # anything so skip it.
3477 my $pre_existing = $r->[$j]->value;
3478 if ($pre_existing ne $value) {
3480 # Here the new and old standardized values are the
3481 # same, but the non-standardized values aren't. If
3482 # replacing unconditionally, then replace
3483 if( $replace == $UNCONDITIONALLY) {
3488 # Here, are replacing conditionally. Decide to
3489 # replace or not based on which appears to look
3490 # the "nicest". If one is mixed case and the
3491 # other isn't, choose the mixed case one.
3492 my $new_mixed = $value =~ /[A-Z]/
3493 && $value =~ /[a-z]/;
3494 my $old_mixed = $pre_existing =~ /[A-Z]/
3495 && $pre_existing =~ /[a-z]/;
3497 if ($old_mixed != $new_mixed) {
3498 $cdm = 1 if $new_mixed;
3499 if (main::DEBUG && $to_trace) {
3501 trace "Replacing $pre_existing with $value";
3504 trace "Retaining $pre_existing over $value";
3510 # Here casing wasn't different between the two.
3511 # If one has hyphens or underscores and the
3512 # other doesn't, choose the one with the
3514 my $new_punct = $value =~ /[-_]/;
3515 my $old_punct = $pre_existing =~ /[-_]/;
3517 if ($old_punct != $new_punct) {
3518 $cdm = 1 if $new_punct;
3519 if (main::DEBUG && $to_trace) {
3521 trace "Replacing $pre_existing with $value";
3524 trace "Retaining $pre_existing over $value";
3527 } # else existing one is just as "good";
3528 # retain it to save cycles.
3534 } # End of loop looking for highest affected range.
3536 # Here, $j points to one beyond the highest range that this insertion
3537 # affects (hence to beyond the range list if that range is the final
3538 # one in the range list).
3540 # The splice length is all the affected ranges. Get it before
3541 # subtracting, for efficiency, so we don't have to later add 1.
3542 my $length = $j - $i;
3544 $j--; # $j now points to the highest affected range.
3545 trace "Final affected range is $j: $r->[$j]" if main::DEBUG && $to_trace;
3547 # If inserting a multiple record, this is where it goes, after all the
3548 # existing ones for this range. This implies an insertion, and no
3549 # change to any existing ranges. Note that $j can be -1 if this new
3550 # range doesn't actually duplicate any existing, and comes at the
3551 # beginning of the list, in which case we can handle it like any other
3552 # insertion, and is easier to do so.
3553 if ($replace == $MULTIPLE && $j >= 0) {
3555 # This restriction could be remedied with a little extra work, but
3556 # it won't hopefully ever be necessary
3557 if ($r->[$j]->start != $r->[$j]->end) {
3558 Carp::my_carp_bug("$owner_name_of{$addr}Can't cope with adding a multiple when the other range ($r->[$j]) contains more than one code point. No action taken.");
3562 # Don't add an exact duplicate, as it isn't really a multiple
3563 return if $value eq $r->[$j]->value && $type eq $r->[$j]->type;
3565 trace "Adding multiple record at $j+1 with $start..$end, $value" if main::DEBUG && $to_trace;
3566 my @return = splice @$r,
3573 if (main::DEBUG && $to_trace) {
3574 trace "After splice:";
3575 trace 'j-2=[', $j-2, ']', $r->[$j-2] if $j >= 2;
3576 trace 'j-1=[', $j-1, ']', $r->[$j-1] if $j >= 1;
3577 trace "j =[", $j, "]", $r->[$j] if $j >= 0;
3578 trace 'j+1=[', $j+1, ']', $r->[$j+1] if $j < @$r - 1;
3579 trace 'j+2=[', $j+2, ']', $r->[$j+2] if $j < @$r - 2;
3580 trace 'j+3=[', $j+3, ']', $r->[$j+3] if $j < @$r - 3;
3585 # Here, have taken care of $NO and $MULTIPLE replaces.
3586 # $j points to the highest affected range. But it can be < $i or even
3587 # -1. These happen only if the insertion is entirely in the gap
3588 # between r[$i-1] and r[$i]. Here's why: j < i means that the j loop
3589 # above exited first time through with $end < $r->[$i]->start. (And
3590 # then we subtracted one from j) This implies also that $start <
3591 # $r->[$i]->start, but we know from above that $r->[$i-1]->end <
3592 # $start, so the entire input range is in the gap.
3595 # Here the entire input range is in the gap before $i.
3597 if (main::DEBUG && $to_trace) {
3599 trace "Entire range is between $r->[$i-1] and $r->[$i]";
3602 trace "Entire range is before $r->[$i]";
3605 return if $operation ne '+'; # Deletion of a non-existent range is
3610 # Here the entire input range is not in the gap before $i. There
3611 # is an affected one, and $j points to the highest such one.
3613 # At this point, here is the situation:
3614 # This is not an insertion of a multiple, nor of tentative ($NO)
3616 # $i points to the first element in the current range list that
3617 # may be affected by this operation. In fact, we know
3618 # that the range at $i is affected because we are in
3619 # the else branch of this 'if'
3620 # $j points to the highest affected range.
3622 # r[$i-1]->end < $start <= r[$i]->end
3624 # r[$i-1]->end < $start <= $end <= r[$j]->end
3627 # $cdm is a boolean which is set true if and only if this is a
3628 # change or deletion (multiple was handled above). In
3629 # other words, it could be renamed to be just $cd.
3631 # We now have enough information to decide if this call is a no-op
3632 # or not. It is a no-op if it is a deletion of a non-existent
3633 # range, or an insertion of already existing data.
3635 if (main::DEBUG && $to_trace && ! $cdm
3637 && $start >= $r->[$i]->start)
3641 return if ! $cdm # change or delete => not no-op
3642 && $i == $j # more than one affected range => not no-op
3644 # Here, r[$i-1]->end < $start <= $end <= r[$i]->end
3645 # Further, $start and/or $end is >= r[$i]->start
3646 # The test below hence guarantees that
3647 # r[$i]->start < $start <= $end <= r[$i]->end
3648 # This means the input range is contained entirely in
3649 # the one at $i, so is a no-op
3650 && $start >= $r->[$i]->start;
3653 # Here, we know that some action will have to be taken. We have
3654 # calculated the offset and length (though adjustments may be needed)
3655 # for the splice. Now start constructing the replacement list.
3657 my $splice_start = $i;
3662 # See if should extend any adjacent ranges.
3663 if ($operation eq '-') { # Don't extend deletions
3664 $extends_below = $extends_above = 0;
3666 else { # Here, should extend any adjacent ranges. See if there are
3668 $extends_below = ($i > 0
3669 # can't extend unless adjacent
3670 && $r->[$i-1]->end == $start -1
3671 # can't extend unless are same standard value
3672 && $r->[$i-1]->standard_form eq $standard_form
3673 # can't extend unless share type
3674 && $r->[$i-1]->type == $type);
3675 $extends_above = ($j+1 < $range_list_size
3676 && $r->[$j+1]->start == $end +1
3677 && $r->[$j+1]->standard_form eq $standard_form
3678 && $r->[$j-1]->type == $type);
3680 if ($extends_below && $extends_above) { # Adds to both
3681 $splice_start--; # start replace at element below
3682 $length += 2; # will replace on both sides
3683 trace "Extends both below and above ranges" if main::DEBUG && $to_trace;
3685 # The result will fill in any gap, replacing both sides, and
3686 # create one large range.
3687 @replacement = Range->new($r->[$i-1]->start,
3694 # Here we know that the result won't just be the conglomeration of
3695 # a new range with both its adjacent neighbors. But it could
3696 # extend one of them.
3698 if ($extends_below) {
3700 # Here the new element adds to the one below, but not to the
3701 # one above. If inserting, and only to that one range, can
3702 # just change its ending to include the new one.
3703 if ($length == 0 && ! $cdm) {
3704 $r->[$i-1]->set_end($end);
3705 trace "inserted range extends range to below so it is now $r->[$i-1]" if main::DEBUG && $to_trace;
3709 trace "Changing inserted range to start at ", sprintf("%04X", $r->[$i-1]->start), " instead of ", sprintf("%04X", $start) if main::DEBUG && $to_trace;
3710 $splice_start--; # start replace at element below
3711 $length++; # will replace the element below
3712 $start = $r->[$i-1]->start;
3715 elsif ($extends_above) {
3717 # Here the new element adds to the one above, but not below.
3718 # Mirror the code above
3719 if ($length == 0 && ! $cdm) {
3720 $r->[$j+1]->set_start($start);
3721 trace "inserted range extends range to above so it is now $r->[$j+1]" if main::DEBUG && $to_trace;
3725 trace "Changing inserted range to end at ", sprintf("%04X", $r->[$j+1]->end), " instead of ", sprintf("%04X", $end) if main::DEBUG && $to_trace;
3726 $length++; # will replace the element above
3727 $end = $r->[$j+1]->end;
3731 trace "Range at $i is $r->[$i]" if main::DEBUG && $to_trace;
3733 # Finally, here we know there will have to be a splice.
3734 # If the change or delete affects only the highest portion of the
3735 # first affected range, the range will have to be split. The
3736 # splice will remove the whole range, but will replace it by a new
3737 # range containing just the unaffected part. So, in this case,
3738 # add to the replacement list just this unaffected portion.
3739 if (! $extends_below
3740 && $start > $r->[$i]->start && $start <= $r->[$i]->end)
3743 Range->new($r->[$i]->start,
3745 Value => $r->[$i]->value,
3746 Type => $r->[$i]->type);
3749 # In the case of an insert or change, but not a delete, we have to
3750 # put in the new stuff; this comes next.
3751 if ($operation eq '+') {
3752 push @replacement, Range->new($start,
3758 trace "Range at $j is $r->[$j]" if main::DEBUG && $to_trace && $j != $i;
3759 #trace "$end >=", $r->[$j]->start, " && $end <", $r->[$j]->end if main::DEBUG && $to_trace;
3761 # And finally, if we're changing or deleting only a portion of the
3762 # highest affected range, it must be split, as the lowest one was.
3763 if (! $extends_above
3764 && $j >= 0 # Remember that j can be -1 if before first
3766 && $end >= $r->[$j]->start
3767 && $end < $r->[$j]->end)
3770 Range->new($end + 1,
3772 Value => $r->[$j]->value,
3773 Type => $r->[$j]->type);
3777 # And do the splice, as calculated above
3778 if (main::DEBUG && $to_trace) {
3779 trace "replacing $length element(s) at $i with ";
3780 foreach my $replacement (@replacement) {
3781 trace " $replacement";
3783 trace "Before splice:";
3784 trace 'i-2=[', $i-2, ']', $r->[$i-2] if $i >= 2;
3785 trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i >= 1;
3786 trace "i =[", $i, "]", $r->[$i];
3787 trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < @$r - 1;
3788 trace 'i+2=[', $i+2, ']', $r->[$i+2] if $i < @$r - 2;
3791 my @return = splice @$r, $splice_start, $length, @replacement;
3793 if (main::DEBUG && $to_trace) {
3794 trace "After splice:";
3795 trace 'i-2=[', $i-2, ']', $r->[$i-2] if $i >= 2;
3796 trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i >= 1;
3797 trace "i =[", $i, "]", $r->[$i];
3798 trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < @$r - 1;
3799 trace 'i+2=[', $i+2, ']', $r->[$i+2] if $i < @$r - 2;
3800 trace "removed @return";
3803 # An actual deletion could have changed the maximum in the list.
3804 # There was no deletion if the splice didn't return something, but
3805 # otherwise recalculate it. This is done too rarely to worry about
3807 if ($operation eq '-' && @return) {
3808 $max{$addr} = $r->[-1]->end;
3813 sub reset_each_range { # reset the iterator for each_range();
3815 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3818 undef $each_range_iterator{pack 'J', $self};
3823 # Iterate over each range in a range list. Results are undefined if
3824 # the range list is changed during the iteration.
3827 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3829 my $addr = do { no overloading; pack 'J', $self; };
3831 return if $self->is_empty;
3833 $each_range_iterator{$addr} = -1
3834 if ! defined $each_range_iterator{$addr};
3835 $each_range_iterator{$addr}++;
3836 return $ranges{$addr}->[$each_range_iterator{$addr}]
3837 if $each_range_iterator{$addr} < @{$ranges{$addr}};
3838 undef $each_range_iterator{$addr};
3842 sub count { # Returns count of code points in range list
3844 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3846 my $addr = do { no overloading; pack 'J', $self; };
3849 foreach my $range (@{$ranges{$addr}}) {
3850 $count += $range->end - $range->start + 1;
3855 sub delete_range { # Delete a range
3860 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3862 return $self->_add_delete('-', $start, $end, "");
3865 sub is_empty { # Returns boolean as to if a range list is empty
3867 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3870 return scalar @{$ranges{pack 'J', $self}} == 0;
3874 # Quickly returns a scalar suitable for separating tables into
3875 # buckets, i.e. it is a hash function of the contents of a table, so
3876 # there are relatively few conflicts.
3879 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3881 my $addr = do { no overloading; pack 'J', $self; };
3883 # These are quickly computable. Return looks like 'min..max;count'
3884 return $self->min . "..$max{$addr};" . scalar @{$ranges{$addr}};
3886 } # End closure for _Range_List_Base
3889 use base '_Range_List_Base';
3891 # A Range_List is a range list for match tables; i.e. the range values are
3892 # not significant. Thus a number of operations can be safely added to it,
3893 # such as inversion, intersection. Note that union is also an unsafe
3894 # operation when range values are cared about, and that method is in the base
3895 # class, not here. But things are set up so that that method is callable only
3896 # during initialization. Only in this derived class, is there an operation
3897 # that combines two tables. A Range_Map can thus be used to initialize a
3898 # Range_List, and its mappings will be in the list, but are not significant to
3901 sub trace { return main::trace(@_); }
3907 '+' => sub { my $self = shift;
3910 return $self->_union($other)
3912 '&' => sub { my $self = shift;
3915 return $self->_intersect($other, 0);
3922 # Returns a new Range_List that gives all code points not in $self.
3926 my $new = Range_List->new;
3928 # Go through each range in the table, finding the gaps between them
3929 my $max = -1; # Set so no gap before range beginning at 0
3930 for my $range ($self->ranges) {
3931 my $start = $range->start;
3932 my $end = $range->end;
3934 # If there is a gap before this range, the inverse will contain
3936 if ($start > $max + 1) {
3937 $new->add_range($max + 1, $start - 1);
3942 # And finally, add the gap from the end of the table to the max
3943 # possible code point
3944 if ($max < $LAST_UNICODE_CODEPOINT) {
3945 $new->add_range($max + 1, $LAST_UNICODE_CODEPOINT);
3951 # Returns a new Range_List with the argument deleted from it. The
3952 # argument can be a single code point, a range, or something that has
3953 # a range, with the _range_list() method on it returning them
3957 my $reversed = shift;
3958 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3961 Carp::my_carp_bug("Can't cope with a "
3963 . " being the second parameter in a '-'. Subtraction ignored.");
3967 my $new = Range_List->new(Initialize => $self);
3969 if (! ref $other) { # Single code point
3970 $new->delete_range($other, $other);
3972 elsif ($other->isa('Range')) {
3973 $new->delete_range($other->start, $other->end);
3975 elsif ($other->can('_range_list')) {
3976 foreach my $range ($other->_range_list->ranges) {
3977 $new->delete_range($range->start, $range->end);
3981 Carp::my_carp_bug("Can't cope with a "
3983 . " argument to '-'. Subtraction ignored."
3992 # Returns either a boolean giving whether the two inputs' range lists
3993 # intersect (overlap), or a new Range_List containing the intersection
3994 # of the two lists. The optional final parameter being true indicates
3995 # to do the check instead of the intersection.
3997 my $a_object = shift;
3998 my $b_object = shift;
3999 my $check_if_overlapping = shift;
4000 $check_if_overlapping = 0 unless defined $check_if_overlapping;
4001 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4003 if (! defined $b_object) {
4005 $message .= $a_object->_owner_name_of if defined $a_object;
4006 Carp::my_carp_bug($message .= "Called with undefined value. Intersection not done.");
4010 # a & b = !(!a | !b), or in our terminology = ~ ( ~a + -b )
4011 # Thus the intersection could be much more simply be written:
4012 # return ~(~$a_object + ~$b_object);
4013 # But, this is slower, and when taking the inverse of a large
4014 # range_size_1 table, back when such tables were always stored that
4015 # way, it became prohibitively slow, hence the code was changed to the
4018 if ($b_object->isa('Range')) {
4019 $b_object = Range_List->new(Initialize => $b_object,
4020 Owner => $a_object->_owner_name_of);
4022 $b_object = $b_object->_range_list if $b_object->can('_range_list');
4024 my @a_ranges = $a_object->ranges;
4025 my @b_ranges = $b_object->ranges;
4027 #local $to_trace = 1 if main::DEBUG;
4028 trace "intersecting $a_object with ", scalar @a_ranges, "ranges and $b_object with", scalar @b_ranges, " ranges" if main::DEBUG && $to_trace;
4030 # Start with the first range in each list
4032 my $range_a = $a_ranges[$a_i];
4034 my $range_b = $b_ranges[$b_i];
4036 my $new = __PACKAGE__->new(Owner => $a_object->_owner_name_of)
4037 if ! $check_if_overlapping;
4039 # If either list is empty, there is no intersection and no overlap
4040 if (! defined $range_a || ! defined $range_b) {
4041 return $check_if_overlapping ? 0 : $new;
4043 trace "range_a[$a_i]=$range_a; range_b[$b_i]=$range_b" if main::DEBUG && $to_trace;
4045 # Otherwise, must calculate the intersection/overlap. Start with the
4046 # very first code point in each list
4047 my $a = $range_a->start;
4048 my $b = $range_b->start;
4050 # Loop through all the ranges of each list; in each iteration, $a and
4051 # $b are the current code points in their respective lists
4054 # If $a and $b are the same code point, ...
4057 # it means the lists overlap. If just checking for overlap
4058 # know the answer now,
4059 return 1 if $check_if_overlapping;
4061 # The intersection includes this code point plus anything else
4062 # common to both current ranges.
4064 my $end = main::min($range_a->end, $range_b->end);
4065 if (! $check_if_overlapping) {
4066 trace "adding intersection range ", sprintf("%04X", $start) . ".." . sprintf("%04X", $end) if main::DEBUG && $to_trace;
4067 $new->add_range($start, $end);
4070 # Skip ahead to the end of the current intersect
4073 # If the current intersect ends at the end of either range (as
4074 # it must for at least one of them), the next possible one
4075 # will be the beginning code point in it's list's next range.
4076 if ($a == $range_a->end) {
4077 $range_a = $a_ranges[++$a_i];
4078 last unless defined $range_a;
4079 $a = $range_a->start;
4081 if ($b == $range_b->end) {
4082 $range_b = $b_ranges[++$b_i];
4083 last unless defined $range_b;
4084 $b = $range_b->start;
4087 trace "range_a[$a_i]=$range_a; range_b[$b_i]=$range_b" if main::DEBUG && $to_trace;
4091 # Not equal, but if the range containing $a encompasses $b,
4092 # change $a to be the middle of the range where it does equal
4093 # $b, so the next iteration will get the intersection
4094 if ($range_a->end >= $b) {
4099 # Here, the current range containing $a is entirely below
4100 # $b. Go try to find a range that could contain $b.
4101 $a_i = $a_object->_search_ranges($b);
4103 # If no range found, quit.
4104 last unless defined $a_i;
4106 # The search returns $a_i, such that
4107 # range_a[$a_i-1]->end < $b <= range_a[$a_i]->end
4108 # Set $a to the beginning of this new range, and repeat.
4109 $range_a = $a_ranges[$a_i];
4110 $a = $range_a->start;
4113 else { # Here, $b < $a.
4115 # Mirror image code to the leg just above
4116 if ($range_b->end >= $a) {
4120 $b_i = $b_object->_search_ranges($a);
4121 last unless defined $b_i;
4122 $range_b = $b_ranges[$b_i];
4123 $b = $range_b->start;
4126 } # End of looping through ranges.
4128 # Intersection fully computed, or now know that there is no overlap
4129 return $check_if_overlapping ? 0 : $new;
4133 # Returns boolean giving whether the two arguments overlap somewhere
4137 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4139 return $self->_intersect($other, 1);
4143 # Add a range to the list.
4148 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4150 return $self->_add_delete('+', $start, $end, "");
4153 sub matches_identically_to {
4154 # Return a boolean as to whether or not two Range_Lists match identical
4155 # sets of code points.
4159 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4161 # These are ordered in increasing real time to figure out (at least
4162 # until a patch changes that and doesn't change this)
4163 return 0 if $self->max != $other->max;
4164 return 0 if $self->min != $other->min;
4165 return 0 if $self->range_count != $other->range_count;
4166 return 0 if $self->count != $other->count;
4168 # Here they could be identical because all the tests above passed.
4169 # The loop below is somewhat simpler since we know they have the same
4170 # number of elements. Compare range by range, until reach the end or
4171 # find something that differs.
4172 my @a_ranges = $self->ranges;
4173 my @b_ranges = $other->ranges;
4174 for my $i (0 .. @a_ranges - 1) {
4175 my $a = $a_ranges[$i];
4176 my $b = $b_ranges[$i];
4177 trace "self $a; other $b" if main::DEBUG && $to_trace;
4178 return 0 if $a->start != $b->start || $a->end != $b->end;
4183 sub is_code_point_usable {
4184 # This used only for making the test script. See if the input
4185 # proposed trial code point is one that Perl will handle. If second
4186 # parameter is 0, it won't select some code points for various
4187 # reasons, noted below.
4190 my $try_hard = shift;
4191 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4193 return 0 if $code < 0; # Never use a negative
4195 # shun null. I'm (khw) not sure why this was done, but NULL would be
4196 # the character very frequently used.
4197 return $try_hard if $code == 0x0000;
4199 return 0 if $try_hard; # XXX Temporary until fix utf8.c
4201 # shun non-character code points.
4202 return $try_hard if $code >= 0xFDD0 && $code <= 0xFDEF;
4203 return $try_hard if ($code & 0xFFFE) == 0xFFFE; # includes FFFF
4205 return $try_hard if $code > $LAST_UNICODE_CODEPOINT; # keep in range
4206 return $try_hard if $code >= 0xD800 && $code <= 0xDFFF; # no surrogate
4211 sub get_valid_code_point {
4212 # Return a code point that's part of the range list. Returns nothing
4213 # if the table is empty or we can't find a suitable code point. This
4214 # used only for making the test script.
4217 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4219 my $addr = do { no overloading; pack 'J', $self; };
4221 # On first pass, don't choose less desirable code points; if no good
4222 # one is found, repeat, allowing a less desirable one to be selected.
4223 for my $try_hard (0, 1) {
4225 # Look through all the ranges for a usable code point.
4226 for my $set ($self->ranges) {
4228 # Try the edge cases first, starting with the end point of the
4230 my $end = $set->end;
4231 return $end if is_code_point_usable($end, $try_hard);
4233 # End point didn't, work. Start at the beginning and try
4234 # every one until find one that does work.
4235 for my $trial ($set->start .. $end - 1) {
4236 return $trial if is_code_point_usable($trial, $try_hard);
4240 return (); # If none found, give up.
4243 sub get_invalid_code_point {
4244 # Return a code point that's not part of the table. Returns nothing
4245 # if the table covers all code points or a suitable code point can't
4246 # be found. This used only for making the test script.
4249 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4251 # Just find a valid code point of the inverse, if any.
4252 return Range_List->new(Initialize => ~ $self)->get_valid_code_point;
4254 } # end closure for Range_List
4257 use base '_Range_List_Base';
4259 # A Range_Map is a range list in which the range values (called maps) are
4260 # significant, and hence shouldn't be manipulated by our other code, which
4261 # could be ambiguous or lose things. For example, in taking the union of two
4262 # lists, which share code points, but which have differing values, which one
4263 # has precedence in the union?
4264 # It turns out that these operations aren't really necessary for map tables,
4265 # and so this class was created to make sure they aren't accidentally
4271 # Add a range containing a mapping value to the list
4274 # Rest of parameters passed on
4276 return $self->_add_delete('+', @_);
4280 # Adds entry to a range list which can duplicate an existing entry
4283 my $code_point = shift;
4285 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4287 return $self->add_map($code_point, $code_point,
4288 $value, Replace => $MULTIPLE);
4290 } # End of closure for package Range_Map
4292 package _Base_Table;
4294 # A table is the basic data structure that gets written out into a file for
4295 # use by the Perl core. This is the abstract base class implementing the
4296 # common elements from the derived ones. A list of the methods to be
4297 # furnished by an implementing class is just after the constructor.
4299 sub standardize { return main::standardize($_[0]); }
4300 sub trace { return main::trace(@_); }
4304 main::setup_package();
4307 # Object containing the ranges of the table.
4308 main::set_access('range_list', \%range_list, 'p_r', 'p_s');
4311 # The full table name.
4312 main::set_access('full_name', \%full_name, 'r');
4315 # The table name, almost always shorter
4316 main::set_access('name', \%name, 'r');
4319 # The shortest of all the aliases for this table, with underscores removed
4320 main::set_access('short_name', \%short_name);
4322 my %nominal_short_name_length;
4323 # The length of short_name before removing underscores
4324 main::set_access('nominal_short_name_length',
4325 \%nominal_short_name_length);
4328 # The complete name, including property.
4329 main::set_access('complete_name', \%complete_name, 'r');
4332 # Parent property this table is attached to.
4333 main::set_access('property', \%property, 'r');
4336 # Ordered list of aliases of the table's name. The first ones in the list
4337 # are output first in comments
4338 main::set_access('aliases', \%aliases, 'readable_array');
4341 # A comment associated with the table for human readers of the files
4342 main::set_access('comment', \%comment, 's');
4345 # A comment giving a short description of the table's meaning for human
4346 # readers of the files.
4347 main::set_access('description', \%description, 'readable_array');
4350 # A comment giving a short note about the table for human readers of the
4352 main::set_access('note', \%note, 'readable_array');
4355 # Boolean; if set means any file that contains this table is marked as for
4356 # internal-only use.
4357 main::set_access('internal_only', \%internal_only);
4359 my %find_table_from_alias;
4360 # The parent property passes this pointer to a hash which this class adds
4361 # all its aliases to, so that the parent can quickly take an alias and
4363 main::set_access('find_table_from_alias', \%find_table_from_alias, 'p_r');
4366 # After this table is made equivalent to another one; we shouldn't go
4367 # changing the contents because that could mean it's no longer equivalent
4368 main::set_access('locked', \%locked, 'r');
4371 # This gives the final path to the file containing the table. Each
4372 # directory in the path is an element in the array
4373 main::set_access('file_path', \%file_path, 'readable_array');
4376 # What is the table's status, normal, $OBSOLETE, etc. Enum
4377 main::set_access('status', \%status, 'r');
4380 # A comment about its being obsolete, or whatever non normal status it has
4381 main::set_access('status_info', \%status_info, 'r');
4384 # Is the table to be output with each range only a single code point?
4385 # This is done to avoid breaking existing code that may have come to rely
4386 # on this behavior in previous versions of this program.)
4387 main::set_access('range_size_1', \%range_size_1, 'r', 's');
4390 # A boolean set iff this table is a Perl extension to the Unicode
4392 main::set_access('perl_extension', \%perl_extension, 'r');
4394 my %output_range_counts;
4395 # A boolean set iff this table is to have comments written in the
4396 # output file that contain the number of code points in the range.
4397 # The constructor can override the global flag of the same name.
4398 main::set_access('output_range_counts', \%output_range_counts, 'r');
4401 # The format of the entries of the table. This is calculated from the
4402 # data in the table (or passed in the constructor). This is an enum e.g.,
4404 main::set_access('format', \%format, 'r', 'p_s');
4407 # All arguments are key => value pairs, which you can see below, most
4408 # of which match fields documented above. Otherwise: Pod_Entry,
4409 # Externally_Ok, and Fuzzy apply to the names of the table, and are
4410 # documented in the Alias package
4412 return Carp::carp_too_few_args(\@_, 2) if main::DEBUG && @_ < 2;
4416 my $self = bless \do { my $anonymous_scalar }, $class;
4417 my $addr = do { no overloading; pack 'J', $self; };
4421 $name{$addr} = delete $args{'Name'};
4422 $find_table_from_alias{$addr} = delete $args{'_Alias_Hash'};
4423 $full_name{$addr} = delete $args{'Full_Name'};
4424 my $complete_name = $complete_name{$addr}
4425 = delete $args{'Complete_Name'};
4426 $format{$addr} = delete $args{'Format'};
4427 $internal_only{$addr} = delete $args{'Internal_Only_Warning'} || 0;
4428 $output_range_counts{$addr} = delete $args{'Output_Range_Counts'};
4429 $property{$addr} = delete $args{'_Property'};
4430 $range_list{$addr} = delete $args{'_Range_List'};
4431 $status{$addr} = delete $args{'Status'} || $NORMAL;
4432 $status_info{$addr} = delete $args{'_Status_Info'} || "";
4433 $range_size_1{$addr} = delete $args{'Range_Size_1'} || 0;
4435 my $description = delete $args{'Description'};
4436 my $externally_ok = delete $args{'Externally_Ok'};
4437 my $loose_match = delete $args{'Fuzzy'};
4438 my $note = delete $args{'Note'};
4439 my $make_pod_entry = delete $args{'Pod_Entry'};
4440 my $perl_extension = delete $args{'Perl_Extension'};
4442 # Shouldn't have any left over
4443 Carp::carp_extra_args(\%args) if main::DEBUG && %args;
4445 # Can't use || above because conceivably the name could be 0, and
4446 # can't use // operator in case this program gets used in Perl 5.8
4447 $full_name{$addr} = $name{$addr} if ! defined $full_name{$addr};
4448 $output_range_counts{$addr} = $output_range_counts if
4449 ! defined $output_range_counts{$addr};
4451 $aliases{$addr} = [ ];
4452 $comment{$addr} = [ ];
4453 $description{$addr} = [ ];
4455 $file_path{$addr} = [ ];
4456 $locked{$addr} = "";
4458 push @{$description{$addr}}, $description if $description;
4459 push @{$note{$addr}}, $note if $note;
4461 if ($status{$addr} eq $PLACEHOLDER) {
4463 # A placeholder table doesn't get documented, is a perl extension,
4464 # and quite likely will be empty
4465 $make_pod_entry = 0 if ! defined $make_pod_entry;
4466 $perl_extension = 1 if ! defined $perl_extension;
4467 push @tables_that_may_be_empty, $complete_name{$addr};
4469 elsif (! $status{$addr}) {
4471 # If hasn't set its status already, see if it is on one of the
4472 # lists of properties or tables that have particular statuses; if
4473 # not, is normal. The lists are prioritized so the most serious
4474 # ones are checked first
4475 if (exists $why_suppressed{$complete_name}
4476 # Don't suppress if overriden
4477 && ! grep { $_ eq $complete_name{$addr} }
4478 @output_mapped_properties)
4480 $status{$addr} = $SUPPRESSED;
4482 elsif (exists $why_deprecated{$complete_name}) {
4483 $status{$addr} = $DEPRECATED;
4485 elsif (exists $why_stabilized{$complete_name}) {
4486 $status{$addr} = $STABILIZED;
4488 elsif (exists $why_obsolete{$complete_name}) {
4489 $status{$addr} = $OBSOLETE;
4492 # Existence above doesn't necessarily mean there is a message
4493 # associated with it. Use the most serious message.
4494 if ($status{$addr}) {
4495 if ($why_suppressed{$complete_name}) {
4497 = $why_suppressed{$complete_name};
4499 elsif ($why_deprecated{$complete_name}) {
4501 = $why_deprecated{$complete_name};
4503 elsif ($why_stabilized{$complete_name}) {
4505 = $why_stabilized{$complete_name};
4507 elsif ($why_obsolete{$complete_name}) {
4509 = $why_obsolete{$complete_name};
4514 $perl_extension{$addr} = $perl_extension || 0;
4516 # By convention what typically gets printed only or first is what's
4517 # first in the list, so put the full name there for good output
4518 # clarity. Other routines rely on the full name being first on the
4520 $self->add_alias($full_name{$addr},
4521 Externally_Ok => $externally_ok,
4522 Fuzzy => $loose_match,
4523 Pod_Entry => $make_pod_entry,
4524 Status => $status{$addr},
4527 # Then comes the other name, if meaningfully different.
4528 if (standardize($full_name{$addr}) ne standardize($name{$addr})) {
4529 $self->add_alias($name{$addr},
4530 Externally_Ok => $externally_ok,
4531 Fuzzy => $loose_match,
4532 Pod_Entry => $make_pod_entry,
4533 Status => $status{$addr},
4540 # Here are the methods that are required to be defined by any derived
4543 handle_special_range
4547 # write() knows how to write out normal ranges, but it calls
4548 # handle_special_range() when it encounters a non-normal one.
4549 # append_to_body() is called by it after it has handled all
4550 # ranges to add anything after the main portion of the table.
4551 # And finally, pre_body() is called after all this to build up
4552 # anything that should appear before the main portion of the
4553 # table. Doing it this way allows things in the middle to
4554 # affect what should appear before the main portion of the
4559 Carp::my_carp_bug( __LINE__
4560 . ": Must create method '$sub()' for "
4568 "." => \&main::_operator_dot,
4569 '!=' => \&main::_operator_not_equal,
4570 '==' => \&main::_operator_equal,
4574 # Returns the array of ranges associated with this table.
4577 return $range_list{pack 'J', shift}->ranges;
4581 # Add a synonym for this table.
4583 return Carp::carp_too_few_args(\@_, 3) if main::DEBUG && @_ < 3;
4586 my $name = shift; # The name to add.
4587 my $pointer = shift; # What the alias hash should point to. For
4588 # map tables, this is the parent property;
4589 # for match tables, it is the table itself.
4592 my $loose_match = delete $args{'Fuzzy'};
4594 my $make_pod_entry = delete $args{'Pod_Entry'};
4595 $make_pod_entry = $YES unless defined $make_pod_entry;
4597 my $externally_ok = delete $args{'Externally_Ok'};
4598 $externally_ok = 1 unless defined $externally_ok;
4600 my $status = delete $args{'Status'};
4601 $status = $NORMAL unless defined $status;
4603 Carp::carp_extra_args(\%args) if main::DEBUG && %args;
4605 # Capitalize the first letter of the alias unless it is one of the CJK
4606 # ones which specifically begins with a lower 'k'. Do this because
4607 # Unicode has varied whether they capitalize first letters or not, and
4608 # have later changed their minds and capitalized them, but not the
4609 # other way around. So do it always and avoid changes from release to
4611 $name = ucfirst($name) unless $name =~ /^k[A-Z]/;
4613 my $addr = do { no overloading; pack 'J', $self; };
4615 # Figure out if should be loosely matched if not already specified.
4616 if (! defined $loose_match) {
4618 # Is a loose_match if isn't null, and doesn't begin with an
4619 # underscore and isn't just a number
4621 && substr($name, 0, 1) ne '_'
4622 && $name !~ qr{^[0-9_.+-/]+$})
4631 # If this alias has already been defined, do nothing.
4632 return if defined $find_table_from_alias{$addr}->{$name};
4634 # That includes if it is standardly equivalent to an existing alias,
4635 # in which case, add this name to the list, so won't have to search
4637 my $standard_name = main::standardize($name);
4638 if (defined $find_table_from_alias{$addr}->{$standard_name}) {
4639 $find_table_from_alias{$addr}->{$name}
4640 = $find_table_from_alias{$addr}->{$standard_name};
4644 # Set the index hash for this alias for future quick reference.
4645 $find_table_from_alias{$addr}->{$name} = $pointer;
4646 $find_table_from_alias{$addr}->{$standard_name} = $pointer;
4647 local $to_trace = 0 if main::DEBUG;
4648 trace "adding alias $name to $pointer" if main::DEBUG && $to_trace;
4649 trace "adding alias $standard_name to $pointer" if main::DEBUG && $to_trace;
4652 # Put the new alias at the end of the list of aliases unless the final
4653 # element begins with an underscore (meaning it is for internal perl
4654 # use) or is all numeric, in which case, put the new one before that
4655 # one. This floats any all-numeric or underscore-beginning aliases to
4656 # the end. This is done so that they are listed last in output lists,
4657 # to encourage the user to use a better name (either more descriptive
4658 # or not an internal-only one) instead. This ordering is relied on
4659 # implicitly elsewhere in this program, like in short_name()
4660 my $list = $aliases{$addr};
4661 my $insert_position = (@$list == 0
4662 || (substr($list->[-1]->name, 0, 1) ne '_'
4663 && $list->[-1]->name =~ /\D/))
4669 Alias->new($name, $loose_match, $make_pod_entry,
4670 $externally_ok, $status);
4672 # This name may be shorter than any existing ones, so clear the cache
4673 # of the shortest, so will have to be recalculated.
4675 undef $short_name{pack 'J', $self};
4680 # Returns a name suitable for use as the base part of a file name.
4681 # That is, shorter wins. It can return undef if there is no suitable
4682 # name. The name has all non-essential underscores removed.
4684 # The optional second parameter is a reference to a scalar in which
4685 # this routine will store the length the returned name had before the
4686 # underscores were removed, or undef if the return is undef.
4688 # The shortest name can change if new aliases are added. So using
4689 # this should be deferred until after all these are added. The code
4690 # that does that should clear this one's cache.
4691 # Any name with alphabetics is preferred over an all numeric one, even
4695 my $nominal_length_ptr = shift;
4696 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4698 my $addr = do { no overloading; pack 'J', $self; };
4700 # For efficiency, don't recalculate, but this means that adding new
4701 # aliases could change what the shortest is, so the code that does
4702 # that needs to undef this.
4703 if (defined $short_name{$addr}) {
4704 if ($nominal_length_ptr) {
4705 $$nominal_length_ptr = $nominal_short_name_length{$addr};
4707 return $short_name{$addr};
4710 # Look at each alias
4711 foreach my $alias ($self->aliases()) {
4713 # Don't use an alias that isn't ok to use for an external name.
4714 next if ! $alias->externally_ok;
4716 my $name = main::Standardize($alias->name);
4717 trace $self, $name if main::DEBUG && $to_trace;
4719 # Take the first one, or a shorter one that isn't numeric. This
4720 # relies on numeric aliases always being last in the array
4721 # returned by aliases(). Any alpha one will have precedence.
4722 if (! defined $short_name{$addr}
4724 && length($name) < length($short_name{$addr})))
4726 # Remove interior underscores.
4727 ($short_name{$addr} = $name) =~ s/ (?<= . ) _ (?= . ) //xg;
4729 $nominal_short_name_length{$addr} = length $name;
4733 # If no suitable external name return undef
4734 if (! defined $short_name{$addr}) {
4735 $$nominal_length_ptr = undef if $nominal_length_ptr;
4739 # Don't allow a null external name.
4740 if ($short_name{$addr} eq "") {
4741 $short_name{$addr} = '_';
4742 $nominal_short_name_length{$addr} = 1;
4745 trace $self, $short_name{$addr} if main::DEBUG && $to_trace;
4747 if ($nominal_length_ptr) {
4748 $$nominal_length_ptr = $nominal_short_name_length{$addr};
4750 return $short_name{$addr};
4754 # Returns the external name that this table should be known by. This
4755 # is usually the short_name, but not if the short_name is undefined.
4758 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4760 my $short = $self->short_name;
4761 return $short if defined $short;
4766 sub add_description { # Adds the parameter as a short description.
4769 my $description = shift;
4771 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4774 push @{$description{pack 'J', $self}}, $description;
4779 sub add_note { # Adds the parameter as a short note.
4784 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4787 push @{$note{pack 'J', $self}}, $note;
4792 sub add_comment { # Adds the parameter as a comment.
4795 my $comment = shift;
4796 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4801 push @{$comment{pack 'J', $self}}, $comment;
4807 # Return the current comment for this table. If called in list
4808 # context, returns the array of comments. In scalar, returns a string
4809 # of each element joined together with a period ending each.
4812 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4814 my $addr = do { no overloading; pack 'J', $self; };
4815 my @list = @{$comment{$addr}};
4816 return @list if wantarray;
4818 foreach my $sentence (@list) {
4819 $return .= '. ' if $return;
4820 $return .= $sentence;
4823 $return .= '.' if $return;
4828 # Initialize the table with the argument which is any valid
4829 # initialization for range lists.
4832 my $addr = do { no overloading; pack 'J', $self; };
4833 my $initialization = shift;
4834 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4836 # Replace the current range list with a new one of the same exact
4838 my $class = ref $range_list{$addr};
4839 $range_list{$addr} = $class->new(Owner => $self,
4840 Initialize => $initialization);
4846 # The header that is output for the table in the file it is written
4850 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4853 $return .= $DEVELOPMENT_ONLY if $compare_versions;
4856 $return .= $INTERNAL_ONLY if $internal_only{pack 'J', $self};
4861 # Write a representation of the table to its file. It calls several
4862 # functions furnished by sub-classes of this abstract base class to
4863 # handle non-normal ranges, to add stuff before the table, and at its
4867 my $tab_stops = shift; # The number of tab stops over to put any
4869 my $suppress_value = shift; # Optional, if the value associated with
4870 # a range equals this one, don't write
4872 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4874 my $addr = do { no overloading; pack 'J', $self; };
4876 # Start with the header
4877 my @HEADER = $self->header;
4880 push @HEADER, "\n", main::simple_fold($comment{$addr}, '# '), "\n"
4883 # Things discovered processing the main body of the document may
4884 # affect what gets output before it, therefore pre_body() isn't called
4885 # until after all other processing of the table is done.
4887 # The main body looks like a 'here' document. If annotating, get rid
4888 # of the comments before passing to the caller, as some callers, such
4889 # as charnames.pm, can't cope with them. (Outputting range counts
4890 # also introduces comments, but these don't show up in the tables that
4891 # can't cope with comments, and there aren't that many of them that
4892 # it's worth the extra real time to get rid of them).
4894 if ($output_names) {
4895 # Use the line below in Perls that don't have /r
4896 #push @OUT, 'return join "\n", map { s/\s*#.*//mg; $_ } split "\n", <<\'END\';' . "\n";
4897 push @OUT, "return <<'END' =~ s/\\s*#.*//mgr;\n";
4899 push @OUT, "return <<'END';\n";
4902 if ($range_list{$addr}->is_empty) {
4904 # This is a kludge for empty tables to silence a warning in
4905 # utf8.c, which can't really deal with empty tables, but it can
4906 # deal with a table that matches nothing, as the inverse of 'Any'
4908 push @OUT, "!utf8::IsAny\n";
4911 my $range_size_1 = $range_size_1{$addr};
4912 my $format; # Used only in $output_names option
4913 my $include_name; # Used only in $output_names option
4915 if ($output_names) {
4917 # if annotating each code point, must print 1 per line.
4918 # The variable could point to a subroutine, and we don't want
4919 # to lose that fact, so only set if not set already
4920 $range_size_1 = 1 if ! $range_size_1;
4922 $format = $self->format;
4924 # The name of the character is output only for tables that
4925 # don't already include the name in the output.
4926 my $property = $self->property;
4928 ! ($property == $perl_charname
4929 || $property == main::property_ref('Unicode_1_Name')
4930 || $property == main::property_ref('Name')
4931 || $property == main::property_ref('Name_Alias')
4935 # Output each range as part of the here document.
4937 for my $set ($range_list{$addr}->ranges) {
4938 if ($set->type != 0) {
4939 $self->handle_special_range($set);
4942 my $start = $set->start;
4943 my $end = $set->end;
4944 my $value = $set->value;
4946 # Don't output ranges whose value is the one to suppress
4947 next RANGE if defined $suppress_value
4948 && $value eq $suppress_value;
4950 # If there is a range and doesn't need a single point range
4952 if ($start != $end && ! $range_size_1) {
4953 push @OUT, sprintf "%04X\t%04X\t%s", $start, $end, $value;
4955 # Add a comment with the size of the range, if requested.
4956 # Expand Tabs to make sure they all start in the same
4957 # column, and then unexpand to use mostly tabs.
4958 if (! $output_range_counts{$addr}) {
4962 $OUT[-1] = Text::Tabs::expand($OUT[-1]);
4963 my $count = main::clarify_number($end - $start + 1);
4966 my $width = $tab_stops * 8 - 1;
4967 $OUT[-1] = sprintf("%-*s # [%s]\n",
4971 $OUT[-1] = Text::Tabs::unexpand($OUT[-1]);
4976 # Here to output a single code point per line
4978 # If not to annotate, use the simple formats
4979 if (! $output_names) {
4981 # Use any passed in subroutine to output.
4982 if (ref $range_size_1 eq 'CODE') {
4983 for my $i ($start .. $end) {
4984 push @OUT, &{$range_size_1}($i, $value);
4989 # Here, caller is ok with default output.
4990 for (my $i = $start; $i <= $end; $i++) {
4991 push @OUT, sprintf "%04X\t\t%s\n", $i, $value;
4997 # Here, wants annotation.
4998 for (my $i = $start; $i <= $end; $i++) {
5000 # Get character information if don't have it already
5001 main::populate_char_info($i)
5002 if ! defined $viacode[$i];
5003 my $type = $annotate_char_type[$i];
5005 # Figure out if should output the next code points as part
5006 # of a range or not. If this is not in an annotation
5007 # range, then won't output as a range, so returns $i.
5008 # Otherwise use the end of the annotation range, but no
5009 # further than the maximum possible end point of the loop.
5010 my $range_end = main::min($annotate_ranges->value_of($i)
5014 # Use a range if it is a range, and either is one of the
5015 # special annotation ranges, or the range is at most 3
5016 # long. This last case causes the algorithmically named
5017 # code points to be output individually in spans of at
5018 # most 3, as they are the ones whose $type is > 0.
5019 if ($range_end != $i
5020 && ( $type < 0 || $range_end - $i > 2))
5022 # Here is to output a range. We don't allow a
5023 # caller-specified output format--just use the
5025 push @OUT, sprintf "%04X\t%04X\t%s\t#", $i,
5028 my $range_name = $viacode[$i];
5030 # For the code points which end in their hex value, we
5031 # eliminate that from the output annotation, and
5032 # capitalize only the first letter of each word.
5033 if ($type == $CP_IN_NAME) {
5034 my $hex = sprintf "%04X", $i;
5035 $range_name =~ s/-$hex$//;
5036 my @words = split " ", $range_name;
5037 for my $word (@words) {
5038 $word = ucfirst(lc($word)) if $word ne 'CJK';
5040 $range_name = join " ", @words;
5042 elsif ($type == $HANGUL_SYLLABLE) {
5043 $range_name = "Hangul Syllable";
5046 $OUT[-1] .= " $range_name" if $range_name;
5048 # Include the number of code points in the range
5049 my $count = main::clarify_number($range_end - $i + 1);
5050 $OUT[-1] .= " [$count]\n";
5052 # Skip to the end of the range
5055 else { # Not in a range.
5058 # When outputting the names of each character, use
5059 # the character itself if printable
5060 $comment .= "'" . chr($i) . "' " if $printable[$i];
5062 # To make it more readable, use a minimum indentation
5065 # Determine the annotation
5066 if ($format eq $DECOMP_STRING_FORMAT) {
5068 # This is very specialized, with the type of
5069 # decomposition beginning the line enclosed in
5070 # <...>, and the code points that the code point
5071 # decomposes to separated by blanks. Create two
5072 # strings, one of the printable characters, and
5073 # one of their official names.
5074 (my $map = $value) =~ s/ \ * < .*? > \ +//x;
5078 foreach my $to (split " ", $map) {
5079 $to = CORE::hex $to;
5080 $to_name .= " + " if $to_name;
5081 $to_chr .= chr($to);
5082 main::populate_char_info($to)
5083 if ! defined $viacode[$to];
5084 $to_name .= $viacode[$to];
5088 "=> '$to_chr'; $viacode[$i] => $to_name";
5089 $comment_indent = 25; # Determined by experiment
5093 # Assume that any table that has hex format is a
5094 # mapping of one code point to another.
5095 if ($format eq $HEX_FORMAT) {
5096 my $decimal_value = CORE::hex $value;
5097 main::populate_char_info($decimal_value)
5098 if ! defined $viacode[$decimal_value];
5100 . chr($decimal_value)
5101 . "'; " if $printable[$decimal_value];
5103 $comment .= $viacode[$i] if $include_name
5105 if ($format eq $HEX_FORMAT) {
5106 my $decimal_value = CORE::hex $value;
5107 $comment .= " => $viacode[$decimal_value]"
5108 if $viacode[$decimal_value];
5111 # If including the name, no need to indent, as the
5112 # name will already be way across the line.
5113 $comment_indent = ($include_name) ? 0 : 60;
5116 # Use any passed in routine to output the base part of
5118 if (ref $range_size_1 eq 'CODE') {
5119 my $base_part = &{$range_size_1}($i, $value);
5121 push @OUT, $base_part;
5124 push @OUT, sprintf "%04X\t\t%s", $i, $value;
5127 # And add the annotation.
5128 $OUT[-1] = sprintf "%-*s\t# %s", $comment_indent,
5130 $comment if $comment;
5134 } # End of loop through all the table's ranges
5137 # Add anything that goes after the main body, but within the here
5139 my $append_to_body = $self->append_to_body;
5140 push @OUT, $append_to_body if $append_to_body;
5142 # And finish the here document.
5145 # Done with the main portion of the body. Can now figure out what
5146 # should appear before it in the file.
5147 my $pre_body = $self->pre_body;
5148 push @HEADER, $pre_body, "\n" if $pre_body;
5150 # All these files have a .pl suffix
5151 $file_path{$addr}->[-1] .= '.pl';
5153 main::write($file_path{$addr},
5154 $output_names, # utf8 iff annotating
5160 sub set_status { # Set the table's status
5162 my $status = shift; # The status enum value
5163 my $info = shift; # Any message associated with it.
5164 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5166 my $addr = do { no overloading; pack 'J', $self; };
5168 $status{$addr} = $status;
5169 $status_info{$addr} = $info;
5174 # Don't allow changes to the table from now on. This stores a stack
5175 # trace of where it was called, so that later attempts to modify it
5176 # can immediately show where it got locked.
5179 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5181 my $addr = do { no overloading; pack 'J', $self; };
5183 $locked{$addr} = "";
5185 my $line = (caller(0))[2];
5188 # Accumulate the stack trace
5190 my ($pkg, $file, $caller_line, $caller) = caller $i++;
5192 last unless defined $caller;
5194 $locked{$addr} .= " called from $caller() at line $line\n";
5195 $line = $caller_line;
5197 $locked{$addr} .= " called from main at line $line\n";
5202 sub carp_if_locked {
5203 # Return whether a table is locked or not, and, by the way, complain
5207 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5209 my $addr = do { no overloading; pack 'J', $self; };
5211 return 0 if ! $locked{$addr};
5212 Carp::my_carp_bug("Can't modify a locked table. Stack trace of locking:\n$locked{$addr}\n\n");
5216 sub set_file_path { # Set the final directory path for this table
5218 # Rest of parameters passed on
5221 @{$file_path{pack 'J', $self}} = @_;
5225 # Accessors for the range list stored in this table. First for
5234 matches_identically_to
5248 return $range_list{pack 'J', $self}->$sub(@_);
5252 # Then for ones that should fail if locked
5262 return if $self->carp_if_locked;
5264 return $range_list{pack 'J', $self}->$sub(@_);
5271 use base '_Base_Table';
5273 # A Map Table is a table that contains the mappings from code points to
5274 # values. There are two weird cases:
5275 # 1) Anomalous entries are ones that aren't maps of ranges of code points, but
5276 # are written in the table's file at the end of the table nonetheless. It
5277 # requires specially constructed code to handle these; utf8.c can not read
5278 # these in, so they should not go in $map_directory. As of this writing,
5279 # the only case that these happen is for named sequences used in
5280 # charnames.pm. But this code doesn't enforce any syntax on these, so
5281 # something else could come along that uses it.
5282 # 2) Specials are anything that doesn't fit syntactically into the body of the
5283 # table. The ranges for these have a map type of non-zero. The code below
5284 # knows about and handles each possible type. In most cases, these are
5285 # written as part of the header.
5287 # A map table deliberately can't be manipulated at will unlike match tables.
5288 # This is because of the ambiguities having to do with what to do with
5289 # overlapping code points. And there just isn't a need for those things;
5290 # what one wants to do is just query, add, replace, or delete mappings, plus
5291 # write the final result.
5292 # However, there is a method to get the list of possible ranges that aren't in
5293 # this table to use for defaulting missing code point mappings. And,
5294 # map_add_or_replace_non_nulls() does allow one to add another table to this
5295 # one, but it is clearly very specialized, and defined that the other's
5296 # non-null values replace this one's if there is any overlap.
5298 sub trace { return main::trace(@_); }
5302 main::setup_package();
5305 # Many input files omit some entries; this gives what the mapping for the
5306 # missing entries should be
5307 main::set_access('default_map', \%default_map, 'r');
5309 my %anomalous_entries;
5310 # Things that go in the body of the table which don't fit the normal
5311 # scheme of things, like having a range. Not much can be done with these
5312 # once there except to output them. This was created to handle named
5314 main::set_access('anomalous_entry', \%anomalous_entries, 'a');
5315 main::set_access('anomalous_entries', # Append singular, read plural
5316 \%anomalous_entries,
5320 # This is a string, solely for documentation, indicating how one can get
5321 # access to this property via the Perl core.
5322 main::set_access('core_access', \%core_access, 'r', 's');
5325 # Boolean as to whether or not to write out this map table
5326 main::set_access('to_output_map', \%to_output_map, 's');
5335 # Optional initialization data for the table.
5336 my $initialize = delete $args{'Initialize'};
5338 my $core_access = delete $args{'Core_Access'};
5339 my $default_map = delete $args{'Default_Map'};
5340 my $property = delete $args{'_Property'};
5341 my $full_name = delete $args{'Full_Name'};
5342 # Rest of parameters passed on
5344 my $range_list = Range_Map->new(Owner => $property);
5346 my $self = $class->SUPER::new(
5348 Complete_Name => $full_name,
5349 Full_Name => $full_name,
5350 _Property => $property,
5351 _Range_List => $range_list,
5354 my $addr = do { no overloading; pack 'J', $self; };
5356 $anomalous_entries{$addr} = [];
5357 $core_access{$addr} = $core_access;
5358 $default_map{$addr} = $default_map;
5360 $self->initialize($initialize) if defined $initialize;
5367 qw("") => "_operator_stringify",
5370 sub _operator_stringify {
5373 my $name = $self->property->full_name;
5374 $name = '""' if $name eq "";
5375 return "Map table for Property '$name'";
5379 # Add a synonym for this table (which means the property itself)
5382 # Rest of parameters passed on.
5384 $self->SUPER::add_alias($name, $self->property, @_);
5389 # Add a range of code points to the list of specially-handled code
5390 # points. $MULTI_CP is assumed if the type of special is not passed
5399 my $type = delete $args{'Type'} || 0;
5400 # Rest of parameters passed on
5402 # Can't change the table if locked.
5403 return if $self->carp_if_locked;
5405 my $addr = do { no overloading; pack 'J', $self; };
5407 $self->_range_list->add_map($lower, $upper,
5414 sub append_to_body {
5415 # Adds to the written HERE document of the table's body any anomalous
5416 # entries in the table..
5419 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5421 my $addr = do { no overloading; pack 'J', $self; };
5423 return "" unless @{$anomalous_entries{$addr}};
5424 return join("\n", @{$anomalous_entries{$addr}}) . "\n";
5427 sub map_add_or_replace_non_nulls {
5428 # This adds the mappings in the table $other to $self. Non-null
5429 # mappings from $other override those in $self. It essentially merges
5430 # the two tables, with the second having priority except for null
5435 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5437 return if $self->carp_if_locked;
5439 if (! $other->isa(__PACKAGE__)) {
5440 Carp::my_carp_bug("$other should be a "
5448 my $addr = do { no overloading; pack 'J', $self; };
5449 my $other_addr = do { no overloading; pack 'J', $other; };
5451 local $to_trace = 0 if main::DEBUG;
5453 my $self_range_list = $self->_range_list;
5454 my $other_range_list = $other->_range_list;
5455 foreach my $range ($other_range_list->ranges) {
5456 my $value = $range->value;
5457 next if $value eq "";
5458 $self_range_list->_add_delete('+',
5462 Type => $range->type,
5463 Replace => $UNCONDITIONALLY);
5469 sub set_default_map {
5470 # Define what code points that are missing from the input files should
5475 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5477 my $addr = do { no overloading; pack 'J', $self; };
5479 # Convert the input to the standard equivalent, if any (won't have any
5480 # for $STRING properties)
5481 my $standard = $self->_find_table_from_alias->{$map};
5482 $map = $standard->name if defined $standard;
5484 # Warn if there already is a non-equivalent default map for this
5485 # property. Note that a default map can be a ref, which means that
5486 # what it actually means is delayed until later in the program, and it
5487 # IS permissible to override it here without a message.
5488 my $default_map = $default_map{$addr};
5489 if (defined $default_map
5490 && ! ref($default_map)
5491 && $default_map ne $map
5492 && main::Standardize($map) ne $default_map)
5494 my $property = $self->property;
5495 my $map_table = $property->table($map);
5496 my $default_table = $property->table($default_map);
5497 if (defined $map_table
5498 && defined $default_table
5499 && $map_table != $default_table)
5501 Carp::my_carp("Changing the default mapping for "
5503 . " from $default_map to $map'");
5507 $default_map{$addr} = $map;
5509 # Don't also create any missing table for this map at this point,
5510 # because if we did, it could get done before the main table add is
5511 # done for PropValueAliases.txt; instead the caller will have to make
5512 # sure it exists, if desired.
5517 # Returns boolean: should we write this map table?
5520 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5522 my $addr = do { no overloading; pack 'J', $self; };
5524 # If overridden, use that
5525 return $to_output_map{$addr} if defined $to_output_map{$addr};
5527 my $full_name = $self->full_name;
5529 # If table says to output, do so; if says to suppress it, do do.
5530 return 1 if grep { $_ eq $full_name } @output_mapped_properties;
5531 return 0 if $self->status eq $SUPPRESSED;
5533 my $type = $self->property->type;
5535 # Don't want to output binary map tables even for debugging.
5536 return 0 if $type == $BINARY;
5538 # But do want to output string ones.
5539 return 1 if $type == $STRING;
5541 # Otherwise is an $ENUM, don't output it
5546 # Returns a Range_List that is gaps of the current table. That is,
5550 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5552 my $current = Range_List->new(Initialize => $self->_range_list,
5553 Owner => $self->property);
5557 sub set_final_comment {
5558 # Just before output, create the comment that heads the file
5559 # containing this table.
5562 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5564 # No sense generating a comment if aren't going to write it out.
5565 return if ! $self->to_output_map;
5567 my $addr = do { no overloading; pack 'J', $self; };
5569 my $property = $self->property;
5571 # Get all the possible names for this property. Don't use any that
5572 # aren't ok for use in a file name, etc. This is perhaps causing that
5573 # flag to do double duty, and may have to be changed in the future to
5574 # have our own flag for just this purpose; but it works now to exclude
5575 # Perl generated synonyms from the lists for properties, where the
5576 # name is always the proper Unicode one.
5577 my @property_aliases = grep { $_->externally_ok } $self->aliases;
5579 my $count = $self->count;
5580 my $default_map = $default_map{$addr};
5582 # The ranges that map to the default aren't output, so subtract that
5583 # to get those actually output. A property with matching tables
5584 # already has the information calculated.
5585 if ($property->type != $STRING) {
5586 $count -= $property->table($default_map)->count;
5588 elsif (defined $default_map) {
5590 # But for $STRING properties, must calculate now. Subtract the
5591 # count from each range that maps to the default.
5592 foreach my $range ($self->_range_list->ranges) {
5593 if ($range->value eq $default_map) {
5594 $count -= $range->end +1 - $range->start;
5600 # Get a string version of $count with underscores in large numbers,
5602 my $string_count = main::clarify_number($count);
5604 my $code_points = ($count == 1)
5605 ? 'single code point'
5606 : "$string_count code points";
5611 if (@property_aliases <= 1) {
5612 $mapping = 'mapping';
5613 $these_mappings = 'this mapping';
5617 $mapping = 'synonymous mappings';
5618 $these_mappings = 'these mappings';
5622 if ($count >= $MAX_UNICODE_CODEPOINTS) {
5623 $cp = "any code point in Unicode Version $string_version";
5627 if ($default_map eq "") {
5628 $map_to = 'the null string';
5630 elsif ($default_map eq $CODE_POINT) {
5634 $map_to = "'$default_map'";
5637 $cp = "the single code point";
5640 $cp = "one of the $code_points";
5642 $cp .= " in Unicode Version $string_version for which the mapping is not to $map_to";
5647 my $status = $self->status;
5649 my $warn = uc $status_past_participles{$status};
5652 !!!!!!! $warn !!!!!!!!!!!!!!!!!!!
5653 All property or property=value combinations contained in this file are $warn.
5654 See $unicode_reference_url for what this means.
5658 $comment .= "This file returns the $mapping:\n";
5660 for my $i (0 .. @property_aliases - 1) {
5661 $comment .= sprintf("%-8s%s\n",
5663 $property_aliases[$i]->name . '(cp)'
5667 "\nwhere 'cp' is $cp. Note that $these_mappings $are ";
5669 my $access = $core_access{$addr};
5671 $comment .= "accessible through the Perl core via $access.";
5674 $comment .= "not accessible through the Perl core directly.";
5677 # And append any commentary already set from the actual property.
5678 $comment .= "\n\n" . $self->comment if $self->comment;
5679 if ($self->description) {
5680 $comment .= "\n\n" . join " ", $self->description;
5683 $comment .= "\n\n" . join " ", $self->note;
5687 if (! $self->perl_extension) {
5690 For information about what this property really means, see:
5691 $unicode_reference_url
5695 if ($count) { # Format differs for empty table
5696 $comment.= "\nThe format of the ";
5697 if ($self->range_size_1) {
5699 main body of lines of this file is: CODE_POINT\\t\\tMAPPING where CODE_POINT
5700 is in hex; MAPPING is what CODE_POINT maps to.
5705 # There are tables which end up only having one element per
5706 # range, but it is not worth keeping track of for making just
5707 # this comment a little better.
5709 non-comment portions of the main body of lines of this file is:
5710 START\\tSTOP\\tMAPPING where START is the starting code point of the
5711 range, in hex; STOP is the ending point, or if omitted, the range has just one
5712 code point; MAPPING is what each code point between START and STOP maps to.
5714 if ($self->output_range_counts) {
5716 Numbers in comments in [brackets] indicate how many code points are in the
5717 range (omitted when the range is a single code point or if the mapping is to
5723 $self->set_comment(main::join_lines($comment));
5727 my %swash_keys; # Makes sure don't duplicate swash names.
5729 # The remaining variables are temporaries used while writing each table,
5730 # to output special ranges.
5731 my $has_hangul_syllables;
5732 my @multi_code_point_maps; # Map is to more than one code point.
5734 # The key is the base name of the code point, and the value is an
5735 # array giving all the ranges that use this base name. Each range
5736 # is actually a hash giving the 'low' and 'high' values of it.
5737 my %names_ending_in_code_point;
5739 # Inverse mapping. The list of ranges that have these kinds of
5740 # names. Each element contains the low, high, and base names in a
5742 my @code_points_ending_in_code_point;
5744 sub handle_special_range {
5745 # Called in the middle of write when it finds a range it doesn't know
5750 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5752 my $addr = do { no overloading; pack 'J', $self; };
5754 my $type = $range->type;
5756 my $low = $range->start;
5757 my $high = $range->end;
5758 my $map = $range->value;
5760 # No need to output the range if it maps to the default.
5761 return if $map eq $default_map{$addr};
5763 # Switch based on the map type...
5764 if ($type == $HANGUL_SYLLABLE) {
5766 # These are entirely algorithmically determinable based on
5767 # some constants furnished by Unicode; for now, just set a
5768 # flag to indicate that have them. After everything is figured
5769 # out, we will output the code that does the algorithm.
5770 $has_hangul_syllables = 1;
5772 elsif ($type == $CP_IN_NAME) {
5774 # Code points whose the name ends in their code point are also
5775 # algorithmically determinable, but need information about the map
5776 # to do so. Both the map and its inverse are stored in data
5777 # structures output in the file.
5778 push @{$names_ending_in_code_point{$map}->{'low'}}, $low;
5779 push @{$names_ending_in_code_point{$map}->{'high'}}, $high;
5781 push @code_points_ending_in_code_point, { low => $low,
5786 elsif ($range->type == $MULTI_CP || $range->type == $NULL) {
5788 # Multi-code point maps and null string maps have an entry
5789 # for each code point in the range. They use the same
5791 for my $code_point ($low .. $high) {
5793 # The pack() below can't cope with surrogates.
5794 if ($code_point >= 0xD800 && $code_point <= 0xDFFF) {
5795 Carp::my_carp("Surrogage code point '$code_point' in mapping to '$map' in $self. No map created");
5799 # Generate the hash entries for these in the form that
5800 # utf8.c understands.
5804 foreach my $to (split " ", $map) {
5805 if ($to !~ /^$code_point_re$/) {
5806 Carp::my_carp("Illegal code point '$to' in mapping '$map' from $code_point in $self. No map created");
5809 $tostr .= sprintf "\\x{%s}", $to;
5810 $to = CORE::hex $to;
5811 if ($output_names) {
5812 $to_name .= " + " if $to_name;
5813 $to_chr .= chr($to);
5814 main::populate_char_info($to)
5815 if ! defined $viacode[$to];
5816 $to_name .= $viacode[$to];
5820 # I (khw) have never waded through this line to
5821 # understand it well enough to comment it.
5822 my $utf8 = sprintf(qq["%s" => "$tostr",],
5823 join("", map { sprintf "\\x%02X", $_ }
5824 unpack("U0C*", pack("U", $code_point))));
5826 # Add a comment so that a human reader can more easily
5827 # see what's going on.
5828 push @multi_code_point_maps,
5829 sprintf("%-45s # U+%04X", $utf8, $code_point);
5830 if (! $output_names) {
5831 $multi_code_point_maps[-1] .= " => $map";
5834 main::populate_char_info($code_point)
5835 if ! defined $viacode[$code_point];
5836 $multi_code_point_maps[-1] .= " '"
5838 . "' => '$to_chr'; $viacode[$code_point] => $to_name";
5843 Carp::my_carp("Unrecognized map type '$range->type' in '$range' in $self. Not written");
5850 # Returns the string that should be output in the file before the main
5851 # body of this table. It isn't called until the main body is
5852 # calculated, saving a pass. The string includes some hash entries
5853 # identifying the format of the body, and what the single value should
5854 # be for all ranges missing from it. It also includes any code points
5855 # which have map_types that don't go in the main table.
5858 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5860 my $addr = do { no overloading; pack 'J', $self; };
5862 my $name = $self->property->swash_name;
5864 if (defined $swash_keys{$name}) {
5865 Carp::my_carp(join_lines(<<END
5866 Already created a swash name '$name' for $swash_keys{$name}. This means that
5867 the same name desired for $self shouldn't be used. Bad News. This must be
5868 fixed before production use, but proceeding anyway
5872 $swash_keys{$name} = "$self";
5876 # Here we assume we were called after have gone through the whole
5877 # file. If we actually generated anything for each map type, add its
5878 # respective header and trailer
5879 if (@multi_code_point_maps) {
5882 # Some code points require special handling because their mappings are each to
5883 # multiple code points. These do not appear in the main body, but are defined
5884 # in the hash below.
5886 # Each key is the string of N bytes that together make up the UTF-8 encoding
5887 # for the code point. (i.e. the same as looking at the code point's UTF-8
5888 # under "use bytes"). Each value is the UTF-8 of the translation, for speed.
5889 %utf8::ToSpec$name = (
5891 $pre_body .= join("\n", @multi_code_point_maps) . "\n);\n";
5894 if ($has_hangul_syllables || @code_points_ending_in_code_point) {
5896 # Convert these structures to output format.
5897 my $code_points_ending_in_code_point =
5898 main::simple_dumper(\@code_points_ending_in_code_point,
5900 my $names = main::simple_dumper(\%names_ending_in_code_point,
5903 # Do the same with the Hangul names,
5909 if ($has_hangul_syllables) {
5911 # Construct a regular expression of all the possible
5912 # combinations of the Hangul syllables.
5913 my @L_re; # Leading consonants
5914 for my $i ($LBase .. $LBase + $LCount - 1) {
5915 push @L_re, $Jamo{$i}
5917 my @V_re; # Middle vowels
5918 for my $i ($VBase .. $VBase + $VCount - 1) {
5919 push @V_re, $Jamo{$i}
5921 my @T_re; # Trailing consonants
5922 for my $i ($TBase + 1 .. $TBase + $TCount - 1) {
5923 push @T_re, $Jamo{$i}
5926 # The whole re is made up of the L V T combination.
5928 . join ('|', sort @L_re)
5930 . join ('|', sort @V_re)
5932 . join ('|', sort @T_re)
5935 # These hashes needed by the algorithm were generated
5936 # during reading of the Jamo.txt file
5937 $jamo = main::simple_dumper(\%Jamo, ' ' x 8);
5938 $jamo_l = main::simple_dumper(\%Jamo_L, ' ' x 8);
5939 $jamo_v = main::simple_dumper(\%Jamo_V, ' ' x 8);
5940 $jamo_t = main::simple_dumper(\%Jamo_T, ' ' x 8);
5945 # To achieve significant memory savings when this file is read in,
5946 # algorithmically derivable code points are omitted from the main body below.
5947 # Instead, the following routines can be used to translate between name and
5948 # code point and vice versa
5952 # Matches legal code point. 4-6 hex numbers, If there are 6, the
5953 # first two must be '10'; if there are 5, the first must not be a '0'.
5954 my \$code_point_re = qr/$code_point_re/;
5956 # In the following hash, the keys are the bases of names which includes
5957 # the code point in the name, like CJK UNIFIED IDEOGRAPH-4E01. The values
5958 # of each key is another hash which is used to get the low and high ends
5959 # for each range of code points that apply to the name
5960 my %names_ending_in_code_point = (
5964 # And the following array gives the inverse mapping from code points to
5965 # names. Lowest code points are first
5966 my \@code_points_ending_in_code_point = (
5967 $code_points_ending_in_code_point
5970 # Earlier releases didn't have Jamos. No sense outputting
5971 # them unless will be used.
5972 if ($has_hangul_syllables) {
5975 # Convert from code point to Jamo short name for use in composing Hangul
5981 # Leading consonant (can be null)
5991 # Optional trailing consonant
5996 # Computed re that splits up a Hangul name into LVT or LV syllables
5997 my \$syllable_re = qr/$jamo_re/;
5999 my \$HANGUL_SYLLABLE = "HANGUL SYLLABLE ";
6000 my \$HANGUL_SYLLABLE_LENGTH = length \$HANGUL_SYLLABLE;
6002 # These constants names and values were taken from the Unicode standard,
6003 # version 5.1, section 3.12. They are used in conjunction with Hangul
6005 my \$SBase = $SBase_string;
6006 my \$LBase = $LBase_string;
6007 my \$VBase = $VBase_string;
6008 my \$TBase = $TBase_string;
6009 my \$SCount = $SCount;
6010 my \$LCount = $LCount;
6011 my \$VCount = $VCount;
6012 my \$TCount = $TCount;
6013 my \$NCount = \$VCount * \$TCount;
6015 } # End of has Jamos
6017 $pre_body .= << 'END';
6019 sub name_to_code_point_special {
6022 # Returns undef if not one of the specially handled names; otherwise
6023 # returns the code point equivalent to the input name
6025 if ($has_hangul_syllables) {
6026 $pre_body .= << 'END';
6028 if (substr($name, 0, $HANGUL_SYLLABLE_LENGTH) eq $HANGUL_SYLLABLE) {
6029 $name = substr($name, $HANGUL_SYLLABLE_LENGTH);
6030 return if $name !~ qr/^$syllable_re$/;
6031 my $L = $Jamo_L{$1};
6032 my $V = $Jamo_V{$2};
6033 my $T = (defined $3) ? $Jamo_T{$3} : 0;
6034 return ($L * $VCount + $V) * $TCount + $T + $SBase;
6038 $pre_body .= << 'END';
6040 # Name must end in '-code_point' for this to handle.
6041 if ($name !~ /^ (.*) - ($code_point_re) $/x) {
6046 my $code_point = CORE::hex $2;
6048 # Name must be one of the ones which has the code point in it.
6049 return if ! $names_ending_in_code_point{$base};
6051 # Look through the list of ranges that apply to this name to see if
6052 # the code point is in one of them.
6053 for (my $i = 0; $i < scalar @{$names_ending_in_code_point{$base}{'low'}}; $i++) {
6054 return if $names_ending_in_code_point{$base}{'low'}->[$i] > $code_point;
6055 next if $names_ending_in_code_point{$base}{'high'}->[$i] < $code_point;
6057 # Here, the code point is in the range.
6061 # Here, looked like the name had a code point number in it, but
6062 # did not match one of the valid ones.
6066 sub code_point_to_name_special {
6067 my $code_point = shift;
6069 # Returns the name of a code point if algorithmically determinable;
6072 if ($has_hangul_syllables) {
6073 $pre_body .= << 'END';
6075 # If in the Hangul range, calculate the name based on Unicode's
6077 if ($code_point >= $SBase && $code_point <= $SBase + $SCount -1) {
6079 my $SIndex = $code_point - $SBase;
6080 my $L = $LBase + $SIndex / $NCount;
6081 my $V = $VBase + ($SIndex % $NCount) / $TCount;
6082 my $T = $TBase + $SIndex % $TCount;
6083 $name = "$HANGUL_SYLLABLE$Jamo{$L}$Jamo{$V}";
6084 $name .= $Jamo{$T} if $T != $TBase;
6089 $pre_body .= << 'END';
6091 # Look through list of these code points for one in range.
6092 foreach my $hash (@code_points_ending_in_code_point) {
6093 return if $code_point < $hash->{'low'};
6094 if ($code_point <= $hash->{'high'}) {
6095 return sprintf("%s-%04X", $hash->{'name'}, $code_point);
6098 return; # None found
6103 } # End of has hangul or code point in name maps.
6105 my $format = $self->format;
6108 # The name this swash is to be known by, with the format of the mappings in
6109 # the main body of the table, and what all code points missing from this file
6111 \$utf8::SwashInfo{'To$name'}{'format'} = '$format'; # $map_table_formats{$format}
6113 my $default_map = $default_map{$addr};
6114 $return .= "\$utf8::SwashInfo{'To$name'}{'missing'} = '$default_map';";
6116 if ($default_map eq $CODE_POINT) {
6117 $return .= ' # code point maps to itself';
6119 elsif ($default_map eq "") {
6120 $return .= ' # code point maps to the null string';
6124 $return .= $pre_body;
6130 # Write the table to the file.
6133 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6135 my $addr = do { no overloading; pack 'J', $self; };
6137 # Clear the temporaries
6138 $has_hangul_syllables = 0;
6139 undef @multi_code_point_maps;
6140 undef %names_ending_in_code_point;
6141 undef @code_points_ending_in_code_point;
6143 # Calculate the format of the table if not already done.
6144 my $format = $self->format;
6145 my $type = $self->property->type;
6146 my $default_map = $self->default_map;
6147 if (! defined $format) {
6148 if ($type == $BINARY) {
6150 # Don't bother checking the values, because we elsewhere
6151 # verify that a binary table has only 2 values.
6152 $format = $BINARY_FORMAT;
6155 my @ranges = $self->_range_list->ranges;
6157 # default an empty table based on its type and default map
6160 # But it turns out that the only one we can say is a
6161 # non-string (besides binary, handled above) is when the
6162 # table is a string and the default map is to a code point
6163 if ($type == $STRING && $default_map eq $CODE_POINT) {
6164 $format = $HEX_FORMAT;
6167 $format = $STRING_FORMAT;
6172 # Start with the most restrictive format, and as we find
6173 # something that doesn't fit with that, change to the next
6174 # most restrictive, and so on.
6175 $format = $DECIMAL_FORMAT;
6176 foreach my $range (@ranges) {
6177 next if $range->type != 0; # Non-normal ranges don't
6178 # affect the main body
6179 my $map = $range->value;
6180 if ($map ne $default_map) {
6181 last if $format eq $STRING_FORMAT; # already at
6184 $format = $INTEGER_FORMAT
6185 if $format eq $DECIMAL_FORMAT
6186 && $map !~ / ^ [0-9] $ /x;
6187 $format = $FLOAT_FORMAT
6188 if $format eq $INTEGER_FORMAT
6189 && $map !~ / ^ -? [0-9]+ $ /x;
6190 $format = $RATIONAL_FORMAT
6191 if $format eq $FLOAT_FORMAT
6192 && $map !~ / ^ -? [0-9]+ \. [0-9]* $ /x;
6193 $format = $HEX_FORMAT
6194 if $format eq $RATIONAL_FORMAT
6195 && $map !~ / ^ -? [0-9]+ ( \/ [0-9]+ )? $ /x;
6196 $format = $STRING_FORMAT if $format eq $HEX_FORMAT
6197 && $map =~ /[^0-9A-F]/;
6202 } # end of calculating format
6204 if ($default_map eq $CODE_POINT
6205 && $format ne $HEX_FORMAT
6206 && ! defined $self->format) # manual settings are always
6209 Carp::my_carp_bug("Expecting hex format for mapping table for $self, instead got '$format'")
6212 $self->_set_format($format);
6214 return $self->SUPER::write(
6215 ($self->property == $block)
6216 ? 7 # block file needs more tab stops
6218 $default_map); # don't write defaulteds
6221 # Accessors for the underlying list that should fail if locked.
6231 return if $self->carp_if_locked;
6232 return $self->_range_list->$sub(@_);
6235 } # End closure for Map_Table
6237 package Match_Table;
6238 use base '_Base_Table';
6240 # A Match table is one which is a list of all the code points that have
6241 # the same property and property value, for use in \p{property=value}
6242 # constructs in regular expressions. It adds very little data to the base
6243 # structure, but many methods, as these lists can be combined in many ways to
6245 # There are only a few concepts added:
6246 # 1) Equivalents and Relatedness.
6247 # Two tables can match the identical code points, but have different names.
6248 # This always happens when there is a perl single form extension
6249 # \p{IsProperty} for the Unicode compound form \P{Property=True}. The two
6250 # tables are set to be related, with the Perl extension being a child, and
6251 # the Unicode property being the parent.
6253 # It may be that two tables match the identical code points and we don't
6254 # know if they are related or not. This happens most frequently when the
6255 # Block and Script properties have the exact range. But note that a
6256 # revision to Unicode could add new code points to the script, which would
6257 # now have to be in a different block (as the block was filled, or there
6258 # would have been 'Unknown' script code points in it and they wouldn't have
6259 # been identical). So we can't rely on any two properties from Unicode
6260 # always matching the same code points from release to release, and thus
6261 # these tables are considered coincidentally equivalent--not related. When
6262 # two tables are unrelated but equivalent, one is arbitrarily chosen as the
6263 # 'leader', and the others are 'equivalents'. This concept is useful
6264 # to minimize the number of tables written out. Only one file is used for
6265 # any identical set of code points, with entries in Heavy.pl mapping all
6266 # the involved tables to it.
6268 # Related tables will always be identical; we set them up to be so. Thus
6269 # if the Unicode one is deprecated, the Perl one will be too. Not so for
6270 # unrelated tables. Relatedness makes generating the documentation easier.
6272 # 2) Conflicting. It may be that there will eventually be name clashes, with
6273 # the same name meaning different things. For a while, there actually were
6274 # conflicts, but they have so far been resolved by changing Perl's or
6275 # Unicode's definitions to match the other, but when this code was written,
6276 # it wasn't clear that that was what was going to happen. (Unicode changed
6277 # because of protests during their beta period.) Name clashes are warned
6278 # about during compilation, and the documentation. The generated tables
6279 # are sane, free of name clashes, because the code suppresses the Perl
6280 # version. But manual intervention to decide what the actual behavior
6281 # should be may be required should this happen. The introductory comments
6282 # have more to say about this.
6284 sub standardize { return main::standardize($_[0]); }
6285 sub trace { return main::trace(@_); }
6290 main::setup_package();
6293 # The leader table of this one; initially $self.
6294 main::set_access('leader', \%leader, 'r');
6297 # An array of any tables that have this one as their leader
6298 main::set_access('equivalents', \%equivalents, 'readable_array');
6301 # The parent table to this one, initially $self. This allows us to
6302 # distinguish between equivalent tables that are related, and those which
6303 # may not be, but share the same output file because they match the exact
6304 # same set of code points in the current Unicode release.
6305 main::set_access('parent', \%parent, 'r');
6308 # An array of any tables that have this one as their parent
6309 main::set_access('children', \%children, 'readable_array');
6312 # Array of any tables that would have the same name as this one with
6313 # a different meaning. This is used for the generated documentation.
6314 main::set_access('conflicting', \%conflicting, 'readable_array');
6317 # Set in the constructor for tables that are expected to match all code
6319 main::set_access('matches_all', \%matches_all, 'r');
6326 # The property for which this table is a listing of property values.
6327 my $property = delete $args{'_Property'};
6329 my $name = delete $args{'Name'};
6330 my $full_name = delete $args{'Full_Name'};
6331 $full_name = $name if ! defined $full_name;
6334 my $initialize = delete $args{'Initialize'};
6335 my $matches_all = delete $args{'Matches_All'} || 0;
6336 my $format = delete $args{'Format'};
6337 # Rest of parameters passed on.
6339 my $range_list = Range_List->new(Initialize => $initialize,
6340 Owner => $property);
6342 my $complete = $full_name;
6343 $complete = '""' if $complete eq ""; # A null name shouldn't happen,
6344 # but this helps debug if it
6346 # The complete name for a match table includes it's property in a
6347 # compound form 'property=table', except if the property is the
6348 # pseudo-property, perl, in which case it is just the single form,
6349 # 'table' (If you change the '=' must also change the ':' in lots of
6350 # places in this program that assume an equal sign)
6351 $complete = $property->full_name . "=$complete" if $property != $perl;
6353 my $self = $class->SUPER::new(%args,
6355 Complete_Name => $complete,
6356 Full_Name => $full_name,
6357 _Property => $property,
6358 _Range_List => $range_list,
6359 Format => $EMPTY_FORMAT,
6361 my $addr = do { no overloading; pack 'J', $self; };
6363 $conflicting{$addr} = [ ];
6364 $equivalents{$addr} = [ ];
6365 $children{$addr} = [ ];
6366 $matches_all{$addr} = $matches_all;
6367 $leader{$addr} = $self;
6368 $parent{$addr} = $self;
6370 if (defined $format && $format ne $EMPTY_FORMAT) {
6371 Carp::my_carp_bug("'Format' must be '$EMPTY_FORMAT' in a match table instead of '$format'. Using '$EMPTY_FORMAT'");
6377 # See this program's beginning comment block about overloading these.
6380 qw("") => "_operator_stringify",
6384 return if $self->carp_if_locked;
6392 return $self->_range_list + $other;
6398 return $self->_range_list & $other;
6404 return if $self->carp_if_locked;
6406 my $addr = do { no overloading; pack 'J', $self; };
6410 # Change the range list of this table to be the
6412 $self->_set_range_list($self->_range_list
6415 else { # $other is just a simple value
6416 $self->add_range($other, $other);
6420 '-' => sub { my $self = shift;
6422 my $reversed = shift;
6425 Carp::my_carp_bug("Can't cope with a "
6427 . " being the first parameter in a '-'. Subtraction ignored.");
6431 return $self->_range_list - $other;
6433 '~' => sub { my $self = shift;
6434 return ~ $self->_range_list;
6438 sub _operator_stringify {
6441 my $name = $self->complete_name;
6442 return "Table '$name'";
6446 # Add a synonym for this table. See the comments in the base class
6450 # Rest of parameters passed on.
6452 $self->SUPER::add_alias($name, $self, @_);
6456 sub add_conflicting {
6457 # Add the name of some other object to the list of ones that name
6458 # clash with this match table.
6461 my $conflicting_name = shift; # The name of the conflicting object
6462 my $p = shift || 'p'; # Optional, is this a \p{} or \P{} ?
6463 my $conflicting_object = shift; # Optional, the conflicting object
6464 # itself. This is used to
6465 # disambiguate the text if the input
6466 # name is identical to any of the
6467 # aliases $self is known by.
6468 # Sometimes the conflicting object is
6469 # merely hypothetical, so this has to
6470 # be an optional parameter.
6471 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6473 my $addr = do { no overloading; pack 'J', $self; };
6475 # Check if the conflicting name is exactly the same as any existing
6476 # alias in this table (as long as there is a real object there to
6477 # disambiguate with).
6478 if (defined $conflicting_object) {
6479 foreach my $alias ($self->aliases) {
6480 if ($alias->name eq $conflicting_name) {
6482 # Here, there is an exact match. This results in
6483 # ambiguous comments, so disambiguate by changing the
6484 # conflicting name to its object's complete equivalent.
6485 $conflicting_name = $conflicting_object->complete_name;
6491 # Convert to the \p{...} final name
6492 $conflicting_name = "\\$p" . "{$conflicting_name}";
6495 return if grep { $conflicting_name eq $_ } @{$conflicting{$addr}};
6497 push @{$conflicting{$addr}}, $conflicting_name;
6502 sub is_set_equivalent_to {
6503 # Return boolean of whether or not the other object is a table of this
6504 # type and has been marked equivalent to this one.
6508 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6510 return 0 if ! defined $other; # Can happen for incomplete early
6512 unless ($other->isa(__PACKAGE__)) {
6513 my $ref_other = ref $other;
6514 my $ref_self = ref $self;
6515 Carp::my_carp_bug("Argument to 'is_set_equivalent_to' must be another $ref_self, not a '$ref_other'. $other not set equivalent to $self.");
6519 # Two tables are equivalent if they have the same leader.
6521 return $leader{pack 'J', $self} == $leader{pack 'J', $other};
6525 sub set_equivalent_to {
6526 # Set $self equivalent to the parameter table.
6527 # The required Related => 'x' parameter is a boolean indicating
6528 # whether these tables are related or not. If related, $other becomes
6529 # the 'parent' of $self; if unrelated it becomes the 'leader'
6531 # Related tables share all characteristics except names; equivalents
6532 # not quite so many.
6533 # If they are related, one must be a perl extension. This is because
6534 # we can't guarantee that Unicode won't change one or the other in a
6535 # later release even if they are idential now.
6541 my $related = delete $args{'Related'};
6543 Carp::carp_extra_args(\%args) if main::DEBUG && %args;
6545 return if ! defined $other; # Keep on going; happens in some early
6548 if (! defined $related) {
6549 Carp::my_carp_bug("set_equivalent_to must have 'Related => [01] parameter. Assuming $self is not related to $other");
6553 # If already are equivalent, no need to re-do it; if subroutine
6554 # returns null, it found an error, also do nothing
6555 my $are_equivalent = $self->is_set_equivalent_to($other);
6556 return if ! defined $are_equivalent || $are_equivalent;
6558 my $addr = do { no overloading; pack 'J', $self; };
6559 my $current_leader = ($related) ? $parent{$addr} : $leader{$addr};
6562 ! $other->perl_extension
6563 && ! $current_leader->perl_extension)
6565 Carp::my_carp_bug("set_equivalent_to should have 'Related => 0 for equivalencing two Unicode properties. Assuming $self is not related to $other");
6569 my $leader = do { no overloading; pack 'J', $current_leader; };
6570 my $other_addr = do { no overloading; pack 'J', $other; };
6572 # Any tables that are equivalent to or children of this table must now
6573 # instead be equivalent to or (children) to the new leader (parent),
6574 # still equivalent. The equivalency includes their matches_all info,
6575 # and for related tables, their status
6576 # All related tables are of necessity equivalent, but the converse
6577 # isn't necessarily true
6578 my $status = $other->status;
6579 my $status_info = $other->status_info;
6580 my $matches_all = $matches_all{other_addr};
6581 foreach my $table ($current_leader, @{$equivalents{$leader}}) {
6582 next if $table == $other;
6583 trace "setting $other to be the leader of $table, status=$status" if main::DEBUG && $to_trace;
6585 my $table_addr = do { no overloading; pack 'J', $table; };
6586 $leader{$table_addr} = $other;
6587 $matches_all{$table_addr} = $matches_all;
6588 $self->_set_range_list($other->_range_list);
6589 push @{$equivalents{$other_addr}}, $table;
6591 $parent{$table_addr} = $other;
6592 push @{$children{$other_addr}}, $table;
6593 $table->set_status($status, $status_info);
6597 # Now that we've declared these to be equivalent, any changes to one
6598 # of the tables would invalidate that equivalency.
6604 sub add_range { # Add a range to the list for this table.
6606 # Rest of parameters passed on
6608 return if $self->carp_if_locked;
6609 return $self->_range_list->add_range(@_);
6612 sub pre_body { # Does nothing for match tables.
6616 sub append_to_body { # Does nothing for match tables.
6622 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6624 return $self->SUPER::write(2); # 2 tab stops
6627 sub set_final_comment {
6628 # This creates a comment for the file that is to hold the match table
6629 # $self. It is somewhat convoluted to make the English read nicely,
6630 # but, heh, it's just a comment.
6631 # This should be called only with the leader match table of all the
6632 # ones that share the same file. It lists all such tables, ordered so
6633 # that related ones are together.
6635 my $leader = shift; # Should only be called on the leader table of
6636 # an equivalent group
6637 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6639 my $addr = do { no overloading; pack 'J', $leader; };
6641 if ($leader{$addr} != $leader) {
6642 Carp::my_carp_bug(<<END
6643 set_final_comment() must be called on a leader table, which $leader is not.
6644 It is equivalent to $leader{$addr}. No comment created
6650 # Get the number of code points matched by each of the tables in this
6651 # file, and add underscores for clarity.
6652 my $count = $leader->count;
6653 my $string_count = main::clarify_number($count);
6655 my $loose_count = 0; # how many aliases loosely matched
6656 my $compound_name = ""; # ? Are any names compound?, and if so, an
6658 my $properties_with_compound_names = 0; # count of these
6661 my %flags; # The status flags used in the file
6662 my $total_entries = 0; # number of entries written in the comment
6663 my $matches_comment = ""; # The portion of the comment about the
6665 my @global_comments; # List of all the tables' comments that are
6666 # there before this routine was called.
6668 # Get list of all the parent tables that are equivalent to this one
6669 # (including itself).
6670 my @parents = grep { $parent{main::objaddr $_} == $_ }
6671 main::uniques($leader, @{$equivalents{$addr}});
6672 my $has_unrelated = (@parents >= 2); # boolean, ? are there unrelated
6675 for my $parent (@parents) {
6677 my $property = $parent->property;
6679 # Special case 'N' tables in properties with two match tables when
6680 # the other is a 'Y' one. These are likely to be binary tables,
6681 # but not necessarily. In either case, \P{} will match the
6682 # complement of \p{}, and so if something is a synonym of \p, the
6683 # complement of that something will be the synonym of \P. This
6684 # would be true of any property with just two match tables, not
6685 # just those whose values are Y and N; but that would require a
6686 # little extra work, and there are none such so far in Unicode.
6687 my $perl_p = 'p'; # which is it? \p{} or \P{}
6688 my @yes_perl_synonyms; # list of any synonyms for the 'Y' table
6690 if (scalar $property->tables == 2
6691 && $parent == $property->table('N')
6692 && defined (my $yes = $property->table('Y')))
6694 my $yes_addr = do { no overloading; pack 'J', $yes; };
6696 = grep { $_->property == $perl }
6699 $parent{$yes_addr}->children);
6701 # But these synonyms are \P{} ,not \p{}
6705 my @description; # Will hold the table description
6706 my @note; # Will hold the table notes.
6707 my @conflicting; # Will hold the table conflicts.
6709 # Look at the parent, any yes synonyms, and all the children
6710 my $parent_addr = do { no overloading; pack 'J', $parent; };
6711 for my $table ($parent,
6713 @{$children{$parent_addr}})
6715 my $table_addr = do { no overloading; pack 'J', $table; };
6716 my $table_property = $table->property;
6718 # Tables are separated by a blank line to create a grouping.
6719 $matches_comment .= "\n" if $matches_comment;
6721 # The table is named based on the property and value
6722 # combination it is for, like script=greek. But there may be
6723 # a number of synonyms for each side, like 'sc' for 'script',
6724 # and 'grek' for 'greek'. Any combination of these is a valid
6725 # name for this table. In this case, there are three more,
6726 # 'sc=grek', 'sc=greek', and 'script='grek'. Rather than
6727 # listing all possible combinations in the comment, we make
6728 # sure that each synonym occurs at least once, and add
6729 # commentary that the other combinations are possible.
6730 my @property_aliases = $table_property->aliases;
6731 my @table_aliases = $table->aliases;
6733 Carp::my_carp_bug("$table doesn't have any names. Proceeding anyway.") unless @table_aliases;
6735 # The alias lists above are already ordered in the order we
6736 # want to output them. To ensure that each synonym is listed,
6737 # we must use the max of the two numbers.
6738 my $listed_combos = main::max(scalar @table_aliases,
6739 scalar @property_aliases);
6740 trace "$listed_combos, tables=", scalar @table_aliases, "; names=", scalar @property_aliases if main::DEBUG;
6742 my $property_had_compound_name = 0;
6744 for my $i (0 .. $listed_combos - 1) {
6747 # The current alias for the property is the next one on
6748 # the list, or if beyond the end, start over. Similarly
6749 # for the table (\p{prop=table})
6750 my $property_alias = $property_aliases
6751 [$i % @property_aliases]->name;
6752 my $table_alias_object = $table_aliases
6753 [$i % @table_aliases];
6754 my $table_alias = $table_alias_object->name;
6755 my $loose_match = $table_alias_object->loose_match;
6757 if ($table_alias !~ /\D/) { # Clarify large numbers.
6758 $table_alias = main::clarify_number($table_alias)
6761 # Add a comment for this alias combination
6762 my $current_match_comment;
6763 if ($table_property == $perl) {
6764 $current_match_comment = "\\$perl_p"
6768 $current_match_comment
6769 = "\\p{$property_alias=$table_alias}";
6770 $property_had_compound_name = 1;
6773 # Flag any abnormal status for this table.
6774 my $flag = $property->status
6776 || $table_alias_object->status;
6778 if ($flag ne $PLACEHOLDER) {
6779 $flags{$flag} = $status_past_participles{$flag};
6781 $flags{$flag} = <<END;
6782 a placeholder because it is not in Version $string_version of Unicode, but is
6783 needed by the Perl core to work gracefully. Because it is not in this version
6784 of Unicode, it will not be listed in $pod_file.pod
6791 # Pretty up the comment. Note the \b; it says don't make
6792 # this line a continuation.
6793 $matches_comment .= sprintf("\b%-1s%-s%s\n",
6796 $current_match_comment);
6797 } # End of generating the entries for this table.
6799 # Save these for output after this group of related tables.
6800 push @description, $table->description;
6801 push @note, $table->note;
6802 push @conflicting, $table->conflicting;
6804 # And this for output after all the tables.
6805 push @global_comments, $table->comment;
6807 # Compute an alternate compound name using the final property
6808 # synonym and the first table synonym with a colon instead of
6809 # the equal sign used elsewhere.
6810 if ($property_had_compound_name) {
6811 $properties_with_compound_names ++;
6812 if (! $compound_name || @property_aliases > 1) {
6813 $compound_name = $property_aliases[-1]->name
6815 . $table_aliases[0]->name;
6818 } # End of looping through all children of this table
6820 # Here have assembled in $matches_comment all the related tables
6821 # to the current parent (preceded by the same info for all the
6822 # previous parents). Put out information that applies to all of
6823 # the current family.
6826 # But output the conflicting information now, as it applies to
6828 my $conflicting = join ", ", @conflicting;
6830 $matches_comment .= <<END;
6832 Note that contrary to what you might expect, the above is NOT the same as
6834 $matches_comment .= "any of: " if @conflicting > 1;
6835 $matches_comment .= "$conflicting\n";
6839 $matches_comment .= "\n Meaning: "
6840 . join('; ', @description)
6844 $matches_comment .= "\n Note: "
6845 . join("\n ", @note)
6848 } # End of looping through all tables
6856 $code_points = 'single code point';
6860 $code_points = "$string_count code points";
6865 if ($total_entries <= 1) {
6868 $any_of_these = 'this'
6871 $synonyms = " any of the following regular expression constructs";
6872 $entries = 'entries';
6873 $any_of_these = 'any of these'
6877 if ($has_unrelated) {
6879 This file is for tables that are not necessarily related: To conserve
6880 resources, every table that matches the identical set of code points in this
6881 version of Unicode uses this file. Each one is listed in a separate group
6882 below. It could be that the tables will match the same set of code points in
6883 other Unicode releases, or it could be purely coincidence that they happen to
6884 be the same in Unicode $string_version, and hence may not in other versions.
6890 foreach my $flag (sort keys %flags) {
6892 '$flag' below means that this form is $flags{$flag}.
6894 next if $flag eq $PLACEHOLDER;
6895 $comment .= "Consult $pod_file.pod\n";
6901 This file returns the $code_points in Unicode Version $string_version that
6905 $pod_file.pod should be consulted for the syntax rules for $any_of_these,
6906 including if adding or subtracting white space, underscore, and hyphen
6907 characters matters or doesn't matter, and other permissible syntactic
6908 variants. Upper/lower case distinctions never matter.
6911 if ($compound_name) {
6914 A colon can be substituted for the equals sign, and
6916 if ($properties_with_compound_names > 1) {
6918 within each group above,
6921 $compound_name = sprintf("%-8s\\p{%s}", " ", $compound_name);
6923 # Note the \b below, it says don't make that line a continuation.
6925 anything to the left of the equals (or colon) can be combined with anything to
6926 the right. Thus, for example,
6932 # And append any comment(s) from the actual tables. They are all
6933 # gathered here, so may not read all that well.
6934 if (@global_comments) {
6935 $comment .= "\n" . join("\n\n", @global_comments) . "\n";
6938 if ($count) { # The format differs if no code points, and needs no
6939 # explanation in that case
6942 The format of the lines of this file is:
6945 START\\tSTOP\\twhere START is the starting code point of the range, in hex;
6946 STOP is the ending point, or if omitted, the range has just one code point.
6948 if ($leader->output_range_counts) {
6950 Numbers in comments in [brackets] indicate how many code points are in the
6956 $leader->set_comment(main::join_lines($comment));
6960 # Accessors for the underlying list
6962 get_valid_code_point
6963 get_invalid_code_point
6971 return $self->_range_list->$sub(@_);
6974 } # End closure for Match_Table
6978 # The Property class represents a Unicode property, or the $perl
6979 # pseudo-property. It contains a map table initialized empty at construction
6980 # time, and for properties accessible through regular expressions, various
6981 # match tables, created through the add_match_table() method, and referenced
6982 # by the table('NAME') or tables() methods, the latter returning a list of all
6983 # of the match tables. Otherwise table operations implicitly are for the map
6986 # Most of the data in the property is actually about its map table, so it
6987 # mostly just uses that table's accessors for most methods. The two could
6988 # have been combined into one object, but for clarity because of their
6989 # differing semantics, they have been kept separate. It could be argued that
6990 # the 'file' and 'directory' fields should be kept with the map table.
6992 # Each property has a type. This can be set in the constructor, or in the
6993 # set_type accessor, but mostly it is figured out by the data. Every property
6994 # starts with unknown type, overridden by a parameter to the constructor, or
6995 # as match tables are added, or ranges added to the map table, the data is
6996 # inspected, and the type changed. After the table is mostly or entirely
6997 # filled, compute_type() should be called to finalize they analysis.
6999 # There are very few operations defined. One can safely remove a range from
7000 # the map table, and property_add_or_replace_non_nulls() adds the maps from another
7001 # table to this one, replacing any in the intersection of the two.
7003 sub standardize { return main::standardize($_[0]); }
7004 sub trace { return main::trace(@_) if main::DEBUG && $to_trace }
7008 # This hash will contain as keys, all the aliases of all properties, and
7009 # as values, pointers to their respective property objects. This allows
7010 # quick look-up of a property from any of its names.
7011 my %alias_to_property_of;
7013 sub dump_alias_to_property_of {
7016 print "\n", main::simple_dumper (\%alias_to_property_of), "\n";
7021 # This is a package subroutine, not called as a method.
7022 # If the single parameter is a literal '*' it returns a list of all
7023 # defined properties.
7024 # Otherwise, the single parameter is a name, and it returns a pointer
7025 # to the corresponding property object, or undef if none.
7027 # Properties can have several different names. The 'standard' form of
7028 # each of them is stored in %alias_to_property_of as they are defined.
7029 # But it's possible that this subroutine will be called with some
7030 # variant, so if the initial lookup fails, it is repeated with the
7031 # standarized form of the input name. If found, besides returning the
7032 # result, the input name is added to the list so future calls won't
7033 # have to do the conversion again.
7037 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7039 if (! defined $name) {
7040 Carp::my_carp_bug("Undefined input property. No action taken.");
7044 return main::uniques(values %alias_to_property_of) if $name eq '*';
7046 # Return cached result if have it.
7047 my $result = $alias_to_property_of{$name};
7048 return $result if defined $result;
7050 # Convert the input to standard form.
7051 my $standard_name = standardize($name);
7053 $result = $alias_to_property_of{$standard_name};
7054 return unless defined $result; # Don't cache undefs
7056 # Cache the result before returning it.
7057 $alias_to_property_of{$name} = $result;
7062 main::setup_package();
7065 # A pointer to the map table object for this property
7066 main::set_access('map', \%map);
7069 # The property's full name. This is a duplicate of the copy kept in the
7070 # map table, but is needed because stringify needs it during
7071 # construction of the map table, and then would have a chicken before egg
7073 main::set_access('full_name', \%full_name, 'r');
7076 # This hash will contain as keys, all the aliases of any match tables
7077 # attached to this property, and as values, the pointers to their
7078 # respective tables. This allows quick look-up of a table from any of its
7080 main::set_access('table_ref', \%table_ref);
7083 # The type of the property, $ENUM, $BINARY, etc
7084 main::set_access('type', \%type, 'r');
7087 # The filename where the map table will go (if actually written).
7088 # Normally defaulted, but can be overridden.
7089 main::set_access('file', \%file, 'r', 's');
7092 # The directory where the map table will go (if actually written).
7093 # Normally defaulted, but can be overridden.
7094 main::set_access('directory', \%directory, 's');
7096 my %pseudo_map_type;
7097 # This is used to affect the calculation of the map types for all the
7098 # ranges in the table. It should be set to one of the values that signify
7099 # to alter the calculation.
7100 main::set_access('pseudo_map_type', \%pseudo_map_type, 'r');
7102 my %has_only_code_point_maps;
7103 # A boolean used to help in computing the type of data in the map table.
7104 main::set_access('has_only_code_point_maps', \%has_only_code_point_maps);
7107 # A list of the first few distinct mappings this property has. This is
7108 # used to disambiguate between binary and enum property types, so don't
7109 # have to keep more than three.
7110 main::set_access('unique_maps', \%unique_maps);
7113 # The only required parameter is the positionally first, name. All
7114 # other parameters are key => value pairs. See the documentation just
7115 # above for the meanings of the ones not passed directly on to the map
7116 # table constructor.
7119 my $name = shift || "";
7121 my $self = property_ref($name);
7122 if (defined $self) {
7123 my $options_string = join ", ", @_;
7124 $options_string = ". Ignoring options $options_string" if $options_string;
7125 Carp::my_carp("$self is already in use. Using existing one$options_string;");
7131 $self = bless \do { my $anonymous_scalar }, $class;
7132 my $addr = do { no overloading; pack 'J', $self; };
7134 $directory{$addr} = delete $args{'Directory'};
7135 $file{$addr} = delete $args{'File'};
7136 $full_name{$addr} = delete $args{'Full_Name'} || $name;
7137 $type{$addr} = delete $args{'Type'} || $UNKNOWN;
7138 $pseudo_map_type{$addr} = delete $args{'Map_Type'};
7139 # Rest of parameters passed on.
7141 $has_only_code_point_maps{$addr} = 1;
7142 $table_ref{$addr} = { };
7143 $unique_maps{$addr} = { };
7145 $map{$addr} = Map_Table->new($name,
7146 Full_Name => $full_name{$addr},
7147 _Alias_Hash => \%alias_to_property_of,
7153 # See this program's beginning comment block about overloading the copy
7154 # constructor. Few operations are defined on properties, but a couple are
7155 # useful. It is safe to take the inverse of a property, and to remove a
7156 # single code point from it.
7159 qw("") => "_operator_stringify",
7160 "." => \&main::_operator_dot,
7161 '==' => \&main::_operator_equal,
7162 '!=' => \&main::_operator_not_equal,
7163 '=' => sub { return shift },
7164 '-=' => "_minus_and_equal",
7167 sub _operator_stringify {
7168 return "Property '" . shift->full_name . "'";
7171 sub _minus_and_equal {
7172 # Remove a single code point from the map table of a property.
7176 my $reversed = shift;
7177 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7180 Carp::my_carp_bug("Can't cope with a "
7182 . " argument to '-='. Subtraction ignored.");
7185 elsif ($reversed) { # Shouldnt happen in a -=, but just in case
7186 Carp::my_carp_bug("Can't cope with a "
7188 . " being the first parameter in a '-='. Subtraction ignored.");
7193 $map{pack 'J', $self}->delete_range($other, $other);
7198 sub add_match_table {
7199 # Add a new match table for this property, with name given by the
7200 # parameter. It returns a pointer to the table.
7206 my $addr = do { no overloading; pack 'J', $self; };
7208 my $table = $table_ref{$addr}{$name};
7209 my $standard_name = main::standardize($name);
7211 || (defined ($table = $table_ref{$addr}{$standard_name})))
7213 Carp::my_carp("Table '$name' in $self is already in use. Using existing one");
7214 $table_ref{$addr}{$name} = $table;
7219 # See if this is a perl extension, if not passed in.
7220 my $perl_extension = delete $args{'Perl_Extension'};
7222 = $self->perl_extension if ! defined $perl_extension;
7224 $table = Match_Table->new(
7226 Perl_Extension => $perl_extension,
7227 _Alias_Hash => $table_ref{$addr},
7230 # gets property's status by default
7231 Status => $self->status,
7232 _Status_Info => $self->status_info,
7234 Internal_Only_Warning => 1); # Override any
7236 return unless defined $table;
7239 # Save the names for quick look up
7240 $table_ref{$addr}{$standard_name} = $table;
7241 $table_ref{$addr}{$name} = $table;
7243 # Perhaps we can figure out the type of this property based on the
7244 # fact of adding this match table. First, string properties don't
7245 # have match tables; second, a binary property can't have 3 match
7247 if ($type{$addr} == $UNKNOWN) {
7248 $type{$addr} = $NON_STRING;
7250 elsif ($type{$addr} == $STRING) {
7251 Carp::my_carp("$self Added a match table '$name' to a string property '$self'. Changed it to a non-string property. Bad News.");
7252 $type{$addr} = $NON_STRING;
7254 elsif ($type{$addr} != $ENUM) {
7255 if (scalar main::uniques(values %{$table_ref{$addr}}) > 2
7256 && $type{$addr} == $BINARY)
7258 Carp::my_carp("$self now has more than 2 tables (with the addition of '$name'), and so is no longer binary. Changing its type to 'enum'. Bad News.");
7259 $type{$addr} = $ENUM;
7267 # Return a pointer to the match table (with name given by the
7268 # parameter) associated with this property; undef if none.
7272 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7274 my $addr = do { no overloading; pack 'J', $self; };
7276 return $table_ref{$addr}{$name} if defined $table_ref{$addr}{$name};
7278 # If quick look-up failed, try again using the standard form of the
7279 # input name. If that succeeds, cache the result before returning so
7280 # won't have to standardize this input name again.
7281 my $standard_name = main::standardize($name);
7282 return unless defined $table_ref{$addr}{$standard_name};
7284 $table_ref{$addr}{$name} = $table_ref{$addr}{$standard_name};
7285 return $table_ref{$addr}{$name};
7289 # Return a list of pointers to all the match tables attached to this
7293 return main::uniques(values %{$table_ref{pack 'J', shift}});
7297 # Returns the directory the map table for this property should be
7298 # output in. If a specific directory has been specified, that has
7299 # priority; 'undef' is returned if the type isn't defined;
7300 # or $map_directory for everything else.
7302 my $addr = do { no overloading; pack 'J', shift; };
7304 return $directory{$addr} if defined $directory{$addr};
7305 return undef if $type{$addr} == $UNKNOWN;
7306 return $map_directory;
7310 # Return the name that is used to both:
7311 # 1) Name the file that the map table is written to.
7312 # 2) The name of swash related stuff inside that file.
7313 # The reason for this is that the Perl core historically has used
7314 # certain names that aren't the same as the Unicode property names.
7315 # To continue using these, $file is hard-coded in this file for those,
7316 # but otherwise the standard name is used. This is different from the
7317 # external_name, so that the rest of the files, like in lib can use
7318 # the standard name always, without regard to historical precedent.
7321 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7323 my $addr = do { no overloading; pack 'J', $self; };
7325 return $file{$addr} if defined $file{$addr};
7326 return $map{$addr}->external_name;
7329 sub to_create_match_tables {
7330 # Returns a boolean as to whether or not match tables should be
7331 # created for this property.
7334 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7336 # The whole point of this pseudo property is match tables.
7337 return 1 if $self == $perl;
7339 my $addr = do { no overloading; pack 'J', $self; };
7341 # Don't generate tables of code points that match the property values
7342 # of a string property. Such a list would most likely have many
7343 # property values, each with just one or very few code points mapping
7345 return 0 if $type{$addr} == $STRING;
7347 # Don't generate anything for unimplemented properties.
7348 return 0 if grep { $self->complete_name eq $_ }
7349 @unimplemented_properties;
7354 sub property_add_or_replace_non_nulls {
7355 # This adds the mappings in the property $other to $self. Non-null
7356 # mappings from $other override those in $self. It essentially merges
7357 # the two properties, with the second having priority except for null
7362 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7364 if (! $other->isa(__PACKAGE__)) {
7365 Carp::my_carp_bug("$other should be a "
7374 return $map{pack 'J', $self}->map_add_or_replace_non_nulls($map{pack 'J', $other});
7378 # Set the type of the property. Mostly this is figured out by the
7379 # data in the table. But this is used to set it explicitly. The
7380 # reason it is not a standard accessor is that when setting a binary
7381 # property, we need to make sure that all the true/false aliases are
7382 # present, as they were omitted in early Unicode releases.
7386 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7388 if ($type != $ENUM && $type != $BINARY && $type != $STRING) {
7389 Carp::my_carp("Unrecognized type '$type'. Type not set");
7393 { no overloading; $type{pack 'J', $self} = $type; }
7394 return if $type != $BINARY;
7396 my $yes = $self->table('Y');
7397 $yes = $self->table('Yes') if ! defined $yes;
7398 $yes = $self->add_match_table('Y') if ! defined $yes;
7399 $yes->add_alias('Yes');
7400 $yes->add_alias('T');
7401 $yes->add_alias('True');
7403 my $no = $self->table('N');
7404 $no = $self->table('No') if ! defined $no;
7405 $no = $self->add_match_table('N') if ! defined $no;
7406 $no->add_alias('No');
7407 $no->add_alias('F');
7408 $no->add_alias('False');
7413 # Add a map to the property's map table. This also keeps
7414 # track of the maps so that the property type can be determined from
7418 my $start = shift; # First code point in range
7419 my $end = shift; # Final code point in range
7420 my $map = shift; # What the range maps to.
7421 # Rest of parameters passed on.
7423 my $addr = do { no overloading; pack 'J', $self; };
7425 # If haven't the type of the property, gather information to figure it
7427 if ($type{$addr} == $UNKNOWN) {
7429 # If the map contains an interior blank or dash, or most other
7430 # nonword characters, it will be a string property. This
7431 # heuristic may actually miss some string properties. If so, they
7432 # may need to have explicit set_types called for them. This
7433 # happens in the Unihan properties.
7434 if ($map =~ / (?<= . ) [ -] (?= . ) /x
7435 || $map =~ / [^\w.\/\ -] /x)
7437 $self->set_type($STRING);
7439 # $unique_maps is used for disambiguating between ENUM and
7440 # BINARY later; since we know the property is not going to be
7441 # one of those, no point in keeping the data around
7442 undef $unique_maps{$addr};
7446 # Not necessarily a string. The final decision has to be
7447 # deferred until all the data are in. We keep track of if all
7448 # the values are code points for that eventual decision.
7449 $has_only_code_point_maps{$addr} &=
7450 $map =~ / ^ $code_point_re $/x;
7452 # For the purposes of disambiguating between binary and other
7453 # enumerations at the end, we keep track of the first three
7454 # distinct property values. Once we get to three, we know
7455 # it's not going to be binary, so no need to track more.
7456 if (scalar keys %{$unique_maps{$addr}} < 3) {
7457 $unique_maps{$addr}{main::standardize($map)} = 1;
7462 # Add the mapping by calling our map table's method
7463 return $map{$addr}->add_map($start, $end, $map, @_);
7467 # Compute the type of the property: $ENUM, $STRING, or $BINARY. This
7468 # should be called after the property is mostly filled with its maps.
7469 # We have been keeping track of what the property values have been,
7470 # and now have the necessary information to figure out the type.
7473 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7475 my $addr = do { no overloading; pack 'J', $self; };
7477 my $type = $type{$addr};
7479 # If already have figured these out, no need to do so again, but we do
7480 # a double check on ENUMS to make sure that a string property hasn't
7481 # improperly been classified as an ENUM, so continue on with those.
7482 return if $type == $STRING || $type == $BINARY;
7484 # If every map is to a code point, is a string property.
7485 if ($type == $UNKNOWN
7486 && ($has_only_code_point_maps{$addr}
7487 || (defined $map{$addr}->default_map
7488 && $map{$addr}->default_map eq "")))
7490 $self->set_type($STRING);
7494 # Otherwise, it is to some sort of enumeration. (The case where
7495 # it is a Unicode miscellaneous property, and treated like a
7496 # string in this program is handled in add_map()). Distinguish
7497 # between binary and some other enumeration type. Of course, if
7498 # there are more than two values, it's not binary. But more
7499 # subtle is the test that the default mapping is defined means it
7500 # isn't binary. This in fact may change in the future if Unicode
7501 # changes the way its data is structured. But so far, no binary
7502 # properties ever have @missing lines for them, so the default map
7503 # isn't defined for them. The few properties that are two-valued
7504 # and aren't considered binary have the default map defined
7505 # starting in Unicode 5.0, when the @missing lines appeared; and
7506 # this program has special code to put in a default map for them
7507 # for earlier than 5.0 releases.
7509 || scalar keys %{$unique_maps{$addr}} > 2
7510 || defined $self->default_map)
7512 my $tables = $self->tables;
7513 my $count = $self->count;
7514 if ($verbosity && $count > 500 && $tables/$count > .1) {
7515 Carp::my_carp_bug("It appears that $self should be a \$STRING property, not an \$ENUM because it has too many match tables: $count\n");
7517 $self->set_type($ENUM);
7520 $self->set_type($BINARY);
7523 undef $unique_maps{$addr}; # Garbage collect
7527 # Most of the accessors for a property actually apply to its map table.
7528 # Setup up accessor functions for those, referring to %map
7577 # 'property' above is for symmetry, so that one can take
7578 # the property of a property and get itself, and so don't
7579 # have to distinguish between properties and tables in
7587 return $map{pack 'J', $self}->$sub(@_);
7597 # Returns lines of the input joined together, so that they can be folded
7599 # This causes continuation lines to be joined together into one long line
7600 # for folding. A continuation line is any line that doesn't begin with a
7601 # space or "\b" (the latter is stripped from the output). This is so
7602 # lines can be be in a HERE document so as to fit nicely in the terminal
7603 # width, but be joined together in one long line, and then folded with
7604 # indents, '#' prefixes, etc, properly handled.
7605 # A blank separates the joined lines except if there is a break; an extra
7606 # blank is inserted after a period ending a line.
7608 # Intialize the return with the first line.
7609 my ($return, @lines) = split "\n", shift;
7611 # If the first line is null, it was an empty line, add the \n back in
7612 $return = "\n" if $return eq "";
7614 # Now join the remainder of the physical lines.
7615 for my $line (@lines) {
7617 # An empty line means wanted a blank line, so add two \n's to get that
7618 # effect, and go to the next line.
7619 if (length $line == 0) {
7624 # Look at the last character of what we have so far.
7625 my $previous_char = substr($return, -1, 1);
7627 # And at the next char to be output.
7628 my $next_char = substr($line, 0, 1);
7630 if ($previous_char ne "\n") {
7632 # Here didn't end wth a nl. If the next char a blank or \b, it
7633 # means that here there is a break anyway. So add a nl to the
7635 if ($next_char eq " " || $next_char eq "\b") {
7636 $previous_char = "\n";
7637 $return .= $previous_char;
7640 # Add an extra space after periods.
7641 $return .= " " if $previous_char eq '.';
7644 # Here $previous_char is still the latest character to be output. If
7645 # it isn't a nl, it means that the next line is to be a continuation
7646 # line, with a blank inserted between them.
7647 $return .= " " if $previous_char ne "\n";
7650 substr($line, 0, 1) = "" if $next_char eq "\b";
7652 # And append this next line.
7659 sub simple_fold($;$$$) {
7660 # Returns a string of the input (string or an array of strings) folded
7661 # into multiple-lines each of no more than $MAX_LINE_WIDTH characters plus
7663 # This is tailored for the kind of text written by this program,
7664 # especially the pod file, which can have very long names with
7665 # underscores in the middle, or words like AbcDefgHij.... We allow
7666 # breaking in the middle of such constructs if the line won't fit
7667 # otherwise. The break in such cases will come either just after an
7668 # underscore, or just before one of the Capital letters.
7670 local $to_trace = 0 if main::DEBUG;
7673 my $prefix = shift; # Optional string to prepend to each output
7675 $prefix = "" unless defined $prefix;
7677 my $hanging_indent = shift; # Optional number of spaces to indent
7678 # continuation lines
7679 $hanging_indent = 0 unless $hanging_indent;
7681 my $right_margin = shift; # Optional number of spaces to narrow the
7683 $right_margin = 0 unless defined $right_margin;
7685 # Call carp with the 'nofold' option to avoid it from trying to call us
7687 Carp::carp_extra_args(\@_, 'nofold') if main::DEBUG && @_;
7689 # The space available doesn't include what's automatically prepended
7690 # to each line, or what's reserved on the right.
7691 my $max = $MAX_LINE_WIDTH - length($prefix) - $right_margin;
7692 # XXX Instead of using the 'nofold' perhaps better to look up the stack
7694 if (DEBUG && $hanging_indent >= $max) {
7695 Carp::my_carp("Too large a hanging indent ($hanging_indent); must be < $max. Using 0", 'nofold');
7696 $hanging_indent = 0;
7699 # First, split into the current physical lines.
7701 if (ref $line) { # Better be an array, because not bothering to
7703 foreach my $line (@{$line}) {
7704 push @line, split /\n/, $line;
7708 @line = split /\n/, $line;
7711 #local $to_trace = 1 if main::DEBUG;
7712 trace "", join(" ", @line), "\n" if main::DEBUG && $to_trace;
7714 # Look at each current physical line.
7715 for (my $i = 0; $i < @line; $i++) {
7716 Carp::my_carp("Tabs don't work well.", 'nofold') if $line[$i] =~ /\t/;
7717 #local $to_trace = 1 if main::DEBUG;
7718 trace "i=$i: $line[$i]\n" if main::DEBUG && $to_trace;
7720 # Remove prefix, because will be added back anyway, don't want
7722 $line[$i] =~ s/^$prefix//;
7724 # Remove trailing space
7725 $line[$i] =~ s/\s+\Z//;
7727 # If the line is too long, fold it.
7728 if (length $line[$i] > $max) {
7731 # Here needs to fold. Save the leading space in the line for
7733 $line[$i] =~ /^ ( \s* )/x;
7734 my $leading_space = $1;
7735 trace "line length", length $line[$i], "; lead length", length($leading_space) if main::DEBUG && $to_trace;
7737 # If character at final permissible position is white space,
7738 # fold there, which will delete that white space
7739 if (substr($line[$i], $max - 1, 1) =~ /\s/) {
7740 $remainder = substr($line[$i], $max);
7741 $line[$i] = substr($line[$i], 0, $max - 1);
7745 # Otherwise fold at an acceptable break char closest to
7746 # the max length. Look at just the maximal initial
7747 # segment of the line
7748 my $segment = substr($line[$i], 0, $max - 1);
7750 /^ ( .{$hanging_indent} # Don't look before the
7752 \ * # Don't look in leading
7753 # blanks past the indent
7754 [^ ] .* # Find the right-most
7755 (?: # acceptable break:
7756 [ \s = ] # space or equal
7757 | - (?! [.0-9] ) # or non-unary minus.
7758 ) # $1 includes the character
7761 # Split into the initial part that fits, and remaining
7763 $remainder = substr($line[$i], length $1);
7765 trace $line[$i] if DEBUG && $to_trace;
7766 trace $remainder if DEBUG && $to_trace;
7769 # If didn't find a good breaking spot, see if there is a
7770 # not-so-good breaking spot. These are just after
7771 # underscores or where the case changes from lower to
7772 # upper. Use \a as a soft hyphen, but give up
7773 # and don't break the line if there is actually a \a
7774 # already in the input. We use an ascii character for the
7775 # soft-hyphen to avoid any attempt by miniperl to try to
7776 # access the files that this program is creating.
7777 elsif ($segment !~ /\a/
7778 && ($segment =~ s/_/_\a/g
7779 || $segment =~ s/ ( [a-z] ) (?= [A-Z] )/$1\a/xg))
7781 # Here were able to find at least one place to insert
7782 # our substitute soft hyphen. Find the right-most one
7783 # and replace it by a real hyphen.
7784 trace $segment if DEBUG && $to_trace;
7786 rindex($segment, "\a"),
7789 # Then remove the soft hyphen substitutes.
7790 $segment =~ s/\a//g;
7791 trace $segment if DEBUG && $to_trace;
7793 # And split into the initial part that fits, and
7794 # remainder of the line
7795 my $pos = rindex($segment, '-');
7796 $remainder = substr($line[$i], $pos);
7797 trace $remainder if DEBUG && $to_trace;
7798 $line[$i] = substr($segment, 0, $pos + 1);
7802 # Here we know if we can fold or not. If we can, $remainder
7803 # is what remains to be processed in the next iteration.
7804 if (defined $remainder) {
7805 trace "folded='$line[$i]'" if main::DEBUG && $to_trace;
7807 # Insert the folded remainder of the line as a new element
7808 # of the array. (It may still be too long, but we will
7809 # deal with that next time through the loop.) Omit any
7810 # leading space in the remainder.
7811 $remainder =~ s/^\s+//;
7812 trace "remainder='$remainder'" if main::DEBUG && $to_trace;
7814 # But then indent by whichever is larger of:
7815 # 1) the leading space on the input line;
7816 # 2) the hanging indent.
7817 # This preserves indentation in the original line.
7818 my $lead = ($leading_space)
7819 ? length $leading_space
7821 $lead = max($lead, $hanging_indent);
7822 splice @line, $i+1, 0, (" " x $lead) . $remainder;
7826 # Ready to output the line. Get rid of any trailing space
7827 # And prefix by the required $prefix passed in.
7828 $line[$i] =~ s/\s+$//;
7829 $line[$i] = "$prefix$line[$i]\n";
7830 } # End of looping through all the lines.
7832 return join "", @line;
7835 sub property_ref { # Returns a reference to a property object.
7836 return Property::property_ref(@_);
7839 sub force_unlink ($) {
7840 my $filename = shift;
7841 return unless file_exists($filename);
7842 return if CORE::unlink($filename);
7844 # We might need write permission
7845 chmod 0777, $filename;
7846 CORE::unlink($filename) or Carp::my_carp("Couldn't unlink $filename. Proceeding anyway: $!");
7851 # Given a filename and references to arrays of lines, write the lines of
7852 # each array to the file
7853 # Filename can be given as an arrayref of directory names
7855 return Carp::carp_too_few_args(\@_, 3) if main::DEBUG && @_ < 3;
7858 my $use_utf8 = shift;
7860 # Get into a single string if an array, and get rid of, in Unix terms, any
7862 $file= File::Spec->join(@$file) if ref $file eq 'ARRAY';
7863 $file = File::Spec->canonpath($file);
7865 # If has directories, make sure that they all exist
7866 (undef, my $directories, undef) = File::Spec->splitpath($file);
7867 File::Path::mkpath($directories) if $directories && ! -d $directories;
7869 push @files_actually_output, $file;
7871 force_unlink ($file);
7874 if (not open $OUT, ">", $file) {
7875 Carp::my_carp("can't open $file for output. Skipping this file: $!");
7879 # $output_names outputs the utf8 of each character as well
7880 binmode $OUT, ":utf8" if $use_utf8;
7882 while (defined (my $lines_ref = shift)) {
7883 unless (@$lines_ref) {
7884 Carp::my_carp("An array of lines for writing to file '$file' is empty; writing it anyway;");
7887 print $OUT @$lines_ref or die Carp::my_carp("write to '$file' failed: $!");
7889 close $OUT or die Carp::my_carp("close '$file' failed: $!");
7891 print "$file written.\n" if $verbosity >= $VERBOSE;
7897 sub Standardize($) {
7898 # This converts the input name string into a standardized equivalent to
7902 unless (defined $name) {
7903 Carp::my_carp_bug("Standardize() called with undef. Returning undef.");
7907 # Remove any leading or trailing white space
7911 # Convert interior white space and hypens into underscores.
7912 $name =~ s/ (?<= .) [ -]+ (.) /_$1/xg;
7914 # Capitalize the letter following an underscore, and convert a sequence of
7915 # multiple underscores to a single one
7916 $name =~ s/ (?<= .) _+ (.) /_\u$1/xg;
7918 # And capitalize the first letter, but not for the special cjk ones.
7919 $name = ucfirst($name) unless $name =~ /^k[A-Z]/;
7923 sub standardize ($) {
7924 # Returns a lower-cased standardized name, without underscores. This form
7925 # is chosen so that it can distinguish between any real versus superficial
7926 # Unicode name differences. It relies on the fact that Unicode doesn't
7927 # have interior underscores, white space, nor dashes in any
7928 # stricter-matched name. It should not be used on Unicode code point
7929 # names (the Name property), as they mostly, but not always follow these
7932 my $name = Standardize(shift);
7933 return if !defined $name;
7935 $name =~ s/ (?<= .) _ (?= . ) //xg;
7941 my $indent_increment = " " x 2;
7944 $main::simple_dumper_nesting = 0;
7947 # Like Simple Data::Dumper. Good enough for our needs. We can't use
7948 # the real thing as we have to run under miniperl.
7950 # It is designed so that on input it is at the beginning of a line,
7951 # and the final thing output in any call is a trailing ",\n".
7955 $indent = "" if ! defined $indent;
7957 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7959 # nesting level is localized, so that as the call stack pops, it goes
7960 # back to the prior value.
7961 local $main::simple_dumper_nesting = $main::simple_dumper_nesting;
7962 undef %already_output if $main::simple_dumper_nesting == 0;
7963 $main::simple_dumper_nesting++;
7964 #print STDERR __LINE__, ": $main::simple_dumper_nesting: $indent$item\n";
7966 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7968 # Determine the indent for recursive calls.
7969 my $next_indent = $indent . $indent_increment;
7974 # Dump of scalar: just output it in quotes if not a number. To do
7975 # so we must escape certain characters, and therefore need to
7976 # operate on a copy to avoid changing the original
7978 $copy = $UNDEF unless defined $copy;
7980 # Quote non-numbers (numbers also have optional leading '-' and
7982 if ($copy eq "" || $copy !~ /^ -? \d+ ( \. \d+ )? $/x) {
7984 # Escape apostrophe and backslash
7985 $copy =~ s/ ( ['\\] ) /\\$1/xg;
7988 $output = "$indent$copy,\n";
7992 # Keep track of cycles in the input, and refuse to infinitely loop
7993 my $addr = do { no overloading; pack 'J', $item; };
7994 if (defined $already_output{$addr}) {
7995 return "${indent}ALREADY OUTPUT: $item\n";
7997 $already_output{$addr} = $item;
7999 if (ref $item eq 'ARRAY') {
8002 if ($main::simple_dumper_nesting > 1) {
8004 $using_brackets = 1;
8007 $using_brackets = 0;
8010 # If the array is empty, put the closing bracket on the same
8011 # line. Otherwise, recursively add each array element
8017 for (my $i = 0; $i < @$item; $i++) {
8019 # Indent array elements one level
8020 $output .= &simple_dumper($item->[$i], $next_indent);
8021 $output =~ s/\n$//; # Remove trailing nl so as to
8022 $output .= " # [$i]\n"; # add a comment giving the
8025 $output .= $indent; # Indent closing ']' to orig level
8027 $output .= ']' if $using_brackets;
8030 elsif (ref $item eq 'HASH') {
8035 # No surrounding braces at top level
8037 if ($main::simple_dumper_nesting > 1) {
8040 $body_indent = $next_indent;
8041 $next_indent .= $indent_increment;
8046 $body_indent = $indent;
8050 # Output hashes sorted alphabetically instead of apparently
8051 # random. Use caseless alphabetic sort
8052 foreach my $key (sort { lc $a cmp lc $b } keys %$item)
8054 if ($is_first_line) {
8058 $output .= "$body_indent";
8061 # The key must be a scalar, but this recursive call quotes
8063 $output .= &simple_dumper($key);
8065 # And change the trailing comma and nl to the hash fat
8066 # comma for clarity, and so the value can be on the same
8068 $output =~ s/,\n$/ => /;
8070 # Recursively call to get the value's dump.
8071 my $next = &simple_dumper($item->{$key}, $next_indent);
8073 # If the value is all on one line, remove its indent, so
8074 # will follow the => immediately. If it takes more than
8075 # one line, start it on a new line.
8076 if ($next !~ /\n.*\n/) {
8085 $output .= "$indent},\n" if $using_braces;
8087 elsif (ref $item eq 'CODE' || ref $item eq 'GLOB') {
8088 $output = $indent . ref($item) . "\n";
8089 # XXX see if blessed
8091 elsif ($item->can('dump')) {
8093 # By convention in this program, objects furnish a 'dump'
8094 # method. Since not doing any output at this level, just pass
8095 # on the input indent
8096 $output = $item->dump($indent);
8099 Carp::my_carp("Can't cope with dumping a " . ref($item) . ". Skipping.");
8106 sub dump_inside_out {
8107 # Dump inside-out hashes in an object's state by converting them to a
8108 # regular hash and then calling simple_dumper on that.
8111 my $fields_ref = shift;
8112 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8114 my $addr = do { no overloading; pack 'J', $object; };
8117 foreach my $key (keys %$fields_ref) {
8118 $hash{$key} = $fields_ref->{$key}{$addr};
8121 return simple_dumper(\%hash, @_);
8125 # Overloaded '.' method that is common to all packages. It uses the
8126 # package's stringify method.
8130 my $reversed = shift;
8131 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8133 $other = "" unless defined $other;
8135 foreach my $which (\$self, \$other) {
8136 next unless ref $$which;
8137 if ($$which->can('_operator_stringify')) {
8138 $$which = $$which->_operator_stringify;
8141 my $ref = ref $$which;
8142 my $addr = do { no overloading; pack 'J', $$which; };
8143 $$which = "$ref ($addr)";
8151 sub _operator_equal {
8152 # Generic overloaded '==' routine. To be equal, they must be the exact
8158 return 0 unless defined $other;
8159 return 0 unless ref $other;
8161 return $self == $other;
8164 sub _operator_not_equal {
8168 return ! _operator_equal($self, $other);
8171 sub process_PropertyAliases($) {
8172 # This reads in the PropertyAliases.txt file, which contains almost all
8173 # the character properties in Unicode and their equivalent aliases:
8174 # scf ; Simple_Case_Folding ; sfc
8176 # Field 0 is the preferred short name for the property.
8177 # Field 1 is the full name.
8178 # Any succeeding ones are other accepted names.
8181 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8183 # This whole file was non-existent in early releases, so use our own
8185 $file->insert_lines(get_old_property_aliases())
8186 if ! -e 'PropertyAliases.txt';
8188 # Add any cjk properties that may have been defined.
8189 $file->insert_lines(@cjk_properties);
8191 while ($file->next_line) {
8193 my @data = split /\s*;\s*/;
8195 my $full = $data[1];
8197 my $this = Property->new($data[0], Full_Name => $full);
8199 # Start looking for more aliases after these two.
8200 for my $i (2 .. @data - 1) {
8201 $this->add_alias($data[$i]);
8208 sub finish_property_setup {
8209 # Finishes setting up after PropertyAliases.
8212 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8214 # This entry was missing from this file in earlier Unicode versions
8215 if (-e 'Jamo.txt') {
8216 my $jsn = property_ref('JSN');
8217 if (! defined $jsn) {
8218 $jsn = Property->new('JSN', Full_Name => 'Jamo_Short_Name');
8222 # This entry is still missing as of 5.2, perhaps because no short name for
8224 if (-e 'NameAliases.txt') {
8225 my $aliases = property_ref('Name_Alias');
8226 if (! defined $aliases) {
8227 $aliases = Property->new('Name_Alias');
8231 # These are used so much, that we set globals for them.
8232 $gc = property_ref('General_Category');
8233 $block = property_ref('Block');
8235 # Perl adds this alias.
8236 $gc->add_alias('Category');
8238 # For backwards compatibility, these property files have particular names.
8239 my $upper = property_ref('Uppercase_Mapping');
8240 $upper->set_core_access('uc()');
8241 $upper->set_file('Upper'); # This is what utf8.c calls it
8243 my $lower = property_ref('Lowercase_Mapping');
8244 $lower->set_core_access('lc()');
8245 $lower->set_file('Lower');
8247 my $title = property_ref('Titlecase_Mapping');
8248 $title->set_core_access('ucfirst()');
8249 $title->set_file('Title');
8251 my $fold = property_ref('Case_Folding');
8252 $fold->set_file('Fold') if defined $fold;
8254 # utf8.c can't currently cope with non range-size-1 for these, and even if
8255 # it were changed to do so, someone else may be using them, expecting the
8257 foreach my $property (qw {
8264 property_ref($property)->set_range_size_1(1);
8267 # These two properties aren't actually used in the core, but unfortunately
8268 # the names just above that are in the core interfere with these, so
8269 # choose different names. These aren't a problem unless the map tables
8270 # for these files get written out.
8271 my $lowercase = property_ref('Lowercase');
8272 $lowercase->set_file('IsLower') if defined $lowercase;
8273 my $uppercase = property_ref('Uppercase');
8274 $uppercase->set_file('IsUpper') if defined $uppercase;
8276 # Set up the hard-coded default mappings, but only on properties defined
8278 foreach my $property (keys %default_mapping) {
8279 my $property_object = property_ref($property);
8280 next if ! defined $property_object;
8281 my $default_map = $default_mapping{$property};
8282 $property_object->set_default_map($default_map);
8284 # A map of <code point> implies the property is string.
8285 if ($property_object->type == $UNKNOWN
8286 && $default_map eq $CODE_POINT)
8288 $property_object->set_type($STRING);
8292 # The following use the Multi_Default class to create objects for
8295 # Bidi class has a complicated default, but the derived file takes care of
8296 # the complications, leaving just 'L'.
8297 if (file_exists("${EXTRACTED}DBidiClass.txt")) {
8298 property_ref('Bidi_Class')->set_default_map('L');
8303 # The derived file was introduced in 3.1.1. The values below are
8304 # taken from table 3-8, TUS 3.0
8306 'my $default = Range_List->new;
8307 $default->add_range(0x0590, 0x05FF);
8308 $default->add_range(0xFB1D, 0xFB4F);'
8311 # The defaults apply only to unassigned characters
8312 $default_R .= '$gc->table("Unassigned") & $default;';
8314 if ($v_version lt v3.0.0) {
8315 $default = Multi_Default->new(R => $default_R, 'L');
8319 # AL apparently not introduced until 3.0: TUS 2.x references are
8320 # not on-line to check it out
8322 'my $default = Range_List->new;
8323 $default->add_range(0x0600, 0x07BF);
8324 $default->add_range(0xFB50, 0xFDFF);
8325 $default->add_range(0xFE70, 0xFEFF);'
8328 # Non-character code points introduced in this release; aren't AL
8329 if ($v_version ge 3.1.0) {
8330 $default_AL .= '$default->delete_range(0xFDD0, 0xFDEF);';
8332 $default_AL .= '$gc->table("Unassigned") & $default';
8333 $default = Multi_Default->new(AL => $default_AL,
8337 property_ref('Bidi_Class')->set_default_map($default);
8340 # Joining type has a complicated default, but the derived file takes care
8341 # of the complications, leaving just 'U' (or Non_Joining), except the file
8343 if (file_exists("${EXTRACTED}DJoinType.txt") || -e 'ArabicShaping.txt') {
8344 if (file_exists("${EXTRACTED}DJoinType.txt") && $v_version ne 3.1.0) {
8345 property_ref('Joining_Type')->set_default_map('Non_Joining');
8349 # Otherwise, there are not one, but two possibilities for the
8350 # missing defaults: T and U.
8351 # The missing defaults that evaluate to T are given by:
8352 # T = Mn + Cf - ZWNJ - ZWJ
8353 # where Mn and Cf are the general category values. In other words,
8354 # any non-spacing mark or any format control character, except
8355 # U+200C ZERO WIDTH NON-JOINER (joining type U) and U+200D ZERO
8356 # WIDTH JOINER (joining type C).
8357 my $default = Multi_Default->new(
8358 'T' => '$gc->table("Mn") + $gc->table("Cf") - 0x200C - 0x200D',
8360 property_ref('Joining_Type')->set_default_map($default);
8364 # Line break has a complicated default in early releases. It is 'Unknown'
8365 # for non-assigned code points; 'AL' for assigned.
8366 if (file_exists("${EXTRACTED}DLineBreak.txt") || -e 'LineBreak.txt') {
8367 my $lb = property_ref('Line_Break');
8368 if ($v_version gt 3.2.0) {
8369 $lb->set_default_map('Unknown');
8372 my $default = Multi_Default->new( 'Unknown' => '$gc->table("Cn")',
8374 $lb->set_default_map($default);
8377 # If has the URS property, make sure that the standard aliases are in
8378 # it, since not in the input tables in some versions.
8379 my $urs = property_ref('Unicode_Radical_Stroke');
8381 $urs->add_alias('cjkRSUnicode');
8382 $urs->add_alias('kRSUnicode');
8388 sub get_old_property_aliases() {
8389 # Returns what would be in PropertyAliases.txt if it existed in very old
8390 # versions of Unicode. It was derived from the one in 3.2, and pared
8391 # down based on the data that was actually in the older releases.
8392 # An attempt was made to use the existence of files to mean inclusion or
8393 # not of various aliases, but if this was not sufficient, using version
8394 # numbers was resorted to.
8398 # These are to be used in all versions (though some are constructed by
8399 # this program if missing)
8400 push @return, split /\n/, <<'END';
8402 Bidi_M ; Bidi_Mirrored
8404 ccc ; Canonical_Combining_Class
8405 dm ; Decomposition_Mapping
8406 dt ; Decomposition_Type
8407 gc ; General_Category
8409 lc ; Lowercase_Mapping
8411 na1 ; Unicode_1_Name
8414 sfc ; Simple_Case_Folding
8415 slc ; Simple_Lowercase_Mapping
8416 stc ; Simple_Titlecase_Mapping
8417 suc ; Simple_Uppercase_Mapping
8418 tc ; Titlecase_Mapping
8419 uc ; Uppercase_Mapping
8422 if (-e 'Blocks.txt') {
8423 push @return, "blk ; Block\n";
8425 if (-e 'ArabicShaping.txt') {
8426 push @return, split /\n/, <<'END';
8431 if (-e 'PropList.txt') {
8433 # This first set is in the original old-style proplist.
8434 push @return, split /\n/, <<'END';
8436 Bidi_C ; Bidi_Control
8444 Join_C ; Join_Control
8446 QMark ; Quotation_Mark
8447 Term ; Terminal_Punctuation
8448 WSpace ; White_Space
8450 # The next sets were added later
8451 if ($v_version ge v3.0.0) {
8452 push @return, split /\n/, <<'END';
8457 if ($v_version ge v3.0.1) {
8458 push @return, split /\n/, <<'END';
8459 NChar ; Noncharacter_Code_Point
8462 # The next sets were added in the new-style
8463 if ($v_version ge v3.1.0) {
8464 push @return, split /\n/, <<'END';
8465 OAlpha ; Other_Alphabetic
8466 OLower ; Other_Lowercase
8468 OUpper ; Other_Uppercase
8471 if ($v_version ge v3.1.1) {
8472 push @return, "AHex ; ASCII_Hex_Digit\n";
8475 if (-e 'EastAsianWidth.txt') {
8476 push @return, "ea ; East_Asian_Width\n";
8478 if (-e 'CompositionExclusions.txt') {
8479 push @return, "CE ; Composition_Exclusion\n";
8481 if (-e 'LineBreak.txt') {
8482 push @return, "lb ; Line_Break\n";
8484 if (-e 'BidiMirroring.txt') {
8485 push @return, "bmg ; Bidi_Mirroring_Glyph\n";
8487 if (-e 'Scripts.txt') {
8488 push @return, "sc ; Script\n";
8490 if (-e 'DNormalizationProps.txt') {
8491 push @return, split /\n/, <<'END';
8492 Comp_Ex ; Full_Composition_Exclusion
8493 FC_NFKC ; FC_NFKC_Closure
8494 NFC_QC ; NFC_Quick_Check
8495 NFD_QC ; NFD_Quick_Check
8496 NFKC_QC ; NFKC_Quick_Check
8497 NFKD_QC ; NFKD_Quick_Check
8498 XO_NFC ; Expands_On_NFC
8499 XO_NFD ; Expands_On_NFD
8500 XO_NFKC ; Expands_On_NFKC
8501 XO_NFKD ; Expands_On_NFKD
8504 if (-e 'DCoreProperties.txt') {
8505 push @return, split /\n/, <<'END';
8510 # These can also appear in some versions of PropList.txt
8511 push @return, "Lower ; Lowercase\n"
8512 unless grep { $_ =~ /^Lower\b/} @return;
8513 push @return, "Upper ; Uppercase\n"
8514 unless grep { $_ =~ /^Upper\b/} @return;
8517 # This flag requires the DAge.txt file to be copied into the directory.
8518 if (DEBUG && $compare_versions) {
8519 push @return, 'age ; Age';
8525 sub process_PropValueAliases {
8526 # This file contains values that properties look like:
8527 # bc ; AL ; Arabic_Letter
8528 # blk; n/a ; Greek_And_Coptic ; Greek
8530 # Field 0 is the property.
8531 # Field 1 is the short name of a property value or 'n/a' if no
8532 # short name exists;
8533 # Field 2 is the full property value name;
8534 # Any other fields are more synonyms for the property value.
8535 # Purely numeric property values are omitted from the file; as are some
8536 # others, fewer and fewer in later releases
8538 # Entries for the ccc property have an extra field before the
8540 # ccc; 0; NR ; Not_Reordered
8541 # It is the numeric value that the names are synonyms for.
8543 # There are comment entries for values missing from this file:
8544 # # @missing: 0000..10FFFF; ISO_Comment; <none>
8545 # # @missing: 0000..10FFFF; Lowercase_Mapping; <code point>
8548 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8550 # This whole file was non-existent in early releases, so use our own
8551 # internal one if necessary.
8552 if (! -e 'PropValueAliases.txt') {
8553 $file->insert_lines(get_old_property_value_aliases());
8556 # Add any explicit cjk values
8557 $file->insert_lines(@cjk_property_values);
8559 # This line is used only for testing the code that checks for name
8560 # conflicts. There is a script Inherited, and when this line is executed
8561 # it causes there to be a name conflict with the 'Inherited' that this
8562 # program generates for this block property value
8563 #$file->insert_lines('blk; n/a; Herited');
8566 # Process each line of the file ...
8567 while ($file->next_line) {
8569 my ($property, @data) = split /\s*;\s*/;
8571 # The full name for the ccc property value is in field 2 of the
8572 # remaining ones; field 1 for all other properties. Swap ccc fields 1
8573 # and 2. (Rightmost splice removes field 2, returning it; left splice
8574 # inserts that into field 1, thus shifting former field 1 to field 2.)
8575 splice (@data, 1, 0, splice(@data, 2, 1)) if $property eq 'ccc';
8577 # If there is no short name, use the full one in element 1
8578 $data[0] = $data[1] if $data[0] eq "n/a";
8580 # Earlier releases had the pseudo property 'qc' that should expand to
8581 # the ones that replace it below.
8582 if ($property eq 'qc') {
8583 if (lc $data[0] eq 'y') {
8584 $file->insert_lines('NFC_QC; Y ; Yes',
8590 elsif (lc $data[0] eq 'n') {
8591 $file->insert_lines('NFC_QC; N ; No',
8597 elsif (lc $data[0] eq 'm') {
8598 $file->insert_lines('NFC_QC; M ; Maybe',
8599 'NFKC_QC; M ; Maybe',
8603 $file->carp_bad_line("qc followed by unexpected '$data[0]");
8608 # The first field is the short name, 2nd is the full one.
8609 my $property_object = property_ref($property);
8610 my $table = $property_object->add_match_table($data[0],
8611 Full_Name => $data[1]);
8613 # Start looking for more aliases after these two.
8614 for my $i (2 .. @data - 1) {
8615 $table->add_alias($data[$i]);
8617 } # End of looping through the file
8619 # As noted in the comments early in the program, it generates tables for
8620 # the default values for all releases, even those for which the concept
8621 # didn't exist at the time. Here we add those if missing.
8622 my $age = property_ref('age');
8623 if (defined $age && ! defined $age->table('Unassigned')) {
8624 $age->add_match_table('Unassigned');
8626 $block->add_match_table('No_Block') if -e 'Blocks.txt'
8627 && ! defined $block->table('No_Block');
8630 # Now set the default mappings of the properties from the file. This is
8631 # done after the loop because a number of properties have only @missings
8632 # entries in the file, and may not show up until the end.
8633 my @defaults = $file->get_missings;
8634 foreach my $default_ref (@defaults) {
8635 my $default = $default_ref->[0];
8636 my $property = property_ref($default_ref->[1]);
8637 $property->set_default_map($default);
8642 sub get_old_property_value_aliases () {
8643 # Returns what would be in PropValueAliases.txt if it existed in very old
8644 # versions of Unicode. It was derived from the one in 3.2, and pared
8645 # down. An attempt was made to use the existence of files to mean
8646 # inclusion or not of various aliases, but if this was not sufficient,
8647 # using version numbers was resorted to.
8649 my @return = split /\n/, <<'END';
8650 bc ; AN ; Arabic_Number
8651 bc ; B ; Paragraph_Separator
8652 bc ; CS ; Common_Separator
8653 bc ; EN ; European_Number
8654 bc ; ES ; European_Separator
8655 bc ; ET ; European_Terminator
8656 bc ; L ; Left_To_Right
8657 bc ; ON ; Other_Neutral
8658 bc ; R ; Right_To_Left
8659 bc ; WS ; White_Space
8661 # The standard combining classes are very much different in v1, so only use
8662 # ones that look right (not checked thoroughly)
8663 ccc; 0; NR ; Not_Reordered
8664 ccc; 1; OV ; Overlay
8666 ccc; 8; KV ; Kana_Voicing
8668 ccc; 202; ATBL ; Attached_Below_Left
8669 ccc; 216; ATAR ; Attached_Above_Right
8670 ccc; 218; BL ; Below_Left
8672 ccc; 222; BR ; Below_Right
8674 ccc; 228; AL ; Above_Left
8676 ccc; 232; AR ; Above_Right
8677 ccc; 234; DA ; Double_Above
8679 dt ; can ; canonical
8693 gc ; C ; Other # Cc | Cf | Cn | Co | Cs
8695 gc ; Cn ; Unassigned
8696 gc ; Co ; Private_Use
8697 gc ; L ; Letter # Ll | Lm | Lo | Lt | Lu
8698 gc ; LC ; Cased_Letter # Ll | Lt | Lu
8699 gc ; Ll ; Lowercase_Letter
8700 gc ; Lm ; Modifier_Letter
8701 gc ; Lo ; Other_Letter
8702 gc ; Lu ; Uppercase_Letter
8703 gc ; M ; Mark # Mc | Me | Mn
8704 gc ; Mc ; Spacing_Mark
8705 gc ; Mn ; Nonspacing_Mark
8706 gc ; N ; Number # Nd | Nl | No
8707 gc ; Nd ; Decimal_Number
8708 gc ; No ; Other_Number
8709 gc ; P ; Punctuation # Pc | Pd | Pe | Pf | Pi | Po | Ps
8710 gc ; Pd ; Dash_Punctuation
8711 gc ; Pe ; Close_Punctuation
8712 gc ; Po ; Other_Punctuation
8713 gc ; Ps ; Open_Punctuation
8714 gc ; S ; Symbol # Sc | Sk | Sm | So
8715 gc ; Sc ; Currency_Symbol
8716 gc ; Sm ; Math_Symbol
8717 gc ; So ; Other_Symbol
8718 gc ; Z ; Separator # Zl | Zp | Zs
8719 gc ; Zl ; Line_Separator
8720 gc ; Zp ; Paragraph_Separator
8721 gc ; Zs ; Space_Separator
8729 if (-e 'ArabicShaping.txt') {
8730 push @return, split /\n/, <<'END';
8737 jg ; n/a ; NO_JOINING_GROUP
8745 jt ; C ; Join_Causing
8746 jt ; D ; Dual_Joining
8747 jt ; L ; Left_Joining
8748 jt ; R ; Right_Joining
8749 jt ; U ; Non_Joining
8750 jt ; T ; Transparent
8752 if ($v_version ge v3.0.0) {
8753 push @return, split /\n/, <<'END';
8757 jg ; n/a ; DALATH_RISH
8760 jg ; n/a ; FINAL_SEMKATH
8763 jg ; n/a ; HAMZA_ON_HEH_GOAL
8770 jg ; n/a ; KNOTTED_HEH
8777 jg ; n/a ; REVERSED_PE
8781 jg ; n/a ; SWASH_KAF
8783 jg ; n/a ; TEH_MARBUTA
8786 jg ; n/a ; YEH_BARREE
8787 jg ; n/a ; YEH_WITH_TAIL
8796 if (-e 'EastAsianWidth.txt') {
8797 push @return, split /\n/, <<'END';
8807 if (-e 'LineBreak.txt') {
8808 push @return, split /\n/, <<'END';
8810 lb ; AL ; Alphabetic
8811 lb ; B2 ; Break_Both
8812 lb ; BA ; Break_After
8813 lb ; BB ; Break_Before
8814 lb ; BK ; Mandatory_Break
8815 lb ; CB ; Contingent_Break
8816 lb ; CL ; Close_Punctuation
8817 lb ; CM ; Combining_Mark
8818 lb ; CR ; Carriage_Return
8819 lb ; EX ; Exclamation
8822 lb ; ID ; Ideographic
8823 lb ; IN ; Inseperable
8824 lb ; IS ; Infix_Numeric
8826 lb ; NS ; Nonstarter
8828 lb ; OP ; Open_Punctuation
8829 lb ; PO ; Postfix_Numeric
8830 lb ; PR ; Prefix_Numeric
8832 lb ; SA ; Complex_Context
8835 lb ; SY ; Break_Symbols
8841 if (-e 'DNormalizationProps.txt') {
8842 push @return, split /\n/, <<'END';
8849 if (-e 'Scripts.txt') {
8850 push @return, split /\n/, <<'END';
8852 sc ; Armn ; Armenian
8854 sc ; Bopo ; Bopomofo
8855 sc ; Cans ; Canadian_Aboriginal
8856 sc ; Cher ; Cherokee
8857 sc ; Cyrl ; Cyrillic
8858 sc ; Deva ; Devanagari
8860 sc ; Ethi ; Ethiopic
8861 sc ; Geor ; Georgian
8864 sc ; Gujr ; Gujarati
8865 sc ; Guru ; Gurmukhi
8869 sc ; Hira ; Hiragana
8870 sc ; Ital ; Old_Italic
8871 sc ; Kana ; Katakana
8876 sc ; Mlym ; Malayalam
8877 sc ; Mong ; Mongolian
8881 sc ; Qaai ; Inherited
8895 if ($v_version ge v2.0.0) {
8896 push @return, split /\n/, <<'END';
8900 dt ; vert ; vertical
8905 gc ; Lt ; Titlecase_Letter
8906 gc ; Me ; Enclosing_Mark
8907 gc ; Nl ; Letter_Number
8908 gc ; Pc ; Connector_Punctuation
8909 gc ; Sk ; Modifier_Symbol
8912 if ($v_version ge v2.1.2) {
8913 push @return, "bc ; S ; Segment_Separator\n";
8915 if ($v_version ge v2.1.5) {
8916 push @return, split /\n/, <<'END';
8917 gc ; Pf ; Final_Punctuation
8918 gc ; Pi ; Initial_Punctuation
8921 if ($v_version ge v2.1.8) {
8922 push @return, "ccc; 240; IS ; Iota_Subscript\n";
8925 if ($v_version ge v3.0.0) {
8926 push @return, split /\n/, <<'END';
8927 bc ; AL ; Arabic_Letter
8928 bc ; BN ; Boundary_Neutral
8929 bc ; LRE ; Left_To_Right_Embedding
8930 bc ; LRO ; Left_To_Right_Override
8931 bc ; NSM ; Nonspacing_Mark
8932 bc ; PDF ; Pop_Directional_Format
8933 bc ; RLE ; Right_To_Left_Embedding
8934 bc ; RLO ; Right_To_Left_Override
8936 ccc; 233; DB ; Double_Below
8940 if ($v_version ge v3.1.0) {
8941 push @return, "ccc; 226; R ; Right\n";
8947 sub output_perl_charnames_line ($$) {
8949 # Output the entries in Perl_charnames specially, using 5 digits instead
8950 # of four. This makes the entries a constant length, and simplifies
8951 # charnames.pm which this table is for. Unicode can have 6 digit
8952 # ordinals, but they are all private use or noncharacters which do not
8953 # have names, so won't be in this table.
8955 return sprintf "%05X\t%s\n", $_[0], $_[1];
8959 # This is used to store the range list of all the code points usable when
8960 # the little used $compare_versions feature is enabled.
8961 my $compare_versions_range_list;
8963 sub process_generic_property_file {
8964 # This processes a file containing property mappings and puts them
8965 # into internal map tables. It should be used to handle any property
8966 # files that have mappings from a code point or range thereof to
8967 # something else. This means almost all the UCD .txt files.
8968 # each_line_handlers() should be set to adjust the lines of these
8969 # files, if necessary, to what this routine understands:
8974 # the fields are: "codepoint range ; property; map"
8976 # meaning the codepoints in the range all have the value 'map' under
8978 # Beginning and trailing white space in each field are not signficant.
8979 # Note there is not a trailing semi-colon in the above. A trailing
8980 # semi-colon means the map is a null-string. An omitted map, as
8981 # opposed to a null-string, is assumed to be 'Y', based on Unicode
8982 # table syntax. (This could have been hidden from this routine by
8983 # doing it in the $file object, but that would require parsing of the
8984 # line there, so would have to parse it twice, or change the interface
8985 # to pass this an array. So not done.)
8987 # The map field may begin with a sequence of commands that apply to
8988 # this range. Each such command begins and ends with $CMD_DELIM.
8989 # These are used to indicate, for example, that the mapping for a
8990 # range has a non-default type.
8992 # This loops through the file, calling it's next_line() method, and
8993 # then taking the map and adding it to the property's table.
8994 # Complications arise because any number of properties can be in the
8995 # file, in any order, interspersed in any way. The first time a
8996 # property is seen, it gets information about that property and
8997 # caches it for quick retrieval later. It also normalizes the maps
8998 # so that only one of many synonym is stored. The Unicode input files
8999 # do use some multiple synonyms.
9002 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9004 my %property_info; # To keep track of what properties
9005 # have already had entries in the
9006 # current file, and info about each,
9007 # so don't have to recompute.
9008 my $property_name; # property currently being worked on
9009 my $property_type; # and its type
9010 my $previous_property_name = ""; # name from last time through loop
9011 my $property_object; # pointer to the current property's
9013 my $property_addr; # the address of that object
9014 my $default_map; # the string that code points missing
9015 # from the file map to
9016 my $default_table; # For non-string properties, a
9017 # reference to the match table that
9018 # will contain the list of code
9019 # points that map to $default_map.
9021 # Get the next real non-comment line
9023 while ($file->next_line) {
9025 # Default replacement type; means that if parts of the range have
9026 # already been stored in our tables, the new map overrides them if
9027 # they differ more than cosmetically
9028 my $replace = $IF_NOT_EQUIVALENT;
9029 my $map_type; # Default type for the map of this range
9031 #local $to_trace = 1 if main::DEBUG;
9032 trace $_ if main::DEBUG && $to_trace;
9034 # Split the line into components
9035 my ($range, $property_name, $map, @remainder)
9036 = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
9038 # If more or less on the line than we are expecting, warn and skip
9041 $file->carp_bad_line('Extra fields');
9044 elsif ( ! defined $property_name) {
9045 $file->carp_bad_line('Missing property');
9049 # Examine the range.
9050 if ($range !~ /^ ($code_point_re) (?:\.\. ($code_point_re) )? $/x)
9052 $file->carp_bad_line("Range '$range' not of the form 'CP1' or 'CP1..CP2' (where CP1,2 are code points in hex)");
9056 my $high = (defined $2) ? hex $2 : $low;
9058 # For the very specialized case of comparing two Unicode
9060 if (DEBUG && $compare_versions) {
9061 if ($property_name eq 'Age') {
9063 # Only allow code points at least as old as the version
9065 my $age = pack "C*", split(/\./, $map); # v string
9066 next LINE if $age gt $compare_versions;
9070 # Again, we throw out code points younger than those of
9071 # the specified version. By now, the Age property is
9072 # populated. We use the intersection of each input range
9073 # with this property to find what code points in it are
9074 # valid. To do the intersection, we have to convert the
9075 # Age property map to a Range_list. We only have to do
9077 if (! defined $compare_versions_range_list) {
9078 my $age = property_ref('Age');
9079 if (! -e 'DAge.txt') {
9080 croak "Need to have 'DAge.txt' file to do version comparison";
9082 elsif ($age->count == 0) {
9083 croak "The 'Age' table is empty, but its file exists";
9085 $compare_versions_range_list
9086 = Range_List->new(Initialize => $age);
9089 # An undefined map is always 'Y'
9090 $map = 'Y' if ! defined $map;
9092 # Calculate the intersection of the input range with the
9093 # code points that are known in the specified version
9094 my @ranges = ($compare_versions_range_list
9095 & Range->new($low, $high))->ranges;
9097 # If the intersection is empty, throw away this range
9098 next LINE unless @ranges;
9100 # Only examine the first range this time through the loop.
9101 my $this_range = shift @ranges;
9103 # Put any remaining ranges in the queue to be processed
9104 # later. Note that there is unnecessary work here, as we
9105 # will do the intersection again for each of these ranges
9106 # during some future iteration of the LINE loop, but this
9107 # code is not used in production. The later intersections
9108 # are guaranteed to not splinter, so this will not become
9110 my $line = join ';', $property_name, $map;
9111 foreach my $range (@ranges) {
9112 $file->insert_adjusted_lines(sprintf("%04X..%04X; %s",
9118 # And process the first range, like any other.
9119 $low = $this_range->start;
9120 $high = $this_range->end;
9122 } # End of $compare_versions
9124 # If changing to a new property, get the things constant per
9126 if ($previous_property_name ne $property_name) {
9128 $property_object = property_ref($property_name);
9129 if (! defined $property_object) {
9130 $file->carp_bad_line("Unexpected property '$property_name'. Skipped");
9133 { no overloading; $property_addr = pack 'J', $property_object; }
9135 # Defer changing names until have a line that is acceptable
9136 # (the 'next' statement above means is unacceptable)
9137 $previous_property_name = $property_name;
9139 # If not the first time for this property, retrieve info about
9141 if (defined ($property_info{$property_addr}{'type'})) {
9142 $property_type = $property_info{$property_addr}{'type'};
9143 $default_map = $property_info{$property_addr}{'default'};
9145 = $property_info{$property_addr}{'pseudo_map_type'};
9147 = $property_info{$property_addr}{'default_table'};
9151 # Here, is the first time for this property. Set up the
9153 $property_type = $property_info{$property_addr}{'type'}
9154 = $property_object->type;
9156 = $property_info{$property_addr}{'pseudo_map_type'}
9157 = $property_object->pseudo_map_type;
9159 # The Unicode files are set up so that if the map is not
9160 # defined, it is a binary property
9161 if (! defined $map && $property_type != $BINARY) {
9162 if ($property_type != $UNKNOWN
9163 && $property_type != $NON_STRING)
9165 $file->carp_bad_line("No mapping defined on a non-binary property. Using 'Y' for the map");
9168 $property_object->set_type($BINARY);
9170 = $property_info{$property_addr}{'type'}
9175 # Get any @missings default for this property. This
9176 # should precede the first entry for the property in the
9177 # input file, and is located in a comment that has been
9178 # stored by the Input_file class until we access it here.
9179 # It's possible that there is more than one such line
9180 # waiting for us; collect them all, and parse
9181 my @missings_list = $file->get_missings
9182 if $file->has_missings_defaults;
9183 foreach my $default_ref (@missings_list) {
9184 my $default = $default_ref->[0];
9185 my $addr = do { no overloading; pack 'J', property_ref($default_ref->[1]); };
9187 # For string properties, the default is just what the
9188 # file says, but non-string properties should already
9189 # have set up a table for the default property value;
9190 # use the table for these, so can resolve synonyms
9191 # later to a single standard one.
9192 if ($property_type == $STRING
9193 || $property_type == $UNKNOWN)
9195 $property_info{$addr}{'missings'} = $default;
9198 $property_info{$addr}{'missings'}
9199 = $property_object->table($default);
9203 # Finished storing all the @missings defaults in the input
9204 # file so far. Get the one for the current property.
9205 my $missings = $property_info{$property_addr}{'missings'};
9207 # But we likely have separately stored what the default
9208 # should be. (This is to accommodate versions of the
9209 # standard where the @missings lines are absent or
9210 # incomplete.) Hopefully the two will match. But check
9212 $default_map = $property_object->default_map;
9214 # If the map is a ref, it means that the default won't be
9215 # processed until later, so undef it, so next few lines
9216 # will redefine it to something that nothing will match
9217 undef $default_map if ref $default_map;
9219 # Create a $default_map if don't have one; maybe a dummy
9220 # that won't match anything.
9221 if (! defined $default_map) {
9223 # Use any @missings line in the file.
9224 if (defined $missings) {
9225 if (ref $missings) {
9226 $default_map = $missings->full_name;
9227 $default_table = $missings;
9230 $default_map = $missings;
9233 # And store it with the property for outside use.
9234 $property_object->set_default_map($default_map);
9238 # Neither an @missings nor a default map. Create
9239 # a dummy one, so won't have to test definedness
9241 $default_map = '_Perl This will never be in a file
9246 # Here, we have $default_map defined, possibly in terms of
9247 # $missings, but maybe not, and possibly is a dummy one.
9248 if (defined $missings) {
9250 # Make sure there is no conflict between the two.
9251 # $missings has priority.
9252 if (ref $missings) {
9254 = $property_object->table($default_map);
9255 if (! defined $default_table
9256 || $default_table != $missings)
9258 if (! defined $default_table) {
9259 $default_table = $UNDEF;
9261 $file->carp_bad_line(<<END
9262 The \@missings line for $property_name in $file says that missings default to
9263 $missings, but we expect it to be $default_table. $missings used.
9266 $default_table = $missings;
9267 $default_map = $missings->full_name;
9269 $property_info{$property_addr}{'default_table'}
9272 elsif ($default_map ne $missings) {
9273 $file->carp_bad_line(<<END
9274 The \@missings line for $property_name in $file says that missings default to
9275 $missings, but we expect it to be $default_map. $missings used.
9278 $default_map = $missings;
9282 $property_info{$property_addr}{'default'}
9285 # If haven't done so already, find the table corresponding
9286 # to this map for non-string properties.
9287 if (! defined $default_table
9288 && $property_type != $STRING
9289 && $property_type != $UNKNOWN)
9291 $default_table = $property_info{$property_addr}
9293 = $property_object->table($default_map);
9295 } # End of is first time for this property
9296 } # End of switching properties.
9298 # Ready to process the line.
9299 # The Unicode files are set up so that if the map is not defined,
9300 # it is a binary property with value 'Y'
9301 if (! defined $map) {
9306 # If the map begins with a special command to us (enclosed in
9307 # delimiters), extract the command(s).
9308 if (substr($map, 0, 1) eq $CMD_DELIM) {
9309 while ($map =~ s/ ^ $CMD_DELIM (.*?) $CMD_DELIM //x) {
9311 if ($command =~ / ^ $REPLACE_CMD= (.*) /x) {
9314 elsif ($command =~ / ^ $MAP_TYPE_CMD= (.*) /x) {
9318 $file->carp_bad_line("Unknown command line: '$1'");
9325 if ($default_map eq $CODE_POINT && $map =~ / ^ $code_point_re $/x)
9328 # Here, we have a map to a particular code point, and the
9329 # default map is to a code point itself. If the range
9330 # includes the particular code point, change that portion of
9331 # the range to the default. This makes sure that in the final
9332 # table only the non-defaults are listed.
9333 my $decimal_map = hex $map;
9334 if ($low <= $decimal_map && $decimal_map <= $high) {
9336 # If the range includes stuff before or after the map
9337 # we're changing, split it and process the split-off parts
9339 if ($low < $decimal_map) {
9340 $file->insert_adjusted_lines(
9341 sprintf("%04X..%04X; %s; %s",
9347 if ($high > $decimal_map) {
9348 $file->insert_adjusted_lines(
9349 sprintf("%04X..%04X; %s; %s",
9355 $low = $high = $decimal_map;
9360 # If we can tell that this is a synonym for the default map, use
9361 # the default one instead.
9362 if ($property_type != $STRING
9363 && $property_type != $UNKNOWN)
9365 my $table = $property_object->table($map);
9366 if (defined $table && $table == $default_table) {
9367 $map = $default_map;
9371 # And figure out the map type if not known.
9372 if (! defined $map_type || $map_type == $COMPUTE_NO_MULTI_CP) {
9373 if ($map eq "") { # Nulls are always $NULL map type
9375 } # Otherwise, non-strings, and those that don't allow
9376 # $MULTI_CP, and those that aren't multiple code points are
9379 (($property_type != $STRING && $property_type != $UNKNOWN)
9380 || (defined $map_type && $map_type == $COMPUTE_NO_MULTI_CP)
9381 || $map !~ /^ $code_point_re ( \ $code_point_re )+ $ /x)
9386 $map_type = $MULTI_CP;
9390 $property_object->add_map($low, $high,
9393 Replace => $replace);
9394 } # End of loop through file's lines
9400 { # Closure for UnicodeData.txt handling
9402 # This file was the first one in the UCD; its design leads to some
9403 # awkwardness in processing. Here is a sample line:
9404 # 0041;LATIN CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0061;
9405 # The fields in order are:
9406 my $i = 0; # The code point is in field 0, and is shifted off.
9407 my $CHARNAME = $i++; # character name (e.g. "LATIN CAPITAL LETTER A")
9408 my $CATEGORY = $i++; # category (e.g. "Lu")
9409 my $CCC = $i++; # Canonical combining class (e.g. "230")
9410 my $BIDI = $i++; # directional class (e.g. "L")
9411 my $PERL_DECOMPOSITION = $i++; # decomposition mapping
9412 my $PERL_DECIMAL_DIGIT = $i++; # decimal digit value
9413 my $NUMERIC_TYPE_OTHER_DIGIT = $i++; # digit value, like a superscript
9414 # Dual-use in this program; see below
9415 my $NUMERIC = $i++; # numeric value
9416 my $MIRRORED = $i++; # ? mirrored
9417 my $UNICODE_1_NAME = $i++; # name in Unicode 1.0
9418 my $COMMENT = $i++; # iso comment
9419 my $UPPER = $i++; # simple uppercase mapping
9420 my $LOWER = $i++; # simple lowercase mapping
9421 my $TITLE = $i++; # simple titlecase mapping
9422 my $input_field_count = $i;
9424 # This routine in addition outputs these extra fields:
9425 my $DECOMP_TYPE = $i++; # Decomposition type
9427 # These fields are modifications of ones above, and are usually
9428 # suppressed; they must come last, as for speed, the loop upper bound is
9429 # normally set to ignore them
9430 my $NAME = $i++; # This is the strict name field, not the one that
9432 my $DECOMP_MAP = $i++; # Strict decomposition mapping; not the one used
9433 # by Unicode::Normalize
9434 my $last_field = $i - 1;
9436 # All these are read into an array for each line, with the indices defined
9437 # above. The empty fields in the example line above indicate that the
9438 # value is defaulted. The handler called for each line of the input
9439 # changes these to their defaults.
9441 # Here are the official names of the properties, in a parallel array:
9443 $field_names[$BIDI] = 'Bidi_Class';
9444 $field_names[$CATEGORY] = 'General_Category';
9445 $field_names[$CCC] = 'Canonical_Combining_Class';
9446 $field_names[$CHARNAME] = 'Perl_Charnames';
9447 $field_names[$COMMENT] = 'ISO_Comment';
9448 $field_names[$DECOMP_MAP] = 'Decomposition_Mapping';
9449 $field_names[$DECOMP_TYPE] = 'Decomposition_Type';
9450 $field_names[$LOWER] = 'Lowercase_Mapping';
9451 $field_names[$MIRRORED] = 'Bidi_Mirrored';
9452 $field_names[$NAME] = 'Name';
9453 $field_names[$NUMERIC] = 'Numeric_Value';
9454 $field_names[$NUMERIC_TYPE_OTHER_DIGIT] = 'Numeric_Type';
9455 $field_names[$PERL_DECIMAL_DIGIT] = 'Perl_Decimal_Digit';
9456 $field_names[$PERL_DECOMPOSITION] = 'Perl_Decomposition_Mapping';
9457 $field_names[$TITLE] = 'Titlecase_Mapping';
9458 $field_names[$UNICODE_1_NAME] = 'Unicode_1_Name';
9459 $field_names[$UPPER] = 'Uppercase_Mapping';
9461 # Some of these need a little more explanation:
9462 # The $PERL_DECIMAL_DIGIT field does not lead to an official Unicode
9463 # property, but is used in calculating the Numeric_Type. Perl however,
9464 # creates a file from this field, so a Perl property is created from it.
9465 # Similarly, the Other_Digit field is used only for calculating the
9466 # Numeric_Type, and so it can be safely re-used as the place to store
9467 # the value for Numeric_Type; hence it is referred to as
9468 # $NUMERIC_TYPE_OTHER_DIGIT.
9469 # The input field named $PERL_DECOMPOSITION is a combination of both the
9470 # decomposition mapping and its type. Perl creates a file containing
9471 # exactly this field, so it is used for that. The two properties are
9472 # separated into two extra output fields, $DECOMP_MAP and $DECOMP_TYPE.
9473 # $DECOMP_MAP is usually suppressed (unless the lists are changed to
9474 # output it), as Perl doesn't use it directly.
9475 # The input field named here $CHARNAME is used to construct the
9476 # Perl_Charnames property, which is a combination of the Name property
9477 # (which the input field contains), and the Unicode_1_Name property, and
9478 # others from other files. Since, the strict Name property is not used
9479 # by Perl, this field is used for the table that Perl does use. The
9480 # strict Name property table is usually suppressed (unless the lists are
9481 # changed to output it), so it is accumulated in a separate field,
9482 # $NAME, which to save time is discarded unless the table is actually to
9485 # This file is processed like most in this program. Control is passed to
9486 # process_generic_property_file() which calls filter_UnicodeData_line()
9487 # for each input line. This filter converts the input into line(s) that
9488 # process_generic_property_file() understands. There is also a setup
9489 # routine called before any of the file is processed, and a handler for
9490 # EOF processing, all in this closure.
9492 # A huge speed-up occurred at the cost of some added complexity when these
9493 # routines were altered to buffer the outputs into ranges. Almost all the
9494 # lines of the input file apply to just one code point, and for most
9495 # properties, the map for the next code point up is the same as the
9496 # current one. So instead of creating a line for each property for each
9497 # input line, filter_UnicodeData_line() remembers what the previous map
9498 # of a property was, and doesn't generate a line to pass on until it has
9499 # to, as when the map changes; and that passed-on line encompasses the
9500 # whole contiguous range of code points that have the same map for that
9501 # property. This means a slight amount of extra setup, and having to
9502 # flush these buffers on EOF, testing if the maps have changed, plus
9503 # remembering state information in the closure. But it means a lot less
9504 # real time in not having to change the data base for each property on
9507 # Another complication is that there are already a few ranges designated
9508 # in the input. There are two lines for each, with the same maps except
9509 # the code point and name on each line. This was actually the hardest
9510 # thing to design around. The code points in those ranges may actually
9511 # have real maps not given by these two lines. These maps will either
9512 # be algorthimically determinable, or in the extracted files furnished
9513 # with the UCD. In the event of conflicts between these extracted files,
9514 # and this one, Unicode says that this one prevails. But it shouldn't
9515 # prevail for conflicts that occur in these ranges. The data from the
9516 # extracted files prevails in those cases. So, this program is structured
9517 # so that those files are processed first, storing maps. Then the other
9518 # files are processed, generally overwriting what the extracted files
9519 # stored. But just the range lines in this input file are processed
9520 # without overwriting. This is accomplished by adding a special string to
9521 # the lines output to tell process_generic_property_file() to turn off the
9522 # overwriting for just this one line.
9523 # A similar mechanism is used to tell it that the map is of a non-default
9526 sub setup_UnicodeData { # Called before any lines of the input are read
9528 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9530 # Create a new property specially located that is a combination of the
9531 # various Name properties: Name, Unicode_1_Name, Named Sequences, and
9532 # Name_Alias properties. (The final duplicates elements of the
9533 # first.) A comment for it will later be constructed based on the
9534 # actual properties present and used
9535 $perl_charname = Property->new('Perl_Charnames',
9536 Core_Access => '\N{...} and "use charnames"',
9538 Directory => File::Spec->curdir(),
9540 Internal_Only_Warning => 1,
9541 Perl_Extension => 1,
9542 Range_Size_1 => \&output_perl_charnames_line,
9546 my $Perl_decomp = Property->new('Perl_Decomposition_Mapping',
9547 Directory => File::Spec->curdir(),
9548 File => 'Decomposition',
9549 Format => $DECOMP_STRING_FORMAT,
9550 Internal_Only_Warning => 1,
9551 Perl_Extension => 1,
9552 Default_Map => $CODE_POINT,
9554 # normalize.pm can't cope with these
9555 Output_Range_Counts => 0,
9557 # This is a specially formatted table
9558 # explicitly for normalize.pm, which
9559 # is expecting a particular format,
9560 # which means that mappings containing
9561 # multiple code points are in the main
9563 Map_Type => $COMPUTE_NO_MULTI_CP,
9566 $Perl_decomp->add_comment(join_lines(<<END
9567 This mapping is a combination of the Unicode 'Decomposition_Type' and
9568 'Decomposition_Mapping' properties, formatted for use by normalize.pm. It is
9569 identical to the official Unicode 'Decomposition_Mapping' property except for
9571 1) It omits the algorithmically determinable Hangul syllable decompositions,
9572 which normalize.pm handles algorithmically.
9573 2) It contains the decomposition type as well. Non-canonical decompositions
9574 begin with a word in angle brackets, like <super>, which denotes the
9575 compatible decomposition type. If the map does not begin with the <angle
9576 brackets>, the decomposition is canonical.
9580 my $Decimal_Digit = Property->new("Perl_Decimal_Digit",
9582 Perl_Extension => 1,
9583 File => 'Digit', # Trad. location
9584 Directory => $map_directory,
9588 $Decimal_Digit->add_comment(join_lines(<<END
9589 This file gives the mapping of all code points which represent a single
9590 decimal digit [0-9] to their respective digits. For example, the code point
9591 U+0031 (an ASCII '1') is mapped to a numeric 1. These code points are those
9592 that have Numeric_Type=Decimal; not special things, like subscripts nor Roman
9597 # These properties are not used for generating anything else, and are
9598 # usually not output. By making them last in the list, we can just
9599 # change the high end of the loop downwards to avoid the work of
9600 # generating a table(s) that is/are just going to get thrown away.
9601 if (! property_ref('Decomposition_Mapping')->to_output_map
9602 && ! property_ref('Name')->to_output_map)
9604 $last_field = min($NAME, $DECOMP_MAP) - 1;
9605 } elsif (property_ref('Decomposition_Mapping')->to_output_map) {
9606 $last_field = $DECOMP_MAP;
9607 } elsif (property_ref('Name')->to_output_map) {
9608 $last_field = $NAME;
9613 my $first_time = 1; # ? Is this the first line of the file
9614 my $in_range = 0; # ? Are we in one of the file's ranges
9615 my $previous_cp; # hex code point of previous line
9616 my $decimal_previous_cp = -1; # And its decimal equivalent
9617 my @start; # For each field, the current starting
9618 # code point in hex for the range
9619 # being accumulated.
9620 my @fields; # The input fields;
9621 my @previous_fields; # And those from the previous call
9623 sub filter_UnicodeData_line {
9624 # Handle a single input line from UnicodeData.txt; see comments above
9625 # Conceptually this takes a single line from the file containing N
9626 # properties, and converts it into N lines with one property per line,
9627 # which is what the final handler expects. But there are
9628 # complications due to the quirkiness of the input file, and to save
9629 # time, it accumulates ranges where the property values don't change
9630 # and only emits lines when necessary. This is about an order of
9631 # magnitude fewer lines emitted.
9634 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9636 # $_ contains the input line.
9637 # -1 in split means retain trailing null fields
9638 (my $cp, @fields) = split /\s*;\s*/, $_, -1;
9640 #local $to_trace = 1 if main::DEBUG;
9641 trace $cp, @fields , $input_field_count if main::DEBUG && $to_trace;
9642 if (@fields > $input_field_count) {
9643 $file->carp_bad_line('Extra fields');
9648 my $decimal_cp = hex $cp;
9650 # We have to output all the buffered ranges when the next code point
9651 # is not exactly one after the previous one, which means there is a
9652 # gap in the ranges.
9653 my $force_output = ($decimal_cp != $decimal_previous_cp + 1);
9655 # The decomposition mapping field requires special handling. It looks
9658 # <compat> 0032 0020
9661 # The decomposition type is enclosed in <brackets>; if missing, it
9662 # means the type is canonical. There are two decomposition mapping
9663 # tables: the one for use by Perl's normalize.pm has a special format
9664 # which is this field intact; the other, for general use is of
9665 # standard format. In either case we have to find the decomposition
9666 # type. Empty fields have None as their type, and map to the code
9668 if ($fields[$PERL_DECOMPOSITION] eq "") {
9669 $fields[$DECOMP_TYPE] = 'None';
9670 $fields[$DECOMP_MAP] = $fields[$PERL_DECOMPOSITION] = $CODE_POINT;
9673 ($fields[$DECOMP_TYPE], my $map) = $fields[$PERL_DECOMPOSITION]
9674 =~ / < ( .+? ) > \s* ( .+ ) /x;
9675 if (! defined $fields[$DECOMP_TYPE]) {
9676 $fields[$DECOMP_TYPE] = 'Canonical';
9677 $fields[$DECOMP_MAP] = $fields[$PERL_DECOMPOSITION];
9680 $fields[$DECOMP_MAP] = $map;
9684 # The 3 numeric fields also require special handling. The 2 digit
9685 # fields must be either empty or match the number field. This means
9686 # that if it is empty, they must be as well, and the numeric type is
9687 # None, and the numeric value is 'Nan'.
9688 # The decimal digit field must be empty or match the other digit
9689 # field. If the decimal digit field is non-empty, the code point is
9690 # a decimal digit, and the other two fields will have the same value.
9691 # If it is empty, but the other digit field is non-empty, the code
9692 # point is an 'other digit', and the number field will have the same
9693 # value as the other digit field. If the other digit field is empty,
9694 # but the number field is non-empty, the code point is a generic
9696 if ($fields[$NUMERIC] eq "") {
9697 if ($fields[$PERL_DECIMAL_DIGIT] ne ""
9698 || $fields[$NUMERIC_TYPE_OTHER_DIGIT] ne ""
9700 $file->carp_bad_line("Numeric values inconsistent. Trying to process anyway");
9702 $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'None';
9703 $fields[$NUMERIC] = 'NaN';
9706 $file->carp_bad_line("'$fields[$NUMERIC]' should be a whole or rational number. Processing as if it were") if $fields[$NUMERIC] !~ qr{ ^ -? \d+ ( / \d+ )? $ }x;
9707 if ($fields[$PERL_DECIMAL_DIGIT] ne "") {
9708 $file->carp_bad_line("$fields[$PERL_DECIMAL_DIGIT] should equal $fields[$NUMERIC]. Processing anyway") if $fields[$PERL_DECIMAL_DIGIT] != $fields[$NUMERIC];
9709 $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'Decimal';
9711 elsif ($fields[$NUMERIC_TYPE_OTHER_DIGIT] ne "") {
9712 $file->carp_bad_line("$fields[$NUMERIC_TYPE_OTHER_DIGIT] should equal $fields[$NUMERIC]. Processing anyway") if $fields[$NUMERIC_TYPE_OTHER_DIGIT] != $fields[$NUMERIC];
9713 $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'Digit';
9716 $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'Numeric';
9718 # Rationals require extra effort.
9719 register_fraction($fields[$NUMERIC])
9720 if $fields[$NUMERIC] =~ qr{/};
9724 # For the properties that have empty fields in the file, and which
9725 # mean something different from empty, change them to that default.
9726 # Certain fields just haven't been empty so far in any Unicode
9727 # version, so don't look at those, namely $MIRRORED, $BIDI, $CCC,
9728 # $CATEGORY. This leaves just the two fields, and so we hard-code in
9729 # the defaults; which are very unlikely to ever change.
9730 $fields[$UPPER] = $CODE_POINT if $fields[$UPPER] eq "";
9731 $fields[$LOWER] = $CODE_POINT if $fields[$LOWER] eq "";
9733 # UAX44 says that if title is empty, it is the same as whatever upper
9735 $fields[$TITLE] = $fields[$UPPER] if $fields[$TITLE] eq "";
9737 # There are a few pairs of lines like:
9738 # AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
9739 # D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
9740 # that define ranges. These should be processed after the fields are
9741 # adjusted above, as they may override some of them; but mostly what
9742 # is left is to possibly adjust the $CHARNAME field. The names of all the
9743 # paired lines start with a '<', but this is also true of '<control>,
9744 # which isn't one of these special ones.
9745 if ($fields[$CHARNAME] eq '<control>') {
9747 # Some code points in this file have the pseudo-name
9748 # '<control>', but the official name for such ones is the null
9749 # string. For charnames.pm, we use the Unicode version 1 name
9750 $fields[$NAME] = "";
9751 $fields[$CHARNAME] = $fields[$UNICODE_1_NAME];
9753 # We had better not be in between range lines.
9755 $file->carp_bad_line("Expecting a closing range line, not a $fields[$CHARNAME]'. Trying anyway");
9759 elsif (substr($fields[$CHARNAME], 0, 1) ne '<') {
9761 # Here is a non-range line. We had better not be in between range
9764 $file->carp_bad_line("Expecting a closing range line, not a $fields[$CHARNAME]'. Trying anyway");
9767 if ($fields[$CHARNAME] =~ s/- $cp $//x) {
9769 # These are code points whose names end in their code points,
9770 # which means the names are algorithmically derivable from the
9771 # code points. To shorten the output Name file, the algorithm
9772 # for deriving these is placed in the file instead of each
9773 # code point, so they have map type $CP_IN_NAME
9774 $fields[$CHARNAME] = $CMD_DELIM
9779 . $fields[$CHARNAME];
9781 $fields[$NAME] = $fields[$CHARNAME];
9783 elsif ($fields[$CHARNAME] =~ /^<(.+), First>$/) {
9784 $fields[$CHARNAME] = $fields[$NAME] = $1;
9786 # Here we are at the beginning of a range pair.
9788 $file->carp_bad_line("Expecting a closing range line, not a beginning one, $fields[$CHARNAME]'. Trying anyway");
9792 # Because the properties in the range do not overwrite any already
9793 # in the db, we must flush the buffers of what's already there, so
9794 # they get handled in the normal scheme.
9798 elsif ($fields[$CHARNAME] !~ s/^<(.+), Last>$/$1/) {
9799 $file->carp_bad_line("Unexpected name starting with '<' $fields[$CHARNAME]. Ignoring this line.");
9803 else { # Here, we are at the last line of a range pair.
9806 $file->carp_bad_line("Unexpected end of range $fields[$CHARNAME] when not in one. Ignoring this line.");
9812 $fields[$NAME] = $fields[$CHARNAME];
9814 # Check that the input is valid: that the closing of the range is
9815 # the same as the beginning.
9816 foreach my $i (0 .. $last_field) {
9817 next if $fields[$i] eq $previous_fields[$i];
9818 $file->carp_bad_line("Expecting '$fields[$i]' to be the same as '$previous_fields[$i]'. Bad News. Trying anyway");
9821 # The processing differs depending on the type of range,
9822 # determined by its $CHARNAME
9823 if ($fields[$CHARNAME] =~ /^Hangul Syllable/) {
9825 # Check that the data looks right.
9826 if ($decimal_previous_cp != $SBase) {
9827 $file->carp_bad_line("Unexpected Hangul syllable start = $previous_cp. Bad News. Results will be wrong");
9829 if ($decimal_cp != $SBase + $SCount - 1) {
9830 $file->carp_bad_line("Unexpected Hangul syllable end = $cp. Bad News. Results will be wrong");
9833 # The Hangul syllable range has a somewhat complicated name
9834 # generation algorithm. Each code point in it has a canonical
9835 # decomposition also computable by an algorithm. The
9836 # perl decomposition map table built from these is used only
9837 # by normalize.pm, which has the algorithm built in it, so the
9838 # decomposition maps are not needed, and are large, so are
9839 # omitted from it. If the full decomposition map table is to
9840 # be output, the decompositions are generated for it, in the
9841 # EOF handling code for this input file.
9843 $previous_fields[$DECOMP_TYPE] = 'Canonical';
9845 # This range is stored in our internal structure with its
9846 # own map type, different from all others.
9847 $previous_fields[$CHARNAME] = $previous_fields[$NAME]
9853 . $fields[$CHARNAME];
9855 elsif ($fields[$CHARNAME] =~ /^CJK/) {
9857 # The name for these contains the code point itself, and all
9858 # are defined to have the same base name, regardless of what
9859 # is in the file. They are stored in our internal structure
9860 # with a map type of $CP_IN_NAME
9861 $previous_fields[$CHARNAME] = $previous_fields[$NAME]
9867 . 'CJK UNIFIED IDEOGRAPH';
9870 elsif ($fields[$CATEGORY] eq 'Co'
9871 || $fields[$CATEGORY] eq 'Cs')
9873 # The names of all the code points in these ranges are set to
9874 # null, as there are no names for the private use and
9875 # surrogate code points.
9877 $previous_fields[$CHARNAME] = $previous_fields[$NAME] = "";
9880 $file->carp_bad_line("Unexpected code point range $fields[$CHARNAME] because category is $fields[$CATEGORY]. Attempting to process it.");
9883 # The first line of the range caused everything else to be output,
9884 # and then its values were stored as the beginning values for the
9885 # next set of ranges, which this one ends. Now, for each value,
9886 # add a command to tell the handler that these values should not
9887 # replace any existing ones in our database.
9888 foreach my $i (0 .. $last_field) {
9889 $previous_fields[$i] = $CMD_DELIM
9894 . $previous_fields[$i];
9897 # And change things so it looks like the entire range has been
9898 # gone through with this being the final part of it. Adding the
9899 # command above to each field will cause this range to be flushed
9900 # during the next iteration, as it guaranteed that the stored
9901 # field won't match whatever value the next one has.
9903 $decimal_previous_cp = $decimal_cp;
9905 # We are now set up for the next iteration; so skip the remaining
9906 # code in this subroutine that does the same thing, but doesn't
9907 # know about these ranges.
9913 # On the very first line, we fake it so the code below thinks there is
9914 # nothing to output, and initialize so that when it does get output it
9915 # uses the first line's values for the lowest part of the range.
9916 # (One could avoid this by using peek(), but then one would need to
9917 # know the adjustments done above and do the same ones in the setup
9918 # routine; not worth it)
9921 @previous_fields = @fields;
9922 @start = ($cp) x scalar @fields;
9923 $decimal_previous_cp = $decimal_cp - 1;
9926 # For each field, output the stored up ranges that this code point
9927 # doesn't fit in. Earlier we figured out if all ranges should be
9928 # terminated because of changing the replace or map type styles, or if
9929 # there is a gap between this new code point and the previous one, and
9930 # that is stored in $force_output. But even if those aren't true, we
9931 # need to output the range if this new code point's value for the
9932 # given property doesn't match the stored range's.
9933 #local $to_trace = 1 if main::DEBUG;
9934 foreach my $i (0 .. $last_field) {
9935 my $field = $fields[$i];
9936 if ($force_output || $field ne $previous_fields[$i]) {
9938 # Flush the buffer of stored values.
9939 $file->insert_adjusted_lines("$start[$i]..$previous_cp; $field_names[$i]; $previous_fields[$i]");
9941 # Start a new range with this code point and its value
9943 $previous_fields[$i] = $field;
9947 # Set the values for the next time.
9949 $decimal_previous_cp = $decimal_cp;
9951 # The input line has generated whatever adjusted lines are needed, and
9952 # should not be looked at further.
9957 sub EOF_UnicodeData {
9958 # Called upon EOF to flush the buffers, and create the Hangul
9959 # decomposition mappings if needed.
9962 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9964 # Flush the buffers.
9965 foreach my $i (1 .. $last_field) {
9966 $file->insert_adjusted_lines("$start[$i]..$previous_cp; $field_names[$i]; $previous_fields[$i]");
9969 if (-e 'Jamo.txt') {
9971 # The algorithm is published by Unicode, based on values in
9972 # Jamo.txt, (which should have been processed before this
9973 # subroutine), and the results left in %Jamo
9975 Carp::my_carp_bug("Jamo.txt should be processed before Unicode.txt. Hangul syllables not generated.");
9979 # If the full decomposition map table is being output, insert
9980 # into it the Hangul syllable mappings. This is to avoid having
9981 # to publish a subroutine in it to compute them. (which would
9982 # essentially be this code.) This uses the algorithm published by
9984 if (property_ref('Decomposition_Mapping')->to_output_map) {
9985 for (my $S = $SBase; $S < $SBase + $SCount; $S++) {
9987 my $SIndex = $S - $SBase;
9988 my $L = $LBase + $SIndex / $NCount;
9989 my $V = $VBase + ($SIndex % $NCount) / $TCount;
9990 my $T = $TBase + $SIndex % $TCount;
9992 trace "L=$L, V=$V, T=$T" if main::DEBUG && $to_trace;
9993 my $decomposition = sprintf("%04X %04X", $L, $V);
9994 $decomposition .= sprintf(" %04X", $T) if $T != $TBase;
9995 $file->insert_adjusted_lines(
9996 sprintf("%04X; Decomposition_Mapping; %s",
10006 sub filter_v1_ucd {
10007 # Fix UCD lines in version 1. This is probably overkill, but this
10008 # fixes some glaring errors in Version 1 UnicodeData.txt. That file:
10009 # 1) had many Hangul (U+3400 - U+4DFF) code points that were later
10010 # removed. This program retains them
10011 # 2) didn't include ranges, which it should have, and which are now
10012 # added in @corrected_lines below. It was hand populated by
10013 # taking the data from Version 2, verified by analyzing
10015 # 3) There is a syntax error in the entry for U+09F8 which could
10016 # cause problems for utf8_heavy, and so is changed. It's
10017 # numeric value was simply a minus sign, without any number.
10018 # (Eventually Unicode changed the code point to non-numeric.)
10019 # 4) The decomposition types often don't match later versions
10020 # exactly, and the whole syntax of that field is different; so
10021 # the syntax is changed as well as the types to their later
10022 # terminology. Otherwise normalize.pm would be very unhappy
10023 # 5) Many ccc classes are different. These are left intact.
10024 # 6) U+FF10 - U+FF19 are missing their numeric values in all three
10025 # fields. These are unchanged because it doesn't really cause
10026 # problems for Perl.
10027 # 7) A number of code points, such as controls, don't have their
10028 # Unicode Version 1 Names in this file. These are unchanged.
10030 my @corrected_lines = split /\n/, <<'END';
10031 4E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;;
10032 9FA5;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;;
10033 E000;<Private Use, First>;Co;0;L;;;;;N;;;;;
10034 F8FF;<Private Use, Last>;Co;0;L;;;;;N;;;;;
10035 F900;<CJK Compatibility Ideograph, First>;Lo;0;L;;;;;N;;;;;
10036 FA2D;<CJK Compatibility Ideograph, Last>;Lo;0;L;;;;;N;;;;;
10040 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10042 #local $to_trace = 1 if main::DEBUG;
10043 trace $_ if main::DEBUG && $to_trace;
10045 # -1 => retain trailing null fields
10046 my ($code_point, @fields) = split /\s*;\s*/, $_, -1;
10048 # At the first place that is wrong in the input, insert all the
10049 # corrections, replacing the wrong line.
10050 if ($code_point eq '4E00') {
10051 my @copy = @corrected_lines;
10053 ($code_point, @fields) = split /\s*;\s*/, $_, -1;
10055 $file->insert_lines(@copy);
10059 if ($fields[$NUMERIC] eq '-') {
10060 $fields[$NUMERIC] = '-1'; # This is what 2.0 made it.
10063 if ($fields[$PERL_DECOMPOSITION] ne "") {
10065 # Several entries have this change to superscript 2 or 3 in the
10066 # middle. Convert these to the modern version, which is to use
10067 # the actual U+00B2 and U+00B3 (the superscript forms) instead.
10068 # So 'HHHH HHHH <+sup> 0033 <-sup> HHHH' becomes
10069 # 'HHHH HHHH 00B3 HHHH'.
10070 # It turns out that all of these that don't have another
10071 # decomposition defined at the beginning of the line have the
10072 # <square> decomposition in later releases.
10073 if ($code_point ne '00B2' && $code_point ne '00B3') {
10074 if ($fields[$PERL_DECOMPOSITION]
10075 =~ s/<\+sup> 003([23]) <-sup>/00B$1/)
10077 if (substr($fields[$PERL_DECOMPOSITION], 0, 1) ne '<') {
10078 $fields[$PERL_DECOMPOSITION] = '<square> '
10079 . $fields[$PERL_DECOMPOSITION];
10084 # If is like '<+circled> 0052 <-circled>', convert to
10086 $fields[$PERL_DECOMPOSITION] =~
10087 s/ < \+ ( .*? ) > \s* (.*?) \s* <-\1> /<$1> $2/x;
10089 # Convert '<join> HHHH HHHH <join>' to '<medial> HHHH HHHH', etc.
10090 $fields[$PERL_DECOMPOSITION] =~
10091 s/ <join> \s* (.*?) \s* <no-join> /<final> $1/x
10092 or $fields[$PERL_DECOMPOSITION] =~
10093 s/ <join> \s* (.*?) \s* <join> /<medial> $1/x
10094 or $fields[$PERL_DECOMPOSITION] =~
10095 s/ <no-join> \s* (.*?) \s* <join> /<initial> $1/x
10096 or $fields[$PERL_DECOMPOSITION] =~
10097 s/ <no-join> \s* (.*?) \s* <no-join> /<isolated> $1/x;
10099 # Convert '<break> HHHH HHHH <break>' to '<break> HHHH', etc.
10100 $fields[$PERL_DECOMPOSITION] =~
10101 s/ <(break|no-break)> \s* (.*?) \s* <\1> /<$1> $2/x;
10103 # Change names to modern form.
10104 $fields[$PERL_DECOMPOSITION] =~ s/<font variant>/<font>/g;
10105 $fields[$PERL_DECOMPOSITION] =~ s/<no-break>/<noBreak>/g;
10106 $fields[$PERL_DECOMPOSITION] =~ s/<circled>/<circle>/g;
10107 $fields[$PERL_DECOMPOSITION] =~ s/<break>/<fraction>/g;
10109 # One entry has weird braces
10110 $fields[$PERL_DECOMPOSITION] =~ s/[{}]//g;
10113 $_ = join ';', $code_point, @fields;
10114 trace $_ if main::DEBUG && $to_trace;
10118 sub filter_v2_1_5_ucd {
10119 # A dozen entries in this 2.1.5 file had the mirrored and numeric
10120 # columns swapped; These all had mirrored be 'N'. So if the numeric
10121 # column appears to be N, swap it back.
10123 my ($code_point, @fields) = split /\s*;\s*/, $_, -1;
10124 if ($fields[$NUMERIC] eq 'N') {
10125 $fields[$NUMERIC] = $fields[$MIRRORED];
10126 $fields[$MIRRORED] = 'N';
10127 $_ = join ';', $code_point, @fields;
10131 } # End closure for UnicodeData
10133 sub process_GCB_test {
10136 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10138 while ($file->next_line) {
10139 push @backslash_X_tests, $_;
10145 sub process_NamedSequences {
10146 # NamedSequences.txt entries are just added to an array. Because these
10147 # don't look like the other tables, they have their own handler.
10149 # LATIN CAPITAL LETTER A WITH MACRON AND GRAVE;0100 0300
10151 # This just adds the sequence to an array for later handling
10154 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10156 while ($file->next_line) {
10157 my ($name, $sequence, @remainder) = split /\s*;\s*/, $_, -1;
10159 $file->carp_bad_line(
10160 "Doesn't look like 'KHMER VOWEL SIGN OM;17BB 17C6'");
10164 # Note single \t in keeping with special output format of
10165 # Perl_charnames. But it turns out that the code points don't have to
10166 # be 5 digits long, like the rest, based on the internal workings of
10167 # charnames.pm. This could be easily changed for consistency.
10168 push @named_sequences, "$sequence\t$name";
10177 sub filter_early_ea_lb {
10178 # Fixes early EastAsianWidth.txt and LineBreak.txt files. These had a
10179 # third field be the name of the code point, which can be ignored in
10180 # most cases. But it can be meaningful if it marks a range:
10181 # 33FE;W;IDEOGRAPHIC TELEGRAPH SYMBOL FOR DAY THIRTY-ONE
10182 # 3400;W;<CJK Ideograph Extension A, First>
10184 # We need to see the First in the example above to know it's a range.
10185 # They did not use the later range syntaxes. This routine changes it
10186 # to use the modern syntax.
10187 # $1 is the Input_file object.
10189 my @fields = split /\s*;\s*/;
10190 if ($fields[2] =~ /^<.*, First>/) {
10191 $first_range = $fields[0];
10194 elsif ($fields[2] =~ /^<.*, Last>/) {
10195 $_ = $_ = "$first_range..$fields[0]; $fields[1]";
10198 undef $first_range;
10199 $_ = "$fields[0]; $fields[1]";
10206 sub filter_old_style_arabic_shaping {
10207 # Early versions used a different term for the later one.
10209 my @fields = split /\s*;\s*/;
10210 $fields[3] =~ s/<no shaping>/No_Joining_Group/;
10211 $fields[3] =~ s/\s+/_/g; # Change spaces to underscores
10212 $_ = join ';', @fields;
10216 sub filter_arabic_shaping_line {
10217 # ArabicShaping.txt has entries that look like:
10218 # 062A; TEH; D; BEH
10219 # The field containing 'TEH' is not used. The next field is Joining_Type
10220 # and the last is Joining_Group
10221 # This generates two lines to pass on, one for each property on the input
10225 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10227 my @fields = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
10230 $file->carp_bad_line('Extra fields');
10235 $file->insert_adjusted_lines("$fields[0]; Joining_Group; $fields[3]");
10236 $_ = "$fields[0]; Joining_Type; $fields[2]";
10241 sub setup_special_casing {
10242 # SpecialCasing.txt contains the non-simple case change mappings. The
10243 # simple ones are in UnicodeData.txt, which should already have been read
10244 # in to the full property data structures, so as to initialize these with
10245 # the simple ones. Then the SpecialCasing.txt entries overwrite the ones
10246 # which have different full mappings.
10248 # This routine sees if the simple mappings are to be output, and if so,
10249 # copies what has already been put into the full mapping tables, while
10250 # they still contain only the simple mappings.
10252 # The reason it is done this way is that the simple mappings are probably
10253 # not going to be output, so it saves work to initialize the full tables
10254 # with the simple mappings, and then overwrite those relatively few
10255 # entries in them that have different full mappings, and thus skip the
10256 # simple mapping tables altogether.
10259 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10261 # For each of the case change mappings...
10262 foreach my $case ('lc', 'tc', 'uc') {
10263 my $full = property_ref($case);
10264 unless (defined $full && ! $full->is_empty) {
10265 Carp::my_carp_bug("Need to process UnicodeData before SpecialCasing. Only special casing will be generated.");
10268 # The simple version's name in each mapping merely has an 's' in front
10269 # of the full one's
10270 my $simple = property_ref('s' . $case);
10271 $simple->initialize($full) if $simple->to_output_map();
10277 sub filter_special_casing_line {
10278 # Change the format of $_ from SpecialCasing.txt into something that the
10279 # generic handler understands. Each input line contains three case
10280 # mappings. This will generate three lines to pass to the generic handler
10281 # for each of those.
10283 # The input syntax (after stripping comments and trailing white space is
10284 # like one of the following (with the final two being entries that we
10286 # 00DF; 00DF; 0053 0073; 0053 0053; # LATIN SMALL LETTER SHARP S
10287 # 03A3; 03C2; 03A3; 03A3; Final_Sigma;
10288 # 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
10289 # Note the trailing semi-colon, unlike many of the input files. That
10290 # means that there will be an extra null field generated by the split
10293 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10295 my @fields = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
10297 # field #4 is when this mapping is conditional. If any of these get
10298 # implemented, it would be by hard-coding in the casing functions in the
10299 # Perl core, not through tables. But if there is a new condition we don't
10300 # know about, output a warning. We know about all the conditions through
10302 if ($fields[4] ne "") {
10303 my @conditions = split ' ', $fields[4];
10304 if ($conditions[0] ne 'tr' # We know that these languages have
10305 # conditions, and some are multiple
10306 && $conditions[0] ne 'az'
10307 && $conditions[0] ne 'lt'
10309 # And, we know about a single condition Final_Sigma, but
10311 && ($v_version gt v5.2.0
10312 && (@conditions > 1 || $conditions[0] ne 'Final_Sigma')))
10314 $file->carp_bad_line("Unknown condition '$fields[4]'. You should inspect it and either add code to handle it, or add to list of those that are to ignore");
10316 elsif ($conditions[0] ne 'Final_Sigma') {
10318 # Don't print out a message for Final_Sigma, because we have
10319 # hard-coded handling for it. (But the standard could change
10320 # what the rule should be, but it wouldn't show up here
10323 print "# SKIPPING Special Casing: $_\n"
10324 if $verbosity >= $VERBOSE;
10329 elsif (@fields > 6 || (@fields == 6 && $fields[5] ne "" )) {
10330 $file->carp_bad_line('Extra fields');
10335 $_ = "$fields[0]; lc; $fields[1]";
10336 $file->insert_adjusted_lines("$fields[0]; tc; $fields[2]");
10337 $file->insert_adjusted_lines("$fields[0]; uc; $fields[3]");
10342 sub filter_old_style_case_folding {
10343 # This transforms $_ containing the case folding style of 3.0.1, to 3.1
10344 # and later style. Different letters were used in the earlier.
10347 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10349 my @fields = split /\s*;\s*/;
10350 if ($fields[0] =~ /^ 013 [01] $/x) { # The two turkish fields
10353 elsif ($fields[1] eq 'L') {
10354 $fields[1] = 'C'; # L => C always
10356 elsif ($fields[1] eq 'E') {
10357 if ($fields[2] =~ / /) { # E => C if one code point; F otherwise
10365 $file->carp_bad_line("Expecting L or E in second field");
10369 $_ = join("; ", @fields) . ';';
10373 { # Closure for case folding
10375 # Create the map for simple only if are going to output it, for otherwise
10376 # it takes no part in anything we do.
10377 my $to_output_simple;
10380 # These are experimental, perhaps will need these to pass to regcomp.c to
10381 # handle the cases where for example the Kelvin sign character folds to k,
10382 # and in regcomp, we need to know which of the characters can have a
10383 # non-latin1 char fold to it, so it doesn't do the optimizations it might
10385 my @latin1_singly_folded;
10388 sub setup_case_folding($) {
10389 # Read in the case foldings in CaseFolding.txt. This handles both
10390 # simple and full case folding.
10393 = property_ref('Simple_Case_Folding')->to_output_map;
10398 sub filter_case_folding_line {
10399 # Called for each line in CaseFolding.txt
10400 # Input lines look like:
10401 # 0041; C; 0061; # LATIN CAPITAL LETTER A
10402 # 00DF; F; 0073 0073; # LATIN SMALL LETTER SHARP S
10403 # 1E9E; S; 00DF; # LATIN CAPITAL LETTER SHARP S
10405 # 'C' means that folding is the same for both simple and full
10406 # 'F' that it is only for full folding
10407 # 'S' that it is only for simple folding
10408 # 'T' is locale-dependent, and ignored
10409 # 'I' is a type of 'F' used in some early releases.
10410 # Note the trailing semi-colon, unlike many of the input files. That
10411 # means that there will be an extra null field generated by the split
10412 # below, which we ignore and hence is not an error.
10415 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10417 my ($range, $type, $map, @remainder) = split /\s*;\s*/, $_, -1;
10418 if (@remainder > 1 || (@remainder == 1 && $remainder[0] ne "" )) {
10419 $file->carp_bad_line('Extra fields');
10424 if ($type eq 'T') { # Skip Turkic case folding, is locale dependent
10429 # C: complete, F: full, or I: dotted uppercase I -> dotless lowercase
10430 # I are all full foldings
10431 if ($type eq 'C' || $type eq 'F' || $type eq 'I') {
10432 $_ = "$range; Case_Folding; $map";
10436 if ($type ne 'S') {
10437 $file->carp_bad_line('Expecting C F I S or T in second field');
10442 # C and S are simple foldings, but simple case folding is not needed
10443 # unless we explicitly want its map table output.
10444 if ($to_output_simple && $type eq 'C' || $type eq 'S') {
10445 $file->insert_adjusted_lines("$range; Simple_Case_Folding; $map");
10448 # XXX Experimental, see comment above
10449 if ($type ne 'S' && hex($range) >= 256) { # assumes range is 1 point
10450 my @folded = split ' ', $map;
10451 if (hex $folded[0] < 256 && @folded == 1) {
10452 push @latin1_singly_folded, hex $folded[0];
10454 foreach my $folded (@folded) {
10455 push @latin1_folded, hex $folded if hex $folded < 256;
10463 # XXX Experimental, see comment above
10466 #local $to_trace = 1 if main::DEBUG;
10467 @latin1_singly_folded = uniques(@latin1_singly_folded);
10468 @latin1_folded = uniques(@latin1_folded);
10469 trace "latin1 single folded:", map { chr $_ } sort { $a <=> $b } @latin1_singly_folded if main::DEBUG && $to_trace;
10470 trace "latin1 folded:", map { chr $_ } sort { $a <=> $b } @latin1_folded if main::DEBUG && $to_trace;
10473 } # End case fold closure
10475 sub filter_jamo_line {
10476 # Filter Jamo.txt lines. This routine mainly is used to populate hashes
10477 # from this file that is used in generating the Name property for Jamo
10478 # code points. But, it also is used to convert early versions' syntax
10479 # into the modern form. Here are two examples:
10480 # 1100; G # HANGUL CHOSEONG KIYEOK # Modern syntax
10481 # U+1100; G; HANGUL CHOSEONG KIYEOK # 2.0 syntax
10483 # The input is $_, the output is $_ filtered.
10485 my @fields = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
10487 # Let the caller handle unexpected input. In earlier versions, there was
10488 # a third field which is supposed to be a comment, but did not have a '#'
10490 return if @fields > (($v_version gt v3.0.0) ? 2 : 3);
10492 $fields[0] =~ s/^U\+//; # Also, early versions had this extraneous
10495 # Some 2.1 versions had this wrong. Causes havoc with the algorithm.
10496 $fields[1] = 'R' if $fields[0] eq '1105';
10498 # Add to structure so can generate Names from it.
10499 my $cp = hex $fields[0];
10500 my $short_name = $fields[1];
10501 $Jamo{$cp} = $short_name;
10502 if ($cp <= $LBase + $LCount) {
10503 $Jamo_L{$short_name} = $cp - $LBase;
10505 elsif ($cp <= $VBase + $VCount) {
10506 $Jamo_V{$short_name} = $cp - $VBase;
10508 elsif ($cp <= $TBase + $TCount) {
10509 $Jamo_T{$short_name} = $cp - $TBase;
10512 Carp::my_carp_bug("Unexpected Jamo code point in $_");
10516 # Reassemble using just the first two fields to look like a typical
10517 # property file line
10518 $_ = "$fields[0]; $fields[1]";
10523 sub register_fraction($) {
10524 # This registers the input rational number so that it can be passed on to
10525 # utf8_heavy.pl, both in rational and floating forms.
10527 my $rational = shift;
10529 my $float = eval $rational;
10530 $nv_floating_to_rational{$float} = $rational;
10534 sub filter_numeric_value_line {
10535 # DNumValues contains lines of a different syntax than the typical
10537 # 0F33 ; -0.5 ; ; -1/2 # No TIBETAN DIGIT HALF ZERO
10539 # This routine transforms $_ containing the anomalous syntax to the
10540 # typical, by filtering out the extra columns, and convert early version
10541 # decimal numbers to strings that look like rational numbers.
10544 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10546 # Starting in 5.1, there is a rational field. Just use that, omitting the
10547 # extra columns. Otherwise convert the decimal number in the second field
10548 # to a rational, and omit extraneous columns.
10549 my @fields = split /\s*;\s*/, $_, -1;
10552 if ($v_version ge v5.1.0) {
10553 if (@fields != 4) {
10554 $file->carp_bad_line('Not 4 semi-colon separated fields');
10558 $rational = $fields[3];
10559 $_ = join '; ', @fields[ 0, 3 ];
10563 # Here, is an older Unicode file, which has decimal numbers instead of
10564 # rationals in it. Use the fraction to calculate the denominator and
10565 # convert to rational.
10567 if (@fields != 2 && @fields != 3) {
10568 $file->carp_bad_line('Not 2 or 3 semi-colon separated fields');
10573 my $codepoints = $fields[0];
10574 my $decimal = $fields[1];
10575 if ($decimal =~ s/\.0+$//) {
10577 # Anything ending with a decimal followed by nothing but 0's is an
10579 $_ = "$codepoints; $decimal";
10580 $rational = $decimal;
10585 if ($decimal =~ /\.50*$/) {
10589 # Here have the hardcoded repeating decimals in the fraction, and
10590 # the denominator they imply. There were only a few denominators
10591 # in the older Unicode versions of this file which this code
10592 # handles, so it is easy to convert them.
10594 # The 4 is because of a round-off error in the Unicode 3.2 files
10595 elsif ($decimal =~ /\.33*[34]$/ || $decimal =~ /\.6+7$/) {
10598 elsif ($decimal =~ /\.[27]50*$/) {
10601 elsif ($decimal =~ /\.[2468]0*$/) {
10604 elsif ($decimal =~ /\.16+7$/ || $decimal =~ /\.83+$/) {
10607 elsif ($decimal =~ /\.(12|37|62|87)50*$/) {
10610 if ($denominator) {
10611 my $sign = ($decimal < 0) ? "-" : "";
10612 my $numerator = int((abs($decimal) * $denominator) + .5);
10613 $rational = "$sign$numerator/$denominator";
10614 $_ = "$codepoints; $rational";
10617 $file->carp_bad_line("Can't cope with number '$decimal'.");
10624 register_fraction($rational) if $rational =~ qr{/};
10629 my %unihan_properties;
10634 # Do any special setup for Unihan properties.
10636 # This property gives the wrong computed type, so override.
10637 my $usource = property_ref('kIRG_USource');
10638 $usource->set_type($STRING) if defined $usource;
10640 # This property is to be considered binary, so change all the values
10642 $iicore = property_ref('kIICore');
10643 if (defined $iicore) {
10644 $iicore->add_match_table('Y') if ! defined $iicore->table('Y');
10646 # We have to change the default map, because the @missing line is
10647 # misleading, given that we are treating it as binary.
10648 $iicore->set_default_map('N');
10649 $iicore->set_type($BINARY);
10655 sub filter_unihan_line {
10656 # Change unihan db lines to look like the others in the db. Here is
10658 # U+341C kCangjie IEKN
10660 # Tabs are used instead of semi-colons to separate fields; therefore
10661 # they may have semi-colons embedded in them. Change these to periods
10662 # so won't screw up the rest of the code.
10665 # Remove lines that don't look like ones we accept.
10666 if ($_ !~ /^ [^\t]* \t ( [^\t]* ) /x) {
10671 # Extract the property, and save a reference to its object.
10673 if (! exists $unihan_properties{$property}) {
10674 $unihan_properties{$property} = property_ref($property);
10677 # Don't do anything unless the property is one we're handling, which
10678 # we determine by seeing if there is an object defined for it or not
10679 if (! defined $unihan_properties{$property}) {
10684 # The iicore property is supposed to be a boolean, so convert to our
10685 # standard boolean form.
10686 if (defined $iicore && $unihan_properties{$property} == $iicore) {
10687 $_ =~ s/$property.*/$property\tY/
10690 # Convert the tab separators to our standard semi-colons, and convert
10691 # the U+HHHH notation to the rest of the standard's HHHH
10693 s/\b U \+ (?= $code_point_re )//xg;
10695 #local $to_trace = 1 if main::DEBUG;
10696 trace $_ if main::DEBUG && $to_trace;
10702 sub filter_blocks_lines {
10703 # In the Blocks.txt file, the names of the blocks don't quite match the
10704 # names given in PropertyValueAliases.txt, so this changes them so they
10705 # do match: Blanks and hyphens are changed into underscores. Also makes
10706 # early release versions look like later ones
10708 # $_ is transformed to the correct value.
10711 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10713 if ($v_version lt v3.2.0) {
10714 if (/FEFF.*Specials/) { # Bug in old versions: line wrongly inserted
10719 # Old versions used a different syntax to mark the range.
10720 $_ =~ s/;\s+/../ if $v_version lt v3.1.0;
10723 my @fields = split /\s*;\s*/, $_, -1;
10724 if (@fields != 2) {
10725 $file->carp_bad_line("Expecting exactly two fields");
10730 # Change hyphens and blanks in the block name field only
10731 $fields[1] =~ s/[ -]/_/g;
10732 $fields[1] =~ s/_ ( [a-z] ) /_\u$1/g; # Capitalize first letter of word
10734 $_ = join("; ", @fields);
10739 my $current_property;
10741 sub filter_old_style_proplist {
10742 # PropList.txt has been in Unicode since version 2.0. Until 3.1, it
10743 # was in a completely different syntax. Ken Whistler of Unicode says
10744 # that it was something he used as an aid for his own purposes, but
10745 # was never an official part of the standard. However, comments in
10746 # DAge.txt indicate that non-character code points were available in
10747 # the UCD as of 3.1. It is unclear to me (khw) how they could be
10748 # there except through this file (but on the other hand, they first
10749 # appeared there in 3.0.1), so maybe it was part of the UCD, and maybe
10750 # not. But the claim is that it was published as an aid to others who
10751 # might want some more information than was given in the official UCD
10752 # of the time. Many of the properties in it were incorporated into
10753 # the later PropList.txt, but some were not. This program uses this
10754 # early file to generate property tables that are otherwise not
10755 # accessible in the early UCD's, and most were probably not really
10756 # official at that time, so one could argue that it should be ignored,
10757 # and you can easily modify things to skip this. And there are bugs
10758 # in this file in various versions. (For example, the 2.1.9 version
10759 # removes from Alphabetic the CJK range starting at 4E00, and they
10760 # weren't added back in until 3.1.0.) Many of this file's properties
10761 # were later sanctioned, so this code generates tables for those
10762 # properties that aren't otherwise in the UCD of the time but
10763 # eventually did become official, and throws away the rest. Here is a
10764 # list of all the ones that are thrown away:
10765 # Bidi=* duplicates UnicodeData.txt
10766 # Combining never made into official property;
10768 # Composite never made into official property.
10769 # Currency Symbol duplicates UnicodeData.txt: gc=sc
10770 # Decimal Digit duplicates UnicodeData.txt: gc=nd
10771 # Delimiter never made into official property;
10773 # Format Control never made into official property;
10775 # High Surrogate duplicates Blocks.txt
10776 # Ignorable Control never made into official property;
10778 # ISO Control duplicates UnicodeData.txt: gc=cc
10779 # Left of Pair never made into official property;
10780 # Line Separator duplicates UnicodeData.txt: gc=zl
10781 # Low Surrogate duplicates Blocks.txt
10782 # Non-break was actually listed as a property
10783 # in 3.2, but without any code
10784 # points. Unicode denies that this
10785 # was ever an official property
10786 # Non-spacing duplicate UnicodeData.txt: gc=mn
10787 # Numeric duplicates UnicodeData.txt: gc=cc
10788 # Paired Punctuation never made into official property;
10789 # appears to be gc=ps + gc=pe
10790 # Paragraph Separator duplicates UnicodeData.txt: gc=cc
10791 # Private Use duplicates UnicodeData.txt: gc=co
10792 # Private Use High Surrogate duplicates Blocks.txt
10793 # Punctuation duplicates UnicodeData.txt: gc=p
10794 # Space different definition than eventual
10796 # Titlecase duplicates UnicodeData.txt: gc=lt
10797 # Unassigned Code Value duplicates UnicodeData.txt: gc=cc
10798 # Zero-width never made into offical property;
10800 # Most of the properties have the same names in this file as in later
10801 # versions, but a couple do not.
10803 # This subroutine filters $_, converting it from the old style into
10804 # the new style. Here's a sample of the old-style
10806 # *******************************************
10808 # Property dump for: 0x100000A0 (Join Control)
10810 # 200C..200D (2 chars)
10812 # In the example, the property is "Join Control". It is kept in this
10813 # closure between calls to the subroutine. The numbers beginning with
10814 # 0x were internal to Ken's program that generated this file.
10816 # If this line contains the property name, extract it.
10817 if (/^Property dump for: [^(]*\((.*)\)/) {
10820 # Convert white space to underscores.
10823 # Convert the few properties that don't have the same name as
10824 # their modern counterparts
10825 s/Identifier_Part/ID_Continue/
10826 or s/Not_a_Character/NChar/;
10828 # If the name matches an existing property, use it.
10829 if (defined property_ref($_)) {
10830 trace "new property=", $_ if main::DEBUG && $to_trace;
10831 $current_property = $_;
10833 else { # Otherwise discard it
10834 trace "rejected property=", $_ if main::DEBUG && $to_trace;
10835 undef $current_property;
10837 $_ = ""; # The property is saved for the next lines of the
10838 # file, but this defining line is of no further use,
10839 # so clear it so that the caller won't process it
10842 elsif (! defined $current_property || $_ !~ /^$code_point_re/) {
10844 # Here, the input line isn't a header defining a property for the
10845 # following section, and either we aren't in such a section, or
10846 # the line doesn't look like one that defines the code points in
10847 # such a section. Ignore this line.
10852 # Here, we have a line defining the code points for the current
10853 # stashed property. Anything starting with the first blank is
10854 # extraneous. Otherwise, it should look like a normal range to
10855 # the caller. Append the property name so that it looks just like
10856 # a modern PropList entry.
10859 $_ .= "; $current_property";
10861 trace $_ if main::DEBUG && $to_trace;
10864 } # End closure for old style proplist
10866 sub filter_old_style_normalization_lines {
10867 # For early releases of Unicode, the lines were like:
10868 # 74..2A76 ; NFKD_NO
10869 # For later releases this became:
10870 # 74..2A76 ; NFKD_QC; N
10871 # Filter $_ to look like those in later releases.
10872 # Similarly for MAYBEs
10874 s/ _NO \b /_QC; N/x || s/ _MAYBE \b /_QC; M/x;
10876 # Also, the property FC_NFKC was abbreviated to FNC
10881 sub finish_Unicode() {
10882 # This routine should be called after all the Unicode files have been read
10884 # 1) Adds the mappings for code points missing from the files which have
10885 # defaults specified for them.
10886 # 2) At this this point all mappings are known, so it computes the type of
10887 # each property whose type hasn't been determined yet.
10888 # 3) Calculates all the regular expression match tables based on the
10890 # 3) Calculates and adds the tables which are defined by Unicode, but
10891 # which aren't derived by them
10893 # For each property, fill in any missing mappings, and calculate the re
10894 # match tables. If a property has more than one missing mapping, the
10895 # default is a reference to a data structure, and requires data from other
10896 # properties to resolve. The sort is used to cause these to be processed
10897 # last, after all the other properties have been calculated.
10898 # (Fortunately, the missing properties so far don't depend on each other.)
10899 foreach my $property
10900 (sort { (defined $a->default_map && ref $a->default_map) ? 1 : -1 }
10903 # $perl has been defined, but isn't one of the Unicode properties that
10904 # need to be finished up.
10905 next if $property == $perl;
10907 # Handle the properties that have more than one possible default
10908 if (ref $property->default_map) {
10909 my $default_map = $property->default_map;
10911 # These properties have stored in the default_map:
10913 # 1) A default map which applies to all code points in a
10915 # 2) an expression which will evaluate to the list of code
10916 # points in that class
10918 # 3) the default map which applies to every other missing code
10921 # Go through each list.
10922 while (my ($default, $eval) = $default_map->get_next_defaults) {
10924 # Get the class list, and intersect it with all the so-far
10925 # unspecified code points yielding all the code points
10926 # in the class that haven't been specified.
10927 my $list = eval $eval;
10929 Carp::my_carp("Can't set some defaults for missing code points for $property because eval '$eval' failed with '$@'");
10933 # Narrow down the list to just those code points we don't have
10935 $list = $list & $property->inverse_list;
10937 # Add mappings to the property for each code point in the list
10938 foreach my $range ($list->ranges) {
10939 $property->add_map($range->start, $range->end, $default);
10943 # All remaining code points have the other mapping. Set that up
10944 # so the normal single-default mapping code will work on them
10945 $property->set_default_map($default_map->other_default);
10947 # And fall through to do that
10950 # We should have enough data now to compute the type of the property.
10951 $property->compute_type;
10952 my $property_type = $property->type;
10954 next if ! $property->to_create_match_tables;
10956 # Here want to create match tables for this property
10958 # The Unicode db always (so far, and they claim into the future) have
10959 # the default for missing entries in binary properties be 'N' (unless
10960 # there is a '@missing' line that specifies otherwise)
10961 if ($property_type == $BINARY && ! defined $property->default_map) {
10962 $property->set_default_map('N');
10965 # Add any remaining code points to the mapping, using the default for
10966 # missing code points
10967 if (defined (my $default_map = $property->default_map)) {
10968 foreach my $range ($property->inverse_list->ranges) {
10969 $property->add_map($range->start, $range->end, $default_map);
10972 # Make sure there is a match table for the default
10973 if (! defined $property->table($default_map)) {
10974 $property->add_match_table($default_map);
10978 # Have all we need to populate the match tables.
10979 my $property_name = $property->name;
10980 foreach my $range ($property->ranges) {
10981 my $map = $range->value;
10982 my $table = property_ref($property_name)->table($map);
10983 if (! defined $table) {
10985 # Integral and rational property values are not necessarily
10986 # defined in PropValueAliases, but all other ones should be,
10988 if ($v_version ge v5.1.0
10989 && $map !~ /^ -? \d+ ( \/ \d+ )? $/x)
10991 Carp::my_carp("Table '$property_name=$map' should have been defined. Defining it now.")
10993 $table = property_ref($property_name)->add_match_table($map);
10996 $table->add_range($range->start, $range->end);
10999 # And add the Is_ prefix synonyms for Perl 5.6 compatibility, in which
11000 # all properties have this optional prefix. These do not get a
11001 # separate entry in the pod file, because are covered by a wild-card
11003 foreach my $alias ($property->aliases) {
11004 my $Is_name = 'Is_' . $alias->name;
11005 if (! defined (my $pre_existing = property_ref($Is_name))) {
11006 $property->add_alias($Is_name,
11008 Status => $alias->status,
11009 Externally_Ok => 0);
11013 # It seemed too much work to add in these warnings when it
11014 # appears that Unicode has made a decision never to begin a
11015 # property name with 'Is_', so this shouldn't happen, but just
11016 # in case, it is a warning.
11017 Carp::my_carp(<<END
11018 There is already an alias named $Is_name (from " . $pre_existing . "), so not
11019 creating this alias for $property. The generated table and pod files do not
11020 warn users of this conflict.
11023 $has_Is_conflicts++;
11025 } # End of loop through aliases for this property
11026 } # End of loop through all Unicode properties.
11028 # Fill in the mappings that Unicode doesn't completely furnish. First the
11029 # single letter major general categories. If Unicode were to start
11030 # delivering the values, this would be redundant, but better that than to
11031 # try to figure out if should skip and not get it right. Ths could happen
11032 # if a new major category were to be introduced, and the hard-coded test
11033 # wouldn't know about it.
11034 # This routine depends on the standard names for the general categories
11035 # being what it thinks they are, like 'Cn'. The major categories are the
11036 # union of all the general category tables which have the same first
11037 # letters. eg. L = Lu + Lt + Ll + Lo + Lm
11038 foreach my $minor_table ($gc->tables) {
11039 my $minor_name = $minor_table->name;
11040 next if length $minor_name == 1;
11041 if (length $minor_name != 2) {
11042 Carp::my_carp_bug("Unexpected general category '$minor_name'. Skipped.");
11046 my $major_name = uc(substr($minor_name, 0, 1));
11047 my $major_table = $gc->table($major_name);
11048 $major_table += $minor_table;
11051 # LC is Ll, Lu, and Lt. (used to be L& or L_, but PropValueAliases.txt
11052 # defines it as LC)
11053 my $LC = $gc->table('LC');
11054 $LC->add_alias('L_', Status => $DISCOURAGED); # For backwards...
11055 $LC->add_alias('L&', Status => $DISCOURAGED); # compatibility.
11058 if ($LC->is_empty) { # Assume if not empty that Unicode has started to
11059 # deliver the correct values in it
11060 $LC->initialize($gc->table('Ll') + $gc->table('Lu'));
11062 # Lt not in release 1.
11063 $LC += $gc->table('Lt') if defined $gc->table('Lt');
11065 $LC->add_description('[\p{Ll}\p{Lu}\p{Lt}]');
11067 my $Cs = $gc->table('Cs');
11069 $Cs->add_note('Mostly not usable in Perl.');
11070 $Cs->add_comment(join_lines(<<END
11071 Surrogates are used exclusively for I/O in UTF-16, and should not appear in
11072 Unicode text, and hence their use will generate (usually fatal) messages
11078 # Folding information was introduced later into Unicode data. To get
11079 # Perl's case ignore (/i) to work at all in releases that don't have
11080 # folding, use the best available alternative, which is lower casing.
11081 my $fold = property_ref('Simple_Case_Folding');
11082 if ($fold->is_empty) {
11083 $fold->initialize(property_ref('Simple_Lowercase_Mapping'));
11084 $fold->add_note(join_lines(<<END
11085 WARNING: This table uses lower case as a substitute for missing fold
11091 # Multiple-character mapping was introduced later into Unicode data. If
11092 # missing, use the single-characters maps as best available alternative
11093 foreach my $map (qw { Uppercase_Mapping
11098 my $full = property_ref($map);
11099 if ($full->is_empty) {
11100 my $simple = property_ref('Simple_' . $map);
11101 $full->initialize($simple);
11102 $full->add_comment($simple->comment) if ($simple->comment);
11103 $full->add_note(join_lines(<<END
11104 WARNING: This table uses simple mapping (single-character only) as a
11105 substitute for missing multiple-character information
11113 sub compile_perl() {
11114 # Create perl-defined tables. Almost all are part of the pseudo-property
11115 # named 'perl' internally to this program. Many of these are recommended
11116 # in UTS#18 "Unicode Regular Expressions", and their derivations are based
11117 # on those found there.
11118 # Almost all of these are equivalent to some Unicode property.
11119 # A number of these properties have equivalents restricted to the ASCII
11120 # range, with their names prefaced by 'Posix', to signify that these match
11121 # what the Posix standard says they should match. A couple are
11122 # effectively this, but the name doesn't have 'Posix' in it because there
11123 # just isn't any Posix equivalent.
11125 # 'Any' is all code points. As an error check, instead of just setting it
11126 # to be that, construct it to be the union of all the major categories
11127 my $Any = $perl->add_match_table('Any',
11128 Description => "[\\x{0000}-\\x{$LAST_UNICODE_CODEPOINT_STRING}]",
11131 foreach my $major_table ($gc->tables) {
11133 # Major categories are the ones with single letter names.
11134 next if length($major_table->name) != 1;
11136 $Any += $major_table;
11139 if ($Any->max != $LAST_UNICODE_CODEPOINT) {
11140 Carp::my_carp_bug("Generated highest code point ("
11141 . sprintf("%X", $Any->max)
11142 . ") doesn't match expected value $LAST_UNICODE_CODEPOINT_STRING.")
11144 if ($Any->range_count != 1 || $Any->min != 0) {
11145 Carp::my_carp_bug("Generated table 'Any' doesn't match all code points.")
11148 $Any->add_alias('All');
11150 # Assigned is the opposite of gc=unassigned
11151 my $Assigned = $perl->add_match_table('Assigned',
11152 Description => "All assigned code points",
11153 Initialize => ~ $gc->table('Unassigned'),
11156 # Our internal-only property should be treated as more than just a
11158 $perl->add_match_table('_CombAbove')
11159 ->set_equivalent_to(property_ref('ccc')->table('Above'),
11162 my $ASCII = $perl->add_match_table('ASCII', Description => '[[:ASCII:]]');
11163 if (defined $block) { # This is equivalent to the block if have it.
11164 my $Unicode_ASCII = $block->table('Basic_Latin');
11165 if (defined $Unicode_ASCII && ! $Unicode_ASCII->is_empty) {
11166 $ASCII->set_equivalent_to($Unicode_ASCII, Related => 1);
11170 # Very early releases didn't have blocks, so initialize ASCII ourselves if
11172 if ($ASCII->is_empty) {
11173 $ASCII->initialize([ 0..127 ]);
11176 # Get the best available case definitions. Early Unicode versions didn't
11177 # have Uppercase and Lowercase defined, so use the general category
11178 # instead for them.
11179 my $Lower = $perl->add_match_table('Lower');
11180 my $Unicode_Lower = property_ref('Lowercase');
11181 if (defined $Unicode_Lower && ! $Unicode_Lower->is_empty) {
11182 $Lower->set_equivalent_to($Unicode_Lower->table('Y'), Related => 1);
11185 $Lower->set_equivalent_to($gc->table('Lowercase_Letter'),
11188 $perl->add_match_table("PosixLower",
11189 Description => "[a-z]",
11190 Initialize => $Lower & $ASCII,
11193 my $Upper = $perl->add_match_table('Upper');
11194 my $Unicode_Upper = property_ref('Uppercase');
11195 if (defined $Unicode_Upper && ! $Unicode_Upper->is_empty) {
11196 $Upper->set_equivalent_to($Unicode_Upper->table('Y'), Related => 1);
11199 $Upper->set_equivalent_to($gc->table('Uppercase_Letter'),
11202 $perl->add_match_table("PosixUpper",
11203 Description => "[A-Z]",
11204 Initialize => $Upper & $ASCII,
11207 # Earliest releases didn't have title case. Initialize it to empty if not
11208 # otherwise present
11209 my $Title = $perl->add_match_table('Title');
11210 my $lt = $gc->table('Lt');
11212 $Title->set_equivalent_to($lt, Related => 1);
11215 # If this Unicode version doesn't have Cased, set up our own. From
11216 # Unicode 5.1: Definition D120: A character C is defined to be cased if
11217 # and only if C has the Lowercase or Uppercase property or has a
11218 # General_Category value of Titlecase_Letter.
11219 unless (defined property_ref('Cased')) {
11220 my $cased = $perl->add_match_table('Cased',
11221 Initialize => $Lower + $Upper + $Title,
11222 Description => 'Uppercase or Lowercase or Titlecase',
11226 # Similarly, set up our own Case_Ignorable property if this Unicode
11227 # version doesn't have it. From Unicode 5.1: Definition D121: A character
11228 # C is defined to be case-ignorable if C has the value MidLetter or the
11229 # value MidNumLet for the Word_Break property or its General_Category is
11230 # one of Nonspacing_Mark (Mn), Enclosing_Mark (Me), Format (Cf),
11231 # Modifier_Letter (Lm), or Modifier_Symbol (Sk).
11233 # Perl has long had an internal-only alias for this property.
11234 my $perl_case_ignorable = $perl->add_match_table('_Case_Ignorable');
11235 my $case_ignorable = property_ref('Case_Ignorable');
11236 if (defined $case_ignorable && ! $case_ignorable->is_empty) {
11237 $perl_case_ignorable->set_equivalent_to($case_ignorable->table('Y'),
11242 $perl_case_ignorable->initialize($gc->table('Mn') + $gc->table('Lm'));
11244 # The following three properties are not in early releases
11245 $perl_case_ignorable += $gc->table('Me') if defined $gc->table('Me');
11246 $perl_case_ignorable += $gc->table('Cf') if defined $gc->table('Cf');
11247 $perl_case_ignorable += $gc->table('Sk') if defined $gc->table('Sk');
11249 # For versions 4.1 - 5.0, there is no MidNumLet property, and
11250 # correspondingly the case-ignorable definition lacks that one. For
11251 # 4.0, it appears that it was meant to be the same definition, but was
11252 # inadvertently omitted from the standard's text, so add it if the
11253 # property actually is there
11254 my $wb = property_ref('Word_Break');
11256 my $midlet = $wb->table('MidLetter');
11257 $perl_case_ignorable += $midlet if defined $midlet;
11258 my $midnumlet = $wb->table('MidNumLet');
11259 $perl_case_ignorable += $midnumlet if defined $midnumlet;
11263 # In earlier versions of the standard, instead of the above two
11264 # properties , just the following characters were used:
11265 $perl_case_ignorable += 0x0027 # APOSTROPHE
11266 + 0x00AD # SOFT HYPHEN (SHY)
11267 + 0x2019; # RIGHT SINGLE QUOTATION MARK
11271 # The remaining perl defined tables are mostly based on Unicode TR 18,
11272 # "Annex C: Compatibility Properties". All of these have two versions,
11273 # one whose name generally begins with Posix that is posix-compliant, and
11274 # one that matches Unicode characters beyond the Posix, ASCII range
11276 my $Alpha = $perl->add_match_table('Alpha');
11278 # Alphabetic was not present in early releases
11279 my $Alphabetic = property_ref('Alphabetic');
11280 if (defined $Alphabetic && ! $Alphabetic->is_empty) {
11281 $Alpha->set_equivalent_to($Alphabetic->table('Y'), Related => 1);
11285 # For early releases, we don't get it exactly right. The below
11286 # includes more than it should, which in 5.2 terms is: L + Nl +
11287 # Other_Alphabetic. Other_Alphabetic contains many characters from
11288 # Mn and Mc. It's better to match more than we should, than less than
11290 $Alpha->initialize($gc->table('Letter')
11292 + $gc->table('Mc'));
11293 $Alpha += $gc->table('Nl') if defined $gc->table('Nl');
11294 $Alpha->add_description('Alphabetic');
11296 $perl->add_match_table("PosixAlpha",
11297 Description => "[A-Za-z]",
11298 Initialize => $Alpha & $ASCII,
11301 my $Alnum = $perl->add_match_table('Alnum',
11302 Description => 'Alphabetic and (Decimal) Numeric',
11303 Initialize => $Alpha + $gc->table('Decimal_Number'),
11305 $perl->add_match_table("PosixAlnum",
11306 Description => "[A-Za-z0-9]",
11307 Initialize => $Alnum & $ASCII,
11310 my $Word = $perl->add_match_table('Word',
11311 Description => '\w, including beyond ASCII',
11312 Initialize => $Alnum + $gc->table('Mark'),
11314 my $Pc = $gc->table('Connector_Punctuation'); # 'Pc' Not in release 1
11315 $Word += $Pc if defined $Pc;
11317 # This is a Perl extension, so the name doesn't begin with Posix.
11318 $perl->add_match_table('PerlWord',
11319 Description => '\w, restricted to ASCII = [A-Za-z0-9_]',
11320 Initialize => $Word & $ASCII,
11323 my $Blank = $perl->add_match_table('Blank',
11324 Description => '\h, Horizontal white space',
11326 # 200B is Zero Width Space which is for line
11327 # break control, and was listed as
11328 # Space_Separator in early releases
11329 Initialize => $gc->table('Space_Separator')
11333 $Blank->add_alias('HorizSpace'); # Another name for it.
11334 $perl->add_match_table("PosixBlank",
11335 Description => "\\t and ' '",
11336 Initialize => $Blank & $ASCII,
11339 my $VertSpace = $perl->add_match_table('VertSpace',
11340 Description => '\v',
11341 Initialize => $gc->table('Line_Separator')
11342 + $gc->table('Paragraph_Separator')
11343 + 0x000A # LINE FEED
11344 + 0x000B # VERTICAL TAB
11345 + 0x000C # FORM FEED
11346 + 0x000D # CARRIAGE RETURN
11349 # No Posix equivalent for vertical space
11351 my $Space = $perl->add_match_table('Space',
11352 Description => '\s including beyond ASCII plus vertical tab',
11353 Initialize => $Blank + $VertSpace,
11355 $perl->add_match_table("PosixSpace",
11356 Description => "\\t, \\n, \\cK, \\f, \\r, and ' '. (\\cK is vertical tab)",
11357 Initialize => $Space & $ASCII,
11360 # Perl's traditional space doesn't include Vertical Tab
11361 my $SpacePerl = $perl->add_match_table('SpacePerl',
11362 Description => '\s, including beyond ASCII',
11363 Initialize => $Space - 0x000B,
11365 $perl->add_match_table('PerlSpace',
11366 Description => '\s, restricted to ASCII',
11367 Initialize => $SpacePerl & $ASCII,
11370 my $Cntrl = $perl->add_match_table('Cntrl',
11371 Description => 'Control characters');
11372 $Cntrl->set_equivalent_to($gc->table('Cc'), Related => 1);
11373 $perl->add_match_table("PosixCntrl",
11374 Description => "ASCII control characters: NUL, SOH, STX, ETX, EOT, ENQ, ACK, BEL, BS, HT, LF, VT, FF, CR, SO, SI, DLE, DC1, DC2, DC3, DC4, NAK, SYN, ETB, CAN, EOM, SUB, ESC, FS, GS, RS, US, and DEL",
11375 Initialize => $Cntrl & $ASCII,
11378 # $controls is a temporary used to construct Graph.
11379 my $controls = Range_List->new(Initialize => $gc->table('Unassigned')
11380 + $gc->table('Control'));
11381 # Cs not in release 1
11382 $controls += $gc->table('Surrogate') if defined $gc->table('Surrogate');
11384 # Graph is ~space & ~(Cc|Cs|Cn) = ~(space + $controls)
11385 my $Graph = $perl->add_match_table('Graph',
11386 Description => 'Characters that are graphical',
11387 Initialize => ~ ($Space + $controls),
11389 $perl->add_match_table("PosixGraph",
11391 '[-!"#$%&\'()*+,./:;<>?@[\\\]^_`{|}~0-9A-Za-z]',
11392 Initialize => $Graph & $ASCII,
11395 $print = $perl->add_match_table('Print',
11396 Description => 'Characters that are graphical plus space characters (but no controls)',
11397 Initialize => $Blank + $Graph - $gc->table('Control'),
11399 $perl->add_match_table("PosixPrint",
11401 '[- 0-9A-Za-z!"#$%&\'()*+,./:;<>?@[\\\]^_`{|}~]',
11402 Initialize => $print & $ASCII,
11405 my $Punct = $perl->add_match_table('Punct');
11406 $Punct->set_equivalent_to($gc->table('Punctuation'), Related => 1);
11408 # \p{punct} doesn't include the symbols, which posix does
11409 $perl->add_match_table('PosixPunct',
11410 Description => '[-!"#$%&\'()*+,./:;<>?@[\\\]^_`{|}~]',
11411 Initialize => $ASCII & ($gc->table('Punctuation')
11412 + $gc->table('Symbol')),
11415 my $Digit = $perl->add_match_table('Digit',
11416 Description => '\d, extended beyond just [0-9]');
11417 $Digit->set_equivalent_to($gc->table('Decimal_Number'), Related => 1);
11418 my $PosixDigit = $perl->add_match_table("PosixDigit",
11419 Description => '[0-9]',
11420 Initialize => $Digit & $ASCII,
11423 # Hex_Digit was not present in first release
11424 my $Xdigit = $perl->add_match_table('XDigit');
11425 my $Hex = property_ref('Hex_Digit');
11426 if (defined $Hex && ! $Hex->is_empty) {
11427 $Xdigit->set_equivalent_to($Hex->table('Y'), Related => 1);
11430 # (Have to use hex instead of e.g. '0', because could be running on an
11431 # non-ASCII machine, and we want the Unicode (ASCII) values)
11432 $Xdigit->initialize([ 0x30..0x39, 0x41..0x46, 0x61..0x66,
11433 0xFF10..0xFF19, 0xFF21..0xFF26, 0xFF41..0xFF46]);
11434 $Xdigit->add_description('[0-9A-Fa-f] and corresponding fullwidth versions, like U+FF10: FULLWIDTH DIGIT ZERO');
11437 my $dt = property_ref('Decomposition_Type');
11438 $dt->add_match_table('Non_Canon', Full_Name => 'Non_Canonical',
11439 Initialize => ~ ($dt->table('None') + $dt->table('Canonical')),
11440 Perl_Extension => 1,
11441 Note => 'Union of all non-canonical decompositions',
11444 # _CanonDCIJ is equivalent to Soft_Dotted, but if on a release earlier
11445 # than SD appeared, construct it ourselves, based on the first release SD
11447 my $CanonDCIJ = $perl->add_match_table('_CanonDCIJ');
11448 my $soft_dotted = property_ref('Soft_Dotted');
11449 if (defined $soft_dotted && ! $soft_dotted->is_empty) {
11450 $CanonDCIJ->set_equivalent_to($soft_dotted->table('Y'), Related => 1);
11454 # This list came from 3.2 Soft_Dotted.
11455 $CanonDCIJ->initialize([ 0x0069,
11464 $CanonDCIJ = $CanonDCIJ & $Assigned;
11467 # These are used in Unicode's definition of \X
11468 my $begin = $perl->add_match_table('_X_Begin', Perl_Extension => 1);
11469 my $extend = $perl->add_match_table('_X_Extend', Perl_Extension => 1);
11471 my $gcb = property_ref('Grapheme_Cluster_Break');
11473 # The 'extended' grapheme cluster came in 5.1. The non-extended
11474 # definition differs too much from the traditional Perl one to use.
11475 if (defined $gcb && defined $gcb->table('SpacingMark')) {
11477 # Note that assumes HST is defined; it came in an earlier release than
11478 # GCB. In the line below, two negatives means: yes hangul
11479 $begin += ~ property_ref('Hangul_Syllable_Type')
11480 ->table('Not_Applicable')
11481 + ~ ($gcb->table('Control')
11482 + $gcb->table('CR')
11483 + $gcb->table('LF'));
11484 $begin->add_comment('For use in \X; matches: Hangul_Syllable | ! Control');
11486 $extend += $gcb->table('Extend') + $gcb->table('SpacingMark');
11487 $extend->add_comment('For use in \X; matches: Extend | SpacingMark');
11489 else { # Old definition, used on early releases.
11490 $extend += $gc->table('Mark')
11493 $begin += ~ $extend;
11495 # Here we may have a release that has the regular grapheme cluster
11496 # defined, or a release that doesn't have anything defined.
11497 # We set things up so the Perl core degrades gracefully, possibly with
11498 # placeholders that match nothing.
11500 if (! defined $gcb) {
11501 $gcb = Property->new('GCB', Status => $PLACEHOLDER);
11503 my $hst = property_ref('HST');
11504 if (!defined $hst) {
11505 $hst = Property->new('HST', Status => $PLACEHOLDER);
11506 $hst->add_match_table('Not_Applicable',
11507 Initialize => $Any,
11511 # On some releases, here we may not have the needed tables for the
11512 # perl core, in some releases we may.
11513 foreach my $name (qw{ L LV LVT T V prepend }) {
11514 my $table = $gcb->table($name);
11515 if (! defined $table) {
11516 $table = $gcb->add_match_table($name);
11517 push @tables_that_may_be_empty, $table->complete_name;
11520 # The HST property predates the GCB one, and has identical tables
11521 # for some of them, so use it if we can.
11522 if ($table->is_empty
11524 && defined $hst->table($name))
11526 $table += $hst->table($name);
11531 # More GCB. If we found some hangul syllables, populate a combined
11533 my $lv_lvt_v = $perl->add_match_table('_X_LV_LVT_V');
11534 my $LV = $gcb->table('LV');
11535 if ($LV->is_empty) {
11536 push @tables_that_may_be_empty, $lv_lvt_v->complete_name;
11538 $lv_lvt_v += $LV + $gcb->table('LVT') + $gcb->table('V');
11539 $lv_lvt_v->add_comment('For use in \X; matches: HST=LV | HST=LVT | HST=V');
11542 # Was previously constructed to contain both Name and Unicode_1_Name
11543 my @composition = ('Name', 'Unicode_1_Name');
11545 if (@named_sequences) {
11546 push @composition, 'Named_Sequence';
11547 foreach my $sequence (@named_sequences) {
11548 $perl_charname->add_anomalous_entry($sequence);
11552 my $alias_sentence = "";
11553 my $alias = property_ref('Name_Alias');
11554 if (defined $alias) {
11555 push @composition, 'Name_Alias';
11556 $alias->reset_each_range;
11557 while (my ($range) = $alias->each_range) {
11558 next if $range->value eq "";
11559 if ($range->start != $range->end) {
11560 Carp::my_carp("Expecting only one code point in the range $range. Just to keep going, using just the first code point;");
11562 $perl_charname->add_duplicate($range->start, $range->value);
11564 $alias_sentence = <<END;
11565 The Name_Alias property adds duplicate code point entries with a corrected
11566 name. The original (less correct, but still valid) name will be physically
11571 if (@composition <= 2) { # Always at least 2
11572 $comment = join " and ", @composition;
11575 $comment = join ", ", @composition[0 .. scalar @composition - 2];
11576 $comment .= ", and $composition[-1]";
11579 $perl_charname->add_comment(join_lines( <<END
11580 This file is for charnames.pm. It is the union of the $comment properties.
11581 Unicode_1_Name entries are used only for otherwise nameless code
11587 # The combining class property used by Perl's normalize.pm is not located
11588 # in the normal mapping directory; create a copy for it.
11589 my $ccc = property_ref('Canonical_Combining_Class');
11590 my $perl_ccc = Property->new('Perl_ccc',
11591 Default_Map => $ccc->default_map,
11592 Full_Name => 'Perl_Canonical_Combining_Class',
11593 Internal_Only_Warning => 1,
11594 Perl_Extension => 1,
11597 Initialize => $ccc,
11598 File => 'CombiningClass',
11599 Directory => File::Spec->curdir(),
11601 $perl_ccc->set_to_output_map(1);
11602 $perl_ccc->add_comment(join_lines(<<END
11603 This mapping is for normalize.pm. It is currently identical to the Unicode
11604 Canonical_Combining_Class property.
11608 # This one match table for it is needed for calculations on output
11609 my $default = $perl_ccc->add_match_table($ccc->default_map,
11610 Initialize => $ccc->table($ccc->default_map),
11611 Status => $SUPPRESSED);
11613 # Construct the Present_In property from the Age property.
11614 if (-e 'DAge.txt' && defined (my $age = property_ref('Age'))) {
11615 my $default_map = $age->default_map;
11616 my $in = Property->new('In',
11617 Default_Map => $default_map,
11618 Full_Name => "Present_In",
11619 Internal_Only_Warning => 1,
11620 Perl_Extension => 1,
11622 Initialize => $age,
11624 $in->add_comment(join_lines(<<END
11625 This file should not be used for any purpose. The values in this file are the
11626 same as for $age, and not for what $in really means. This is because anything
11627 defined in a given release should have multiple values: that release and all
11628 higher ones. But only one value per code point can be represented in a table
11633 # The Age tables are named like 1.5, 2.0, 2.1, .... Sort so that the
11634 # lowest numbered (earliest) come first, with the non-numeric one
11636 my ($first_age, @rest_ages) = sort { ($a->name !~ /^[\d.]*$/)
11638 : ($b->name !~ /^[\d.]*$/)
11640 : $a->name <=> $b->name
11643 # The Present_In property is the cumulative age properties. The first
11644 # one hence is identical to the first age one.
11645 my $previous_in = $in->add_match_table($first_age->name);
11646 $previous_in->set_equivalent_to($first_age, Related => 1);
11648 my $description_start = "Code point's usage introduced in version ";
11649 $first_age->add_description($description_start . $first_age->name);
11651 # To construct the accumlated values, for each of the age tables
11652 # starting with the 2nd earliest, merge the earliest with it, to get
11653 # all those code points existing in the 2nd earliest. Repeat merging
11654 # the new 2nd earliest with the 3rd earliest to get all those existing
11655 # in the 3rd earliest, and so on.
11656 foreach my $current_age (@rest_ages) {
11657 next if $current_age->name !~ /^[\d.]*$/; # Skip the non-numeric
11659 my $current_in = $in->add_match_table(
11660 $current_age->name,
11661 Initialize => $current_age + $previous_in,
11662 Description => $description_start
11663 . $current_age->name
11666 $previous_in = $current_in;
11668 # Add clarifying material for the corresponding age file. This is
11669 # in part because of the confusing and contradictory information
11670 # given in the Standard's documentation itself, as of 5.2.
11671 $current_age->add_description(
11672 "Code point's usage was introduced in version "
11673 . $current_age->name);
11674 $current_age->add_note("See also $in");
11678 # And finally the code points whose usages have yet to be decided are
11679 # the same in both properties. Note that permanently unassigned code
11680 # points actually have their usage assigned (as being permanently
11681 # unassigned), so that these tables are not the same as gc=cn.
11682 my $unassigned = $in->add_match_table($default_map);
11683 my $age_default = $age->table($default_map);
11684 $age_default->add_description(<<END
11685 Code point's usage has not been assigned in any Unicode release thus far.
11688 $unassigned->set_equivalent_to($age_default, Related => 1);
11692 # Finished creating all the perl properties. All non-internal non-string
11693 # ones have a synonym of 'Is_' prefixed. (Internal properties begin with
11694 # an underscore.) These do not get a separate entry in the pod file
11695 foreach my $table ($perl->tables) {
11696 foreach my $alias ($table->aliases) {
11697 next if $alias->name =~ /^_/;
11698 $table->add_alias('Is_' . $alias->name,
11700 Status => $alias->status,
11701 Externally_Ok => 0);
11705 # Here done with all the basic stuff. Ready to populate the information
11706 # about each character if annotating them.
11707 if ($output_names) {
11709 # See comments at its declaration
11710 $annotate_ranges = Range_Map->new;
11712 # This separates out the non-characters from the other unassigneds, so
11713 # can give different annotations for each.
11714 $unassigned_sans_noncharacters = Range_List->new(
11715 Initialize => $gc->table('Unassigned')
11716 & property_ref('Noncharacter_Code_Point')->table('N'));
11718 for (my $i = 0; $i <= $LAST_UNICODE_CODEPOINT; $i++ ) {
11719 $i = populate_char_info($i); # Note sets $i so may cause skips
11726 sub add_perl_synonyms() {
11727 # A number of Unicode tables have Perl synonyms that are expressed in
11728 # the single-form, \p{name}. These are:
11729 # All the binary property Y tables, so that \p{Name=Y} gets \p{Name} and
11730 # \p{Is_Name} as synonyms
11731 # \p{Script=Value} gets \p{Value}, \p{Is_Value} as synonyms
11732 # \p{General_Category=Value} gets \p{Value}, \p{Is_Value} as synonyms
11733 # \p{Block=Value} gets \p{In_Value} as a synonym, and, if there is no
11734 # conflict, \p{Value} and \p{Is_Value} as well
11736 # This routine generates these synonyms, warning of any unexpected
11739 # Construct the list of tables to get synonyms for. Start with all the
11740 # binary and the General_Category ones.
11741 my @tables = grep { $_->type == $BINARY } property_ref('*');
11742 push @tables, $gc->tables;
11744 # If the version of Unicode includes the Script property, add its tables
11745 if (defined property_ref('Script')) {
11746 push @tables, property_ref('Script')->tables;
11749 # The Block tables are kept separate because they are treated differently.
11750 # And the earliest versions of Unicode didn't include them, so add only if
11753 push @blocks, $block->tables if defined $block;
11755 # Here, have the lists of tables constructed. Process blocks last so that
11756 # if there are name collisions with them, blocks have lowest priority.
11757 # Should there ever be other collisions, manual intervention would be
11758 # required. See the comments at the beginning of the program for a
11759 # possible way to handle those semi-automatically.
11760 foreach my $table (@tables, @blocks) {
11762 # For non-binary properties, the synonym is just the name of the
11763 # table, like Greek, but for binary properties the synonym is the name
11764 # of the property, and means the code points in its 'Y' table.
11765 my $nominal = $table;
11766 my $nominal_property = $nominal->property;
11768 if (! $nominal->isa('Property')) {
11773 # Here is a binary property. Use the 'Y' table. Verify that is
11775 my $yes = $nominal->table('Y');
11776 unless (defined $yes) { # Must be defined, but is permissible to
11778 Carp::my_carp_bug("Undefined $nominal, 'Y'. Skipping.");
11784 foreach my $alias ($nominal->aliases) {
11786 # Attempt to create a table in the perl directory for the
11787 # candidate table, using whatever aliases in it that don't
11788 # conflict. Also add non-conflicting aliases for all these
11789 # prefixed by 'Is_' (and/or 'In_' for Block property tables)
11791 foreach my $prefix ("", 'Is_', 'In_') {
11793 # Only Block properties can have added 'In_' aliases.
11794 next if $prefix eq 'In_' and $nominal_property != $block;
11796 my $proposed_name = $prefix . $alias->name;
11798 # No Is_Is, In_In, nor combinations thereof
11799 trace "$proposed_name is a no-no" if main::DEBUG && $to_trace && $proposed_name =~ /^ I [ns] _I [ns] _/x;
11800 next if $proposed_name =~ /^ I [ns] _I [ns] _/x;
11802 trace "Seeing if can add alias or table: 'perl=$proposed_name' based on $nominal" if main::DEBUG && $to_trace;
11804 # Get a reference to any existing table in the perl
11805 # directory with the desired name.
11806 my $pre_existing = $perl->table($proposed_name);
11808 if (! defined $pre_existing) {
11810 # No name collision, so ok to add the perl synonym.
11812 my $make_pod_entry;
11814 my $status = $actual->status;
11815 if ($nominal_property == $block) {
11817 # For block properties, the 'In' form is preferred for
11818 # external use; the pod file contains wild cards for
11819 # this and the 'Is' form so no entries for those; and
11820 # we don't want people using the name without the
11821 # 'In', so discourage that.
11822 if ($prefix eq "") {
11823 $make_pod_entry = 1;
11824 $status = $status || $DISCOURAGED;
11825 $externally_ok = 0;
11827 elsif ($prefix eq 'In_') {
11828 $make_pod_entry = 0;
11829 $status = $status || $NORMAL;
11830 $externally_ok = 1;
11833 $make_pod_entry = 0;
11834 $status = $status || $DISCOURAGED;
11835 $externally_ok = 0;
11838 elsif ($prefix ne "") {
11840 # The 'Is' prefix is handled in the pod by a wild
11841 # card, and we won't use it for an external name
11842 $make_pod_entry = 0;
11843 $status = $status || $NORMAL;
11844 $externally_ok = 0;
11848 # Here, is an empty prefix, non block. This gets its
11849 # own pod entry and can be used for an external name.
11850 $make_pod_entry = 1;
11851 $status = $status || $NORMAL;
11852 $externally_ok = 1;
11855 # Here, there isn't a perl pre-existing table with the
11856 # name. Look through the list of equivalents of this
11857 # table to see if one is a perl table.
11858 foreach my $equivalent ($actual->leader->equivalents) {
11859 next if $equivalent->property != $perl;
11861 # Here, have found a table for $perl. Add this alias
11862 # to it, and are done with this prefix.
11863 $equivalent->add_alias($proposed_name,
11864 Pod_Entry => $make_pod_entry,
11866 Externally_Ok => $externally_ok);
11867 trace "adding alias perl=$proposed_name to $equivalent" if main::DEBUG && $to_trace;
11871 # Here, $perl doesn't already have a table that is a
11872 # synonym for this property, add one.
11873 my $added_table = $perl->add_match_table($proposed_name,
11874 Pod_Entry => $make_pod_entry,
11876 Externally_Ok => $externally_ok);
11877 # And it will be related to the actual table, since it is
11879 $added_table->set_equivalent_to($actual, Related => 1);
11880 trace "added ", $perl->table($proposed_name) if main::DEBUG && $to_trace;
11882 } # End of no pre-existing.
11884 # Here, there is a pre-existing table that has the proposed
11885 # name. We could be in trouble, but not if this is just a
11886 # synonym for another table that we have already made a child
11887 # of the pre-existing one.
11888 if ($pre_existing->is_set_equivalent_to($actual)) {
11889 trace "$pre_existing is already equivalent to $actual; adding alias perl=$proposed_name to it" if main::DEBUG && $to_trace;
11890 $pre_existing->add_alias($proposed_name);
11894 # Here, there is a name collision, but it still could be ok if
11895 # the tables match the identical set of code points, in which
11896 # case, we can combine the names. Compare each table's code
11897 # point list to see if they are identical.
11898 trace "Potential name conflict with $pre_existing having ", $pre_existing->count, " code points" if main::DEBUG && $to_trace;
11899 if ($pre_existing->matches_identically_to($actual)) {
11901 # Here, they do match identically. Not a real conflict.
11902 # Make the perl version a child of the Unicode one, except
11903 # in the non-obvious case of where the perl name is
11904 # already a synonym of another Unicode property. (This is
11905 # excluded by the test for it being its own parent.) The
11906 # reason for this exclusion is that then the two Unicode
11907 # properties become related; and we don't really know if
11908 # they are or not. We generate documentation based on
11909 # relatedness, and this would be misleading. Code
11910 # later executed in the process will cause the tables to
11911 # be represented by a single file anyway, without making
11912 # it look in the pod like they are necessarily related.
11913 if ($pre_existing->parent == $pre_existing
11914 && ($pre_existing->property == $perl
11915 || $actual->property == $perl))
11917 trace "Setting $pre_existing equivalent to $actual since one is \$perl, and match identical sets" if main::DEBUG && $to_trace;
11918 $pre_existing->set_equivalent_to($actual, Related => 1);
11920 elsif (main::DEBUG && $to_trace) {
11921 trace "$pre_existing is equivalent to $actual since match identical sets, but not setting them equivalent, to preserve the separateness of the perl aliases";
11922 trace $pre_existing->parent;
11927 # Here they didn't match identically, there is a real conflict
11928 # between our new name and a pre-existing property.
11929 $actual->add_conflicting($proposed_name, 'p', $pre_existing);
11930 $pre_existing->add_conflicting($nominal->full_name,
11934 # Don't output a warning for aliases for the block
11935 # properties (unless they start with 'In_') as it is
11936 # expected that there will be conflicts and the block
11938 if ($verbosity >= $NORMAL_VERBOSITY
11939 && ($actual->property != $block || $prefix eq 'In_'))
11941 print simple_fold(join_lines(<<END
11942 There is already an alias named $proposed_name (from " . $pre_existing . "),
11943 so not creating this alias for " . $actual
11948 # Keep track for documentation purposes.
11949 $has_In_conflicts++ if $prefix eq 'In_';
11950 $has_Is_conflicts++ if $prefix eq 'Is_';
11955 # There are some properties which have No and Yes (and N and Y) as
11956 # property values, but aren't binary, and could possibly be confused with
11957 # binary ones. So create caveats for them. There are tables that are
11958 # named 'No', and tables that are named 'N', but confusion is not likely
11959 # unless they are the same table. For example, N meaning Number or
11960 # Neutral is not likely to cause confusion, so don't add caveats to things
11962 foreach my $property (grep { $_->type != $BINARY } property_ref('*')) {
11963 my $yes = $property->table('Yes');
11964 if (defined $yes) {
11965 my $y = $property->table('Y');
11966 if (defined $y && $yes == $y) {
11967 foreach my $alias ($property->aliases) {
11968 $yes->add_conflicting($alias->name);
11972 my $no = $property->table('No');
11974 my $n = $property->table('N');
11975 if (defined $n && $no == $n) {
11976 foreach my $alias ($property->aliases) {
11977 $no->add_conflicting($alias->name, 'P');
11986 sub register_file_for_name($$$) {
11987 # Given info about a table and a datafile that it should be associated
11988 # with, register that assocation
11991 my $directory_ref = shift; # Array of the directory path for the file
11992 my $file = shift; # The file name in the final directory, [-1].
11993 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
11995 trace "table=$table, file=$file, directory=@$directory_ref" if main::DEBUG && $to_trace;
11997 if ($table->isa('Property')) {
11998 $table->set_file_path(@$directory_ref, $file);
11999 push @map_properties, $table
12000 if $directory_ref->[0] eq $map_directory;
12004 # Do all of the work for all equivalent tables when called with the leader
12005 # table, so skip if isn't the leader.
12006 return if $table->leader != $table;
12008 # Join all the file path components together, using slashes.
12009 my $full_filename = join('/', @$directory_ref, $file);
12011 # All go in the same subdirectory of unicore
12012 if ($directory_ref->[0] ne $matches_directory) {
12013 Carp::my_carp("Unexpected directory in "
12014 . join('/', @{$directory_ref}, $file));
12017 # For this table and all its equivalents ...
12018 foreach my $table ($table, $table->equivalents) {
12020 # Associate it with its file internally. Don't include the
12021 # $matches_directory first component
12022 $table->set_file_path(@$directory_ref, $file);
12023 my $sub_filename = join('/', $directory_ref->[1, -1], $file);
12025 my $property = $table->property;
12026 $property = ($property == $perl)
12027 ? "" # 'perl' is never explicitly stated
12028 : standardize($property->name) . '=';
12030 my $deprecated = ($table->status eq $DEPRECATED)
12031 ? $table->status_info
12034 # And for each of the table's aliases... This inner loop eventually
12035 # goes through all aliases in the UCD that we generate regex match
12037 foreach my $alias ($table->aliases) {
12038 my $name = $alias->name;
12040 # Generate an entry in either the loose or strict hashes, which
12041 # will translate the property and alias names combination into the
12042 # file where the table for them is stored.
12044 if ($alias->loose_match) {
12045 $standard = $property . standardize($alias->name);
12046 if (exists $loose_to_file_of{$standard}) {
12047 Carp::my_carp("Can't change file registered to $loose_to_file_of{$standard} to '$sub_filename'.");
12050 $loose_to_file_of{$standard} = $sub_filename;
12054 $standard = lc ($property . $name);
12055 if (exists $stricter_to_file_of{$standard}) {
12056 Carp::my_carp("Can't change file registered to $stricter_to_file_of{$standard} to '$sub_filename'.");
12059 $stricter_to_file_of{$standard} = $sub_filename;
12061 # Tightly coupled with how utf8_heavy.pl works, for a
12062 # floating point number that is a whole number, get rid of
12063 # the trailing decimal point and 0's, so that utf8_heavy
12064 # will work. Also note that this assumes that such a
12065 # number is matched strictly; so if that were to change,
12066 # this would be wrong.
12067 if ((my $integer_name = $name)
12068 =~ s/^ ( -? \d+ ) \.0+ $ /$1/x)
12070 $stricter_to_file_of{$property . $integer_name}
12076 # Keep a list of the deprecated properties and their filenames
12078 $utf8::why_deprecated{$sub_filename} = $deprecated;
12087 my %base_names; # Names already used for avoiding DOS 8.3 filesystem
12089 my %full_dir_name_of; # Full length names of directories used.
12091 sub construct_filename($$$) {
12092 # Return a file name for a table, based on the table name, but perhaps
12093 # changed to get rid of non-portable characters in it, and to make
12094 # sure that it is unique on a file system that allows the names before
12095 # any period to be at most 8 characters (DOS). While we're at it
12096 # check and complain if there are any directory conflicts.
12098 my $name = shift; # The name to start with
12099 my $mutable = shift; # Boolean: can it be changed? If no, but
12100 # yet it must be to work properly, a warning
12102 my $directories_ref = shift; # A reference to an array containing the
12103 # path to the file, with each element one path
12104 # component. This is used because the same
12105 # name can be used in different directories.
12106 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
12108 my $warn = ! defined wantarray; # If true, then if the name is
12109 # changed, a warning is issued as well.
12111 if (! defined $name) {
12112 Carp::my_carp("Undefined name in directory "
12113 . File::Spec->join(@$directories_ref)
12118 # Make sure that no directory names conflict with each other. Look at
12119 # each directory in the input file's path. If it is already in use,
12120 # assume it is correct, and is merely being re-used, but if we
12121 # truncate it to 8 characters, and find that there are two directories
12122 # that are the same for the first 8 characters, but differ after that,
12123 # then that is a problem.
12124 foreach my $directory (@$directories_ref) {
12125 my $short_dir = substr($directory, 0, 8);
12126 if (defined $full_dir_name_of{$short_dir}) {
12127 next if $full_dir_name_of{$short_dir} eq $directory;
12128 Carp::my_carp("$directory conflicts with $full_dir_name_of{$short_dir}. Bad News. Continuing anyway");
12131 $full_dir_name_of{$short_dir} = $directory;
12135 my $path = join '/', @$directories_ref;
12136 $path .= '/' if $path;
12138 # Remove interior underscores.
12139 (my $filename = $name) =~ s/ (?<=.) _ (?=.) //xg;
12141 # Change any non-word character into an underscore, and truncate to 8.
12142 $filename =~ s/\W+/_/g; # eg., "L&" -> "L_"
12143 substr($filename, 8) = "" if length($filename) > 8;
12145 # Make sure the basename doesn't conflict with something we
12146 # might have already written. If we have, say,
12153 while (my $num = $base_names{$path}{lc $filename}++) {
12154 $num++; # so basenames with numbers start with '2', which
12155 # just looks more natural.
12157 # Want to append $num, but if it'll make the basename longer
12158 # than 8 characters, pre-truncate $filename so that the result
12160 my $delta = length($filename) + length($num) - 8;
12162 substr($filename, -$delta) = $num;
12167 if ($warn && ! $warned) {
12169 Carp::my_carp("'$path$name' conflicts with another name on a filesystem with 8 significant characters (like DOS). Proceeding anyway.");
12173 return $filename if $mutable;
12175 # If not changeable, must return the input name, but warn if needed to
12176 # change it beyond shortening it.
12177 if ($name ne $filename
12178 && substr($name, 0, length($filename)) ne $filename) {
12179 Carp::my_carp("'$path$name' had to be changed into '$filename'. Bad News. Proceeding anyway.");
12185 # The pod file contains a very large table. Many of the lines in that table
12186 # would exceed a typical output window's size, and so need to be wrapped with
12187 # a hanging indent to make them look good. The pod language is really
12188 # insufficient here. There is no general construct to do that in pod, so it
12189 # is done here by beginning each such line with a space to cause the result to
12190 # be output without formatting, and doing all the formatting here. This leads
12191 # to the result that if the eventual display window is too narrow it won't
12192 # look good, and if the window is too wide, no advantage is taken of that
12193 # extra width. A further complication is that the output may be indented by
12194 # the formatter so that there is less space than expected. What I (khw) have
12195 # done is to assume that that indent is a particular number of spaces based on
12196 # what it is in my Linux system; people can always resize their windows if
12197 # necessary, but this is obviously less than desirable, but the best that can
12199 my $automatic_pod_indent = 8;
12201 # Try to format so that uses fewest lines, but few long left column entries
12202 # slide into the right column. An experiment on 5.1 data yielded the
12203 # following percentages that didn't cut into the other side along with the
12204 # associated first-column widths
12206 # 80% not too bad except for a few blocks
12207 # 90% = 33; # , cuts 353/3053 lines from 37 = 12%
12209 my $indent_info_column = 27; # 75% of lines didn't have overlap
12211 my $FILLER = 3; # Length of initial boiler-plate columns in a pod line
12212 # The 3 is because of:
12213 # 1 for the leading space to tell the pod formatter to
12216 # 1 for the space between the flag and the main data
12218 sub format_pod_line ($$$;$$) {
12219 # Take a pod line and return it, formatted properly
12221 my $first_column_width = shift;
12222 my $entry = shift; # Contents of left column
12223 my $info = shift; # Contents of right column
12225 my $status = shift || ""; # Any flag
12227 my $loose_match = shift; # Boolean.
12228 $loose_match = 1 unless defined $loose_match;
12230 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
12233 $flags .= $STRICTER if ! $loose_match;
12235 $flags .= $status if $status;
12237 # There is a blank in the left column to cause the pod formatter to
12238 # output the line as-is.
12239 return sprintf " %-*s%-*s %s\n",
12240 # The first * in the format is replaced by this, the -1 is
12241 # to account for the leading blank. There isn't a
12242 # hard-coded blank after this to separate the flags from
12243 # the rest of the line, so that in the unlikely event that
12244 # multiple flags are shown on the same line, they both
12245 # will get displayed at the expense of that separation,
12246 # but since they are left justified, a blank will be
12247 # inserted in the normal case.
12251 # The other * in the format is replaced by this number to
12252 # cause the first main column to right fill with blanks.
12253 # The -1 is for the guaranteed blank following it.
12254 $first_column_width - $FILLER - 1,
12259 my @zero_match_tables; # List of tables that have no matches in this release
12261 sub make_table_pod_entries($) {
12262 # This generates the entries for the pod file for a given table.
12263 # Also done at this time are any children tables. The output looks like:
12264 # \p{Common} \p{Script=Common} (Short: \p{Zyyy}) (5178)
12266 my $input_table = shift; # Table the entry is for
12267 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
12269 # Generate parent and all its children at the same time.
12270 return if $input_table->parent != $input_table;
12272 my $property = $input_table->property;
12273 my $type = $property->type;
12274 my $full_name = $property->full_name;
12276 my $count = $input_table->count;
12277 my $string_count = clarify_number($count);
12278 my $status = $input_table->status;
12279 my $status_info = $input_table->status_info;
12281 my $entry_for_first_table; # The entry for the first table output.
12282 # Almost certainly, it is the parent.
12284 # For each related table (including itself), we will generate a pod entry
12285 # for each name each table goes by
12286 foreach my $table ($input_table, $input_table->children) {
12288 # utf8_heavy.pl cannot deal with null string property values, so don't
12290 next if $table->name eq "";
12292 # First, gather all the info that applies to this table as a whole.
12294 push @zero_match_tables, $table if $count == 0;
12296 my $table_property = $table->property;
12298 # The short name has all the underscores removed, while the full name
12299 # retains them. Later, we decide whether to output a short synonym
12300 # for the full one, we need to compare apples to apples, so we use the
12301 # short name's length including underscores.
12302 my $table_property_short_name_length;
12303 my $table_property_short_name
12304 = $table_property->short_name(\$table_property_short_name_length);
12305 my $table_property_full_name = $table_property->full_name;
12307 # Get how much savings there is in the short name over the full one
12308 # (delta will always be <= 0)
12309 my $table_property_short_delta = $table_property_short_name_length
12310 - length($table_property_full_name);
12311 my @table_description = $table->description;
12312 my @table_note = $table->note;
12314 # Generate an entry for each alias in this table.
12315 my $entry_for_first_alias; # saves the first one encountered.
12316 foreach my $alias ($table->aliases) {
12318 # Skip if not to go in pod.
12319 next unless $alias->make_pod_entry;
12321 # Start gathering all the components for the entry
12322 my $name = $alias->name;
12324 my $entry; # Holds the left column, may include extras
12325 my $entry_ref; # To refer to the left column's contents from
12326 # another entry; has no extras
12328 # First the left column of the pod entry. Tables for the $perl
12329 # property always use the single form.
12330 if ($table_property == $perl) {
12331 $entry = "\\p{$name}";
12332 $entry_ref = "\\p{$name}";
12334 else { # Compound form.
12336 # Only generate one entry for all the aliases that mean true
12337 # or false in binary properties. Append a '*' to indicate
12338 # some are missing. (The heading comment notes this.)
12339 my $wild_card_mark;
12340 if ($type == $BINARY) {
12341 next if $name ne 'N' && $name ne 'Y';
12342 $wild_card_mark = '*';
12345 $wild_card_mark = "";
12348 # Colon-space is used to give a little more space to be easier
12351 . $table_property_full_name
12352 . ": $name$wild_card_mark}";
12354 # But for the reference to this entry, which will go in the
12355 # right column, where space is at a premium, use equals
12357 $entry_ref = "\\p{" . $table_property_full_name . "=$name}";
12360 # Then the right (info) column. This is stored as components of
12361 # an array for the moment, then joined into a string later. For
12362 # non-internal only properties, begin the info with the entry for
12363 # the first table we encountered (if any), as things are ordered
12364 # so that that one is the most descriptive. This leads to the
12365 # info column of an entry being a more descriptive version of the
12368 if ($name =~ /^_/) {
12370 '(For internal use by Perl, not necessarily stable)';
12372 elsif ($entry_for_first_alias) {
12373 push @info, $entry_for_first_alias;
12376 # If this entry is equivalent to another, add that to the info,
12377 # using the first such table we encountered
12378 if ($entry_for_first_table) {
12380 push @info, "(= $entry_for_first_table)";
12383 push @info, $entry_for_first_table;
12387 # If the name is a large integer, add an equivalent with an
12388 # exponent for better readability
12389 if ($name =~ /^[+-]?[\d]+$/ && $name >= 10_000) {
12390 push @info, sprintf "(= %.1e)", $name
12393 my $parenthesized = "";
12394 if (! $entry_for_first_alias) {
12396 # This is the first alias for the current table. The alias
12397 # array is ordered so that this is the fullest, most
12398 # descriptive alias, so it gets the fullest info. The other
12399 # aliases are mostly merely pointers to this one, using the
12400 # information already added above.
12402 # Display any status message, but only on the parent table
12403 if ($status && ! $entry_for_first_table) {
12404 push @info, $status_info;
12407 # Put out any descriptive info
12408 if (@table_description || @table_note) {
12409 push @info, join "; ", @table_description, @table_note;
12412 # Look to see if there is a shorter name we can point people
12414 my $standard_name = standardize($name);
12416 my $proposed_short = $table->short_name;
12417 if (defined $proposed_short) {
12418 my $standard_short = standardize($proposed_short);
12420 # If the short name is shorter than the standard one, or
12421 # even it it's not, but the combination of it and its
12422 # short property name (as in \p{prop=short} ($perl doesn't
12423 # have this form)) saves at least two characters, then,
12424 # cause it to be listed as a shorter synonym.
12425 if (length $standard_short < length $standard_name
12426 || ($table_property != $perl
12427 && (length($standard_short)
12428 - length($standard_name)
12429 + $table_property_short_delta) # (<= 0)
12432 $short_name = $proposed_short;
12433 if ($table_property != $perl) {
12434 $short_name = $table_property_short_name
12437 $short_name = "\\p{$short_name}";
12441 # And if this is a compound form name, see if there is a
12442 # single form equivalent
12444 if ($table_property != $perl) {
12446 # Special case the binary N tables, so that will print
12447 # \P{single}, but use the Y table values to populate
12448 # 'single', as we haven't populated the N table.
12451 if ($type == $BINARY
12452 && $input_table == $property->table('No'))
12454 $test_table = $property->table('Yes');
12458 $test_table = $input_table;
12462 # Look for a single form amongst all the children.
12463 foreach my $table ($test_table->children) {
12464 next if $table->property != $perl;
12465 my $proposed_name = $table->short_name;
12466 next if ! defined $proposed_name;
12468 # Don't mention internal-only properties as a possible
12469 # single form synonym
12470 next if substr($proposed_name, 0, 1) eq '_';
12472 $proposed_name = "\\$p\{$proposed_name}";
12473 if (! defined $single_form
12474 || length($proposed_name) < length $single_form)
12476 $single_form = $proposed_name;
12478 # The goal here is to find a single form; not the
12479 # shortest possible one. We've already found a
12480 # short name. So, stop at the first single form
12481 # found, which is likely to be closer to the
12488 # Ouput both short and single in the same parenthesized
12489 # expression, but with only one of 'Single', 'Short' if there
12491 if ($short_name || $single_form || $table->conflicting) {
12492 $parenthesized .= '(';
12493 $parenthesized .= "Short: $short_name" if $short_name;
12494 if ($short_name && $single_form) {
12495 $parenthesized .= ', ';
12497 elsif ($single_form) {
12498 $parenthesized .= 'Single: ';
12500 $parenthesized .= $single_form if $single_form;
12505 # Warn if this property isn't the same as one that a
12506 # semi-casual user might expect. The other components of this
12507 # parenthesized structure are calculated only for the first entry
12508 # for this table, but the conflicting is deemed important enough
12509 # to go on every entry.
12510 my $conflicting = join " NOR ", $table->conflicting;
12511 if ($conflicting) {
12512 $parenthesized .= '(' if ! $parenthesized;
12513 $parenthesized .= '; ' if $parenthesized ne '(';
12514 $parenthesized .= "NOT $conflicting";
12516 $parenthesized .= ')' if $parenthesized;
12518 push @info, $parenthesized if $parenthesized;
12520 if ($table_property != $perl && $table->perl_extension) {
12521 push @info, '(Perl extension)';
12523 push @info, "($string_count)" if $output_range_counts;
12525 # Now, we have both the entry and info so add them to the
12526 # list of all the properties.
12527 push @match_properties,
12528 format_pod_line($indent_info_column,
12532 $alias->loose_match);
12534 $entry_for_first_alias = $entry_ref unless $entry_for_first_alias;
12535 } # End of looping through the aliases for this table.
12537 if (! $entry_for_first_table) {
12538 $entry_for_first_table = $entry_for_first_alias;
12540 } # End of looping through all the related tables
12544 sub pod_alphanumeric_sort {
12545 # Sort pod entries alphanumerically.
12547 # The first few character columns are filler, plus the '\p{'; and get rid
12548 # of all the trailing stuff, starting with the trailing '}', so as to sort
12549 # on just 'Name=Value'
12550 (my $a = lc $a) =~ s/^ .*? { //x;
12552 (my $b = lc $b) =~ s/^ .*? { //x;
12555 # Determine if the two operands are both internal only or both not.
12556 # Character 0 should be a '\'; 1 should be a p; 2 should be '{', so 3
12557 # should be the underscore that begins internal only
12558 my $a_is_internal = (substr($a, 0, 1) eq '_');
12559 my $b_is_internal = (substr($b, 0, 1) eq '_');
12561 # Sort so the internals come last in the table instead of first (which the
12562 # leading underscore would otherwise indicate).
12563 if ($a_is_internal != $b_is_internal) {
12564 return 1 if $a_is_internal;
12568 # Determine if the two operands are numeric property values or not.
12569 # A numeric property will look like xyz: 3. But the number
12570 # can begin with an optional minus sign, and may have a
12571 # fraction or rational component, like xyz: 3/2. If either
12572 # isn't numeric, use alphabetic sort.
12573 my ($a_initial, $a_number) =
12574 ($a =~ /^ ( [^:=]+ [:=] \s* ) (-? \d+ (?: [.\/] \d+)? )/ix);
12575 return $a cmp $b unless defined $a_number;
12576 my ($b_initial, $b_number) =
12577 ($b =~ /^ ( [^:=]+ [:=] \s* ) (-? \d+ (?: [.\/] \d+)? )/ix);
12578 return $a cmp $b unless defined $b_number;
12580 # Here they are both numeric, but use alphabetic sort if the
12581 # initial parts don't match
12582 return $a cmp $b if $a_initial ne $b_initial;
12584 # Convert rationals to floating for the comparison.
12585 $a_number = eval $a_number if $a_number =~ qr{/};
12586 $b_number = eval $b_number if $b_number =~ qr{/};
12588 return $a_number <=> $b_number;
12592 # Create the .pod file. This generates the various subsections and then
12593 # combines them in one big HERE document.
12595 return unless defined $pod_directory;
12596 print "Making pod file\n" if $verbosity >= $PROGRESS;
12598 my $exception_message =
12599 '(Any exceptions are individually noted beginning with the word NOT.)';
12601 if (-e 'Blocks.txt') {
12603 # Add the line: '\p{In_*} \p{Block: *}', with the warning message
12604 # if the global $has_In_conflicts indicates we have them.
12605 push @match_properties, format_pod_line($indent_info_column,
12608 . (($has_In_conflicts)
12609 ? " $exception_message"
12611 @block_warning = << "END";
12613 Matches in the Block property have shortcuts that begin with 'In_'. For
12614 example, \\p{Block=Latin1} can be written as \\p{In_Latin1}. For backward
12615 compatibility, if there is no conflict with another shortcut, these may also
12616 be written as \\p{Latin1} or \\p{Is_Latin1}. But, N.B., there are numerous
12617 such conflicting shortcuts. Use of these forms for Block is discouraged, and
12618 are flagged as such, not only because of the potential confusion as to what is
12619 meant, but also because a later release of Unicode may preempt the shortcut,
12620 and your program would no longer be correct. Use the 'In_' form instead to
12621 avoid this, or even more clearly, use the compound form, e.g.,
12622 \\p{blk:latin1}. See L<perlunicode/"Blocks"> for more information about this.
12625 my $text = "If an entry has flag(s) at its beginning, like '$DEPRECATED', the 'Is_' form has the same flag(s)";
12626 $text = "$exception_message $text" if $has_Is_conflicts;
12628 # And the 'Is_ line';
12629 push @match_properties, format_pod_line($indent_info_column,
12633 # Sort the properties array for output. It is sorted alphabetically
12634 # except numerically for numeric properties, and only output unique lines.
12635 @match_properties = sort pod_alphanumeric_sort uniques @match_properties;
12637 my $formatted_properties = simple_fold(\@match_properties,
12639 # indent succeeding lines by two extra
12640 # which looks better
12641 $indent_info_column + 2,
12643 # shorten the line length by how much
12644 # the formatter indents, so the folded
12645 # line will fit in the space
12646 # presumably available
12647 $automatic_pod_indent);
12648 # Add column headings, indented to be a little more centered, but not
12650 $formatted_properties = format_pod_line($indent_info_column,
12654 . $formatted_properties;
12656 # Generate pod documentation lines for the tables that match nothing
12658 if (@zero_match_tables) {
12659 @zero_match_tables = uniques(@zero_match_tables);
12660 $zero_matches = join "\n\n",
12661 map { $_ = '=item \p{' . $_->complete_name . "}" }
12662 sort { $a->complete_name cmp $b->complete_name }
12663 uniques(@zero_match_tables);
12665 $zero_matches = <<END;
12667 =head2 Legal \\p{} and \\P{} constructs that match no characters
12669 Unicode has some property-value pairs that currently don't match anything.
12670 This happens generally either because they are obsolete, or for symmetry with
12671 other forms, but no language has yet been encoded that uses them. In this
12672 version of Unicode, the following match zero code points:
12683 # Generate list of properties that we don't accept, grouped by the reasons
12684 # why. This is so only put out the 'why' once, and then list all the
12685 # properties that have that reason under it.
12687 my %why_list; # The keys are the reasons; the values are lists of
12688 # properties that have the key as their reason
12690 # For each property, add it to the list that are suppressed for its reason
12691 # The sort will cause the alphabetically first properties to be added to
12692 # each list first, so each list will be sorted.
12693 foreach my $property (sort keys %why_suppressed) {
12694 push @{$why_list{$why_suppressed{$property}}}, $property;
12697 # For each reason (sorted by the first property that has that reason)...
12698 my @bad_re_properties;
12699 foreach my $why (sort { $why_list{$a}->[0] cmp $why_list{$b}->[0] }
12702 # Add to the output, all the properties that have that reason. Start
12703 # with an empty line.
12704 push @bad_re_properties, "\n\n";
12706 my $has_item = 0; # Flag if actually output anything.
12707 foreach my $name (@{$why_list{$why}}) {
12709 # Split compound names into $property and $table components
12710 my $property = $name;
12712 if ($property =~ / (.*) = (.*) /x) {
12717 # This release of Unicode may not have a property that is
12718 # suppressed, so don't reference a non-existent one.
12719 $property = property_ref($property);
12720 next if ! defined $property;
12722 # And since this list is only for match tables, don't list the
12723 # ones that don't have match tables.
12724 next if ! $property->to_create_match_tables;
12726 # Find any abbreviation, and turn it into a compound name if this
12727 # is a property=value pair.
12728 my $short_name = $property->name;
12729 $short_name .= '=' . $property->table($table)->name if $table;
12731 # And add the property as an item for the reason.
12732 push @bad_re_properties, "\n=item I<$name> ($short_name)\n";
12736 # And add the reason under the list of properties, if such a list
12737 # actually got generated. Note that the header got added
12738 # unconditionally before. But pod ignores extra blank lines, so no
12740 push @bad_re_properties, "\n$why\n" if $has_item;
12742 } # End of looping through each reason.
12744 # Generate a list of the properties whose map table we output, from the
12745 # global @map_properties.
12746 my @map_tables_actually_output;
12747 my $info_indent = 20; # Left column is narrower than \p{} table.
12748 foreach my $property (@map_properties) {
12750 # Get the path to the file; don't output any not in the standard
12752 my @path = $property->file_path;
12753 next if $path[0] ne $map_directory;
12754 shift @path; # Remove the standard name
12756 my $file = join '/', @path; # In case is in sub directory
12757 my $info = $property->full_name;
12758 my $short_name = $property->name;
12759 if ($info ne $short_name) {
12760 $info .= " ($short_name)";
12762 foreach my $more_info ($property->description,
12764 $property->status_info)
12766 next unless $more_info;
12768 $info .= ". $more_info";
12770 push @map_tables_actually_output, format_pod_line($info_indent,
12773 $property->status);
12776 # Sort alphabetically, and fold for output
12777 @map_tables_actually_output = sort
12778 pod_alphanumeric_sort @map_tables_actually_output;
12779 @map_tables_actually_output
12780 = simple_fold(\@map_tables_actually_output,
12783 $automatic_pod_indent);
12785 # Generate a list of the formats that can appear in the map tables.
12786 my @map_table_formats;
12787 foreach my $format (sort keys %map_table_formats) {
12788 push @map_table_formats, " $format $map_table_formats{$format}\n";
12791 # Everything is ready to assemble.
12792 my @OUT = << "END";
12797 To change this file, edit $0 instead.
12803 $pod_file - Index of Unicode Version $string_version properties in Perl
12807 There are many properties in Unicode, and Perl provides access to almost all of
12808 them, as well as some additional extensions and short-cut synonyms.
12810 And just about all of the few that aren't accessible through the Perl
12811 core are accessible through the modules: Unicode::Normalize and
12812 Unicode::UCD, and for Unihan properties, via the CPAN module Unicode::Unihan.
12814 This document merely lists all available properties and does not attempt to
12815 explain what each property really means. There is a brief description of each
12816 Perl extension. There is some detail about Blocks, Scripts, General_Category,
12817 and Bidi_Class in L<perlunicode>, but to find out about the intricacies of the
12818 Unicode properties, refer to the Unicode standard. A good starting place is
12819 L<$unicode_reference_url>. More information on the Perl extensions is in
12820 L<perlrecharclass>.
12822 Note that you can define your own properties; see
12823 L<perlunicode/"User-Defined Character Properties">.
12825 =head1 Properties accessible through \\p{} and \\P{}
12827 The Perl regular expression \\p{} and \\P{} constructs give access to most of
12828 the Unicode character properties. The table below shows all these constructs,
12829 both single and compound forms.
12831 B<Compound forms> consist of two components, separated by an equals sign or a
12832 colon. The first component is the property name, and the second component is
12833 the particular value of the property to match against, for example,
12834 '\\p{Script: Greek}' or '\\p{Script=Greek}' both mean to match characters
12835 whose Script property is Greek.
12837 B<Single forms>, like '\\p{Greek}', are mostly Perl-defined shortcuts for
12838 their equivalent compound forms. The table shows these equivalences. (In our
12839 example, '\\p{Greek}' is a just a shortcut for '\\p{Script=Greek}'.)
12840 There are also a few Perl-defined single forms that are not shortcuts for a
12841 compound form. One such is \\p{Word}. These are also listed in the table.
12843 In parsing these constructs, Perl always ignores Upper/lower case differences
12844 everywhere within the {braces}. Thus '\\p{Greek}' means the same thing as
12845 '\\p{greek}'. But note that changing the case of the 'p' or 'P' before the
12846 left brace completely changes the meaning of the construct, from "match" (for
12847 '\\p{}') to "doesn't match" (for '\\P{}'). Casing in this document is for
12848 improved legibility.
12850 Also, white space, hyphens, and underscores are also normally ignored
12851 everywhere between the {braces}, and hence can be freely added or removed
12852 even if the C</x> modifier hasn't been specified on the regular expression.
12853 But $a_bold_stricter at the beginning of an entry in the table below
12854 means that tighter (stricter) rules are used for that entry:
12858 =item Single form (\\p{name}) tighter rules:
12860 White space, hyphens, and underscores ARE significant
12865 =item * white space adjacent to a non-word character
12867 =item * underscores separating digits in numbers
12871 That means, for example, that you can freely add or remove white space
12872 adjacent to (but within) the braces without affecting the meaning.
12874 =item Compound form (\\p{name=value} or \\p{name:value}) tighter rules:
12876 The tighter rules given above for the single form apply to everything to the
12877 right of the colon or equals; the looser rules still apply to everything to
12880 That means, for example, that you can freely add or remove white space
12881 adjacent to (but within) the braces and the colon or equal sign.
12885 Some properties are considered obsolete, but still available. There are
12886 several varieties of obsolesence:
12892 Properties marked with $a_bold_obsolete in the table are considered
12893 obsolete. At the time of this writing (Unicode version 5.2) there is no
12894 information in the Unicode standard about the implications of a property being
12899 Obsolete properties may be stabilized. This means that they are not actively
12900 maintained by Unicode, and will not be extended as new characters are added to
12901 the standard. Such properties are marked with $a_bold_stabilized in the
12902 table. At the time of this writing (Unicode version 5.2) there is no further
12903 information in the Unicode standard about the implications of a property being
12908 Obsolete properties may be deprecated. This means that their use is strongly
12909 discouraged, so much so that a warning will be issued if used, unless the
12910 regular expression is in the scope of a C<S<no warnings 'deprecated'>>
12911 statement. $A_bold_deprecated flags each such entry in the table, and
12912 the entry there for the longest, most descriptive version of the property will
12913 give the reason it is deprecated, and perhaps advice. Perl may issue such a
12914 warning, even for properties that aren't officially deprecated by Unicode,
12915 when there used to be characters or code points that were matched by them, but
12916 no longer. This is to warn you that your program may not work like it did on
12917 earlier Unicode releases.
12919 A deprecated property may be made unavailable in a future Perl version, so it
12920 is best to move away from them.
12924 Some Perl extensions are present for backwards compatibility and are
12925 discouraged from being used, but not obsolete. $A_bold_discouraged
12926 flags each such entry in the table.
12930 The table below has two columns. The left column contains the \\p{}
12931 constructs to look up, possibly preceeded by the flags mentioned above; and
12932 the right column contains information about them, like a description, or
12933 synonyms. It shows both the single and compound forms for each property that
12934 has them. If the left column is a short name for a property, the right column
12935 will give its longer, more descriptive name; and if the left column is the
12936 longest name, the right column will show any equivalent shortest name, in both
12937 single and compound forms if applicable.
12939 The right column will also caution you if a property means something different
12940 than what might normally be expected.
12942 All single forms are Perl extensions; a few compound forms are as well, and
12945 Numbers in (parentheses) indicate the total number of code points matched by
12946 the property. For emphasis, those properties that match no code points at all
12947 are listed as well in a separate section following the table.
12949 There is no description given for most non-Perl defined properties (See
12950 $unicode_reference_url for that).
12952 For compactness, 'B<*>' is used as a wildcard instead of showing all possible
12953 combinations. For example, entries like:
12955 \\p{Gc: *} \\p{General_Category: *}
12957 mean that 'Gc' is a synonym for 'General_Category', and anything that is valid
12958 for the latter is also valid for the former. Similarly,
12962 means that if and only if, for example, \\p{Foo} exists, then \\p{Is_Foo} and
12963 \\p{IsFoo} are also valid and all mean the same thing. And similarly,
12964 \\p{Foo=Bar} means the same as \\p{Is_Foo=Bar} and \\p{IsFoo=Bar}. '*' here
12965 is restricted to something not beginning with an underscore.
12967 Also, in binary properties, 'Yes', 'T', and 'True' are all synonyms for 'Y'.
12968 And 'No', 'F', and 'False' are all synonyms for 'N'. The table shows 'Y*' and
12969 'N*' to indicate this, and doesn't have separate entries for the other
12970 possibilities. Note that not all properties which have values 'Yes' and 'No'
12971 are binary, and they have all their values spelled out without using this wild
12972 card, and a C<NOT> clause in their description that highlights their not being
12973 binary. These also require the compound form to match them, whereas true
12974 binary properties have both single and compound forms available.
12976 Note that all non-essential underscores are removed in the display of the
12983 =item B<*> is a wild-card
12985 =item B<(\\d+)> in the info column gives the number of code points matched by
12988 =item B<$DEPRECATED> means this is deprecated.
12990 =item B<$OBSOLETE> means this is obsolete.
12992 =item B<$STABILIZED> means this is stabilized.
12994 =item B<$STRICTER> means tighter (stricter) name matching applies.
12996 =item B<$DISCOURAGED> means use of this form is discouraged.
13000 $formatted_properties
13004 =head1 Properties not accessible through \\p{} and \\P{}
13006 A few properties are accessible in Perl via various function calls only.
13008 Lowercase_Mapping lc() and lcfirst()
13009 Titlecase_Mapping ucfirst()
13010 Uppercase_Mapping uc()
13012 Case_Folding is accessible through the /i modifier in regular expressions.
13014 The Name property is accessible through the \\N{} interpolation in
13015 double-quoted strings and regular expressions, but both usages require a C<use
13016 charnames;> to be specified, which also contains related functions viacode(),
13017 vianame(), and string_vianame().
13019 =head1 Unicode regular expression properties that are NOT accepted by Perl
13021 Perl will generate an error for a few character properties in Unicode when
13022 used in a regular expression. The non-Unihan ones are listed below, with the
13023 reasons they are not accepted, perhaps with work-arounds. The short names for
13024 the properties are listed enclosed in (parentheses).
13032 An installation can choose to allow any of these to be matched by changing the
13033 controlling lists contained in the program C<\$Config{privlib}>/F<unicore/$0>
13034 and then re-running F<$0>. (C<\%Config> is available from the Config module).
13036 =head1 Files in the I<To> directory (for serious hackers only)
13038 All Unicode properties are really mappings (in the mathematical sense) from
13039 code points to their respective values. As part of its build process,
13040 Perl constructs tables containing these mappings for all properties that it
13041 deals with. But only a few of these are written out into files.
13042 Those written out are in the directory C<\$Config{privlib}>/F<unicore/To/>
13043 (%Config is available from the Config module).
13045 Those ones written are ones needed by Perl internally during execution, or for
13046 which there is some demand, and those for which there is no access through the
13047 Perl core. Generally, properties that can be used in regular expression
13048 matching do not have their map tables written, like Script. Nor are the
13049 simplistic properties that have a better, more complete version, such as
13050 Simple_Uppercase_Mapping (Uppercase_Mapping is written instead).
13052 None of the properties in the I<To> directory are currently directly
13053 accessible through the Perl core, although some may be accessed indirectly.
13054 For example, the uc() function implements the Uppercase_Mapping property and
13055 uses the F<Upper.pl> file found in this directory.
13057 The available files in the current installation, with their properties (short
13058 names in parentheses), and any flags or comments about them, are:
13060 @map_tables_actually_output
13062 An installation can choose to change which files are generated by changing the
13063 controlling lists contained in the program C<\$Config{privlib}>/F<unicore/$0>
13064 and then re-running F<$0>.
13066 Each of these files defines two hash entries to help reading programs decipher
13067 it. One of them looks like this:
13069 \$utf8::SwashInfo{'ToNAME'}{'format'} = 's';
13071 where 'NAME' is a name to indicate the property. For backwards compatibility,
13072 this is not necessarily the property's official Unicode name. (The 'To' is
13073 also for backwards compatibility.) The hash entry gives the format of the
13074 mapping fields of the table, currently one of the following:
13078 This format applies only to the entries in the main body of the table.
13079 Entries defined in hashes or ones that are missing from the list can have a
13082 The value that the missing entries have is given by the other SwashInfo hash
13083 entry line; it looks like this:
13085 \$utf8::SwashInfo{'ToNAME'}{'missing'} = 'NaN';
13087 This example line says that any Unicode code points not explicitly listed in
13088 the file have the value 'NaN' under the property indicated by NAME. If the
13089 value is the special string C<< <code point> >>, it means that the value for
13090 any missing code point is the code point itself. This happens, for example,
13091 in the file for Uppercase_Mapping (To/Upper.pl), in which code points like the
13092 character 'A', are missing because the uppercase of 'A' is itself.
13096 L<$unicode_reference_url>
13104 # And write it. The 0 means no utf8.
13105 main::write([ $pod_directory, "$pod_file.pod" ], 0, \@OUT);
13109 sub make_Heavy () {
13110 # Create and write Heavy.pl, which passes info about the tables to
13117 # This file is for the use of utf8_heavy.pl
13119 # Maps property names in loose standard form to its standard name
13120 \%utf8::loose_property_name_of = (
13123 push @heavy, simple_dumper (\%loose_property_name_of, ' ' x 4);
13124 push @heavy, <<END;
13127 # Maps property, table to file for those using stricter matching
13128 \%utf8::stricter_to_file_of = (
13130 push @heavy, simple_dumper (\%stricter_to_file_of, ' ' x 4);
13131 push @heavy, <<END;
13134 # Maps property, table to file for those using loose matching
13135 \%utf8::loose_to_file_of = (
13137 push @heavy, simple_dumper (\%loose_to_file_of, ' ' x 4);
13138 push @heavy, <<END;
13141 # Maps floating point to fractional form
13142 \%utf8::nv_floating_to_rational = (
13144 push @heavy, simple_dumper (\%nv_floating_to_rational, ' ' x 4);
13145 push @heavy, <<END;
13148 # If a floating point number doesn't have enough digits in it to get this
13149 # close to a fraction, it isn't considered to be that fraction even if all the
13150 # digits it does have match.
13151 \$utf8::max_floating_slop = $MAX_FLOATING_SLOP;
13153 # Deprecated tables to generate a warning for. The key is the file containing
13154 # the table, so as to avoid duplication, as many property names can map to the
13155 # file, but we only need one entry for all of them.
13156 \%utf8::why_deprecated = (
13159 push @heavy, simple_dumper (\%utf8::why_deprecated, ' ' x 4);
13160 push @heavy, <<END;
13166 main::write("Heavy.pl", 0, \@heavy); # The 0 means no utf8.
13170 sub write_all_tables() {
13171 # Write out all the tables generated by this program to files, as well as
13172 # the supporting data structures, pod file, and .t file.
13174 my @writables; # List of tables that actually get written
13175 my %match_tables_to_write; # Used to collapse identical match tables
13176 # into one file. Each key is a hash function
13177 # result to partition tables into buckets.
13178 # Each value is an array of the tables that
13179 # fit in the bucket.
13181 # For each property ...
13182 # (sort so that if there is an immutable file name, it has precedence, so
13183 # some other property can't come in and take over its file name. If b's
13184 # file name is defined, will return 1, meaning to take it first; don't
13185 # care if both defined, as they had better be different anyway)
13187 foreach my $property (sort { defined $b->file } property_ref('*')) {
13188 my $type = $property->type;
13190 # And for each table for that property, starting with the mapping
13193 foreach my $table($property,
13195 # and all the match tables for it (if any), sorted so
13196 # the ones with the shortest associated file name come
13197 # first. The length sorting prevents problems of a
13198 # longer file taking a name that might have to be used
13199 # by a shorter one. The alphabetic sorting prevents
13200 # differences between releases
13201 sort { my $ext_a = $a->external_name;
13202 return 1 if ! defined $ext_a;
13203 my $ext_b = $b->external_name;
13204 return -1 if ! defined $ext_b;
13205 my $cmp = length $ext_a <=> length $ext_b;
13207 # Return result if lengths not equal
13208 return $cmp if $cmp;
13210 # Alphabetic if lengths equal
13211 return $ext_a cmp $ext_b
13212 } $property->tables
13216 # Here we have a table associated with a property. It could be
13217 # the map table (done first for each property), or one of the
13218 # other tables. Determine which type.
13219 my $is_property = $table->isa('Property');
13221 my $name = $table->name;
13222 my $complete_name = $table->complete_name;
13224 # See if should suppress the table if is empty, but warn if it
13225 # contains something.
13226 my $suppress_if_empty_warn_if_not = grep { $complete_name eq $_ }
13227 keys %why_suppress_if_empty_warn_if_not;
13229 # Calculate if this table should have any code points associated
13231 my $expected_empty =
13233 # $perl should be empty, as well as properties that we just
13234 # don't do anything with
13236 && ($table == $perl
13237 || grep { $complete_name eq $_ }
13238 @unimplemented_properties
13242 # Match tables in properties we skipped populating should be
13244 || (! $is_property && ! $property->to_create_match_tables)
13246 # Tables and properties that are expected to have no code
13247 # points should be empty
13248 || $suppress_if_empty_warn_if_not
13251 # Set a boolean if this table is the complement of an empty binary
13253 my $is_complement_of_empty_binary =
13254 $type == $BINARY &&
13255 (($table == $property->table('Y')
13256 && $property->table('N')->is_empty)
13257 || ($table == $property->table('N')
13258 && $property->table('Y')->is_empty));
13261 # Some tables should match everything
13262 my $expected_full =
13264 ? # All these types of map tables will be full because
13265 # they will have been populated with defaults
13266 ($type == $ENUM || $type == $BINARY)
13268 : # A match table should match everything if its method
13270 ($table->matches_all
13272 # The complement of an empty binary table will match
13274 || $is_complement_of_empty_binary
13278 if ($table->is_empty) {
13281 if ($suppress_if_empty_warn_if_not) {
13282 $table->set_status($SUPPRESSED,
13283 $why_suppress_if_empty_warn_if_not{$complete_name});
13286 # Suppress expected empty tables.
13287 next TABLE if $expected_empty;
13289 # And setup to later output a warning for those that aren't
13290 # known to be allowed to be empty. Don't do the warning if
13291 # this table is a child of another one to avoid duplicating
13292 # the warning that should come from the parent one.
13293 if (($table == $property || $table->parent == $table)
13294 && $table->status ne $SUPPRESSED
13295 && ! grep { $complete_name =~ /^$_$/ }
13296 @tables_that_may_be_empty)
13298 push @unhandled_properties, "$table";
13301 elsif ($expected_empty) {
13303 if ($suppress_if_empty_warn_if_not) {
13304 $because = " because $why_suppress_if_empty_warn_if_not{$complete_name}";
13307 Carp::my_carp("Not expecting property $table$because. Generating file for it anyway.");
13310 my $count = $table->count;
13311 if ($expected_full) {
13312 if ($count != $MAX_UNICODE_CODEPOINTS) {
13313 Carp::my_carp("$table matches only "
13314 . clarify_number($count)
13315 . " Unicode code points but should match "
13316 . clarify_number($MAX_UNICODE_CODEPOINTS)
13318 . clarify_number(abs($MAX_UNICODE_CODEPOINTS - $count))
13319 . "). Proceeding anyway.");
13322 # Here is expected to be full. If it is because it is the
13323 # complement of an (empty) binary table that is to be
13324 # suppressed, then suppress this one as well.
13325 if ($is_complement_of_empty_binary) {
13326 my $opposing_name = ($name eq 'Y') ? 'N' : 'Y';
13327 my $opposing = $property->table($opposing_name);
13328 my $opposing_status = $opposing->status;
13329 if ($opposing_status) {
13330 $table->set_status($opposing_status,
13331 $opposing->status_info);
13335 elsif ($count == $MAX_UNICODE_CODEPOINTS) {
13336 if ($table == $property || $table->leader == $table) {
13337 Carp::my_carp("$table unexpectedly matches all Unicode code points. Proceeding anyway.");
13341 if ($table->status eq $SUPPRESSED) {
13342 if (! $is_property) {
13343 my @children = $table->children;
13344 foreach my $child (@children) {
13345 if ($child->status ne $SUPPRESSED) {
13346 Carp::my_carp_bug("'$table' is suppressed and has a child '$child' which isn't");
13353 if (! $is_property) {
13355 # Several things need to be done just once for each related
13356 # group of match tables. Do them on the parent.
13357 if ($table->parent == $table) {
13359 # Add an entry in the pod file for the table; it also does
13361 make_table_pod_entries($table) if defined $pod_directory;
13363 # See if the the table matches identical code points with
13364 # something that has already been output. In that case,
13365 # no need to have two files with the same code points in
13366 # them. We use the table's hash() method to store these
13367 # in buckets, so that it is quite likely that if two
13368 # tables are in the same bucket they will be identical, so
13369 # don't have to compare tables frequently. The tables
13370 # have to have the same status to share a file, so add
13371 # this to the bucket hash. (The reason for this latter is
13372 # that Heavy.pl associates a status with a file.)
13373 my $hash = $table->hash . ';' . $table->status;
13375 # Look at each table that is in the same bucket as this
13377 foreach my $comparison (@{$match_tables_to_write{$hash}})
13379 if ($table->matches_identically_to($comparison)) {
13380 $table->set_equivalent_to($comparison,
13386 # Here, not equivalent, add this table to the bucket.
13387 push @{$match_tables_to_write{$hash}}, $table;
13392 # Here is the property itself.
13393 # Don't write out or make references to the $perl property
13394 next if $table == $perl;
13396 if ($type != $STRING) {
13398 # There is a mapping stored of the various synonyms to the
13399 # standardized name of the property for utf8_heavy.pl.
13400 # Also, the pod file contains entries of the form:
13401 # \p{alias: *} \p{full: *}
13402 # rather than show every possible combination of things.
13404 my @property_aliases = $property->aliases;
13406 # The full name of this property is stored by convention
13407 # first in the alias array
13408 my $full_property_name =
13409 '\p{' . $property_aliases[0]->name . ': *}';
13410 my $standard_property_name = standardize($table->name);
13412 # For each synonym ...
13413 for my $i (0 .. @property_aliases - 1) {
13414 my $alias = $property_aliases[$i];
13415 my $alias_name = $alias->name;
13416 my $alias_standard = standardize($alias_name);
13418 # Set the mapping for utf8_heavy of the alias to the
13420 if (exists ($loose_property_name_of{$alias_standard}))
13422 Carp::my_carp("There already is a property with the same standard name as $alias_name: $loose_property_name_of{$alias_standard}. Old name is retained");
13425 $loose_property_name_of{$alias_standard}
13426 = $standard_property_name;
13429 # Now for the pod entry for this alias. Skip if not
13430 # outputting a pod; skip the first one, which is the
13431 # full name so won't have an entry like: '\p{full: *}
13432 # \p{full: *}', and skip if don't want an entry for
13435 || ! defined $pod_directory
13436 || ! $alias->make_pod_entry;
13438 my $rhs = $full_property_name;
13439 if ($property != $perl && $table->perl_extension) {
13440 $rhs .= ' (Perl extension)';
13442 push @match_properties,
13443 format_pod_line($indent_info_column,
13444 '\p{' . $alias->name . ': *}',
13448 } # End of non-string-like property code
13451 # Don't output a mapping file if not desired.
13452 next if ! $property->to_output_map;
13455 # Here, we know we want to write out the table, but don't do it
13456 # yet because there may be other tables that come along and will
13457 # want to share the file, and the file's comments will change to
13458 # mention them. So save for later.
13459 push @writables, $table;
13461 } # End of looping through the property and all its tables.
13462 } # End of looping through all properties.
13464 # Now have all the tables that will have files written for them. Do it.
13465 foreach my $table (@writables) {
13468 my $property = $table->property;
13469 my $is_property = ($table == $property);
13470 if (! $is_property) {
13472 # Match tables for the property go in lib/$subdirectory, which is
13473 # the property's name. Don't use the standard file name for this,
13474 # as may get an unfamiliar alias
13475 @directory = ($matches_directory, $property->external_name);
13479 @directory = $table->directory;
13480 $filename = $table->file;
13483 # Use specified filename if avaliable, or default to property's
13484 # shortest name. We need an 8.3 safe filename (which means "an 8
13485 # safe" filename, since after the dot is only 'pl', which is < 3)
13486 # The 2nd parameter is if the filename shouldn't be changed, and
13487 # it shouldn't iff there is a hard-coded name for this table.
13488 $filename = construct_filename(
13489 $filename || $table->external_name,
13490 ! $filename, # mutable if no filename
13493 register_file_for_name($table, \@directory, $filename);
13495 # Only need to write one file when shared by more than one
13497 next if ! $is_property && $table->leader != $table;
13499 # Construct a nice comment to add to the file
13500 $table->set_final_comment;
13506 # Write out the pod file
13512 make_property_test_script() if $make_test_script;
13516 my @white_space_separators = ( # This used only for making the test script.
13523 sub generate_separator($) {
13524 # This used only for making the test script. It generates the colon or
13525 # equal separator between the property and property value, with random
13526 # white space surrounding the separator
13530 return "" if $lhs eq ""; # No separator if there's only one (the r) side
13532 # Choose space before and after randomly
13533 my $spaces_before =$white_space_separators[rand(@white_space_separators)];
13534 my $spaces_after = $white_space_separators[rand(@white_space_separators)];
13536 # And return the whole complex, half the time using a colon, half the
13538 return $spaces_before
13539 . (rand() < 0.5) ? '=' : ':'
13543 sub generate_tests($$$$$) {
13544 # This used only for making the test script. It generates test cases that
13545 # are expected to compile successfully in perl. Note that the lhs and
13546 # rhs are assumed to already be as randomized as the caller wants.
13548 my $lhs = shift; # The property: what's to the left of the colon
13549 # or equals separator
13550 my $rhs = shift; # The property value; what's to the right
13551 my $valid_code = shift; # A code point that's known to be in the
13552 # table given by lhs=rhs; undef if table is
13554 my $invalid_code = shift; # A code point known to not be in the table;
13555 # undef if the table is all code points
13556 my $warning = shift;
13558 # Get the colon or equal
13559 my $separator = generate_separator($lhs);
13561 # The whole 'property=value'
13562 my $name = "$lhs$separator$rhs";
13565 # Create a complete set of tests, with complements.
13566 if (defined $valid_code) {
13567 push @output, <<"EOC"
13568 Expect(1, $valid_code, '\\p{$name}', $warning);
13569 Expect(0, $valid_code, '\\p{^$name}', $warning);
13570 Expect(0, $valid_code, '\\P{$name}', $warning);
13571 Expect(1, $valid_code, '\\P{^$name}', $warning);
13574 if (defined $invalid_code) {
13575 push @output, <<"EOC"
13576 Expect(0, $invalid_code, '\\p{$name}', $warning);
13577 Expect(1, $invalid_code, '\\p{^$name}', $warning);
13578 Expect(1, $invalid_code, '\\P{$name}', $warning);
13579 Expect(0, $invalid_code, '\\P{^$name}', $warning);
13585 sub generate_error($$$) {
13586 # This used only for making the test script. It generates test cases that
13587 # are expected to not only not match, but to be syntax or similar errors
13589 my $lhs = shift; # The property: what's to the left of the
13590 # colon or equals separator
13591 my $rhs = shift; # The property value; what's to the right
13592 my $already_in_error = shift; # Boolean; if true it's known that the
13593 # unmodified lhs and rhs will cause an error.
13594 # This routine should not force another one
13595 # Get the colon or equal
13596 my $separator = generate_separator($lhs);
13598 # Since this is an error only, don't bother to randomly decide whether to
13599 # put the error on the left or right side; and assume that the rhs is
13600 # loosely matched, again for convenience rather than rigor.
13601 $rhs = randomize_loose_name($rhs, 'ERROR') unless $already_in_error;
13603 my $property = $lhs . $separator . $rhs;
13606 Error('\\p{$property}');
13607 Error('\\P{$property}');
13611 # These are used only for making the test script
13612 # XXX Maybe should also have a bad strict seps, which includes underscore.
13614 my @good_loose_seps = (
13621 my @bad_loose_seps = (
13626 sub randomize_stricter_name {
13627 # This used only for making the test script. Take the input name and
13628 # return a randomized, but valid version of it under the stricter matching
13632 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
13634 # If the name looks like a number (integer, floating, or rational), do
13636 if ($name =~ qr{ ^ ( -? ) (\d+ ( ( [./] ) \d+ )? ) $ }x) {
13639 my $separator = $3;
13641 # If there isn't a sign, part of the time add a plus
13642 # Note: Not testing having any denominator having a minus sign
13644 $sign = '+' if rand() <= .3;
13647 # And add 0 or more leading zeros.
13648 $name = $sign . ('0' x int rand(10)) . $number;
13650 if (defined $separator) {
13651 my $extra_zeros = '0' x int rand(10);
13653 if ($separator eq '.') {
13655 # Similarly, add 0 or more trailing zeros after a decimal
13657 $name .= $extra_zeros;
13661 # Or, leading zeros before the denominator
13662 $name =~ s,/,/$extra_zeros,;
13667 # For legibility of the test, only change the case of whole sections at a
13668 # time. To do this, first split into sections. The split returns the
13671 for my $section (split / ( [ - + \s _ . ]+ ) /x, $name) {
13672 trace $section if main::DEBUG && $to_trace;
13674 if (length $section > 1 && $section !~ /\D/) {
13676 # If the section is a sequence of digits, about half the time
13677 # randomly add underscores between some of them.
13680 # Figure out how many underscores to add. max is 1 less than
13681 # the number of digits. (But add 1 at the end to make sure
13682 # result isn't 0, and compensate earlier by subtracting 2
13684 my $num_underscores = int rand(length($section) - 2) + 1;
13686 # And add them evenly throughout, for convenience, not rigor
13688 my $spacing = (length($section) - 1)/ $num_underscores;
13689 my $temp = $section;
13691 for my $i (1 .. $num_underscores) {
13692 $section .= substr($temp, 0, $spacing, "") . '_';
13696 push @sections, $section;
13700 # Here not a sequence of digits. Change the case of the section
13702 my $switch = int rand(4);
13703 if ($switch == 0) {
13704 push @sections, uc $section;
13706 elsif ($switch == 1) {
13707 push @sections, lc $section;
13709 elsif ($switch == 2) {
13710 push @sections, ucfirst $section;
13713 push @sections, $section;
13717 trace "returning", join "", @sections if main::DEBUG && $to_trace;
13718 return join "", @sections;
13721 sub randomize_loose_name($;$) {
13722 # This used only for making the test script
13725 my $want_error = shift; # if true, make an error
13726 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
13728 $name = randomize_stricter_name($name);
13731 push @parts, $good_loose_seps[rand(@good_loose_seps)];
13732 for my $part (split /[-\s_]+/, $name) {
13734 if ($want_error and rand() < 0.3) {
13735 push @parts, $bad_loose_seps[rand(@bad_loose_seps)];
13739 push @parts, $good_loose_seps[rand(@good_loose_seps)];
13742 push @parts, $part;
13744 my $new = join("", @parts);
13745 trace "$name => $new" if main::DEBUG && $to_trace;
13748 if (rand() >= 0.5) {
13749 $new .= $bad_loose_seps[rand(@bad_loose_seps)];
13752 $new = $bad_loose_seps[rand(@bad_loose_seps)] . $new;
13758 # Used to make sure don't generate duplicate test cases.
13759 my %test_generated;
13761 sub make_property_test_script() {
13762 # This used only for making the test script
13763 # this written directly -- it's huge.
13765 print "Making test script\n" if $verbosity >= $PROGRESS;
13767 # This uses randomness to test different possibilities without testing all
13768 # possibilities. To ensure repeatability, set the seed to 0. But if
13769 # tests are added, it will perturb all later ones in the .t file
13772 $t_path = 'TestProp.pl' unless defined $t_path; # the traditional name
13774 # Keep going down an order of magnitude
13775 # until find that adding this quantity to
13776 # 1 remains 1; but put an upper limit on
13777 # this so in case this algorithm doesn't
13778 # work properly on some platform, that we
13779 # won't loop forever.
13781 my $min_floating_slop = 1;
13782 while (1+ $min_floating_slop != 1
13785 my $next = $min_floating_slop / 10;
13786 last if $next == 0; # If underflows,
13788 $min_floating_slop = $next;
13791 # It doesn't matter whether the elements of this array contain single lines
13792 # or multiple lines. main::write doesn't count the lines.
13795 foreach my $property (property_ref('*')) {
13796 foreach my $table ($property->tables) {
13798 # Find code points that match, and don't match this table.
13799 my $valid = $table->get_valid_code_point;
13800 my $invalid = $table->get_invalid_code_point;
13801 my $warning = ($table->status eq $DEPRECATED)
13805 # Test each possible combination of the property's aliases with
13806 # the table's. If this gets to be too many, could do what is done
13807 # in the set_final_comment() for Tables
13808 my @table_aliases = $table->aliases;
13809 my @property_aliases = $table->property->aliases;
13810 my $max = max(scalar @table_aliases, scalar @property_aliases);
13811 for my $j (0 .. $max - 1) {
13813 # The current alias for property is the next one on the list,
13814 # or if beyond the end, start over. Similarly for table
13816 = $property_aliases[$j % @property_aliases]->name;
13818 $property_name = "" if $table->property == $perl;
13819 my $table_alias = $table_aliases[$j % @table_aliases];
13820 my $table_name = $table_alias->name;
13821 my $loose_match = $table_alias->loose_match;
13823 # If the table doesn't have a file, any test for it is
13824 # already guaranteed to be in error
13825 my $already_error = ! $table->file_path;
13827 # Generate error cases for this alias.
13828 push @output, generate_error($property_name,
13832 # If the table is guaranteed to always generate an error,
13833 # quit now without generating success cases.
13834 next if $already_error;
13836 # Now for the success cases.
13838 if ($loose_match) {
13840 # For loose matching, create an extra test case for the
13842 my $standard = standardize($table_name);
13844 # $test_name should be a unique combination for each test
13845 # case; used just to avoid duplicate tests
13846 my $test_name = "$property_name=$standard";
13848 # Don't output duplicate test cases.
13849 if (! exists $test_generated{$test_name}) {
13850 $test_generated{$test_name} = 1;
13851 push @output, generate_tests($property_name,
13858 $random = randomize_loose_name($table_name)
13860 else { # Stricter match
13861 $random = randomize_stricter_name($table_name);
13864 # Now for the main test case for this alias.
13865 my $test_name = "$property_name=$random";
13866 if (! exists $test_generated{$test_name}) {
13867 $test_generated{$test_name} = 1;
13868 push @output, generate_tests($property_name,
13875 # If the name is a rational number, add tests for the
13876 # floating point equivalent.
13877 if ($table_name =~ qr{/}) {
13879 # Calculate the float, and find just the fraction.
13880 my $float = eval $table_name;
13881 my ($whole, $fraction)
13882 = $float =~ / (.*) \. (.*) /x;
13884 # Starting with one digit after the decimal point,
13885 # create a test for each possible precision (number of
13886 # digits past the decimal point) until well beyond the
13887 # native number found on this machine. (If we started
13888 # with 0 digits, it would be an integer, which could
13889 # well match an unrelated table)
13891 for my $i (1 .. $min_floating_slop + 3) {
13892 my $table_name = sprintf("%.*f", $i, $float);
13893 if ($i < $MIN_FRACTION_LENGTH) {
13895 # If the test case has fewer digits than the
13896 # minimum acceptable precision, it shouldn't
13897 # succeed, so we expect an error for it.
13898 # E.g., 2/3 = .7 at one decimal point, and we
13899 # shouldn't say it matches .7. We should make
13900 # it be .667 at least before agreeing that the
13901 # intent was to match 2/3. But at the
13902 # less-than- acceptable level of precision, it
13903 # might actually match an unrelated number.
13904 # So don't generate a test case if this
13905 # conflating is possible. In our example, we
13906 # don't want 2/3 matching 7/10, if there is
13907 # a 7/10 code point.
13909 (keys %nv_floating_to_rational)
13912 if abs($table_name - $existing)
13913 < $MAX_FLOATING_SLOP;
13915 push @output, generate_error($property_name,
13917 1 # 1 => already an error
13922 # Here the number of digits exceeds the
13923 # minimum we think is needed. So generate a
13924 # success test case for it.
13925 push @output, generate_tests($property_name,
13943 (map {"Test_X('$_');\n"} @backslash_X_tests),
13948 # This is a list of the input files and how to handle them. The files are
13949 # processed in their order in this list. Some reordering is possible if
13950 # desired, but the v0 files should be first, and the extracted before the
13951 # others except DAge.txt (as data in an extracted file can be over-ridden by
13952 # the non-extracted. Some other files depend on data derived from an earlier
13953 # file, like UnicodeData requires data from Jamo, and the case changing and
13954 # folding requires data from Unicode. Mostly, it safest to order by first
13955 # version releases in (except the Jamo). DAge.txt is read before the
13956 # extracted ones because of the rarely used feature $compare_versions. In the
13957 # unlikely event that there were ever an extracted file that contained the Age
13958 # property information, it would have to go in front of DAge.
13960 # The version strings allow the program to know whether to expect a file or
13961 # not, but if a file exists in the directory, it will be processed, even if it
13962 # is in a version earlier than expected, so you can copy files from a later
13963 # release into an earlier release's directory.
13964 my @input_file_objects = (
13965 Input_file->new('PropertyAliases.txt', v0,
13966 Handler => \&process_PropertyAliases,
13968 Input_file->new(undef, v0, # No file associated with this
13969 Progress_Message => 'Finishing property setup',
13970 Handler => \&finish_property_setup,
13972 Input_file->new('PropValueAliases.txt', v0,
13973 Handler => \&process_PropValueAliases,
13974 Has_Missings_Defaults => $NOT_IGNORED,
13976 Input_file->new('DAge.txt', v3.2.0,
13977 Has_Missings_Defaults => $NOT_IGNORED,
13980 Input_file->new("${EXTRACTED}DGeneralCategory.txt", v3.1.0,
13981 Property => 'General_Category',
13983 Input_file->new("${EXTRACTED}DCombiningClass.txt", v3.1.0,
13984 Property => 'Canonical_Combining_Class',
13985 Has_Missings_Defaults => $NOT_IGNORED,
13987 Input_file->new("${EXTRACTED}DNumType.txt", v3.1.0,
13988 Property => 'Numeric_Type',
13989 Has_Missings_Defaults => $NOT_IGNORED,
13991 Input_file->new("${EXTRACTED}DEastAsianWidth.txt", v3.1.0,
13992 Property => 'East_Asian_Width',
13993 Has_Missings_Defaults => $NOT_IGNORED,
13995 Input_file->new("${EXTRACTED}DLineBreak.txt", v3.1.0,
13996 Property => 'Line_Break',
13997 Has_Missings_Defaults => $NOT_IGNORED,
13999 Input_file->new("${EXTRACTED}DBidiClass.txt", v3.1.1,
14000 Property => 'Bidi_Class',
14001 Has_Missings_Defaults => $NOT_IGNORED,
14003 Input_file->new("${EXTRACTED}DDecompositionType.txt", v3.1.0,
14004 Property => 'Decomposition_Type',
14005 Has_Missings_Defaults => $NOT_IGNORED,
14007 Input_file->new("${EXTRACTED}DBinaryProperties.txt", v3.1.0),
14008 Input_file->new("${EXTRACTED}DNumValues.txt", v3.1.0,
14009 Property => 'Numeric_Value',
14010 Each_Line_Handler => \&filter_numeric_value_line,
14011 Has_Missings_Defaults => $NOT_IGNORED,
14013 Input_file->new("${EXTRACTED}DJoinGroup.txt", v3.1.0,
14014 Property => 'Joining_Group',
14015 Has_Missings_Defaults => $NOT_IGNORED,
14018 Input_file->new("${EXTRACTED}DJoinType.txt", v3.1.0,
14019 Property => 'Joining_Type',
14020 Has_Missings_Defaults => $NOT_IGNORED,
14022 Input_file->new('Jamo.txt', v2.0.0,
14023 Property => 'Jamo_Short_Name',
14024 Each_Line_Handler => \&filter_jamo_line,
14026 Input_file->new('UnicodeData.txt', v1.1.5,
14027 Pre_Handler => \&setup_UnicodeData,
14029 # We clean up this file for some early versions.
14030 Each_Line_Handler => [ (($v_version lt v2.0.0 )
14032 : ($v_version eq v2.1.5)
14033 ? \&filter_v2_1_5_ucd
14036 # And the main filter
14037 \&filter_UnicodeData_line,
14039 EOF_Handler => \&EOF_UnicodeData,
14041 Input_file->new('ArabicShaping.txt', v2.0.0,
14042 Each_Line_Handler =>
14043 [ ($v_version lt 4.1.0)
14044 ? \&filter_old_style_arabic_shaping
14046 \&filter_arabic_shaping_line,
14048 Has_Missings_Defaults => $NOT_IGNORED,
14050 Input_file->new('Blocks.txt', v2.0.0,
14051 Property => 'Block',
14052 Has_Missings_Defaults => $NOT_IGNORED,
14053 Each_Line_Handler => \&filter_blocks_lines
14055 Input_file->new('PropList.txt', v2.0.0,
14056 Each_Line_Handler => (($v_version lt v3.1.0)
14057 ? \&filter_old_style_proplist
14060 Input_file->new('Unihan.txt', v2.0.0,
14061 Pre_Handler => \&setup_unihan,
14063 Each_Line_Handler => \&filter_unihan_line,
14065 Input_file->new('SpecialCasing.txt', v2.1.8,
14066 Each_Line_Handler => \&filter_special_casing_line,
14067 Pre_Handler => \&setup_special_casing,
14070 'LineBreak.txt', v3.0.0,
14071 Has_Missings_Defaults => $NOT_IGNORED,
14072 Property => 'Line_Break',
14073 # Early versions had problematic syntax
14074 Each_Line_Handler => (($v_version lt v3.1.0)
14075 ? \&filter_early_ea_lb
14078 Input_file->new('EastAsianWidth.txt', v3.0.0,
14079 Property => 'East_Asian_Width',
14080 Has_Missings_Defaults => $NOT_IGNORED,
14081 # Early versions had problematic syntax
14082 Each_Line_Handler => (($v_version lt v3.1.0)
14083 ? \&filter_early_ea_lb
14086 Input_file->new('CompositionExclusions.txt', v3.0.0,
14087 Property => 'Composition_Exclusion',
14089 Input_file->new('BidiMirroring.txt', v3.0.1,
14090 Property => 'Bidi_Mirroring_Glyph',
14092 Input_file->new("NormalizationTest.txt", v3.0.1,
14095 Input_file->new('CaseFolding.txt', v3.0.1,
14096 Pre_Handler => \&setup_case_folding,
14097 Each_Line_Handler =>
14098 [ ($v_version lt v3.1.0)
14099 ? \&filter_old_style_case_folding
14101 \&filter_case_folding_line
14103 Post_Handler => \&post_fold,
14105 Input_file->new('DCoreProperties.txt', v3.1.0,
14106 # 5.2 changed this file
14107 Has_Missings_Defaults => (($v_version ge v5.2.0)
14111 Input_file->new('Scripts.txt', v3.1.0,
14112 Property => 'Script',
14113 Has_Missings_Defaults => $NOT_IGNORED,
14115 Input_file->new('DNormalizationProps.txt', v3.1.0,
14116 Has_Missings_Defaults => $NOT_IGNORED,
14117 Each_Line_Handler => (($v_version lt v4.0.1)
14118 ? \&filter_old_style_normalization_lines
14121 Input_file->new('HangulSyllableType.txt', v4.0.0,
14122 Has_Missings_Defaults => $NOT_IGNORED,
14123 Property => 'Hangul_Syllable_Type'),
14124 Input_file->new("$AUXILIARY/WordBreakProperty.txt", v4.1.0,
14125 Property => 'Word_Break',
14126 Has_Missings_Defaults => $NOT_IGNORED,
14128 Input_file->new("$AUXILIARY/GraphemeBreakProperty.txt", v4.1.0,
14129 Property => 'Grapheme_Cluster_Break',
14130 Has_Missings_Defaults => $NOT_IGNORED,
14132 Input_file->new("$AUXILIARY/GCBTest.txt", v4.1.0,
14133 Handler => \&process_GCB_test,
14135 Input_file->new("$AUXILIARY/LBTest.txt", v4.1.0,
14138 Input_file->new("$AUXILIARY/SBTest.txt", v4.1.0,
14141 Input_file->new("$AUXILIARY/WBTest.txt", v4.1.0,
14144 Input_file->new("$AUXILIARY/SentenceBreakProperty.txt", v4.1.0,
14145 Property => 'Sentence_Break',
14146 Has_Missings_Defaults => $NOT_IGNORED,
14148 Input_file->new('NamedSequences.txt', v4.1.0,
14149 Handler => \&process_NamedSequences
14151 Input_file->new('NameAliases.txt', v5.0.0,
14152 Property => 'Name_Alias',
14154 Input_file->new("BidiTest.txt", v5.2.0,
14157 Input_file->new('UnihanIndicesDictionary.txt', v5.2.0,
14159 Each_Line_Handler => \&filter_unihan_line,
14161 Input_file->new('UnihanDataDictionaryLike.txt', v5.2.0,
14163 Each_Line_Handler => \&filter_unihan_line,
14165 Input_file->new('UnihanIRGSources.txt', v5.2.0,
14167 Pre_Handler => \&setup_unihan,
14168 Each_Line_Handler => \&filter_unihan_line,
14170 Input_file->new('UnihanNumericValues.txt', v5.2.0,
14172 Each_Line_Handler => \&filter_unihan_line,
14174 Input_file->new('UnihanOtherMappings.txt', v5.2.0,
14176 Each_Line_Handler => \&filter_unihan_line,
14178 Input_file->new('UnihanRadicalStrokeCounts.txt', v5.2.0,
14180 Each_Line_Handler => \&filter_unihan_line,
14182 Input_file->new('UnihanReadings.txt', v5.2.0,
14184 Each_Line_Handler => \&filter_unihan_line,
14186 Input_file->new('UnihanVariants.txt', v5.2.0,
14188 Each_Line_Handler => \&filter_unihan_line,
14192 # End of all the preliminaries.
14195 if ($compare_versions) {
14196 Carp::my_carp(<<END
14197 Warning. \$compare_versions is set. Output is not suitable for production
14202 # Put into %potential_files a list of all the files in the directory structure
14203 # that could be inputs to this program, excluding those that we should ignore.
14204 # Use absolute file names because it makes it easier across machine types.
14205 my @ignored_files_full_names = map { File::Spec->rel2abs(
14206 internal_file_to_platform($_))
14207 } keys %ignored_files;
14210 return unless /\.txt$/i; # Some platforms change the name's case
14211 my $full = lc(File::Spec->rel2abs($_));
14212 $potential_files{$full} = 1
14213 if ! grep { $full eq lc($_) } @ignored_files_full_names;
14216 }, File::Spec->curdir());
14218 my @mktables_list_output_files;
14219 my $old_start_time = 0;
14221 if (! -e $file_list) {
14222 print "'$file_list' doesn't exist, so forcing rebuild.\n" if $verbosity >= $VERBOSE;
14223 $write_unchanged_files = 1;
14224 } elsif ($write_unchanged_files) {
14225 print "Not checking file list '$file_list'.\n" if $verbosity >= $VERBOSE;
14228 print "Reading file list '$file_list'\n" if $verbosity >= $VERBOSE;
14230 if (! open $file_handle, "<", $file_list) {
14231 Carp::my_carp("Failed to open '$file_list'; turning on -globlist option instead: $!");
14237 # Read and parse mktables.lst, placing the results from the first part
14238 # into @input, and the second part into @mktables_list_output_files
14239 for my $list ( \@input, \@mktables_list_output_files ) {
14240 while (<$file_handle>) {
14241 s/^ \s+ | \s+ $//xg;
14242 if (/^ \s* \# .* Autogenerated\ starting\ on\ (\d+)/x) {
14243 $old_start_time = $1;
14245 next if /^ \s* (?: \# .* )? $/x;
14247 my ( $file ) = split /\t/;
14248 push @$list, $file;
14250 @$list = uniques(@$list);
14254 # Look through all the input files
14255 foreach my $input (@input) {
14256 next if $input eq 'version'; # Already have checked this.
14258 # Ignore if doesn't exist. The checking about whether we care or
14259 # not is done via the Input_file object.
14260 next if ! file_exists($input);
14262 # The paths are stored with relative names, and with '/' as the
14263 # delimiter; convert to absolute on this machine
14264 my $full = lc(File::Spec->rel2abs(internal_file_to_platform($input)));
14265 $potential_files{$full} = 1
14266 if ! grep { lc($full) eq lc($_) } @ignored_files_full_names;
14270 close $file_handle;
14275 # Here wants to process all .txt files in the directory structure.
14276 # Convert them to full path names. They are stored in the platform's
14279 foreach my $object (@input_file_objects) {
14280 my $file = $object->file;
14281 next unless defined $file;
14282 push @known_files, File::Spec->rel2abs($file);
14285 my @unknown_input_files;
14286 foreach my $file (keys %potential_files) {
14287 next if grep { lc($file) eq lc($_) } @known_files;
14289 # Here, the file is unknown to us. Get relative path name
14290 $file = File::Spec->abs2rel($file);
14291 push @unknown_input_files, $file;
14293 # What will happen is we create a data structure for it, and add it to
14294 # the list of input files to process. First get the subdirectories
14296 my (undef, $directories, undef) = File::Spec->splitpath($file);
14297 $directories =~ s;/$;;; # Can have extraneous trailing '/'
14298 my @directories = File::Spec->splitdir($directories);
14300 # If the file isn't extracted (meaning none of the directories is the
14301 # extracted one), just add it to the end of the list of inputs.
14302 if (! grep { $EXTRACTED_DIR eq $_ } @directories) {
14303 push @input_file_objects, Input_file->new($file, v0);
14307 # Here, the file is extracted. It needs to go ahead of most other
14308 # processing. Search for the first input file that isn't a
14309 # special required property (that is, find one whose first_release
14310 # is non-0), and isn't extracted. Also, the Age property file is
14311 # processed before the extracted ones, just in case
14312 # $compare_versions is set.
14313 for (my $i = 0; $i < @input_file_objects; $i++) {
14314 if ($input_file_objects[$i]->first_released ne v0
14315 && lc($input_file_objects[$i]->file) ne 'dage.txt'
14316 && $input_file_objects[$i]->file !~ /$EXTRACTED_DIR/i)
14318 splice @input_file_objects, $i, 0,
14319 Input_file->new($file, v0);
14326 if (@unknown_input_files) {
14327 print STDERR simple_fold(join_lines(<<END
14329 The following files are unknown as to how to handle. Assuming they are
14330 typical property files. You'll know by later error messages if it worked or
14333 ) . " " . join(", ", @unknown_input_files) . "\n\n");
14335 } # End of looking through directory structure for more .txt files.
14337 # Create the list of input files from the objects we have defined, plus
14339 my @input_files = 'version';
14340 foreach my $object (@input_file_objects) {
14341 my $file = $object->file;
14342 next if ! defined $file; # Not all objects have files
14343 next if $object->optional && ! -e $file;
14344 push @input_files, $file;
14347 if ( $verbosity >= $VERBOSE ) {
14348 print "Expecting ".scalar( @input_files )." input files. ",
14349 "Checking ".scalar( @mktables_list_output_files )." output files.\n";
14352 # We set $most_recent to be the most recently changed input file, including
14353 # this program itself (done much earlier in this file)
14354 foreach my $in (@input_files) {
14355 next unless -e $in; # Keep going even if missing a file
14356 my $mod_time = (stat $in)[9];
14357 $most_recent = $mod_time if $mod_time > $most_recent;
14359 # See that the input files have distinct names, to warn someone if they
14360 # are adding a new one
14362 my ($volume, $directories, $file ) = File::Spec->splitpath($in);
14363 $directories =~ s;/$;;; # Can have extraneous trailing '/'
14364 my @directories = File::Spec->splitdir($directories);
14365 my $base = $file =~ s/\.txt$//;
14366 construct_filename($file, 'mutable', \@directories);
14370 my $rebuild = $write_unchanged_files # Rebuild: if unconditional rebuild
14371 || ! scalar @mktables_list_output_files # or if no outputs known
14372 || $old_start_time < $most_recent; # or out-of-date
14374 # Now we check to see if any output files are older than youngest, if
14375 # they are, we need to continue on, otherwise we can presumably bail.
14377 foreach my $out (@mktables_list_output_files) {
14378 if ( ! file_exists($out)) {
14379 print "'$out' is missing.\n" if $verbosity >= $VERBOSE;
14383 #local $to_trace = 1 if main::DEBUG;
14384 trace $most_recent, (stat $out)[9] if main::DEBUG && $to_trace;
14385 if ( (stat $out)[9] <= $most_recent ) {
14386 #trace "$out: most recent mod time: ", (stat $out)[9], ", youngest: $most_recent\n" if main::DEBUG && $to_trace;
14387 print "'$out' is too old.\n" if $verbosity >= $VERBOSE;
14394 print "Files seem to be ok, not bothering to rebuild. Add '-w' option to force build\n";
14397 print "Must rebuild tables.\n" if $verbosity >= $VERBOSE;
14399 # Ready to do the major processing. First create the perl pseudo-property.
14400 $perl = Property->new('perl', Type => $NON_STRING, Perl_Extension => 1);
14402 # Process each input file
14403 foreach my $file (@input_file_objects) {
14407 # Finish the table generation.
14409 print "Finishing processing Unicode properties\n" if $verbosity >= $PROGRESS;
14412 print "Compiling Perl properties\n" if $verbosity >= $PROGRESS;
14415 print "Creating Perl synonyms\n" if $verbosity >= $PROGRESS;
14416 add_perl_synonyms();
14418 print "Writing tables\n" if $verbosity >= $PROGRESS;
14419 write_all_tables();
14421 # Write mktables.lst
14422 if ( $file_list and $make_list ) {
14424 print "Updating '$file_list'\n" if $verbosity >= $PROGRESS;
14425 foreach my $file (@input_files, @files_actually_output) {
14426 my (undef, $directories, $file) = File::Spec->splitpath($file);
14427 my @directories = File::Spec->splitdir($directories);
14428 $file = join '/', @directories, $file;
14432 if (! open $ofh,">",$file_list) {
14433 Carp::my_carp("Can't write to '$file_list'. Skipping: $!");
14437 my $localtime = localtime $start_time;
14438 print $ofh <<"END";
14440 # $file_list -- File list for $0.
14442 # Autogenerated starting on $start_time ($localtime)
14444 # - First section is input files
14445 # ($0 itself is not listed but is automatically considered an input)
14446 # - Section seperator is /^=+\$/
14447 # - Second section is a list of output files.
14448 # - Lines matching /^\\s*#/ are treated as comments
14449 # which along with blank lines are ignored.
14455 print $ofh "$_\n" for sort(@input_files);
14456 print $ofh "\n=================================\n# Output files:\n\n";
14457 print $ofh "$_\n" for sort @files_actually_output;
14458 print $ofh "\n# ",scalar(@input_files)," input files\n",
14459 "# ",scalar(@files_actually_output)+1," output files\n\n",
14462 or Carp::my_carp("Failed to close $ofh: $!");
14464 print "Filelist has ",scalar(@input_files)," input files and ",
14465 scalar(@files_actually_output)+1," output files\n"
14466 if $verbosity >= $VERBOSE;
14470 # Output these warnings unless -q explicitly specified.
14471 if ($verbosity >= $NORMAL_VERBOSITY) {
14472 if (@unhandled_properties) {
14473 print "\nProperties and tables that unexpectedly have no code points\n";
14474 foreach my $property (sort @unhandled_properties) {
14475 print $property, "\n";
14479 if (%potential_files) {
14480 print "\nInput files that are not considered:\n";
14481 foreach my $file (sort keys %potential_files) {
14482 print File::Spec->abs2rel($file), "\n";
14485 print "\nAll done\n" if $verbosity >= $VERBOSE;
14489 # TRAILING CODE IS USED BY make_property_test_script()
14495 # If run outside the normal test suite on an ASCII platform, you can
14496 # just create a latin1_to_native() function that just returns its
14497 # inputs, because that's the only function used from test.pl
14500 # Test qr/\X/ and the \p{} regular expression constructs. This file is
14501 # constructed by mktables from the tables it generates, so if mktables is
14502 # buggy, this won't necessarily catch those bugs. Tests are generated for all
14503 # feasible properties; a few aren't currently feasible; see
14504 # is_code_point_usable() in mktables for details.
14506 # Standard test packages are not used because this manipulates SIG_WARN. It
14507 # exits 0 if every non-skipped test succeeded; -1 if any failed.
14513 my $expected = shift;
14516 my $warning_type = shift; # Type of warning message, like 'deprecated'
14518 my $line = (caller)[2];
14519 $ord = ord(latin1_to_native(chr($ord)));
14521 # Convert the code point to hex form
14522 my $string = sprintf "\"\\x{%04X}\"", $ord;
14526 # The first time through, use all warnings. If the input should generate
14527 # a warning, add another time through with them turned off
14528 push @tests, "no warnings '$warning_type';" if $warning_type;
14530 foreach my $no_warnings (@tests) {
14532 # Store any warning messages instead of outputting them
14533 local $SIG{__WARN__} = $SIG{__WARN__};
14534 my $warning_message;
14535 $SIG{__WARN__} = sub { $warning_message = $_[0] };
14539 # A string eval is needed because of the 'no warnings'.
14540 # Assumes no parens in the regular expression
14541 my $result = eval "$no_warnings
14542 my \$RegObj = qr($regex);
14543 $string =~ \$RegObj ? 1 : 0";
14544 if (not defined $result) {
14545 print "not ok $Tests - couldn't compile /$regex/; line $line: $@\n";
14548 elsif ($result ^ $expected) {
14549 print "not ok $Tests - expected $expected but got $result for $string =~ qr/$regex/; line $line\n";
14552 elsif ($warning_message) {
14553 if (! $warning_type || ($warning_type && $no_warnings)) {
14554 print "not ok $Tests - for qr/$regex/ did not expect warning message '$warning_message'; line $line\n";
14558 print "ok $Tests - expected and got a warning message for qr/$regex/; line $line\n";
14561 elsif ($warning_type && ! $no_warnings) {
14562 print "not ok $Tests - for qr/$regex/ expected a $warning_type warning message, but got none; line $line\n";
14566 print "ok $Tests - got $result for $string =~ qr/$regex/; line $line\n";
14575 if (eval { 'x' =~ qr/$regex/; 1 }) {
14577 my $line = (caller)[2];
14578 print "not ok $Tests - re compiled ok, but expected error for qr/$regex/; line $line: $@\n";
14581 my $line = (caller)[2];
14582 print "ok $Tests - got and expected error for qr/$regex/; line $line\n";
14587 # GCBTest.txt character that separates grapheme clusters
14588 my $breakable_utf8 = my $breakable = chr(0xF7);
14589 utf8::upgrade($breakable_utf8);
14591 # GCBTest.txt character that indicates that the adjoining code points are part
14592 # of the same grapheme cluster
14593 my $nobreak_utf8 = my $nobreak = chr(0xD7);
14594 utf8::upgrade($nobreak_utf8);
14597 # Test qr/\X/ matches. The input is a line from auxiliary/GCBTest.txt
14598 # Each such line is a sequence of code points given by their hex numbers,
14599 # separated by the two characters defined just before this subroutine that
14600 # indicate that either there can or cannot be a break between the adjacent
14601 # code points. If there isn't a break, that means the sequence forms an
14602 # extended grapheme cluster, which means that \X should match the whole
14603 # thing. If there is a break, \X should stop there. This is all
14604 # converted by this routine into a match:
14605 # $string =~ /(\X)/,
14606 # Each \X should match the next cluster; and that is what is checked.
14608 my $template = shift;
14610 my $line = (caller)[2];
14612 # The line contains characters above the ASCII range, but in Latin1. It
14613 # may or may not be in utf8, and if it is, it may or may not know it. So,
14614 # convert these characters to 8 bits. If knows is in utf8, simply
14616 if (utf8::is_utf8($template)) {
14617 utf8::downgrade($template);
14620 # Otherwise, if it is in utf8, but doesn't know it, the next lines
14621 # convert the two problematic characters to their 8-bit equivalents.
14622 # If it isn't in utf8, they don't harm anything.
14624 $template =~ s/$nobreak_utf8/$nobreak/g;
14625 $template =~ s/$breakable_utf8/$breakable/g;
14628 # Get rid of the leading and trailing breakables
14629 $template =~ s/^ \s* $breakable \s* //x;
14630 $template =~ s/ \s* $breakable \s* $ //x;
14632 # And no-breaks become just a space.
14633 $template =~ s/ \s* $nobreak \s* / /xg;
14635 # Split the input into segments that are breakable between them.
14636 my @segments = split /\s*$breakable\s*/, $template;
14639 my $display_string = "";
14641 my @should_display;
14643 # Convert the code point sequence in each segment into a Perl string of
14645 foreach my $segment (@segments) {
14646 my @code_points = split /\s+/, $segment;
14647 my $this_string = "";
14648 my $this_display = "";
14649 foreach my $code_point (@code_points) {
14650 $this_string .= latin1_to_native(chr(hex $code_point));
14651 $this_display .= "\\x{$code_point}";
14654 # The next cluster should match the string in this segment.
14655 push @should_match, $this_string;
14656 push @should_display, $this_display;
14657 $string .= $this_string;
14658 $display_string .= $this_display;
14661 # If a string can be represented in both non-ut8 and utf8, test both cases
14663 for my $to_upgrade (0 .. 1) {
14667 # If already in utf8, would just be a repeat
14668 next UPGRADE if utf8::is_utf8($string);
14670 utf8::upgrade($string);
14673 # Finally, do the \X match.
14674 my @matches = $string =~ /(\X)/g;
14676 # Look through each matched cluster to verify that it matches what we
14678 my $min = (@matches < @should_match) ? @matches : @should_match;
14679 for my $i (0 .. $min - 1) {
14681 if ($matches[$i] eq $should_match[$i]) {
14682 print "ok $Tests - ";
14684 print "In \"$display_string\" =~ /(\\X)/g, \\X #1";
14686 print "And \\X #", $i + 1,
14688 print " correctly matched $should_display[$i]; line $line\n";
14690 $matches[$i] = join("", map { sprintf "\\x{%04X}", $_ }
14691 unpack("U*", $matches[$i]));
14692 print "not ok $Tests - In \"$display_string\" =~ /(\\X)/g, \\X #",
14694 " should have matched $should_display[$i]",
14695 " but instead matched $matches[$i]",
14696 ". Abandoning rest of line $line\n";
14701 # And the number of matches should equal the number of expected matches.
14703 if (@matches == @should_match) {
14704 print "ok $Tests - Nothing was left over; line $line\n";
14706 print "not ok $Tests - There were ", scalar @should_match, " \\X matches expected, but got ", scalar @matches, " instead; line $line\n";
14714 print "1..$Tests\n";
14715 exit($Fails ? -1 : 0);
14718 Error('\p{Script=InGreek}'); # Bug #69018
14719 Test_X("1100 $nobreak 1161"); # Bug #70940
14720 Expect(0, 0x2028, '\p{Print}', ""); # Bug # 71722
14721 Expect(0, 0x2029, '\p{Print}', ""); # Bug # 71722
14722 Expect(1, 0xFF10, '\p{XDigit}', ""); # Bug # 71726