#!/usr/bin/perl -w # !!!!!!!!!!!!!! IF YOU MODIFY THIS FILE !!!!!!!!!!!!!!!!!!!!!!!!! # Any files created or read by this program should be listed in 'mktables.lst' # Use -makelist to regenerate it. # Needs 'no overloading' to run faster on miniperl. Code commented out at the # subroutine objaddr can be used instead to work as far back (untested) as # 5.8: needs pack "U". But almost all occurrences of objaddr have been # removed in favor of using 'no overloading'. You also would have to go # through and replace occurrences like: # my $addr = do { no overloading; pack 'J', $self; } # with # my $addr = main::objaddr $self; # (or reverse commit 9b01bafde4b022706c3d6f947a0963f821b2e50b # that instituted the change to main::objaddr, and subsequent commits that # changed 0+$self to pack 'J', $self.) my $start_time; BEGIN { # Get the time the script started running; do it at compilation to # get it as close as possible $start_time= time; } require 5.010_001; use strict; use warnings; use Carp; use Config; use File::Find; use File::Path; use File::Spec; use Text::Tabs; use re "/aa"; sub DEBUG () { 0 } # Set to 0 for production; 1 for development my $debugging_build = $Config{"ccflags"} =~ /-DDEBUGGING/; ########################################################################## # # mktables -- create the runtime Perl Unicode files (lib/unicore/.../*.pl), # from the Unicode database files (lib/unicore/.../*.txt), It also generates # a pod file and .t files, depending on option parameters. # # The structure of this file is: # First these introductory comments; then # code needed for everywhere, such as debugging stuff; then # code to handle input parameters; then # data structures likely to be of external interest (some of which depend on # the input parameters, so follows them; then # more data structures and subroutine and package (class) definitions; then # the small actual loop to process the input files and finish up; then # a __DATA__ section, for the .t tests # # This program works on all releases of Unicode so far. The outputs have been # scrutinized most intently for release 5.1. The others have been checked for # somewhat more than just sanity. It can handle all non-provisional Unicode # character properties in those releases. # # This program is mostly about Unicode character (or code point) properties. # A property describes some attribute or quality of a code point, like if it # is lowercase or not, its name, what version of Unicode it was first defined # in, or what its uppercase equivalent is. Unicode deals with these disparate # possibilities by making all properties into mappings from each code point # into some corresponding value. In the case of it being lowercase or not, # the mapping is either to 'Y' or 'N' (or various synonyms thereof). Each # property maps each Unicode code point to a single value, called a "property # value". (Some more recently defined properties, map a code point to a set # of values.) # # When using a property in a regular expression, what is desired isn't the # mapping of the code point to its property's value, but the reverse (or the # mathematical "inverse relation"): starting with the property value, "Does a # code point map to it?" These are written in a "compound" form: # \p{property=value}, e.g., \p{category=punctuation}. This program generates # files containing the lists of code points that map to each such regular # expression property value, one file per list # # There is also a single form shortcut that Perl adds for many of the commonly # used properties. This happens for all binary properties, plus script, # general_category, and block properties. # # Thus the outputs of this program are files. There are map files, mostly in # the 'To' directory; and there are list files for use in regular expression # matching, all in subdirectories of the 'lib' directory, with each # subdirectory being named for the property that the lists in it are for. # Bookkeeping, test, and documentation files are also generated. my $matches_directory = 'lib'; # Where match (\p{}) files go. my $map_directory = 'To'; # Where map files go. # DATA STRUCTURES # # The major data structures of this program are Property, of course, but also # Table. There are two kinds of tables, very similar to each other. # "Match_Table" is the data structure giving the list of code points that have # a particular property value, mentioned above. There is also a "Map_Table" # data structure which gives the property's mapping from code point to value. # There are two structures because the match tables need to be combined in # various ways, such as constructing unions, intersections, complements, etc., # and the map ones don't. And there would be problems, perhaps subtle, if # a map table were inadvertently operated on in some of those ways. # The use of separate classes with operations defined on one but not the other # prevents accidentally confusing the two. # # At the heart of each table's data structure is a "Range_List", which is just # an ordered list of "Ranges", plus ancillary information, and methods to # operate on them. A Range is a compact way to store property information. # Each range has a starting code point, an ending code point, and a value that # is meant to apply to all the code points between the two end points, # inclusive. For a map table, this value is the property value for those # code points. Two such ranges could be written like this: # 0x41 .. 0x5A, 'Upper', # 0x61 .. 0x7A, 'Lower' # # Each range also has a type used as a convenience to classify the values. # Most ranges in this program will be Type 0, or normal, but there are some # ranges that have a non-zero type. These are used only in map tables, and # are for mappings that don't fit into the normal scheme of things. Mappings # that require a hash entry to communicate with utf8.c are one example; # another example is mappings for charnames.pm to use which indicate a name # that is algorithmically determinable from its code point (and the reverse). # These are used to significantly compact these tables, instead of listing # each one of the tens of thousands individually. # # In a match table, the value of a range is irrelevant (and hence the type as # well, which will always be 0), and arbitrarily set to the null string. # Using the example above, there would be two match tables for those two # entries, one named Upper would contain the 0x41..0x5A range, and the other # named Lower would contain 0x61..0x7A. # # Actually, there are two types of range lists, "Range_Map" is the one # associated with map tables, and "Range_List" with match tables. # Again, this is so that methods can be defined on one and not the others so # as to prevent operating on them in incorrect ways. # # Eventually, most tables are written out to files to be read by utf8_heavy.pl # in the perl core. All tables could in theory be written, but some are # suppressed because there is no current practical use for them. It is easy # to change which get written by changing various lists that are near the top # of the actual code in this file. The table data structures contain enough # ancillary information to allow them to be treated as separate entities for # writing, such as the path to each one's file. There is a heading in each # map table that gives the format of its entries, and what the map is for all # the code points missing from it. (This allows tables to be more compact.) # # The Property data structure contains one or more tables. All properties # contain a map table (except the $perl property which is a # pseudo-property containing only match tables), and any properties that # are usable in regular expression matches also contain various matching # tables, one for each value the property can have. A binary property can # have two values, True and False (or Y and N, which are preferred by Unicode # terminology). Thus each of these properties will have a map table that # takes every code point and maps it to Y or N (but having ranges cuts the # number of entries in that table way down), and two match tables, one # which has a list of all the code points that map to Y, and one for all the # code points that map to N. (For each binary property, a third table is also # generated for the pseudo Perl property. It contains the identical code # points as the Y table, but can be written in regular expressions, not in the # compound form, but in a "single" form like \p{IsUppercase}.) Many # properties are binary, but some properties have several possible values, # some have many, and properties like Name have a different value for every # named code point. Those will not, unless the controlling lists are changed, # have their match tables written out. But all the ones which can be used in # regular expression \p{} and \P{} constructs will. Prior to 5.14, generally # a property would have either its map table or its match tables written but # not both. Again, what gets written is controlled by lists which can easily # be changed. Starting in 5.14, advantage was taken of this, and all the map # tables needed to reconstruct the Unicode db are now written out, while # suppressing the Unicode .txt files that contain the data. Our tables are # much more compact than the .txt files, so a significant space savings was # achieved. Also, tables are not written out that are trivially derivable # from tables that do get written. So, there typically is no file containing # the code points not matched by a binary property (the table for \P{} versus # lowercase \p{}), since you just need to invert the True table to get the # False table. # Properties have a 'Type', like 'binary', or 'string', or 'enum' depending on # how many match tables there are and the content of the maps. This 'Type' is # different than a range 'Type', so don't get confused by the two concepts # having the same name. # # For information about the Unicode properties, see Unicode's UAX44 document: my $unicode_reference_url = 'http://www.unicode.org/reports/tr44/'; # As stated earlier, this program will work on any release of Unicode so far. # Most obvious problems in earlier data have NOT been corrected except when # necessary to make Perl or this program work reasonably, and to keep out # potential security issues. For example, no folding information was given in # early releases, so this program substitutes lower case instead, just so that # a regular expression with the /i option will do something that actually # gives the right results in many cases. There are also a couple other # corrections for version 1.1.5, commented at the point they are made. As an # example of corrections that weren't made (but could be) is this statement # from DerivedAge.txt: "The supplementary private use code points and the # non-character code points were assigned in version 2.0, but not specifically # listed in the UCD until versions 3.0 and 3.1 respectively." (To be precise # it was 3.0.1 not 3.0.0) More information on Unicode version glitches is # further down in these introductory comments. # # This program works on all non-provisional properties as of the current # Unicode release, though the files for some are suppressed for various # reasons. You can change which are output by changing lists in this program. # # The old version of mktables emphasized the term "Fuzzy" to mean Unicode's # loose matchings rules (from Unicode TR18): # # The recommended names for UCD properties and property values are in # PropertyAliases.txt [Prop] and PropertyValueAliases.txt # [PropValue]. There are both abbreviated names and longer, more # descriptive names. It is strongly recommended that both names be # recognized, and that loose matching of property names be used, # whereby the case distinctions, whitespace, hyphens, and underbar # are ignored. # # The program still allows Fuzzy to override its determination of if loose # matching should be used, but it isn't currently used, as it is no longer # needed; the calculations it makes are good enough. # # SUMMARY OF HOW IT WORKS: # # Process arguments # # A list is constructed containing each input file that is to be processed # # Each file on the list is processed in a loop, using the associated handler # code for each: # The PropertyAliases.txt and PropValueAliases.txt files are processed # first. These files name the properties and property values. # Objects are created of all the property and property value names # that the rest of the input should expect, including all synonyms. # The other input files give mappings from properties to property # values. That is, they list code points and say what the mapping # is under the given property. Some files give the mappings for # just one property; and some for many. This program goes through # each file and populates the properties and their map tables from # them. Some properties are listed in more than one file, and # Unicode has set up a precedence as to which has priority if there # is a conflict. Thus the order of processing matters, and this # program handles the conflict possibility by processing the # overriding input files last, so that if necessary they replace # earlier values. # After this is all done, the program creates the property mappings not # furnished by Unicode, but derivable from what it does give. # The tables of code points that match each property value in each # property that is accessible by regular expressions are created. # The Perl-defined properties are created and populated. Many of these # require data determined from the earlier steps # Any Perl-defined synonyms are created, and name clashes between Perl # and Unicode are reconciled and warned about. # All the properties are written to files # Any other files are written, and final warnings issued. # # For clarity, a number of operators have been overloaded to work on tables: # ~ means invert (take all characters not in the set). The more # conventional '!' is not used because of the possibility of confusing # it with the actual boolean operation. # + means union # - means subtraction # & means intersection # The precedence of these is the order listed. Parentheses should be # copiously used. These are not a general scheme. The operations aren't # defined for a number of things, deliberately, to avoid getting into trouble. # Operations are done on references and affect the underlying structures, so # that the copy constructors for them have been overloaded to not return a new # clone, but the input object itself. # # The bool operator is deliberately not overloaded to avoid confusion with # "should it mean if the object merely exists, or also is non-empty?". # # WHY CERTAIN DESIGN DECISIONS WERE MADE # # This program needs to be able to run under miniperl. Therefore, it uses a # minimum of other modules, and hence implements some things itself that could # be gotten from CPAN # # This program uses inputs published by the Unicode Consortium. These can # change incompatibly between releases without the Perl maintainers realizing # it. Therefore this program is now designed to try to flag these. It looks # at the directories where the inputs are, and flags any unrecognized files. # It keeps track of all the properties in the files it handles, and flags any # that it doesn't know how to handle. It also flags any input lines that # don't match the expected syntax, among other checks. # # It is also designed so if a new input file matches one of the known # templates, one hopefully just needs to add it to a list to have it # processed. # # As mentioned earlier, some properties are given in more than one file. In # particular, the files in the extracted directory are supposedly just # reformattings of the others. But they contain information not easily # derivable from the other files, including results for Unihan, which this # program doesn't ordinarily look at, and for unassigned code points. They # also have historically had errors or been incomplete. In an attempt to # create the best possible data, this program thus processes them first to # glean information missing from the other files; then processes those other # files to override any errors in the extracted ones. Much of the design was # driven by this need to store things and then possibly override them. # # It tries to keep fatal errors to a minimum, to generate something usable for # testing purposes. It always looks for files that could be inputs, and will # warn about any that it doesn't know how to handle (the -q option suppresses # the warning). # # Why is there more than one type of range? # This simplified things. There are some very specialized code points that # have to be handled specially for output, such as Hangul syllable names. # By creating a range type (done late in the development process), it # allowed this to be stored with the range, and overridden by other input. # Originally these were stored in another data structure, and it became a # mess trying to decide if a second file that was for the same property was # overriding the earlier one or not. # # Why are there two kinds of tables, match and map? # (And there is a base class shared by the two as well.) As stated above, # they actually are for different things. Development proceeded much more # smoothly when I (khw) realized the distinction. Map tables are used to # give the property value for every code point (actually every code point # that doesn't map to a default value). Match tables are used for regular # expression matches, and are essentially the inverse mapping. Separating # the two allows more specialized methods, and error checks so that one # can't just take the intersection of two map tables, for example, as that # is nonsensical. # # What about 'fate' and 'status'. The concept of a table's fate was created # late when it became clear that something more was needed. The difference # between this and 'status' is unclean, and could be improved if someone # wanted to spend the effort. # # DEBUGGING # # This program is written so it will run under miniperl. Occasionally changes # will cause an error where the backtrace doesn't work well under miniperl. # To diagnose the problem, you can instead run it under regular perl, if you # have one compiled. # # There is a good trace facility. To enable it, first sub DEBUG must be set # to return true. Then a line like # # local $to_trace = 1 if main::DEBUG; # # can be added to enable tracing in its lexical scope (plus dynamic) or until # you insert another line: # # local $to_trace = 0 if main::DEBUG; # # To actually trace, use a line like "trace $a, @b, %c, ...; # # Some of the more complex subroutines already have trace statements in them. # Permanent trace statements should be like: # # trace ... if main::DEBUG && $to_trace; # # If there is just one or a few files that you're debugging, you can easily # cause most everything else to be skipped. Change the line # # my $debug_skip = 0; # # to 1, and every file whose object is in @input_file_objects and doesn't have # a, 'non_skip => 1,' in its constructor will be skipped. However, skipping # Jamo.txt or UnicodeData.txt will likely cause fatal errors. # # To compare the output tables, it may be useful to specify the -annotate # flag. This causes the tables to expand so there is one entry for each # non-algorithmically named code point giving, currently its name, and its # graphic representation if printable (and you have a font that knows about # it). This makes it easier to see what the particular code points are in # each output table. The tables are usable, but because they don't have # ranges (for the most part), a Perl using them will run slower. Non-named # code points are annotated with a description of their status, and contiguous # ones with the same description will be output as a range rather than # individually. Algorithmically named characters are also output as ranges, # except when there are just a few contiguous ones. # # FUTURE ISSUES # # The program would break if Unicode were to change its names so that # interior white space, underscores, or dashes differences were significant # within property and property value names. # # It might be easier to use the xml versions of the UCD if this program ever # would need heavy revision, and the ability to handle old versions was not # required. # # There is the potential for name collisions, in that Perl has chosen names # that Unicode could decide it also likes. There have been such collisions in # the past, with mostly Perl deciding to adopt the Unicode definition of the # name. However in the 5.2 Unicode beta testing, there were a number of such # collisions, which were withdrawn before the final release, because of Perl's # and other's protests. These all involved new properties which began with # 'Is'. Based on the protests, Unicode is unlikely to try that again. Also, # many of the Perl-defined synonyms, like Any, Word, etc, are listed in a # Unicode document, so they are unlikely to be used by Unicode for another # purpose. However, they might try something beginning with 'In', or use any # of the other Perl-defined properties. This program will warn you of name # collisions, and refuse to generate tables with them, but manual intervention # will be required in this event. One scheme that could be implemented, if # necessary, would be to have this program generate another file, or add a # field to mktables.lst that gives the date of first definition of a property. # Each new release of Unicode would use that file as a basis for the next # iteration. And the Perl synonym addition code could sort based on the age # of the property, so older properties get priority, and newer ones that clash # would be refused; hence existing code would not be impacted, and some other # synonym would have to be used for the new property. This is ugly, and # manual intervention would certainly be easier to do in the short run; lets # hope it never comes to this. # # A NOTE ON UNIHAN # # This program can generate tables from the Unihan database. But it doesn't # by default, letting the CPAN module Unicode::Unihan handle them. Prior to # version 5.2, this database was in a single file, Unihan.txt. In 5.2 the # database was split into 8 different files, all beginning with the letters # 'Unihan'. This program will read those file(s) if present, but it needs to # know which of the many properties in the file(s) should have tables created # for them. It will create tables for any properties listed in # PropertyAliases.txt and PropValueAliases.txt, plus any listed in the # @cjk_properties array and the @cjk_property_values array. Thus, if a # property you want is not in those files of the release you are building # against, you must add it to those two arrays. Starting in 4.0, the # Unicode_Radical_Stroke was listed in those files, so if the Unihan database # is present in the directory, a table will be generated for that property. # In 5.2, several more properties were added. For your convenience, the two # arrays are initialized with all the 6.0 listed properties that are also in # earlier releases. But these are commented out. You can just uncomment the # ones you want, or use them as a template for adding entries for other # properties. # # You may need to adjust the entries to suit your purposes. setup_unihan(), # and filter_unihan_line() are the functions where this is done. This program # already does some adjusting to make the lines look more like the rest of the # Unicode DB; You can see what that is in filter_unihan_line() # # There is a bug in the 3.2 data file in which some values for the # kPrimaryNumeric property have commas and an unexpected comment. A filter # could be added for these; or for a particular installation, the Unihan.txt # file could be edited to fix them. # # HOW TO ADD A FILE TO BE PROCESSED # # A new file from Unicode needs to have an object constructed for it in # @input_file_objects, probably at the end or at the end of the extracted # ones. The program should warn you if its name will clash with others on # restrictive file systems, like DOS. If so, figure out a better name, and # add lines to the README.perl file giving that. If the file is a character # property, it should be in the format that Unicode has implicitly # standardized for such files for the more recently introduced ones. # If so, the Input_file constructor for @input_file_objects can just be the # file name and release it first appeared in. If not, then it should be # possible to construct an each_line_handler() to massage the line into the # standardized form. # # For non-character properties, more code will be needed. You can look at # the existing entries for clues. # # UNICODE VERSIONS NOTES # # The Unicode UCD has had a number of errors in it over the versions. And # these remain, by policy, in the standard for that version. Therefore it is # risky to correct them, because code may be expecting the error. So this # program doesn't generally make changes, unless the error breaks the Perl # core. As an example, some versions of 2.1.x Jamo.txt have the wrong value # for U+1105, which causes real problems for the algorithms for Jamo # calculations, so it is changed here. # # But it isn't so clear cut as to what to do about concepts that are # introduced in a later release; should they extend back to earlier releases # where the concept just didn't exist? It was easier to do this than to not, # so that's what was done. For example, the default value for code points not # in the files for various properties was probably undefined until changed by # some version. No_Block for blocks is such an example. This program will # assign No_Block even in Unicode versions that didn't have it. This has the # benefit that code being written doesn't have to special case earlier # versions; and the detriment that it doesn't match the Standard precisely for # the affected versions. # # Here are some observations about some of the issues in early versions: # # Prior to version 3.0, there were 3 character decompositions. These are not # handled by Unicode::Normalize, nor will it compile when presented a version # that has them. However, you can trivially get it to compile by simply # ignoring those decompositions, by changing the croak to a carp. At the time # of this writing, the line (in cpan/Unicode-Normalize/mkheader) reads # # croak("Weird Canonical Decomposition of U+$h"); # # Simply change to a carp. It will compile, but will not know about any three # character decomposition. # The number of code points in \p{alpha=True} halved in 2.1.9. It turns out # that the reason is that the CJK block starting at 4E00 was removed from # PropList, and was not put back in until 3.1.0. The Perl extension (the # single property name \p{alpha}) has the correct values. But the compound # form is simply not generated until 3.1, as it can be argued that prior to # this release, this was not an official property. The comments for # filter_old_style_proplist() give more details. # # Unicode introduced the synonym Space for White_Space in 4.1. Perl has # always had a \p{Space}. In release 3.2 only, they are not synonymous. The # reason is that 3.2 introduced U+205F=medium math space, which was not # classed as white space, but Perl figured out that it should have been. 4.0 # reclassified it correctly. # # Another change between 3.2 and 4.0 is the CCC property value ATBL. In 3.2 # this was erroneously a synonym for 202 (it should be 200). In 4.0, ATB # became 202, and ATBL was left with no code points, as all the ones that # mapped to 202 stayed mapped to 202. Thus if your program used the numeric # name for the class, it would not have been affected, but if it used the # mnemonic, it would have been. # # \p{Script=Hrkt} (Katakana_Or_Hiragana) came in 4.0.1. Before that code # points which eventually came to have this script property value, instead # mapped to "Unknown". But in the next release all these code points were # moved to \p{sc=common} instead. # # The default for missing code points for BidiClass is complicated. Starting # in 3.1.1, the derived file DBidiClass.txt handles this, but this program # tries to do the best it can for earlier releases. It is done in # process_PropertyAliases() # # In version 2.1.2, the entry in UnicodeData.txt: # 0275;LATIN SMALL LETTER BARRED O;Ll;0;L;;;;;N;;;;019F; # should instead be # 0275;LATIN SMALL LETTER BARRED O;Ll;0;L;;;;;N;;;019F;;019F # Without this change, there are casing problems for this character. # ############################################################################## my $UNDEF = ':UNDEF:'; # String to print out for undefined values in tracing # and errors my $MAX_LINE_WIDTH = 78; # Debugging aid to skip most files so as to not be distracted by them when # concentrating on the ones being debugged. Add # non_skip => 1, # to the constructor for those files you want processed when you set this. # Files with a first version number of 0 are special: they are always # processed regardless of the state of this flag. Generally, Jamo.txt and # UnicodeData.txt must not be skipped if you want this program to not die # before normal completion. my $debug_skip = 0; # Normally these are suppressed. my $write_Unicode_deprecated_tables = 0; # Set to 1 to enable tracing. our $to_trace = 0; { # Closure for trace: debugging aid my $print_caller = 1; # ? Include calling subroutine name my $main_with_colon = 'main::'; my $main_colon_length = length($main_with_colon); sub trace { return unless $to_trace; # Do nothing if global flag not set my @input = @_; local $DB::trace = 0; $DB::trace = 0; # Quiet 'used only once' message my $line_number; # Loop looking up the stack to get the first non-trace caller my $caller_line; my $caller_name; my $i = 0; do { $line_number = $caller_line; (my $pkg, my $file, $caller_line, my $caller) = caller $i++; $caller = $main_with_colon unless defined $caller; $caller_name = $caller; # get rid of pkg $caller_name =~ s/.*:://; if (substr($caller_name, 0, $main_colon_length) eq $main_with_colon) { $caller_name = substr($caller_name, $main_colon_length); } } until ($caller_name ne 'trace'); # If the stack was empty, we were called from the top level $caller_name = 'main' if ($caller_name eq "" || $caller_name eq 'trace'); my $output = ""; foreach my $string (@input) { #print STDERR __LINE__, ": ", join ", ", @input, "\n"; if (ref $string eq 'ARRAY' || ref $string eq 'HASH') { $output .= simple_dumper($string); } else { $string = "$string" if ref $string; $string = $UNDEF unless defined $string; chomp $string; $string = '""' if $string eq ""; $output .= " " if $output ne "" && $string ne "" && substr($output, -1, 1) ne " " && substr($string, 0, 1) ne " "; $output .= $string; } } print STDERR sprintf "%4d: ", $line_number if defined $line_number; print STDERR "$caller_name: " if $print_caller; print STDERR $output, "\n"; return; } } # This is for a rarely used development feature that allows you to compare two # versions of the Unicode standard without having to deal with changes caused # by the code points introduced in the later version. Change the 0 to a # string containing a SINGLE dotted Unicode release number (e.g. "2.1"). Only # code points introduced in that release and earlier will be used; later ones # are thrown away. You use the version number of the earliest one you want to # compare; then run this program on directory structures containing each # release, and compare the outputs. These outputs will therefore include only # the code points common to both releases, and you can see the changes caused # just by the underlying release semantic changes. For versions earlier than # 3.2, you must copy a version of DAge.txt into the directory. my $string_compare_versions = DEBUG && 0; # e.g., "2.1"; my $compare_versions = DEBUG && $string_compare_versions && pack "C*", split /\./, $string_compare_versions; sub uniques { # Returns non-duplicated input values. From "Perl Best Practices: # Encapsulated Cleverness". p. 455 in first edition. my %seen; # Arguably this breaks encapsulation, if the goal is to permit multiple # distinct objects to stringify to the same value, and be interchangeable. # However, for this program, no two objects stringify identically, and all # lists passed to this function are either objects or strings. So this # doesn't affect correctness, but it does give a couple of percent speedup. no overloading; return grep { ! $seen{$_}++ } @_; } $0 = File::Spec->canonpath($0); my $make_test_script = 0; # ? Should we output a test script my $make_norm_test_script = 0; # ? Should we output a normalization test script my $write_unchanged_files = 0; # ? Should we update the output files even if # we don't think they have changed my $use_directory = ""; # ? Should we chdir somewhere. my $pod_directory; # input directory to store the pod file. my $pod_file = 'perluniprops'; my $t_path; # Path to the .t test file my $file_list = 'mktables.lst'; # File to store input and output file names. # This is used to speed up the build, by not # executing the main body of the program if # nothing on the list has changed since the # previous build my $make_list = 1; # ? Should we write $file_list. Set to always # make a list so that when the pumpking is # preparing a release, s/he won't have to do # special things my $glob_list = 0; # ? Should we try to include unknown .txt files # in the input. my $output_range_counts = $debugging_build; # ? Should we include the number # of code points in ranges in # the output my $annotate = 0; # ? Should character names be in the output # Verbosity levels; 0 is quiet my $NORMAL_VERBOSITY = 1; my $PROGRESS = 2; my $VERBOSE = 3; my $verbosity = $NORMAL_VERBOSITY; # Process arguments while (@ARGV) { my $arg = shift @ARGV; if ($arg eq '-v') { $verbosity = $VERBOSE; } elsif ($arg eq '-p') { $verbosity = $PROGRESS; $| = 1; # Flush buffers as we go. } elsif ($arg eq '-q') { $verbosity = 0; } elsif ($arg eq '-w') { $write_unchanged_files = 1; # update the files even if havent changed } elsif ($arg eq '-check') { my $this = shift @ARGV; my $ok = shift @ARGV; if ($this ne $ok) { print "Skipping as check params are not the same.\n"; exit(0); } } elsif ($arg eq '-P' && defined ($pod_directory = shift)) { -d $pod_directory or croak "Directory '$pod_directory' doesn't exist"; } elsif ($arg eq '-maketest' || ($arg eq '-T' && defined ($t_path = shift))) { $make_test_script = 1; } elsif ($arg eq '-makenormtest') { $make_norm_test_script = 1; } elsif ($arg eq '-makelist') { $make_list = 1; } elsif ($arg eq '-C' && defined ($use_directory = shift)) { -d $use_directory or croak "Unknown directory '$use_directory'"; } elsif ($arg eq '-L') { # Existence not tested until have chdir'd $file_list = shift; } elsif ($arg eq '-globlist') { $glob_list = 1; } elsif ($arg eq '-c') { $output_range_counts = ! $output_range_counts } elsif ($arg eq '-annotate') { $annotate = 1; $debugging_build = 1; $output_range_counts = 1; } else { my $with_c = 'with'; $with_c .= 'out' if $output_range_counts; # Complements the state croak <file_name_is_absolute($pod_directory)) { $pod_directory = File::Spec->rel2abs($pod_directory); } if ($t_path && ! File::Spec->file_name_is_absolute($t_path)) { $t_path = File::Spec->rel2abs($t_path); } chdir $use_directory or croak "Failed to chdir to '$use_directory':$!"; if ($pod_directory && File::Spec->file_name_is_absolute($pod_directory)) { $pod_directory = File::Spec->abs2rel($pod_directory); } if ($t_path && File::Spec->file_name_is_absolute($t_path)) { $t_path = File::Spec->abs2rel($t_path); } } # Get Unicode version into regular and v-string. This is done now because # various tables below get populated based on it. These tables are populated # here to be near the top of the file, and so easily seeable by those needing # to modify things. open my $VERSION, "<", "version" or croak "$0: can't open required file 'version': $!\n"; my $string_version = <$VERSION>; close $VERSION; chomp $string_version; my $v_version = pack "C*", split /\./, $string_version; # v string # The following are the complete names of properties with property values that # are known to not match any code points in some versions of Unicode, but that # may change in the future so they should be matchable, hence an empty file is # generated for them. my @tables_that_may_be_empty = ( 'Joining_Type=Left_Joining', ); push @tables_that_may_be_empty, 'Script=Common' if $v_version le v4.0.1; push @tables_that_may_be_empty, 'Title' if $v_version lt v2.0.0; push @tables_that_may_be_empty, 'Script=Katakana_Or_Hiragana' if $v_version ge v4.1.0; push @tables_that_may_be_empty, 'Script_Extensions=Katakana_Or_Hiragana' if $v_version ge v6.0.0; push @tables_that_may_be_empty, 'Grapheme_Cluster_Break=Prepend' if $v_version ge v6.1.0; # The lists below are hashes, so the key is the item in the list, and the # value is the reason why it is in the list. This makes generation of # documentation easier. my %why_suppressed; # No file generated for these. # Files aren't generated for empty extraneous properties. This is arguable. # Extraneous properties generally come about because a property is no longer # used in a newer version of Unicode. If we generated a file without code # points, programs that used to work on that property will still execute # without errors. It just won't ever match (or will always match, with \P{}). # This means that the logic is now likely wrong. I (khw) think its better to # find this out by getting an error message. Just move them to the table # above to change this behavior my %why_suppress_if_empty_warn_if_not = ( # It is the only property that has ever officially been removed from the # Standard. The database never contained any code points for it. 'Special_Case_Condition' => 'Obsolete', # Apparently never official, but there were code points in some versions of # old-style PropList.txt 'Non_Break' => 'Obsolete', ); # These would normally go in the warn table just above, but they were changed # a long time before this program was written, so warnings about them are # moot. if ($v_version gt v3.2.0) { push @tables_that_may_be_empty, 'Canonical_Combining_Class=Attached_Below_Left' } # These are listed in the Property aliases file in 6.0, but Unihan is ignored # unless explicitly added. if ($v_version ge v5.2.0) { my $unihan = 'Unihan; remove from list if using Unihan'; foreach my $table (qw ( kAccountingNumeric kOtherNumeric kPrimaryNumeric kCompatibilityVariant kIICore kIRG_GSource kIRG_HSource kIRG_JSource kIRG_KPSource kIRG_MSource kIRG_KSource kIRG_TSource kIRG_USource kIRG_VSource kRSUnicode )) { $why_suppress_if_empty_warn_if_not{$table} = $unihan; } } # Enum values for to_output_map() method in the Map_Table package. my $EXTERNAL_MAP = 1; my $INTERNAL_MAP = 2; my $OUTPUT_ADJUSTED = 3; # To override computed values for writing the map tables for these properties. # The default for enum map tables is to write them out, so that the Unicode # .txt files can be removed, but all the data to compute any property value # for any code point is available in a more compact form. my %global_to_output_map = ( # Needed by UCD.pm, but don't want to publicize that it exists, so won't # get stuck supporting it if things change. Since it is a STRING # property, it normally would be listed in the pod, but INTERNAL_MAP # suppresses that. Unicode_1_Name => $INTERNAL_MAP, Present_In => 0, # Suppress, as easily computed from Age Block => 0, # Suppress, as Blocks.txt is retained. # Suppress, as mapping can be found instead from the # Perl_Decomposition_Mapping file Decomposition_Type => 0, ); # Properties that this program ignores. my @unimplemented_properties; # With this release, it is automatically handled if the Unihan db is # downloaded push @unimplemented_properties, 'Unicode_Radical_Stroke' if $v_version le v5.2.0; # There are several types of obsolete properties defined by Unicode. These # must be hand-edited for every new Unicode release. my %why_deprecated; # Generates a deprecated warning message if used. my %why_stabilized; # Documentation only my %why_obsolete; # Documentation only { # Closure my $simple = 'Perl uses the more complete version of this property'; my $unihan = 'Unihan properties are by default not enabled in the Perl core. Instead use CPAN: Unicode::Unihan'; my $other_properties = 'other properties'; my $contributory = "Used by Unicode internally for generating $other_properties and not intended to be used stand-alone"; my $why_no_expand = "Deprecated by Unicode. These are characters that expand to more than one character in the specified normalization form, but whether they actually take up more bytes or not depends on the encoding being used. For example, a UTF-8 encoded character may expand to a different number of bytes than a UTF-32 encoded character."; %why_deprecated = ( 'Grapheme_Link' => 'Deprecated by Unicode: Duplicates ccc=vr (Canonical_Combining_Class=Virama)', 'Jamo_Short_Name' => $contributory, 'Line_Break=Surrogate' => 'Deprecated by Unicode because surrogates should never appear in well-formed text, and therefore shouldn\'t be the basis for line breaking', 'Other_Alphabetic' => $contributory, 'Other_Default_Ignorable_Code_Point' => $contributory, 'Other_Grapheme_Extend' => $contributory, 'Other_ID_Continue' => $contributory, 'Other_ID_Start' => $contributory, 'Other_Lowercase' => $contributory, 'Other_Math' => $contributory, 'Other_Uppercase' => $contributory, 'Expands_On_NFC' => $why_no_expand, 'Expands_On_NFD' => $why_no_expand, 'Expands_On_NFKC' => $why_no_expand, 'Expands_On_NFKD' => $why_no_expand, ); %why_suppressed = ( # There is a lib/unicore/Decomposition.pl (used by Normalize.pm) which # contains the same information, but without the algorithmically # determinable Hangul syllables'. This file is not published, so it's # existence is not noted in the comment. 'Decomposition_Mapping' => 'Accessible via Unicode::Normalize or Unicode::UCD::prop_invmap()', 'Indic_Matra_Category' => "Provisional", 'Indic_Syllabic_Category' => "Provisional", # Don't suppress ISO_Comment, as otherwise special handling is needed # to differentiate between it and gc=c, which can be written as 'isc', # which is the same characters as ISO_Comment's short name. 'Name' => "Accessible via \\N{...} or 'use charnames;' or Unicode::UCD::prop_invmap()", 'Simple_Case_Folding' => "$simple. Can access this through Unicode::UCD::casefold or Unicode::UCD::prop_invmap()", 'Simple_Lowercase_Mapping' => "$simple. Can access this through Unicode::UCD::charinfo or Unicode::UCD::prop_invmap()", 'Simple_Titlecase_Mapping' => "$simple. Can access this through Unicode::UCD::charinfo or Unicode::UCD::prop_invmap()", 'Simple_Uppercase_Mapping' => "$simple. Can access this through Unicode::UCD::charinfo or Unicode::UCD::prop_invmap()", FC_NFKC_Closure => 'Supplanted in usage by NFKC_Casefold; otherwise not useful', ); foreach my $property ( # The following are suppressed because they were made contributory # or deprecated by Unicode before Perl ever thought about # supporting them. 'Jamo_Short_Name', 'Grapheme_Link', 'Expands_On_NFC', 'Expands_On_NFD', 'Expands_On_NFKC', 'Expands_On_NFKD', # The following are suppressed because they have been marked # as deprecated for a sufficient amount of time 'Other_Alphabetic', 'Other_Default_Ignorable_Code_Point', 'Other_Grapheme_Extend', 'Other_ID_Continue', 'Other_ID_Start', 'Other_Lowercase', 'Other_Math', 'Other_Uppercase', ) { $why_suppressed{$property} = $why_deprecated{$property}; } # Customize the message for all the 'Other_' properties foreach my $property (keys %why_deprecated) { next if (my $main_property = $property) !~ s/^Other_//; $why_deprecated{$property} =~ s/$other_properties/the $main_property property (which should be used instead)/; } } if ($write_Unicode_deprecated_tables) { foreach my $property (keys %why_suppressed) { delete $why_suppressed{$property} if $property =~ / ^ Other | Grapheme /x; } } if ($v_version ge 4.0.0) { $why_stabilized{'Hyphen'} = 'Use the Line_Break property instead; see www.unicode.org/reports/tr14'; if ($v_version ge 6.0.0) { $why_deprecated{'Hyphen'} = 'Supplanted by Line_Break property values; see www.unicode.org/reports/tr14'; } } if ($v_version ge 5.2.0 && $v_version lt 6.0.0) { $why_obsolete{'ISO_Comment'} = 'Code points for it have been removed'; if ($v_version ge 6.0.0) { $why_deprecated{'ISO_Comment'} = 'No longer needed for Unicode\'s internal chart generation; otherwise not useful, and code points for it have been removed'; } } # Probably obsolete forever if ($v_version ge v4.1.0) { $why_suppressed{'Script=Katakana_Or_Hiragana'} = 'Obsolete. All code points previously matched by this have been moved to "Script=Common".'; } if ($v_version ge v6.0.0) { $why_suppressed{'Script=Katakana_Or_Hiragana'} .= ' Consider instead using "Script_Extensions=Katakana" or "Script_Extensions=Hiragana (or both)"'; $why_suppressed{'Script_Extensions=Katakana_Or_Hiragana'} = 'All code points that would be matched by this are matched by either "Script_Extensions=Katakana" or "Script_Extensions=Hiragana"'; } # This program can create files for enumerated-like properties, such as # 'Numeric_Type'. This file would be the same format as for a string # property, with a mapping from code point to its value, so you could look up, # for example, the script a code point is in. But no one so far wants this # mapping, or they have found another way to get it since this is a new # feature. So no file is generated except if it is in this list. my @output_mapped_properties = split "\n", < ## @missing: 0000..10FFFF; cjkIICore; ## @missing: 0000..10FFFF; cjkIRG_GSource; ## @missing: 0000..10FFFF; cjkIRG_HSource; ## @missing: 0000..10FFFF; cjkIRG_JSource; ## @missing: 0000..10FFFF; cjkIRG_KPSource; ## @missing: 0000..10FFFF; cjkIRG_KSource; ## @missing: 0000..10FFFF; cjkIRG_TSource; ## @missing: 0000..10FFFF; cjkIRG_USource; ## @missing: 0000..10FFFF; cjkIRG_VSource; ## @missing: 0000..10FFFF; cjkOtherNumeric; NaN ## @missing: 0000..10FFFF; cjkPrimaryNumeric; NaN ## @missing: 0000..10FFFF; cjkRSUnicode; END # The input files don't list every code point. Those not listed are to be # defaulted to some value. Below are hard-coded what those values are for # non-binary properties as of 5.1. Starting in 5.0, there are # machine-parsable comment lines in the files the give the defaults; so this # list shouldn't have to be extended. The claim is that all missing entries # for binary properties will default to 'N'. Unicode tried to change that in # 5.2, but the beta period produced enough protest that they backed off. # # The defaults for the fields that appear in UnicodeData.txt in this hash must # be in the form that it expects. The others may be synonyms. my $CODE_POINT = ''; my %default_mapping = ( Age => "Unassigned", # Bidi_Class => Complicated; set in code Bidi_Mirroring_Glyph => "", Block => 'No_Block', Canonical_Combining_Class => 0, Case_Folding => $CODE_POINT, Decomposition_Mapping => $CODE_POINT, Decomposition_Type => 'None', East_Asian_Width => "Neutral", FC_NFKC_Closure => $CODE_POINT, General_Category => 'Cn', Grapheme_Cluster_Break => 'Other', Hangul_Syllable_Type => 'NA', ISO_Comment => "", Jamo_Short_Name => "", Joining_Group => "No_Joining_Group", # Joining_Type => Complicated; set in code kIICore => 'N', # Is converted to binary #Line_Break => Complicated; set in code Lowercase_Mapping => $CODE_POINT, Name => "", Name_Alias => "", NFC_QC => 'Yes', NFD_QC => 'Yes', NFKC_QC => 'Yes', NFKD_QC => 'Yes', Numeric_Type => 'None', Numeric_Value => 'NaN', Script => ($v_version le 4.1.0) ? 'Common' : 'Unknown', Sentence_Break => 'Other', Simple_Case_Folding => $CODE_POINT, Simple_Lowercase_Mapping => $CODE_POINT, Simple_Titlecase_Mapping => $CODE_POINT, Simple_Uppercase_Mapping => $CODE_POINT, Titlecase_Mapping => $CODE_POINT, Unicode_1_Name => "", Unicode_Radical_Stroke => "", Uppercase_Mapping => $CODE_POINT, Word_Break => 'Other', ); # Below are files that Unicode furnishes, but this program ignores, and why. # NormalizationCorrections.txt requires some more explanation. It documents # the cumulative fixes to erroneous normalizations in earlier Unicode # versions. Its main purpose is so that someone running on an earlier version # can use this file to override what got published in that earlier release. # It would be easy for mktables to read and handle this file. But all the # corrections in it should already be in the other files for the release it # is. To get it to actually mean something useful, someone would have to be # using an earlier Unicode release, and copy it to the files for that release # and recomplile. So far there has been no demand to do that, so this hasn't # been implemented. my %ignored_files = ( 'CJKRadicals.txt' => 'Maps the kRSUnicode property values to corresponding code points', 'Index.txt' => 'Alphabetical index of Unicode characters', 'NamedSqProv.txt' => 'Named sequences proposed for inclusion in a later version of the Unicode Standard; if you need them now, you can append this file to F and recompile perl', 'NamesList.txt' => 'Annotated list of characters', 'NormalizationCorrections.txt' => 'Documentation of corrections already incorporated into the Unicode data base', 'Props.txt' => 'Only in very early releases; is a subset of F (which is used instead)', 'ReadMe.txt' => 'Documentation', 'StandardizedVariants.txt' => 'Certain glyph variations for character display are standardized. This lists the non-Unihan ones; the Unihan ones are also not used by Perl, and are in a separate Unicode data base L', 'EmojiSources.txt' => 'Maps certain Unicode code points to their legacy Japanese cell-phone values', 'auxiliary/WordBreakTest.html' => 'Documentation of validation tests', 'auxiliary/SentenceBreakTest.html' => 'Documentation of validation tests', 'auxiliary/GraphemeBreakTest.html' => 'Documentation of validation tests', 'auxiliary/LineBreakTest.html' => 'Documentation of validation tests', ); my %skipped_files; # List of files that we skip ### End of externally interesting definitions, except for @input_file_objects my $HEADER=<<"EOF"; # !!!!!!! DO NOT EDIT THIS FILE !!!!!!! # This file is machine-generated by $0 from the Unicode # database, Version $string_version. Any changes made here will be lost! EOF my $INTERNAL_ONLY_HEADER = <<"EOF"; # !!!!!!! INTERNAL PERL USE ONLY !!!!!!! # This file is for internal use by core Perl only. The format and even the # name or existence of this file are subject to change without notice. Don't # use it directly. EOF my $DEVELOPMENT_ONLY=<<"EOF"; # !!!!!!! DEVELOPMENT USE ONLY !!!!!!! # This file contains information artificially constrained to code points # present in Unicode release $string_compare_versions. # IT CANNOT BE RELIED ON. It is for use during development only and should # not be used for production. EOF my $MAX_UNICODE_CODEPOINT_STRING = "10FFFF"; my $MAX_UNICODE_CODEPOINT = hex $MAX_UNICODE_CODEPOINT_STRING; my $MAX_UNICODE_CODEPOINTS = $MAX_UNICODE_CODEPOINT + 1; # Matches legal code point. 4-6 hex numbers, If there are 6, the first # two must be 10; if there are 5, the first must not be a 0. Written this way # to decrease backtracking. The first regex allows the code point to be at # the end of a word, but to work properly, the word shouldn't end with a valid # hex character. The second one won't match a code point at the end of a # word, and doesn't have the run-on issue my $run_on_code_point_re = qr/ (?: 10[0-9A-F]{4} | [1-9A-F][0-9A-F]{4} | [0-9A-F]{4} ) \b/x; my $code_point_re = qr/\b$run_on_code_point_re/; # This matches the beginning of the line in the Unicode db files that give the # defaults for code points not listed (i.e., missing) in the file. The code # depends on this ending with a semi-colon, so it can assume it is a valid # field when the line is split() by semi-colons my $missing_defaults_prefix = qr/^#\s+\@missing:\s+0000\.\.$MAX_UNICODE_CODEPOINT_STRING\s*;/; # Property types. Unicode has more types, but these are sufficient for our # purposes. my $UNKNOWN = -1; # initialized to illegal value my $NON_STRING = 1; # Either binary or enum my $BINARY = 2; my $FORCED_BINARY = 3; # Not a binary property, but, besides its normal # tables, additional true and false tables are # generated so that false is anything matching the # default value, and true is everything else. my $ENUM = 4; # Include catalog my $STRING = 5; # Anything else: string or misc # Some input files have lines that give default values for code points not # contained in the file. Sometimes these should be ignored. my $NO_DEFAULTS = 0; # Must evaluate to false my $NOT_IGNORED = 1; my $IGNORED = 2; # Range types. Each range has a type. Most ranges are type 0, for normal, # and will appear in the main body of the tables in the output files, but # there are other types of ranges as well, listed below, that are specially # handled. There are pseudo-types as well that will never be stored as a # type, but will affect the calculation of the type. # 0 is for normal, non-specials my $MULTI_CP = 1; # Sequence of more than code point my $HANGUL_SYLLABLE = 2; my $CP_IN_NAME = 3; # The NAME contains the code point appended to it. my $NULL = 4; # The map is to the null string; utf8.c can't # handle these, nor is there an accepted syntax # for them in \p{} constructs my $COMPUTE_NO_MULTI_CP = 5; # Pseudo-type; means that ranges that would # otherwise be $MULTI_CP type are instead type 0 # process_generic_property_file() can accept certain overrides in its input. # Each of these must begin AND end with $CMD_DELIM. my $CMD_DELIM = "\a"; my $REPLACE_CMD = 'replace'; # Override the Replace my $MAP_TYPE_CMD = 'map_type'; # Override the Type my $NO = 0; my $YES = 1; # Values for the Replace argument to add_range. # $NO # Don't replace; add only the code points not # already present. my $IF_NOT_EQUIVALENT = 1; # Replace only under certain conditions; details in # the comments at the subroutine definition. my $UNCONDITIONALLY = 2; # Replace without conditions. my $MULTIPLE_BEFORE = 4; # Don't replace, but add a duplicate record if # already there my $MULTIPLE_AFTER = 5; # Don't replace, but add a duplicate record if # already there my $CROAK = 6; # Die with an error if is already there # Flags to give property statuses. The phrases are to remind maintainers that # if the flag is changed, the indefinite article referring to it in the # documentation may need to be as well. my $NORMAL = ""; my $DEPRECATED = 'D'; my $a_bold_deprecated = "a 'B<$DEPRECATED>'"; my $A_bold_deprecated = "A 'B<$DEPRECATED>'"; my $DISCOURAGED = 'X'; my $a_bold_discouraged = "an 'B<$DISCOURAGED>'"; my $A_bold_discouraged = "An 'B<$DISCOURAGED>'"; my $STRICTER = 'T'; my $a_bold_stricter = "a 'B<$STRICTER>'"; my $A_bold_stricter = "A 'B<$STRICTER>'"; my $STABILIZED = 'S'; my $a_bold_stabilized = "an 'B<$STABILIZED>'"; my $A_bold_stabilized = "An 'B<$STABILIZED>'"; my $OBSOLETE = 'O'; my $a_bold_obsolete = "an 'B<$OBSOLETE>'"; my $A_bold_obsolete = "An 'B<$OBSOLETE>'"; my %status_past_participles = ( $DISCOURAGED => 'discouraged', $STABILIZED => 'stabilized', $OBSOLETE => 'obsolete', $DEPRECATED => 'deprecated', ); # Table fates. These are somewhat ordered, so that fates < $MAP_PROXIED should be # externally documented. my $ORDINARY = 0; # The normal fate. my $MAP_PROXIED = 1; # The map table for the property isn't written out, # but there is a file written that can be used to # reconstruct this table my $INTERNAL_ONLY = 2; # The file for this table is written out, but it is # for Perl's internal use only my $SUPPRESSED = 3; # The file for this table is not written out, and as a # result, we don't bother to do many computations on # it. my $PLACEHOLDER = 4; # Like $SUPPRESSED, but we go through all the # computations anyway, as the values are needed for # things to work. This happens when we have Perl # extensions that depend on Unicode tables that # wouldn't normally be in a given Unicode version. # The format of the values of the tables: my $EMPTY_FORMAT = ""; my $BINARY_FORMAT = 'b'; my $DECIMAL_FORMAT = 'd'; my $FLOAT_FORMAT = 'f'; my $INTEGER_FORMAT = 'i'; my $HEX_FORMAT = 'x'; my $RATIONAL_FORMAT = 'r'; my $STRING_FORMAT = 's'; my $ADJUST_FORMAT = 'a'; my $DECOMP_STRING_FORMAT = 'c'; my $STRING_WHITE_SPACE_LIST = 'sw'; my %map_table_formats = ( $BINARY_FORMAT => 'binary', $DECIMAL_FORMAT => 'single decimal digit', $FLOAT_FORMAT => 'floating point number', $INTEGER_FORMAT => 'integer', $HEX_FORMAT => 'non-negative hex whole number; a code point', $RATIONAL_FORMAT => 'rational: an integer or a fraction', $STRING_FORMAT => 'string', $ADJUST_FORMAT => 'some entries need adjustment', $DECOMP_STRING_FORMAT => 'Perl\'s internal (Normalize.pm) decomposition mapping', $STRING_WHITE_SPACE_LIST => 'string, but some elements are interpreted as a list; white space occurs only as list item separators' ); # Unicode didn't put such derived files in a separate directory at first. my $EXTRACTED_DIR = (-d 'extracted') ? 'extracted' : ""; my $EXTRACTED = ($EXTRACTED_DIR) ? "$EXTRACTED_DIR/" : ""; my $AUXILIARY = 'auxiliary'; # Hashes that will eventually go into Heavy.pl for the use of utf8_heavy.pl # and into UCD.pl for the use of UCD.pm my %loose_to_file_of; # loosely maps table names to their respective # files my %stricter_to_file_of; # same; but for stricter mapping. my %loose_property_to_file_of; # Maps a loose property name to its map file my %file_to_swash_name; # Maps the file name to its corresponding key name # in the hash %utf8::SwashInfo my %nv_floating_to_rational; # maps numeric values floating point numbers to # their rational equivalent my %loose_property_name_of; # Loosely maps (non_string) property names to # standard form my %string_property_loose_to_name; # Same, for string properties. my %loose_defaults; # keys are of form "prop=value", where 'prop' is # the property name in standard loose form, and # 'value' is the default value for that property, # also in standard loose form. my %loose_to_standard_value; # loosely maps table names to the canonical # alias for them my %ambiguous_names; # keys are alias names (in standard form) that # have more than one possible meaning. my %prop_aliases; # Keys are standard property name; values are each # one's aliases my %prop_value_aliases; # Keys of top level are standard property name; # values are keys to another hash, Each one is # one of the property's values, in standard form. # The values are that prop-val's aliases. my %ucd_pod; # Holds entries that will go into the UCD section of the pod # Most properties are immune to caseless matching, otherwise you would get # nonsensical results, as properties are a function of a code point, not # everything that is caselessly equivalent to that code point. For example, # Changes_When_Case_Folded('s') should be false, whereas caselessly it would # be true because 's' and 'S' are equivalent caselessly. However, # traditionally, [:upper:] and [:lower:] are equivalent caselessly, so we # extend that concept to those very few properties that are like this. Each # such property will match the full range caselessly. They are hard-coded in # the program; it's not worth trying to make it general as it's extremely # unlikely that they will ever change. my %caseless_equivalent_to; # These constants names and values were taken from the Unicode standard, # version 5.1, section 3.12. They are used in conjunction with Hangul # syllables. The '_string' versions are so generated tables can retain the # hex format, which is the more familiar value my $SBase_string = "0xAC00"; my $SBase = CORE::hex $SBase_string; my $LBase_string = "0x1100"; my $LBase = CORE::hex $LBase_string; my $VBase_string = "0x1161"; my $VBase = CORE::hex $VBase_string; my $TBase_string = "0x11A7"; my $TBase = CORE::hex $TBase_string; my $SCount = 11172; my $LCount = 19; my $VCount = 21; my $TCount = 28; my $NCount = $VCount * $TCount; # For Hangul syllables; These store the numbers from Jamo.txt in conjunction # with the above published constants. my %Jamo; my %Jamo_L; # Leading consonants my %Jamo_V; # Vowels my %Jamo_T; # Trailing consonants # For code points whose name contains its ordinal as a '-ABCD' suffix. # The key is the base name of the code point, and the value is an # array giving all the ranges that use this base name. Each range # is actually a hash giving the 'low' and 'high' values of it. my %names_ending_in_code_point; my %loose_names_ending_in_code_point; # Same as above, but has blanks, dashes # removed from the names # Inverse mapping. The list of ranges that have these kinds of # names. Each element contains the low, high, and base names in an # anonymous hash. my @code_points_ending_in_code_point; # To hold Unicode's normalization test suite my @normalization_tests; # Boolean: does this Unicode version have the hangul syllables, and are we # writing out a table for them? my $has_hangul_syllables = 0; # Does this Unicode version have code points whose names end in their # respective code points, and are we writing out a table for them? 0 for no; # otherwise points to first property that a table is needed for them, so that # if multiple tables are needed, we don't create duplicates my $needing_code_points_ending_in_code_point = 0; my @backslash_X_tests; # List of tests read in for testing \X my @unhandled_properties; # Will contain a list of properties found in # the input that we didn't process. my @match_properties; # Properties that have match tables, to be # listed in the pod my @map_properties; # Properties that get map files written my @named_sequences; # NamedSequences.txt contents. my %potential_files; # Generated list of all .txt files in the directory # structure so we can warn if something is being # ignored. my @files_actually_output; # List of files we generated. my @more_Names; # Some code point names are compound; this is used # to store the extra components of them. my $MIN_FRACTION_LENGTH = 3; # How many digits of a floating point number at # the minimum before we consider it equivalent to a # candidate rational my $MAX_FLOATING_SLOP = 10 ** - $MIN_FRACTION_LENGTH; # And in floating terms # These store references to certain commonly used property objects my $gc; my $perl; my $block; my $perl_charname; my $print; my $Any; my $script; # Are there conflicting names because of beginning with 'In_', or 'Is_' my $has_In_conflicts = 0; my $has_Is_conflicts = 0; sub internal_file_to_platform ($) { # Convert our file paths which have '/' separators to those of the # platform. my $file = shift; return undef unless defined $file; return File::Spec->join(split '/', $file); } sub file_exists ($) { # platform independent '-e'. This program internally # uses slash as a path separator. my $file = shift; return 0 if ! defined $file; return -e internal_file_to_platform($file); } sub objaddr($) { # Returns the address of the blessed input object. # It doesn't check for blessedness because that would do a string eval # every call, and the program is structured so that this is never called # for a non-blessed object. no overloading; # If overloaded, numifying below won't work. # Numifying a ref gives its address. return pack 'J', $_[0]; } # These are used only if $annotate is true. # The entire range of Unicode characters is examined to populate these # after all the input has been processed. But most can be skipped, as they # have the same descriptive phrases, such as being unassigned my @viacode; # Contains the 1 million character names my @printable; # boolean: And are those characters printable? my @annotate_char_type; # Contains a type of those characters, specifically # for the purposes of annotation. my $annotate_ranges; # A map of ranges of code points that have the same # name for the purposes of annotation. They map to the # upper edge of the range, so that the end point can # be immediately found. This is used to skip ahead to # the end of a range, and avoid processing each # individual code point in it. my $unassigned_sans_noncharacters; # A Range_List of the unassigned # characters, but excluding those which are # also noncharacter code points # The annotation types are an extension of the regular range types, though # some of the latter are folded into one. Make the new types negative to # avoid conflicting with the regular types my $SURROGATE_TYPE = -1; my $UNASSIGNED_TYPE = -2; my $PRIVATE_USE_TYPE = -3; my $NONCHARACTER_TYPE = -4; my $CONTROL_TYPE = -5; my $UNKNOWN_TYPE = -6; # Used only if there is a bug in this program sub populate_char_info ($) { # Used only with the $annotate option. Populates the arrays with the # input code point's info that are needed for outputting more detailed # comments. If calling context wants a return, it is the end point of # any contiguous range of characters that share essentially the same info my $i = shift; Carp::carp_extra_args(\@_) if main::DEBUG && @_; $viacode[$i] = $perl_charname->value_of($i) || ""; # A character is generally printable if Unicode says it is, # but below we make sure that most Unicode general category 'C' types # aren't. $printable[$i] = $print->contains($i); $annotate_char_type[$i] = $perl_charname->type_of($i) || 0; # Only these two regular types are treated specially for annotations # purposes $annotate_char_type[$i] = 0 if $annotate_char_type[$i] != $CP_IN_NAME && $annotate_char_type[$i] != $HANGUL_SYLLABLE; # Give a generic name to all code points that don't have a real name. # We output ranges, if applicable, for these. Also calculate the end # point of the range. my $end; if (! $viacode[$i]) { my $nonchar; if ($gc-> table('Private_use')->contains($i)) { $viacode[$i] = 'Private Use'; $annotate_char_type[$i] = $PRIVATE_USE_TYPE; $printable[$i] = 0; $end = $gc->table('Private_Use')->containing_range($i)->end; } elsif ((defined ($nonchar = Property::property_ref('Noncharacter_Code_Point')) && $nonchar->table('Y')->contains($i))) { $viacode[$i] = 'Noncharacter'; $annotate_char_type[$i] = $NONCHARACTER_TYPE; $printable[$i] = 0; $end = property_ref('Noncharacter_Code_Point')->table('Y')-> containing_range($i)->end; } elsif ($gc-> table('Control')->contains($i)) { $viacode[$i] = property_ref('Name_Alias')->value_of($i) || 'Control'; $annotate_char_type[$i] = $CONTROL_TYPE; $printable[$i] = 0; } elsif ($gc-> table('Unassigned')->contains($i)) { $annotate_char_type[$i] = $UNASSIGNED_TYPE; $printable[$i] = 0; if ($v_version lt v2.0.0) { # No blocks in earliest releases $viacode[$i] = 'Unassigned'; $end = $gc-> table('Unassigned')->containing_range($i)->end; } else { $viacode[$i] = 'Unassigned, block=' . $block-> value_of($i); # Because we name the unassigned by the blocks they are in, it # can't go past the end of that block, and it also can't go # past the unassigned range it is in. The special table makes # sure that the non-characters, which are unassigned, are # separated out. $end = min($block->containing_range($i)->end, $unassigned_sans_noncharacters-> containing_range($i)->end); } } elsif ($v_version lt v2.0.0) { # No surrogates in earliest releases $viacode[$i] = $gc->value_of($i); $annotate_char_type[$i] = $UNKNOWN_TYPE; $printable[$i] = 0; } elsif ($gc-> table('Surrogate')->contains($i)) { $viacode[$i] = 'Surrogate'; $annotate_char_type[$i] = $SURROGATE_TYPE; $printable[$i] = 0; $end = $gc->table('Surrogate')->containing_range($i)->end; } else { Carp::my_carp_bug("Can't figure out how to annotate " . sprintf("U+%04X", $i) . ". Proceeding anyway."); $viacode[$i] = 'UNKNOWN'; $annotate_char_type[$i] = $UNKNOWN_TYPE; $printable[$i] = 0; } } # Here, has a name, but if it's one in which the code point number is # appended to the name, do that. elsif ($annotate_char_type[$i] == $CP_IN_NAME) { $viacode[$i] .= sprintf("-%04X", $i); $end = $perl_charname->containing_range($i)->end; } # And here, has a name, but if it's a hangul syllable one, replace it with # the correct name from the Unicode algorithm elsif ($annotate_char_type[$i] == $HANGUL_SYLLABLE) { use integer; my $SIndex = $i - $SBase; my $L = $LBase + $SIndex / $NCount; my $V = $VBase + ($SIndex % $NCount) / $TCount; my $T = $TBase + $SIndex % $TCount; $viacode[$i] = "HANGUL SYLLABLE $Jamo{$L}$Jamo{$V}"; $viacode[$i] .= $Jamo{$T} if $T != $TBase; $end = $perl_charname->containing_range($i)->end; } return if ! defined wantarray; return $i if ! defined $end; # If not a range, return the input # Save this whole range so can find the end point quickly $annotate_ranges->add_map($i, $end, $end); return $end; } # Commented code below should work on Perl 5.8. ## This 'require' doesn't necessarily work in miniperl, and even if it does, ## the native perl version of it (which is what would operate under miniperl) ## is extremely slow, as it does a string eval every call. #my $has_fast_scalar_util = $ !~ /miniperl/ # && defined eval "require Scalar::Util"; # #sub objaddr($) { # # Returns the address of the blessed input object. Uses the XS version if # # available. It doesn't check for blessedness because that would do a # # string eval every call, and the program is structured so that this is # # never called for a non-blessed object. # # return Scalar::Util::refaddr($_[0]) if $has_fast_scalar_util; # # # Check at least that is a ref. # my $pkg = ref($_[0]) or return undef; # # # Change to a fake package to defeat any overloaded stringify # bless $_[0], 'main::Fake'; # # # Numifying a ref gives its address. # my $addr = pack 'J', $_[0]; # # # Return to original class # bless $_[0], $pkg; # return $addr; #} sub max ($$) { my $a = shift; my $b = shift; return $a if $a >= $b; return $b; } sub min ($$) { my $a = shift; my $b = shift; return $a if $a <= $b; return $b; } sub clarify_number ($) { # This returns the input number with underscores inserted every 3 digits # in large (5 digits or more) numbers. Input must be entirely digits, not # checked. my $number = shift; my $pos = length($number) - 3; return $number if $pos <= 1; while ($pos > 0) { substr($number, $pos, 0) = '_'; $pos -= 3; } return $number; } package Carp; # These routines give a uniform treatment of messages in this program. They # are placed in the Carp package to cause the stack trace to not include them, # although an alternative would be to use another package and set @CARP_NOT # for it. our $Verbose = 1 if main::DEBUG; # Useful info when debugging # This is a work-around suggested by Nicholas Clark to fix a problem with Carp # and overload trying to load Scalar:Util under miniperl. See # http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/2009-11/msg01057.html undef $overload::VERSION; sub my_carp { my $message = shift || ""; my $nofold = shift || 0; if ($message) { $message = main::join_lines($message); $message =~ s/^$0: *//; # Remove initial program name $message =~ s/[.;,]+$//; # Remove certain ending punctuation $message = "\n$0: $message;"; # Fold the message with program name, semi-colon end punctuation # (which looks good with the message that carp appends to it), and a # hanging indent for continuation lines. $message = main::simple_fold($message, "", 4) unless $nofold; $message =~ s/\n$//; # Remove the trailing nl so what carp # appends is to the same line } return $message if defined wantarray; # If a caller just wants the msg carp $message; return; } sub my_carp_bug { # This is called when it is clear that the problem is caused by a bug in # this program. my $message = shift; $message =~ s/^$0: *//; $message = my_carp("Bug in $0. Please report it by running perlbug or if that is unavailable, by sending email to perbug\@perl.org:\n$message"); carp $message; return; } sub carp_too_few_args { if (@_ != 2) { my_carp_bug("Wrong number of arguments: to 'carp_too_few_arguments'. No action taken."); return; } my $args_ref = shift; my $count = shift; my_carp_bug("Need at least $count arguments to " . (caller 1)[3] . ". Instead got: '" . join ', ', @$args_ref . "'. No action taken."); return; } sub carp_extra_args { my $args_ref = shift; my_carp_bug("Too many arguments to 'carp_extra_args': (" . join(', ', @_) . "); Extras ignored.") if @_; unless (ref $args_ref) { my_carp_bug("Argument to 'carp_extra_args' ($args_ref) must be a ref. Not checking arguments."); return; } my ($package, $file, $line) = caller; my $subroutine = (caller 1)[3]; my $list; if (ref $args_ref eq 'HASH') { foreach my $key (keys %$args_ref) { $args_ref->{$key} = $UNDEF unless defined $args_ref->{$key}; } $list = join ', ', each %{$args_ref}; } elsif (ref $args_ref eq 'ARRAY') { foreach my $arg (@$args_ref) { $arg = $UNDEF unless defined $arg; } $list = join ', ', @$args_ref; } else { my_carp_bug("Can't cope with ref " . ref($args_ref) . " . argument to 'carp_extra_args'. Not checking arguments."); return; } my_carp_bug("Unrecognized parameters in options: '$list' to $subroutine. Skipped."); return; } package main; { # Closure # This program uses the inside-out method for objects, as recommended in # "Perl Best Practices". This closure aids in generating those. There # are two routines. setup_package() is called once per package to set # things up, and then set_access() is called for each hash representing a # field in the object. These routines arrange for the object to be # properly destroyed when no longer used, and for standard accessor # functions to be generated. If you need more complex accessors, just # write your own and leave those accesses out of the call to set_access(). # More details below. my %constructor_fields; # fields that are to be used in constructors; see # below # The values of this hash will be the package names as keys to other # hashes containing the name of each field in the package as keys, and # references to their respective hashes as values. my %package_fields; sub setup_package { # Sets up the package, creating standard DESTROY and dump methods # (unless already defined). The dump method is used in debugging by # simple_dumper(). # The optional parameters are: # a) a reference to a hash, that gets populated by later # set_access() calls with one of the accesses being # 'constructor'. The caller can then refer to this, but it is # not otherwise used by these two routines. # b) a reference to a callback routine to call during destruction # of the object, before any fields are actually destroyed my %args = @_; my $constructor_ref = delete $args{'Constructor_Fields'}; my $destroy_callback = delete $args{'Destroy_Callback'}; Carp::carp_extra_args(\@_) if main::DEBUG && %args; my %fields; my $package = (caller)[0]; $package_fields{$package} = \%fields; $constructor_fields{$package} = $constructor_ref; unless ($package->can('DESTROY')) { my $destroy_name = "${package}::DESTROY"; no strict "refs"; # Use typeglob to give the anonymous subroutine the name we want *$destroy_name = sub { my $self = shift; my $addr = do { no overloading; pack 'J', $self; }; $self->$destroy_callback if $destroy_callback; foreach my $field (keys %{$package_fields{$package}}) { #print STDERR __LINE__, ": Destroying ", ref $self, " ", sprintf("%04X", $addr), ": ", $field, "\n"; delete $package_fields{$package}{$field}{$addr}; } return; } } unless ($package->can('dump')) { my $dump_name = "${package}::dump"; no strict "refs"; *$dump_name = sub { my $self = shift; return dump_inside_out($self, $package_fields{$package}, @_); } } return; } sub set_access { # Arrange for the input field to be garbage collected when no longer # needed. Also, creates standard accessor functions for the field # based on the optional parameters-- none if none of these parameters: # 'addable' creates an 'add_NAME()' accessor function. # 'readable' or 'readable_array' creates a 'NAME()' accessor # function. # 'settable' creates a 'set_NAME()' accessor function. # 'constructor' doesn't create an accessor function, but adds the # field to the hash that was previously passed to # setup_package(); # Any of the accesses can be abbreviated down, so that 'a', 'ad', # 'add' etc. all mean 'addable'. # The read accessor function will work on both array and scalar # values. If another accessor in the parameter list is 'a', the read # access assumes an array. You can also force it to be array access # by specifying 'readable_array' instead of 'readable' # # A sort-of 'protected' access can be set-up by preceding the addable, # readable or settable with some initial portion of 'protected_' (but, # the underscore is required), like 'p_a', 'pro_set', etc. The # "protection" is only by convention. All that happens is that the # accessor functions' names begin with an underscore. So instead of # calling set_foo, the call is _set_foo. (Real protection could be # accomplished by having a new subroutine, end_package, called at the # end of each package, and then storing the __LINE__ ranges and # checking them on every accessor. But that is way overkill.) # We create anonymous subroutines as the accessors and then use # typeglobs to assign them to the proper package and name my $name = shift; # Name of the field my $field = shift; # Reference to the inside-out hash containing the # field my $package = (caller)[0]; if (! exists $package_fields{$package}) { croak "$0: Must call 'setup_package' before 'set_access'"; } # Stash the field so DESTROY can get it. $package_fields{$package}{$name} = $field; # Remaining arguments are the accessors. For each... foreach my $access (@_) { my $access = lc $access; my $protected = ""; # Match the input as far as it goes. if ($access =~ /^(p[^_]*)_/) { $protected = $1; if (substr('protected_', 0, length $protected) eq $protected) { # Add 1 for the underscore not included in $protected $access = substr($access, length($protected) + 1); $protected = '_'; } else { $protected = ""; } } if (substr('addable', 0, length $access) eq $access) { my $subname = "${package}::${protected}add_$name"; no strict "refs"; # add_ accessor. Don't add if already there, which we # determine using 'eq' for scalars and '==' otherwise. *$subname = sub { use strict "refs"; return Carp::carp_too_few_args(\@_, 2) if main::DEBUG && @_ < 2; my $self = shift; my $value = shift; my $addr = do { no overloading; pack 'J', $self; }; Carp::carp_extra_args(\@_) if main::DEBUG && @_; if (ref $value) { return if grep { $value == $_ } @{$field->{$addr}}; } else { return if grep { $value eq $_ } @{$field->{$addr}}; } push @{$field->{$addr}}, $value; return; } } elsif (substr('constructor', 0, length $access) eq $access) { if ($protected) { Carp::my_carp_bug("Can't set-up 'protected' constructors") } else { $constructor_fields{$package}{$name} = $field; } } elsif (substr('readable_array', 0, length $access) eq $access) { # Here has read access. If one of the other parameters for # access is array, or this one specifies array (by being more # than just 'readable_'), then create a subroutine that # assumes the data is an array. Otherwise just a scalar my $subname = "${package}::${protected}$name"; if (grep { /^a/i } @_ or length($access) > length('readable_')) { no strict "refs"; *$subname = sub { use strict "refs"; Carp::carp_extra_args(\@_) if main::DEBUG && @_ > 1; my $addr = do { no overloading; pack 'J', $_[0]; }; if (ref $field->{$addr} ne 'ARRAY') { my $type = ref $field->{$addr}; $type = 'scalar' unless $type; Carp::my_carp_bug("Trying to read $name as an array when it is a $type. Big problems."); return; } return scalar @{$field->{$addr}} unless wantarray; # Make a copy; had problems with caller modifying the # original otherwise my @return = @{$field->{$addr}}; return @return; } } else { # Here not an array value, a simpler function. no strict "refs"; *$subname = sub { use strict "refs"; Carp::carp_extra_args(\@_) if main::DEBUG && @_ > 1; no overloading; return $field->{pack 'J', $_[0]}; } } } elsif (substr('settable', 0, length $access) eq $access) { my $subname = "${package}::${protected}set_$name"; no strict "refs"; *$subname = sub { use strict "refs"; if (main::DEBUG) { return Carp::carp_too_few_args(\@_, 2) if @_ < 2; Carp::carp_extra_args(\@_) if @_ > 2; } # $self is $_[0]; $value is $_[1] no overloading; $field->{pack 'J', $_[0]} = $_[1]; return; } } else { Carp::my_carp_bug("Unknown accessor type $access. No accessor set."); } } return; } } package Input_file; # All input files use this object, which stores various attributes about them, # and provides for convenient, uniform handling. The run method wraps the # processing. It handles all the bookkeeping of opening, reading, and closing # the file, returning only significant input lines. # # Each object gets a handler which processes the body of the file, and is # called by run(). Most should use the generic, default handler, which has # code scrubbed to handle things you might not expect. A handler should # basically be a while(next_line()) {...} loop. # # You can also set up handlers to # 1) call before the first line is read for pre processing # 2) call to adjust each line of the input before the main handler gets them # 3) call upon EOF before the main handler exits its loop # 4) call at the end for post processing # # $_ is used to store the input line, and is to be filtered by the # each_line_handler()s. So, if the format of the line is not in the desired # format for the main handler, these are used to do that adjusting. They can # be stacked (by enclosing them in an [ anonymous array ] in the constructor, # so the $_ output of one is used as the input to the next. None of the other # handlers are stackable, but could easily be changed to be so. # # Most of the handlers can call insert_lines() or insert_adjusted_lines() # which insert the parameters as lines to be processed before the next input # file line is read. This allows the EOF handler to flush buffers, for # example. The difference between the two routines is that the lines inserted # by insert_lines() are subjected to the each_line_handler()s. (So if you # called it from such a handler, you would get infinite recursion.) Lines # inserted by insert_adjusted_lines() go directly to the main handler without # any adjustments. If the post-processing handler calls any of these, there # will be no effect. Some error checking for these conditions could be added, # but it hasn't been done. # # carp_bad_line() should be called to warn of bad input lines, which clears $_ # to prevent further processing of the line. This routine will output the # message as a warning once, and then keep a count of the lines that have the # same message, and output that count at the end of the file's processing. # This keeps the number of messages down to a manageable amount. # # get_missings() should be called to retrieve any @missing input lines. # Messages will be raised if this isn't done if the options aren't to ignore # missings. sub trace { return main::trace(@_); } { # Closure # Keep track of fields that are to be put into the constructor. my %constructor_fields; main::setup_package(Constructor_Fields => \%constructor_fields); my %file; # Input file name, required main::set_access('file', \%file, qw{ c r }); my %first_released; # Unicode version file was first released in, required main::set_access('first_released', \%first_released, qw{ c r }); my %handler; # Subroutine to process the input file, defaults to # 'process_generic_property_file' main::set_access('handler', \%handler, qw{ c }); my %property; # name of property this file is for. defaults to none, meaning not # applicable, or is otherwise determinable, for example, from each line. main::set_access('property', \%property, qw{ c r }); my %optional; # If this is true, the file is optional. If not present, no warning is # output. If it is present, the string given by this parameter is # evaluated, and if false the file is not processed. main::set_access('optional', \%optional, 'c', 'r'); my %non_skip; # This is used for debugging, to skip processing of all but a few input # files. Add 'non_skip => 1' to the constructor for those files you want # processed when you set the $debug_skip global. main::set_access('non_skip', \%non_skip, 'c'); my %skip; # This is used to skip processing of this input file semi-permanently, # when it evaluates to true. The value should be the reason the file is # being skipped. It is used for files that we aren't planning to process # anytime soon, but want to allow to be in the directory and not raise a # message that we are not handling. Mostly for test files. This is in # contrast to the non_skip element, which is supposed to be used very # temporarily for debugging. Sets 'optional' to 1. Also, files that we # pretty much will never look at can be placed in the global # %ignored_files instead. Ones used here will be added to %skipped files main::set_access('skip', \%skip, 'c'); my %each_line_handler; # list of subroutines to look at and filter each non-comment line in the # file. defaults to none. The subroutines are called in order, each is # to adjust $_ for the next one, and the final one adjusts it for # 'handler' main::set_access('each_line_handler', \%each_line_handler, 'c'); my %has_missings_defaults; # ? Are there lines in the file giving default values for code points # missing from it?. Defaults to NO_DEFAULTS. Otherwise NOT_IGNORED is # the norm, but IGNORED means it has such lines, but the handler doesn't # use them. Having these three states allows us to catch changes to the # UCD that this program should track main::set_access('has_missings_defaults', \%has_missings_defaults, qw{ c r }); my %pre_handler; # Subroutine to call before doing anything else in the file. If undef, no # such handler is called. main::set_access('pre_handler', \%pre_handler, qw{ c }); my %eof_handler; # Subroutine to call upon getting an EOF on the input file, but before # that is returned to the main handler. This is to allow buffers to be # flushed. The handler is expected to call insert_lines() or # insert_adjusted() with the buffered material main::set_access('eof_handler', \%eof_handler, qw{ c r }); my %post_handler; # Subroutine to call after all the lines of the file are read in and # processed. If undef, no such handler is called. main::set_access('post_handler', \%post_handler, qw{ c }); my %progress_message; # Message to print to display progress in lieu of the standard one main::set_access('progress_message', \%progress_message, qw{ c }); my %handle; # cache open file handle, internal. Is undef if file hasn't been # processed at all, empty if has; main::set_access('handle', \%handle); my %added_lines; # cache of lines added virtually to the file, internal main::set_access('added_lines', \%added_lines); my %errors; # cache of errors found, internal main::set_access('errors', \%errors); my %missings; # storage of '@missing' defaults lines main::set_access('missings', \%missings); sub new { my $class = shift; my $self = bless \do{ my $anonymous_scalar }, $class; my $addr = do { no overloading; pack 'J', $self; }; # Set defaults $handler{$addr} = \&main::process_generic_property_file; $non_skip{$addr} = 0; $skip{$addr} = 0; $has_missings_defaults{$addr} = $NO_DEFAULTS; $handle{$addr} = undef; $added_lines{$addr} = [ ]; $each_line_handler{$addr} = [ ]; $errors{$addr} = { }; $missings{$addr} = [ ]; # Two positional parameters. return Carp::carp_too_few_args(\@_, 2) if main::DEBUG && @_ < 2; $file{$addr} = main::internal_file_to_platform(shift); $first_released{$addr} = shift; # The rest of the arguments are key => value pairs # %constructor_fields has been set up earlier to list all possible # ones. Either set or push, depending on how the default has been set # up just above. my %args = @_; foreach my $key (keys %args) { my $argument = $args{$key}; # Note that the fields are the lower case of the constructor keys my $hash = $constructor_fields{lc $key}; if (! defined $hash) { Carp::my_carp_bug("Unrecognized parameters '$key => $argument' to new() for $self. Skipped"); next; } if (ref $hash->{$addr} eq 'ARRAY') { if (ref $argument eq 'ARRAY') { foreach my $argument (@{$argument}) { next if ! defined $argument; push @{$hash->{$addr}}, $argument; } } else { push @{$hash->{$addr}}, $argument if defined $argument; } } else { $hash->{$addr} = $argument; } delete $args{$key}; }; # If the file has a property for it, it means that the property is not # listed in the file's entries. So add a handler to the list of line # handlers to insert the property name into the lines, to provide a # uniform interface to the final processing subroutine. # the final code doesn't have to worry about that. if ($property{$addr}) { push @{$each_line_handler{$addr}}, \&_insert_property_into_line; } if ($non_skip{$addr} && ! $debug_skip && $verbosity) { print "Warning: " . __PACKAGE__ . " constructor for $file{$addr} has useless 'non_skip' in it\n"; } # If skipping, set to optional, and add to list of ignored files, # including its reason if ($skip{$addr}) { $optional{$addr} = 1; $skipped_files{$file{$addr}} = $skip{$addr} } return $self; } use overload fallback => 0, qw("") => "_operator_stringify", "." => \&main::_operator_dot, ".=" => \&main::_operator_dot_equal, ; sub _operator_stringify { my $self = shift; return __PACKAGE__ . " object for " . $self->file; } # flag to make sure extracted files are processed early my $seen_non_extracted_non_age = 0; sub run { # Process the input object $self. This opens and closes the file and # calls all the handlers for it. Currently, this can only be called # once per file, as it destroy's the EOF handler my $self = shift; Carp::carp_extra_args(\@_) if main::DEBUG && @_; my $addr = do { no overloading; pack 'J', $self; }; my $file = $file{$addr}; # Don't process if not expecting this file (because released later # than this Unicode version), and isn't there. This means if someone # copies it into an earlier version's directory, we will go ahead and # process it. return if $first_released{$addr} gt $v_version && ! -e $file; # If in debugging mode and this file doesn't have the non-skip # flag set, and isn't one of the critical files, skip it. if ($debug_skip && $first_released{$addr} ne v0 && ! $non_skip{$addr}) { print "Skipping $file in debugging\n" if $verbosity; return; } # File could be optional if ($optional{$addr}) { return unless -e $file; my $result = eval $optional{$addr}; if (! defined $result) { Carp::my_carp_bug("Got '$@' when tried to eval $optional{$addr}. $file Skipped."); return; } if (! $result) { if ($verbosity) { print STDERR "Skipping processing input file '$file' because '$optional{$addr}' is not true\n"; } return; } } if (! defined $file || ! -e $file) { # If the file doesn't exist, see if have internal data for it # (based on first_released being 0). if ($first_released{$addr} eq v0) { $handle{$addr} = 'pretend_is_open'; } else { if (! $optional{$addr} # File could be optional && $v_version ge $first_released{$addr}) { print STDERR "Skipping processing input file '$file' because not found\n" if $v_version ge $first_released{$addr}; } return; } } else { # Here, the file exists. Some platforms may change the case of # its name if ($seen_non_extracted_non_age) { if ($file =~ /$EXTRACTED/i) { Carp::my_carp_bug(main::join_lines(<rel2abs($file); my $expecting = delete $potential_files{lc($fkey)}; Carp::my_carp("Was not expecting '$file'.") if ! $expecting && ! defined $handle{$addr}; # Having deleted from expected files, we can quit if not to do # anything. Don't print progress unless really want verbosity if ($skip{$addr}) { print "Skipping $file.\n" if $verbosity >= $VERBOSE; return; } # Open the file, converting the slashes used in this program # into the proper form for the OS my $file_handle; if (not open $file_handle, "<", $file) { Carp::my_carp("Can't open $file. Skipping: $!"); return 0; } $handle{$addr} = $file_handle; # Cache the open file handle } if ($verbosity >= $PROGRESS) { if ($progress_message{$addr}) { print "$progress_message{$addr}\n"; } else { # If using a virtual file, say so. print "Processing ", (-e $file) ? $file : "substitute $file", "\n"; } } # Call any special handler for before the file. &{$pre_handler{$addr}}($self) if $pre_handler{$addr}; # Then the main handler &{$handler{$addr}}($self); # Then any special post-file handler. &{$post_handler{$addr}}($self) if $post_handler{$addr}; # If any errors have been accumulated, output the counts (as the first # error message in each class was output when it was encountered). if ($errors{$addr}) { my $total = 0; my $types = 0; foreach my $error (keys %{$errors{$addr}}) { $total += $errors{$addr}->{$error}; delete $errors{$addr}->{$error}; $types++; } if ($total > 1) { my $message = "A total of $total lines had errors in $file. "; $message .= ($types == 1) ? '(Only the first one was displayed.)' : '(Only the first of each type was displayed.)'; Carp::my_carp($message); } } if (@{$missings{$addr}}) { Carp::my_carp_bug("Handler for $file didn't look at all the \@missing lines. Generated tables likely are wrong"); } # If a real file handle, close it. close $handle{$addr} or Carp::my_carp("Can't close $file: $!") if ref $handle{$addr}; $handle{$addr} = ""; # Uses empty to indicate that has already seen # the file, as opposed to undef return; } sub next_line { # Sets $_ to be the next logical input line, if any. Returns non-zero # if such a line exists. 'logical' means that any lines that have # been added via insert_lines() will be returned in $_ before the file # is read again. my $self = shift; Carp::carp_extra_args(\@_) if main::DEBUG && @_; my $addr = do { no overloading; pack 'J', $self; }; # Here the file is open (or if the handle is not a ref, is an open # 'virtual' file). Get the next line; any inserted lines get priority # over the file itself. my $adjusted; LINE: while (1) { # Loop until find non-comment, non-empty line #local $to_trace = 1 if main::DEBUG; my $inserted_ref = shift @{$added_lines{$addr}}; if (defined $inserted_ref) { ($adjusted, $_) = @{$inserted_ref}; trace $adjusted, $_ if main::DEBUG && $to_trace; return 1 if $adjusted; } else { last if ! ref $handle{$addr}; # Don't read unless is real file last if ! defined ($_ = readline $handle{$addr}); } chomp; trace $_ if main::DEBUG && $to_trace; # See if this line is the comment line that defines what property # value that code points that are not listed in the file should # have. The format or existence of these lines is not guaranteed # by Unicode since they are comments, but the documentation says # that this was added for machine-readability, so probably won't # change. This works starting in Unicode Version 5.0. They look # like: # # @missing: 0000..10FFFF; Not_Reordered # @missing: 0000..10FFFF; Decomposition_Mapping; # @missing: 0000..10FFFF; ; NaN # # Save the line for a later get_missings() call. if (/$missing_defaults_prefix/) { if ($has_missings_defaults{$addr} == $NO_DEFAULTS) { $self->carp_bad_line("Unexpected \@missing line. Assuming no missing entries"); } elsif ($has_missings_defaults{$addr} == $NOT_IGNORED) { my @defaults = split /\s* ; \s*/x, $_; # The first field is the @missing, which ends in a # semi-colon, so can safely shift. shift @defaults; # Some of these lines may have empty field placeholders # which get in the way. An example is: # @missing: 0000..10FFFF; ; NaN # Remove them. Process starting from the top so the # splice doesn't affect things still to be looked at. for (my $i = @defaults - 1; $i >= 0; $i--) { next if $defaults[$i] ne ""; splice @defaults, $i, 1; } # What's left should be just the property (maybe) and the # default. Having only one element means it doesn't have # the property. my $default; my $property; if (@defaults >= 1) { if (@defaults == 1) { $default = $defaults[0]; } else { $property = $defaults[0]; $default = $defaults[1]; } } if (@defaults < 1 || @defaults > 2 || ($default =~ /^$/i && $default !~ /^$/i && $default !~ /^