X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/e904f99525ffc0cd5f09346758a1931019c2f0b0..3e20195b283514bb1c378fb4385e76da91a2ba15:/lib/unicore/mktables diff --git a/lib/unicore/mktables b/lib/unicore/mktables index 5615aee..67ee162 100644 --- a/lib/unicore/mktables +++ b/lib/unicore/mktables @@ -1,831 +1,14301 @@ #!/usr/bin/perl -w +# !!!!!!!!!!!!!! IF YOU MODIFY THIS FILE !!!!!!!!!!!!!!!!!!!!!!!!! +# Any files created or read by this program should be listed in 'mktables.lst' +# Use -makelist to regenerate it. + +# Needs 'no overloading' to run faster on miniperl. Code commented out at the +# subroutine objaddr can be used instead to work as far back (untested) as +# 5.8: needs pack "U". But almost all occurrences of objaddr have been +# removed in favor of using 'no overloading'. You also would have to go +# through and replace occurrences like: +# my $addr = do { no overloading; pack 'J', $self; } +# with +# my $addr = main::objaddr $self; +# (or reverse commit 9b01bafde4b022706c3d6f947a0963f821b2e50b +# that instituted the change to main::objaddr, and subsequent commits that +# changed 0+$self to pack 'J', $self.) + +require 5.010_001; +use strict; +use warnings; +use Carp; +use File::Find; +use File::Path; +use File::Spec; +use Text::Tabs; + +sub DEBUG () { 0 } # Set to 0 for production; 1 for development + +########################################################################## +# +# mktables -- create the runtime Perl Unicode files (lib/unicore/.../*.pl), +# from the Unicode database files (lib/unicore/.../*.txt), It also generates +# a pod file and a .t file +# +# The structure of this file is: +# First these introductory comments; then +# code needed for everywhere, such as debugging stuff; then +# code to handle input parameters; then +# data structures likely to be of external interest (some of which depend on +# the input parameters, so follows them; then +# more data structures and subroutine and package (class) definitions; then +# the small actual loop to process the input files and finish up; then +# a __DATA__ section, for the .t tests +# +# This program works on all releases of Unicode through at least 5.2. The +# outputs have been scrutinized most intently for release 5.1. The others +# have been checked for somewhat more than just sanity. It can handle all +# existing Unicode character properties in those releases. +# +# This program is mostly about Unicode character (or code point) properties. +# A property describes some attribute or quality of a code point, like if it +# is lowercase or not, its name, what version of Unicode it was first defined +# in, or what its uppercase equivalent is. Unicode deals with these disparate +# possibilities by making all properties into mappings from each code point +# into some corresponding value. In the case of it being lowercase or not, +# the mapping is either to 'Y' or 'N' (or various synonyms thereof). Each +# property maps each Unicode code point to a single value, called a "property +# value". (Hence each Unicode property is a true mathematical function with +# exactly one value per code point.) +# +# When using a property in a regular expression, what is desired isn't the +# mapping of the code point to its property's value, but the reverse (or the +# mathematical "inverse relation"): starting with the property value, "Does a +# code point map to it?" These are written in a "compound" form: +# \p{property=value}, e.g., \p{category=punctuation}. This program generates +# files containing the lists of code points that map to each such regular +# expression property value, one file per list # -# mktables -- create the runtime Perl Unicode files (lib/unicore/**/*.pl) -# from the Unicode database files (lib/unicore/*.txt). +# There is also a single form shortcut that Perl adds for many of the commonly +# used properties. This happens for all binary properties, plus script, +# general_category, and block properties. # +# Thus the outputs of this program are files. There are map files, mostly in +# the 'To' directory; and there are list files for use in regular expression +# matching, all in subdirectories of the 'lib' directory, with each +# subdirectory being named for the property that the lists in it are for. +# Bookkeeping, test, and documentation files are also generated. -use strict; +my $matches_directory = 'lib'; # Where match (\p{}) files go. +my $map_directory = 'To'; # Where map files go. + +# DATA STRUCTURES +# +# The major data structures of this program are Property, of course, but also +# Table. There are two kinds of tables, very similar to each other. +# "Match_Table" is the data structure giving the list of code points that have +# a particular property value, mentioned above. There is also a "Map_Table" +# data structure which gives the property's mapping from code point to value. +# There are two structures because the match tables need to be combined in +# various ways, such as constructing unions, intersections, complements, etc., +# and the map ones don't. And there would be problems, perhaps subtle, if +# a map table were inadvertently operated on in some of those ways. +# The use of separate classes with operations defined on one but not the other +# prevents accidentally confusing the two. +# +# At the heart of each table's data structure is a "Range_List", which is just +# an ordered list of "Ranges", plus ancillary information, and methods to +# operate on them. A Range is a compact way to store property information. +# Each range has a starting code point, an ending code point, and a value that +# is meant to apply to all the code points between the two end points, +# inclusive. For a map table, this value is the property value for those +# code points. Two such ranges could be written like this: +# 0x41 .. 0x5A, 'Upper', +# 0x61 .. 0x7A, 'Lower' +# +# Each range also has a type used as a convenience to classify the values. +# Most ranges in this program will be Type 0, or normal, but there are some +# ranges that have a non-zero type. These are used only in map tables, and +# are for mappings that don't fit into the normal scheme of things. Mappings +# that require a hash entry to communicate with utf8.c are one example; +# another example is mappings for charnames.pm to use which indicate a name +# that is algorithmically determinable from its code point (and vice-versa). +# These are used to significantly compact these tables, instead of listing +# each one of the tens of thousands individually. +# +# In a match table, the value of a range is irrelevant (and hence the type as +# well, which will always be 0), and arbitrarily set to the null string. +# Using the example above, there would be two match tables for those two +# entries, one named Upper would contain the 0x41..0x5A range, and the other +# named Lower would contain 0x61..0x7A. +# +# Actually, there are two types of range lists, "Range_Map" is the one +# associated with map tables, and "Range_List" with match tables. +# Again, this is so that methods can be defined on one and not the other so as +# to prevent operating on them in incorrect ways. +# +# Eventually, most tables are written out to files to be read by utf8_heavy.pl +# in the perl core. All tables could in theory be written, but some are +# suppressed because there is no current practical use for them. It is easy +# to change which get written by changing various lists that are near the top +# of the actual code in this file. The table data structures contain enough +# ancillary information to allow them to be treated as separate entities for +# writing, such as the path to each one's file. There is a heading in each +# map table that gives the format of its entries, and what the map is for all +# the code points missing from it. (This allows tables to be more compact.) +# +# The Property data structure contains one or more tables. All properties +# contain a map table (except the $perl property which is a +# pseudo-property containing only match tables), and any properties that +# are usable in regular expression matches also contain various matching +# tables, one for each value the property can have. A binary property can +# have two values, True and False (or Y and N, which are preferred by Unicode +# terminology). Thus each of these properties will have a map table that +# takes every code point and maps it to Y or N (but having ranges cuts the +# number of entries in that table way down), and two match tables, one +# which has a list of all the code points that map to Y, and one for all the +# code points that map to N. (For each of these, a third table is also +# generated for the pseudo Perl property. It contains the identical code +# points as the Y table, but can be written, not in the compound form, but in +# a "single" form like \p{IsUppercase}.) Many properties are binary, but some +# properties have several possible values, some have many, and properties like +# Name have a different value for every named code point. Those will not, +# unless the controlling lists are changed, have their match tables written +# out. But all the ones which can be used in regular expression \p{} and \P{} +# constructs will. Generally a property will have either its map table or its +# match tables written but not both. Again, what gets written is controlled +# by lists which can easily be changed. +# +# For information about the Unicode properties, see Unicode's UAX44 document: + +my $unicode_reference_url = 'http://www.unicode.org/reports/tr44/'; + +# As stated earlier, this program will work on any release of Unicode so far. +# Most obvious problems in earlier data have NOT been corrected except when +# necessary to make Perl or this program work reasonably. For example, no +# folding information was given in early releases, so this program uses the +# substitute of lower case, just so that a regular expression with the /i +# option will do something that actually gives the right results in many +# cases. There are also a couple other corrections for version 1.1.5, +# commented at the point they are made. As an example of corrections that +# weren't made (but could be) is this statement from DerivedAge.txt: "The +# supplementary private use code points and the non-character code points were +# assigned in version 2.0, but not specifically listed in the UCD until +# versions 3.0 and 3.1 respectively." (To be precise it was 3.0.1 not 3.0.0) +# More information on Unicode version glitches is further down in these +# introductory comments. +# +# This program works on all properties as of 5.2, though the files for some +# are suppressed from apparent lack of demand for them. You can change which +# are output by changing lists in this program. +# +# The old version of mktables emphasized the term "Fuzzy" to mean Unocde's +# loose matchings rules (from Unicode TR18): +# +# The recommended names for UCD properties and property values are in +# PropertyAliases.txt [Prop] and PropertyValueAliases.txt +# [PropValue]. There are both abbreviated names and longer, more +# descriptive names. It is strongly recommended that both names be +# recognized, and that loose matching of property names be used, +# whereby the case distinctions, whitespace, hyphens, and underbar +# are ignored. +# The program still allows Fuzzy to override its determination of if loose +# matching should be used, but it isn't currently used, as it is no longer +# needed; the calculations it makes are good enough. +# +# SUMMARY OF HOW IT WORKS: +# +# Process arguments +# +# A list is constructed containing each input file that is to be processed +# +# Each file on the list is processed in a loop, using the associated handler +# code for each: +# The PropertyAliases.txt and PropValueAliases.txt files are processed +# first. These files name the properties and property values. +# Objects are created of all the property and property value names +# that the rest of the input should expect, including all synonyms. +# The other input files give mappings from properties to property +# values. That is, they list code points and say what the mapping +# is under the given property. Some files give the mappings for +# just one property; and some for many. This program goes through +# each file and populates the properties from them. Some properties +# are listed in more than one file, and Unicode has set up a +# precedence as to which has priority if there is a conflict. Thus +# the order of processing matters, and this program handles the +# conflict possibility by processing the overriding input files +# last, so that if necessary they replace earlier values. +# After this is all done, the program creates the property mappings not +# furnished by Unicode, but derivable from what it does give. +# The tables of code points that match each property value in each +# property that is accessible by regular expressions are created. +# The Perl-defined properties are created and populated. Many of these +# require data determined from the earlier steps +# Any Perl-defined synonyms are created, and name clashes between Perl +# and Unicode are reconciled and warned about. +# All the properties are written to files +# Any other files are written, and final warnings issued. +# +# For clarity, a number of operators have been overloaded to work on tables: +# ~ means invert (take all characters not in the set). The more +# conventional '!' is not used because of the possibility of confusing +# it with the actual boolean operation. +# + means union +# - means subtraction +# & means intersection +# The precedence of these is the order listed. Parentheses should be +# copiously used. These are not a general scheme. The operations aren't +# defined for a number of things, deliberately, to avoid getting into trouble. +# Operations are done on references and affect the underlying structures, so +# that the copy constructors for them have been overloaded to not return a new +# clone, but the input object itself. +# +# The bool operator is deliberately not overloaded to avoid confusion with +# "should it mean if the object merely exists, or also is non-empty?". +# +# WHY CERTAIN DESIGN DECISIONS WERE MADE +# +# This program needs to be able to run under miniperl. Therefore, it uses a +# minimum of other modules, and hence implements some things itself that could +# be gotten from CPAN +# +# This program uses inputs published by the Unicode Consortium. These can +# change incompatibly between releases without the Perl maintainers realizing +# it. Therefore this program is now designed to try to flag these. It looks +# at the directories where the inputs are, and flags any unrecognized files. +# It keeps track of all the properties in the files it handles, and flags any +# that it doesn't know how to handle. It also flags any input lines that +# don't match the expected syntax, among other checks. +# +# It is also designed so if a new input file matches one of the known +# templates, one hopefully just needs to add it to a list to have it +# processed. +# +# As mentioned earlier, some properties are given in more than one file. In +# particular, the files in the extracted directory are supposedly just +# reformattings of the others. But they contain information not easily +# derivable from the other files, including results for Unihan, which this +# program doesn't ordinarily look at, and for unassigned code points. They +# also have historically had errors or been incomplete. In an attempt to +# create the best possible data, this program thus processes them first to +# glean information missing from the other files; then processes those other +# files to override any errors in the extracted ones. Much of the design was +# driven by this need to store things and then possibly override them. +# +# It tries to keep fatal errors to a minimum, to generate something usable for +# testing purposes. It always looks for files that could be inputs, and will +# warn about any that it doesn't know how to handle (the -q option suppresses +# the warning). +# +# Why have files written out for binary 'N' matches? +# For binary properties, if you know the mapping for either Y or N; the +# other is trivial to construct, so could be done at Perl run-time by just +# complementing the result, instead of having a file for it. That is, if +# someone types in \p{foo: N}, Perl could translate that to \P{foo: Y} and +# not need a file. The problem is communicating to Perl that a given +# property is binary. Perl can't figure it out from looking at the N (or +# No), as some non-binary properties have these as property values. So +# rather than inventing a way to communicate this info back to the core, +# which would have required changes there as well, it was simpler just to +# add the extra tables. +# +# Why is there more than one type of range? +# This simplified things. There are some very specialized code points that +# have to be handled specially for output, such as Hangul syllable names. +# By creating a range type (done late in the development process), it +# allowed this to be stored with the range, and overridden by other input. +# Originally these were stored in another data structure, and it became a +# mess trying to decide if a second file that was for the same property was +# overriding the earlier one or not. +# +# Why are there two kinds of tables, match and map? +# (And there is a base class shared by the two as well.) As stated above, +# they actually are for different things. Development proceeded much more +# smoothly when I (khw) realized the distinction. Map tables are used to +# give the property value for every code point (actually every code point +# that doesn't map to a default value). Match tables are used for regular +# expression matches, and are essentially the inverse mapping. Separating +# the two allows more specialized methods, and error checks so that one +# can't just take the intersection of two map tables, for example, as that +# is nonsensical. +# +# There are no match tables generated for matches of the null string. These +# would look like qr/\p{JSN=}/ currently without modifying the regex code. +# Perhaps something like them could be added if necessary. The JSN does have +# a real code point U+110B that maps to the null string, but it is a +# contributory property, and therefore not output by default. And it's easily +# handled so far by making the null string the default where it is a +# possibility. +# +# DEBUGGING +# +# This program is written so it will run under miniperl. Occasionally changes +# will cause an error where the backtrace doesn't work well under miniperl. +# To diagnose the problem, you can instead run it under regular perl, if you +# have one compiled. +# +# There is a good trace facility. To enable it, first sub DEBUG must be set +# to return true. Then a line like +# +# local $to_trace = 1 if main::DEBUG; +# +# can be added to enable tracing in its lexical scope or until you insert +# another line: +# +# local $to_trace = 0 if main::DEBUG; +# +# then use a line like "trace $a, @b, %c, ...; +# +# Some of the more complex subroutines already have trace statements in them. +# Permanent trace statements should be like: +# +# trace ... if main::DEBUG && $to_trace; +# +# If there is just one or a few files that you're debugging, you can easily +# cause most everything else to be skipped. Change the line +# +# my $debug_skip = 0; +# +# to 1, and every file whose object is in @input_file_objects and doesn't have +# a, 'non_skip => 1,' in its constructor will be skipped. +# +# FUTURE ISSUES +# +# The program would break if Unicode were to change its names so that +# interior white space, underscores, or dashes differences were significant +# within property and property value names. +# +# It might be easier to use the xml versions of the UCD if this program ever +# would need heavy revision, and the ability to handle old versions was not +# required. +# +# There is the potential for name collisions, in that Perl has chosen names +# that Unicode could decide it also likes. There have been such collisions in +# the past, with mostly Perl deciding to adopt the Unicode definition of the +# name. However in the 5.2 Unicode beta testing, there were a number of such +# collisions, which were withdrawn before the final release, because of Perl's +# and other's protests. These all involved new properties which began with +# 'Is'. Based on the protests, Unicode is unlikely to try that again. Also, +# many of the Perl-defined synonyms, like Any, Word, etc, are listed in a +# Unicode document, so they are unlikely to be used by Unicode for another +# purpose. However, they might try something beginning with 'In', or use any +# of the other Perl-defined properties. This program will warn you of name +# collisions, and refuse to generate tables with them, but manual intervention +# will be required in this event. One scheme that could be implemented, if +# necessary, would be to have this program generate another file, or add a +# field to mktables.lst that gives the date of first definition of a property. +# Each new release of Unicode would use that file as a basis for the next +# iteration. And the Perl synonym addition code could sort based on the age +# of the property, so older properties get priority, and newer ones that clash +# would be refused; hence existing code would not be impacted, and some other +# synonym would have to be used for the new property. This is ugly, and +# manual intervention would certainly be easier to do in the short run; lets +# hope it never comes to this. +# +# A NOTE ON UNIHAN +# +# This program can generate tables from the Unihan database. But it doesn't +# by default, letting the CPAN module Unicode::Unihan handle them. Prior to +# version 5.2, this database was in a single file, Unihan.txt. In 5.2 the +# database was split into 8 different files, all beginning with the letters +# 'Unihan'. This program will read those file(s) if present, but it needs to +# know which of the many properties in the file(s) should have tables created +# for them. It will create tables for any properties listed in +# PropertyAliases.txt and PropValueAliases.txt, plus any listed in the +# @cjk_properties array and the @cjk_property_values array. Thus, if a +# property you want is not in those files of the release you are building +# against, you must add it to those two arrays. Starting in 4.0, the +# Unicode_Radical_Stroke was listed in those files, so if the Unihan database +# is present in the directory, a table will be generated for that property. +# In 5.2, several more properties were added. For your convenience, the two +# arrays are initialized with all the 5.2 listed properties that are also in +# earlier releases. But these are commented out. You can just uncomment the +# ones you want, or use them as a template for adding entries for other +# properties. +# +# You may need to adjust the entries to suit your purposes. setup_unihan(), +# and filter_unihan_line() are the functions where this is done. This program +# already does some adjusting to make the lines look more like the rest of the +# Unicode DB; You can see what that is in filter_unihan_line() +# +# There is a bug in the 3.2 data file in which some values for the +# kPrimaryNumeric property have commas and an unexpected comment. A filter +# could be added for these; or for a particular installation, the Unihan.txt +# file could be edited to fix them. +# +# HOW TO ADD A FILE TO BE PROCESSED +# +# A new file from Unicode needs to have an object constructed for it in +# @input_file_objects, probably at the end or at the end of the extracted +# ones. The program should warn you if its name will clash with others on +# restrictive file systems, like DOS. If so, figure out a better name, and +# add lines to the README.perl file giving that. If the file is a character +# property, it should be in the format that Unicode has by default +# standardized for such files for the more recently introduced ones. +# If so, the Input_file constructor for @input_file_objects can just be the +# file name and release it first appeared in. If not, then it should be +# possible to construct an each_line_handler() to massage the line into the +# standardized form. +# +# For non-character properties, more code will be needed. You can look at +# the existing entries for clues. +# +# UNICODE VERSIONS NOTES +# +# The Unicode UCD has had a number of errors in it over the versions. And +# these remain, by policy, in the standard for that version. Therefore it is +# risky to correct them, because code may be expecting the error. So this +# program doesn't generally make changes, unless the error breaks the Perl +# core. As an example, some versions of 2.1.x Jamo.txt have the wrong value +# for U+1105, which causes real problems for the algorithms for Jamo +# calculations, so it is changed here. +# +# But it isn't so clear cut as to what to do about concepts that are +# introduced in a later release; should they extend back to earlier releases +# where the concept just didn't exist? It was easier to do this than to not, +# so that's what was done. For example, the default value for code points not +# in the files for various properties was probably undefined until changed by +# some version. No_Block for blocks is such an example. This program will +# assign No_Block even in Unicode versions that didn't have it. This has the +# benefit that code being written doesn't have to special case earlier +# versions; and the detriment that it doesn't match the Standard precisely for +# the affected versions. +# +# Here are some observations about some of the issues in early versions: +# +# The number of code points in \p{alpha} halve in 2.1.9. It turns out that +# the reason is that the CJK block starting at 4E00 was removed from PropList, +# and was not put back in until 3.1.0 +# +# Unicode introduced the synonym Space for White_Space in 4.1. Perl has +# always had a \p{Space}. In release 3.2 only, they are not synonymous. The +# reason is that 3.2 introduced U+205F=medium math space, which was not +# classed as white space, but Perl figured out that it should have been. 4.0 +# reclassified it correctly. +# +# Another change between 3.2 and 4.0 is the CCC property value ATBL. In 3.2 +# this was erroneously a synonym for 202. In 4.0, ATB became 202, and ATBL +# was left with no code points, as all the ones that mapped to 202 stayed +# mapped to 202. Thus if your program used the numeric name for the class, +# it would not have been affected, but if it used the mnemonic, it would have +# been. +# +# \p{Script=Hrkt} (Katakana_Or_Hiragana) came in 4.0.1. Before that code +# points which eventually came to have this script property value, instead +# mapped to "Unknown". But in the next release all these code points were +# moved to \p{sc=common} instead. +# +# The default for missing code points for BidiClass is complicated. Starting +# in 3.1.1, the derived file DBidiClass.txt handles this, but this program +# tries to do the best it can for earlier releases. It is done in +# process_PropertyAliases() +# +############################################################################## + +my $UNDEF = ':UNDEF:'; # String to print out for undefined values in tracing + # and errors +my $MAX_LINE_WIDTH = 78; + +# Debugging aid to skip most files so as to not be distracted by them when +# concentrating on the ones being debugged. Add +# non_skip => 1, +# to the constructor for those files you want processed when you set this. +# Files with a first version number of 0 are special: they are always +# processed regardless of the state of this flag. +my $debug_skip = 0; + +# Set to 1 to enable tracing. +our $to_trace = 0; + +{ # Closure for trace: debugging aid + my $print_caller = 1; # ? Include calling subroutine name + my $main_with_colon = 'main::'; + my $main_colon_length = length($main_with_colon); + + sub trace { + return unless $to_trace; # Do nothing if global flag not set + + my @input = @_; + + local $DB::trace = 0; + $DB::trace = 0; # Quiet 'used only once' message + + my $line_number; + + # Loop looking up the stack to get the first non-trace caller + my $caller_line; + my $caller_name; + my $i = 0; + do { + $line_number = $caller_line; + (my $pkg, my $file, $caller_line, my $caller) = caller $i++; + $caller = $main_with_colon unless defined $caller; + + $caller_name = $caller; + + # get rid of pkg + $caller_name =~ s/.*:://; + if (substr($caller_name, 0, $main_colon_length) + eq $main_with_colon) + { + $caller_name = substr($caller_name, $main_colon_length); + } + + } until ($caller_name ne 'trace'); + + # If the stack was empty, we were called from the top level + $caller_name = 'main' if ($caller_name eq "" + || $caller_name eq 'trace'); + + my $output = ""; + foreach my $string (@input) { + #print STDERR __LINE__, ": ", join ", ", @input, "\n"; + if (ref $string eq 'ARRAY' || ref $string eq 'HASH') { + $output .= simple_dumper($string); + } + else { + $string = "$string" if ref $string; + $string = $UNDEF unless defined $string; + chomp $string; + $string = '""' if $string eq ""; + $output .= " " if $output ne "" + && $string ne "" + && substr($output, -1, 1) ne " " + && substr($string, 0, 1) ne " "; + $output .= $string; + } + } + + print STDERR sprintf "%4d: ", $line_number if defined $line_number; + print STDERR "$caller_name: " if $print_caller; + print STDERR $output, "\n"; + return; + } +} -my $LastUnicodeCodepoint = 0x10FFFF; # As of Unicode 3.1.1. +# This is for a rarely used development feature that allows you to compare two +# versions of the Unicode standard without having to deal with changes caused +# by the code points introduced in the later verson. Change the 0 to a SINGLE +# dotted Unicode release number (e.g. 2.1). Only code points introduced in +# that release and earlier will be used; later ones are thrown away. You use +# the version number of the earliest one you want to compare; then run this +# program on directory structures containing each release, and compare the +# outputs. These outputs will therefore include only the code points common +# to both releases, and you can see the changes caused just by the underlying +# release semantic changes. For versions earlier than 3.2, you must copy a +# version of DAge.txt into the directory. +my $string_compare_versions = DEBUG && 0; # e.g., v2.1; +my $compare_versions = DEBUG + && $string_compare_versions + && pack "C*", split /\./, $string_compare_versions; + +sub uniques { + # Returns non-duplicated input values. From "Perl Best Practices: + # Encapsulated Cleverness". p. 455 in first edition. + + my %seen; + # Arguably this breaks encapsulation, if the goal is to permit multiple + # distinct objects to stringify to the same value, and be interchangeable. + # However, for this program, no two objects stringify identically, and all + # lists passed to this function are either objects or strings. So this + # doesn't affect correctness, but it does give a couple of percent speedup. + no overloading; + return grep { ! $seen{$_}++ } @_; +} + +$0 = File::Spec->canonpath($0); + +my $make_test_script = 0; # ? Should we output a test script +my $write_unchanged_files = 0; # ? Should we update the output files even if + # we don't think they have changed +my $use_directory = ""; # ? Should we chdir somewhere. +my $pod_directory; # input directory to store the pod file. +my $pod_file = 'perluniprops'; +my $t_path; # Path to the .t test file +my $file_list = 'mktables.lst'; # File to store input and output file names. + # This is used to speed up the build, by not + # executing the main body of the program if + # nothing on the list has changed since the + # previous build +my $make_list = 1; # ? Should we write $file_list. Set to always + # make a list so that when the pumpking is + # preparing a release, s/he won't have to do + # special things +my $glob_list = 0; # ? Should we try to include unknown .txt files + # in the input. +my $output_range_counts = 1; # ? Should we include the number of code points + # in ranges in the output +my $output_names = 0; # ? Should character names be in the output +my @viacode; # Contains the 1 million character names, if + # $output_names is true + +# Verbosity levels; 0 is quiet +my $NORMAL_VERBOSITY = 1; +my $PROGRESS = 2; +my $VERBOSE = 3; + +my $verbosity = $NORMAL_VERBOSITY; + +# Process arguments +while (@ARGV) { + my $arg = shift @ARGV; + if ($arg eq '-v') { + $verbosity = $VERBOSE; + } + elsif ($arg eq '-p') { + $verbosity = $PROGRESS; + $| = 1; # Flush buffers as we go. + } + elsif ($arg eq '-q') { + $verbosity = 0; + } + elsif ($arg eq '-w') { + $write_unchanged_files = 1; # update the files even if havent changed + } + elsif ($arg eq '-check') { + my $this = shift @ARGV; + my $ok = shift @ARGV; + if ($this ne $ok) { + print "Skipping as check params are not the same.\n"; + exit(0); + } + } + elsif ($arg eq '-P' && defined ($pod_directory = shift)) { + -d $pod_directory or croak "Directory '$pod_directory' doesn't exist"; + } + elsif ($arg eq '-maketest' || ($arg eq '-T' && defined ($t_path = shift))) + { + $make_test_script = 1; + } + elsif ($arg eq '-makelist') { + $make_list = 1; + } + elsif ($arg eq '-C' && defined ($use_directory = shift)) { + -d $use_directory or croak "Unknown directory '$use_directory'"; + } + elsif ($arg eq '-L') { -mkdir("In", 0755); -mkdir("Is", 0755); -mkdir("To", 0755); + # Existence not tested until have chdir'd + $file_list = shift; + } + elsif ($arg eq '-globlist') { + $glob_list = 1; + } + elsif ($arg eq '-c') { + $output_range_counts = ! $output_range_counts + } + elsif ($arg eq '-output_names') { + $output_names = 1; + } + else { + my $with_c = 'with'; + $with_c .= 'out' if $output_range_counts; # Complements the state + croak <[-1]->[1] = $last; +# Change directories now, because need to read 'version' early. +if ($use_directory) { + if ($pod_directory && ! File::Spec->file_name_is_absolute($pod_directory)) { + $pod_directory = File::Spec->rel2abs($pod_directory); + } + if ($t_path && ! File::Spec->file_name_is_absolute($t_path)) { + $t_path = File::Spec->rel2abs($t_path); + } + chdir $use_directory or croak "Failed to chdir to '$use_directory':$!"; + if ($pod_directory && File::Spec->file_name_is_absolute($pod_directory)) { + $pod_directory = File::Spec->abs2rel($pod_directory); + } + if ($t_path && File::Spec->file_name_is_absolute($t_path)) { + $t_path = File::Spec->abs2rel($t_path); + } } -sub append { - my ($table, $code, $name) = @_; - if (@$table && - hex($table->[-1]->[1]) == hex($code) - 1 && - (!defined $name || $table->[-1]->[2] eq $name)) { - extend($table, $code); - } else { - push @$table, [$code, $code, $name]; +# Get Unicode version into regular and v-string. This is done now because +# various tables below get populated based on it. These tables are populated +# here to be near the top of the file, and so easily seeable by those needing +# to modify things. +open my $VERSION, "<", "version" + or croak "$0: can't open required file 'version': $!\n"; +my $string_version = <$VERSION>; +close $VERSION; +chomp $string_version; +my $v_version = pack "C*", split /\./, $string_version; # v string + +# The following are the complete names of properties with property values that +# are known to not match any code points in some versions of Unicode, but that +# may change in the future so they should be matchable, hence an empty file is +# generated for them. +my @tables_that_may_be_empty = ( + 'Joining_Type=Left_Joining', + ); +push @tables_that_may_be_empty, 'Script=Common' if $v_version le v4.0.1; +push @tables_that_may_be_empty, 'Title' if $v_version lt v2.0.0; +push @tables_that_may_be_empty, 'Script=Katakana_Or_Hiragana' + if $v_version ge v4.1.0; + +# The lists below are hashes, so the key is the item in the list, and the +# value is the reason why it is in the list. This makes generation of +# documentation easier. + +my %why_suppressed; # No file generated for these. + +# Files aren't generated for empty extraneous properties. This is arguable. +# Extraneous properties generally come about because a property is no longer +# used in a newer version of Unicode. If we generated a file without code +# points, programs that used to work on that property will still execute +# without errors. It just won't ever match (or will always match, with \P{}). +# This means that the logic is now likely wrong. I (khw) think its better to +# find this out by getting an error message. Just move them to the table +# above to change this behavior +my %why_suppress_if_empty_warn_if_not = ( + + # It is the only property that has ever officially been removed from the + # Standard. The database never contained any code points for it. + 'Special_Case_Condition' => 'Obsolete', + + # Apparently never official, but there were code points in some versions of + # old-style PropList.txt + 'Non_Break' => 'Obsolete', +); + +# These would normally go in the warn table just above, but they were changed +# a long time before this program was written, so warnings about them are +# moot. +if ($v_version gt v3.2.0) { + push @tables_that_may_be_empty, + 'Canonical_Combining_Class=Attached_Below_Left' +} + +# These are listed in the Property aliases file in 5.2, but Unihan is ignored +# unless explicitly added. +if ($v_version ge v5.2.0) { + my $unihan = 'Unihan; remove from list if using Unihan'; + foreach my $table (qw ( + kAccountingNumeric + kOtherNumeric + kPrimaryNumeric + kCompatibilityVariant + kIICore + kIRG_GSource + kIRG_HSource + kIRG_JSource + kIRG_KPSource + kIRG_MSource + kIRG_KSource + kIRG_TSource + kIRG_USource + kIRG_VSource + kRSUnicode + )) + { + $why_suppress_if_empty_warn_if_not{$table} = $unihan; } } -sub inverse { - my ($table) = @_; - my $inverse = []; - my ($first, $last); - if ($table->[0]->[0]) { - $last = hex($table->[0]->[0]); - push @$inverse, [ "0000", - sprintf("%04X", $last - 1) ]; +# Properties that this program ignores. +my @unimplemented_properties = ( +'Unicode_Radical_Stroke' # Remove if changing to handle this one. +); + +# There are several types of obsolete properties defined by Unicode. These +# must be hand-edited for every new Unicode release. +my %why_deprecated; # Generates a deprecated warning message if used. +my %why_stabilized; # Documentation only +my %why_obsolete; # Documentation only + +{ # Closure + my $simple = 'Perl uses the more complete version of this property'; + my $unihan = 'Unihan properties are by default not enabled in the Perl core. Instead use CPAN: Unicode::Unihan'; + + my $other_properties = 'other properties'; + my $contributory = "Used by Unicode internally for generating $other_properties and not intended to be used stand-alone"; + my $why_no_expand = "Easily computed, and yet doesn't cover the common encoding forms (UTF-16/8)", + + %why_deprecated = ( + 'Grapheme_Link' => 'Deprecated by Unicode. Use ccc=vr (Canonical_Combining_Class=Virama) instead', + 'Jamo_Short_Name' => $contributory, + 'Line_Break=Surrogate' => 'Deprecated by Unicode because surrogates should never appear in well-formed text, and therefore shouldn\'t be the basis for line breaking', + 'Other_Alphabetic' => $contributory, + 'Other_Default_Ignorable_Code_Point' => $contributory, + 'Other_Grapheme_Extend' => $contributory, + 'Other_ID_Continue' => $contributory, + 'Other_ID_Start' => $contributory, + 'Other_Lowercase' => $contributory, + 'Other_Math' => $contributory, + 'Other_Uppercase' => $contributory, + ); + + %why_suppressed = ( + # There is a lib/unicore/Decomposition.pl (used by normalize.pm) which + # contains the same information, but without the algorithmically + # determinable Hangul syllables'. This file is not published, so it's + # existence is not noted in the comment. + 'Decomposition_Mapping' => 'Accessible via Unicode::Normalize', + + 'ISO_Comment' => 'Apparently no demand for it, but can access it through Unicode::UCD::charinfo. Obsoleted, and code points for it removed in Unicode 5.2', + 'Unicode_1_Name' => "$simple, and no apparent demand for it, but can access it through Unicode::UCD::charinfo. If there is no later name for a code point, then this one is used instead in charnames", + + 'Simple_Case_Folding' => "$simple. Can access this through Unicode::UCD::casefold", + 'Simple_Lowercase_Mapping' => "$simple. Can access this through Unicode::UCD::charinfo", + 'Simple_Titlecase_Mapping' => "$simple. Can access this through Unicode::UCD::charinfo", + 'Simple_Uppercase_Mapping' => "$simple. Can access this through Unicode::UCD::charinfo", + + 'Name' => "Accessible via 'use charnames;'", + 'Name_Alias' => "Accessible via 'use charnames;'", + + # These are sort of jumping the gun; deprecation is proposed for + # Unicode version 6.0, but they have never been exposed by Perl, and + # likely are soon to be deprecated, so best not to expose them. + FC_NFKC_Closure => 'Use NFKC_Casefold instead', + Expands_On_NFC => $why_no_expand, + Expands_On_NFD => $why_no_expand, + Expands_On_NFKC => $why_no_expand, + Expands_On_NFKD => $why_no_expand, + ); + + # The following are suppressed because they were made contributory or + # deprecated by Unicode before Perl ever thought about supporting them. + foreach my $property ('Jamo_Short_Name', 'Grapheme_Link') { + $why_suppressed{$property} = $why_deprecated{$property}; } - for my $i (0..$#$table-1) { - $first = defined $table->[$i ]->[1] ? - hex($table->[$i ]->[1]) : 0; - $last = defined $table->[$i + 1]->[0] ? - hex($table->[$i + 1]->[0]) : $first; - push @$inverse, [ sprintf("%04X", $first + 1), - sprintf("%04X", $last - 1) ] - unless $first + 1 == $last; + + # Customize the message for all the 'Other_' properties + foreach my $property (keys %why_deprecated) { + next if (my $main_property = $property) !~ s/^Other_//; + $why_deprecated{$property} =~ s/$other_properties/the $main_property property (which should be used instead)/; } - return $inverse; } -sub header { - my $fh = shift; +if ($v_version ge 4.0.0) { + $why_stabilized{'Hyphen'} = 'Use the Line_Break property instead; see www.unicode.org/reports/tr14'; +} +if ($v_version ge 5.2.0) { + $why_obsolete{'ISO_Comment'} = 'Code points for it have been removed'; +} - print $fh < +## @missing: 0000..10FFFF; cjkIICore; +## @missing: 0000..10FFFF; cjkIRG_GSource; +## @missing: 0000..10FFFF; cjkIRG_HSource; +## @missing: 0000..10FFFF; cjkIRG_JSource; +## @missing: 0000..10FFFF; cjkIRG_KPSource; +## @missing: 0000..10FFFF; cjkIRG_KSource; +## @missing: 0000..10FFFF; cjkIRG_TSource; +## @missing: 0000..10FFFF; cjkIRG_USource; +## @missing: 0000..10FFFF; cjkIRG_VSource; +## @missing: 0000..10FFFF; cjkOtherNumeric; NaN +## @missing: 0000..10FFFF; cjkPrimaryNumeric; NaN +## @missing: 0000..10FFFF; cjkRSUnicode; END -EOT + +# The input files don't list every code point. Those not listed are to be +# defaulted to some value. Below are hard-coded what those values are for +# non-binary properties as of 5.1. Starting in 5.0, there are +# machine-parsable comment lines in the files the give the defaults; so this +# list shouldn't have to be extended. The claim is that all missing entries +# for binary properties will default to 'N'. Unicode tried to change that in +# 5.2, but the beta period produced enough protest that they backed off. +# +# The defaults for the fields that appear in UnicodeData.txt in this hash must +# be in the form that it expects. The others may be synonyms. +my $CODE_POINT = ''; +my %default_mapping = ( + Age => "Unassigned", + # Bidi_Class => Complicated; set in code + Bidi_Mirroring_Glyph => "", + Block => 'No_Block', + Canonical_Combining_Class => 0, + Case_Folding => $CODE_POINT, + Decomposition_Mapping => $CODE_POINT, + Decomposition_Type => 'None', + East_Asian_Width => "Neutral", + FC_NFKC_Closure => $CODE_POINT, + General_Category => 'Cn', + Grapheme_Cluster_Break => 'Other', + Hangul_Syllable_Type => 'NA', + ISO_Comment => "", + Jamo_Short_Name => "", + Joining_Group => "No_Joining_Group", + # Joining_Type => Complicated; set in code + kIICore => 'N', # Is converted to binary + #Line_Break => Complicated; set in code + Lowercase_Mapping => $CODE_POINT, + Name => "", + Name_Alias => "", + NFC_QC => 'Yes', + NFD_QC => 'Yes', + NFKC_QC => 'Yes', + NFKD_QC => 'Yes', + Numeric_Type => 'None', + Numeric_Value => 'NaN', + Script => ($v_version le 4.1.0) ? 'Common' : 'Unknown', + Sentence_Break => 'Other', + Simple_Case_Folding => $CODE_POINT, + Simple_Lowercase_Mapping => $CODE_POINT, + Simple_Titlecase_Mapping => $CODE_POINT, + Simple_Uppercase_Mapping => $CODE_POINT, + Titlecase_Mapping => $CODE_POINT, + Unicode_1_Name => "", + Unicode_Radical_Stroke => "", + Uppercase_Mapping => $CODE_POINT, + Word_Break => 'Other', +); + +# Below are files that Unicode furnishes, but this program ignores, and why +my %ignored_files = ( + 'CJKRadicals.txt' => 'Unihan data', + 'Index.txt' => 'An index, not actual data', + 'NamedSqProv.txt' => 'Not officially part of the Unicode standard; Append it to NamedSequences.txt if you want to process the contents.', + 'NamesList.txt' => 'Just adds commentary', + 'NormalizationCorrections.txt' => 'Data is already in other files.', + 'Props.txt' => 'Adds nothing to PropList.txt; only in very early releases', + 'ReadMe.txt' => 'Just comments', + 'README.TXT' => 'Just comments', + 'StandardizedVariants.txt' => 'Only for glyph changes, not a Unicode character property. Does not fit into current scheme where one code point is mapped', +); + +### End of externally interesting definitions, except for @input_file_objects + +my $HEADER=<<"EOF"; +# !!!!!!! DO NOT EDIT THIS FILE !!!!!!! +# This file is machine-generated by $0 from the Unicode +# database, Version $string_version. Any changes made here will be lost! +EOF + +my $INTERNAL_ONLY=<<"EOF"; + +# !!!!!!! INTERNAL PERL USE ONLY !!!!!!! +# This file is for internal use by the Perl program only. The format and even +# the name or existence of this file are subject to change without notice. +# Don't use it directly. +EOF + +my $DEVELOPMENT_ONLY=<<"EOF"; +# !!!!!!! DEVELOPMENT USE ONLY !!!!!!! +# This file contains information artificially constrained to code points +# present in Unicode release $string_compare_versions. +# IT CANNOT BE RELIED ON. It is for use during development only and should +# not be used for production. + +EOF + +my $LAST_UNICODE_CODEPOINT_STRING = "10FFFF"; +my $LAST_UNICODE_CODEPOINT = hex $LAST_UNICODE_CODEPOINT_STRING; +my $MAX_UNICODE_CODEPOINTS = $LAST_UNICODE_CODEPOINT + 1; + +# Matches legal code point. 4-6 hex numbers, If there are 6, the first +# two must be 10; if there are 5, the first must not be a 0. Written this way +# to decrease backtracking +my $code_point_re = + qr/ \b (?: 10[0-9A-F]{4} | [1-9A-F][0-9A-F]{4} | [0-9A-F]{4} ) \b/x; + +# This matches the beginning of the line in the Unicode db files that give the +# defaults for code points not listed (i.e., missing) in the file. The code +# depends on this ending with a semi-colon, so it can assume it is a valid +# field when the line is split() by semi-colons +my $missing_defaults_prefix = + qr/^#\s+\@missing:\s+0000\.\.$LAST_UNICODE_CODEPOINT_STRING\s*;/; + +# Property types. Unicode has more types, but these are sufficient for our +# purposes. +my $UNKNOWN = -1; # initialized to illegal value +my $NON_STRING = 1; # Either binary or enum +my $BINARY = 2; +my $ENUM = 3; # Include catalog +my $STRING = 4; # Anything else: string or misc + +# Some input files have lines that give default values for code points not +# contained in the file. Sometimes these should be ignored. +my $NO_DEFAULTS = 0; # Must evaluate to false +my $NOT_IGNORED = 1; +my $IGNORED = 2; + +# Range types. Each range has a type. Most ranges are type 0, for normal, +# and will appear in the main body of the tables in the output files, but +# there are other types of ranges as well, listed below, that are specially +# handled. There are pseudo-types as well that will never be stored as a +# type, but will affect the calculation of the type. + +# 0 is for normal, non-specials +my $MULTI_CP = 1; # Sequence of more than code point +my $HANGUL_SYLLABLE = 2; +my $CP_IN_NAME = 3; # The NAME contains the code point appended to it. +my $NULL = 4; # The map is to the null string; utf8.c can't + # handle these, nor is there an accepted syntax + # for them in \p{} constructs +my $COMPUTE_NO_MULTI_CP = 5; # Pseudo-type; means that ranges that would + # otherwise be $MULTI_CP type are instead type 0 + +# process_generic_property_file() can accept certain overrides in its input. +# Each of these must begin AND end with $CMD_DELIM. +my $CMD_DELIM = "\a"; +my $REPLACE_CMD = 'replace'; # Override the Replace +my $MAP_TYPE_CMD = 'map_type'; # Override the Type + +my $NO = 0; +my $YES = 1; + +# Values for the Replace argument to add_range. +# $NO # Don't replace; add only the code points not + # already present. +my $IF_NOT_EQUIVALENT = 1; # Replace only under certain conditions; details in + # the comments at the subroutine definition. +my $UNCONDITIONALLY = 2; # Replace without conditions. +my $MULTIPLE = 4; # Don't replace, but add a duplicate record if + # already there + +# Flags to give property statuses. The phrases are to remind maintainers that +# if the flag is changed, the indefinite article referring to it in the +# documentation may need to be as well. +my $NORMAL = ""; +my $SUPPRESSED = 'z'; # The character should never actually be seen, since + # it is suppressed +my $PLACEHOLDER = 'P'; # Implies no pod entry generated +my $DEPRECATED = 'D'; +my $a_bold_deprecated = "a 'B<$DEPRECATED>'"; +my $A_bold_deprecated = "A 'B<$DEPRECATED>'"; +my $DISCOURAGED = 'X'; +my $a_bold_discouraged = "an 'B<$DISCOURAGED>'"; +my $A_bold_discouraged = "An 'B<$DISCOURAGED>'"; +my $STRICTER = 'T'; +my $a_bold_stricter = "a 'B<$STRICTER>'"; +my $A_bold_stricter = "A 'B<$STRICTER>'"; +my $STABILIZED = 'S'; +my $a_bold_stabilized = "an 'B<$STABILIZED>'"; +my $A_bold_stabilized = "An 'B<$STABILIZED>'"; +my $OBSOLETE = 'O'; +my $a_bold_obsolete = "an 'B<$OBSOLETE>'"; +my $A_bold_obsolete = "An 'B<$OBSOLETE>'"; + +my %status_past_participles = ( + $DISCOURAGED => 'discouraged', + $SUPPRESSED => 'should never be generated', + $STABILIZED => 'stabilized', + $OBSOLETE => 'obsolete', + $DEPRECATED => 'deprecated', +); + +# The format of the values of the map tables: +my $BINARY_FORMAT = 'b'; +my $DECIMAL_FORMAT = 'd'; +my $FLOAT_FORMAT = 'f'; +my $INTEGER_FORMAT = 'i'; +my $HEX_FORMAT = 'x'; +my $RATIONAL_FORMAT = 'r'; +my $STRING_FORMAT = 's'; + +my %map_table_formats = ( + $BINARY_FORMAT => 'binary', + $DECIMAL_FORMAT => 'single decimal digit', + $FLOAT_FORMAT => 'floating point number', + $INTEGER_FORMAT => 'integer', + $HEX_FORMAT => 'positive hex whole number; a code point', + $RATIONAL_FORMAT => 'rational: an integer or a fraction', + $STRING_FORMAT => 'arbitrary string', +); + +# Unicode didn't put such derived files in a separate directory at first. +my $EXTRACTED_DIR = (-d 'extracted') ? 'extracted' : ""; +my $EXTRACTED = ($EXTRACTED_DIR) ? "$EXTRACTED_DIR/" : ""; +my $AUXILIARY = 'auxiliary'; + +# Hashes that will eventually go into Heavy.pl for the use of utf8_heavy.pl +my %loose_to_file_of; # loosely maps table names to their respective + # files +my %stricter_to_file_of; # same; but for stricter mapping. +my %nv_floating_to_rational; # maps numeric values floating point numbers to + # their rational equivalent +my %loose_property_name_of; # Loosely maps property names to standard form + +# These constants names and values were taken from the Unicode standard, +# version 5.1, section 3.12. They are used in conjunction with Hangul +# syllables. The '_string' versions are so generated tables can retain the +# hex format, which is the more familiar value +my $SBase_string = "0xAC00"; +my $SBase = CORE::hex $SBase_string; +my $LBase_string = "0x1100"; +my $LBase = CORE::hex $LBase_string; +my $VBase_string = "0x1161"; +my $VBase = CORE::hex $VBase_string; +my $TBase_string = "0x11A7"; +my $TBase = CORE::hex $TBase_string; +my $SCount = 11172; +my $LCount = 19; +my $VCount = 21; +my $TCount = 28; +my $NCount = $VCount * $TCount; + +# For Hangul syllables; These store the numbers from Jamo.txt in conjunction +# with the above published constants. +my %Jamo; +my %Jamo_L; # Leading consonants +my %Jamo_V; # Vowels +my %Jamo_T; # Trailing consonants + +my @backslash_X_tests; # List of tests read in for testing \X +my @unhandled_properties; # Will contain a list of properties found in + # the input that we didn't process. +my @match_properties; # Properties that have match tables, to be + # listed in the pod +my @map_properties; # Properties that get map files written +my @named_sequences; # NamedSequences.txt contents. +my %potential_files; # Generated list of all .txt files in the directory + # structure so we can warn if something is being + # ignored. +my @files_actually_output; # List of files we generated. +my @more_Names; # Some code point names are compound; this is used + # to store the extra components of them. +my $MIN_FRACTION_LENGTH = 3; # How many digits of a floating point number at + # the minimum before we consider it equivalent to a + # candidate rational +my $MAX_FLOATING_SLOP = 10 ** - $MIN_FRACTION_LENGTH; # And in floating terms + +# These store references to certain commonly used property objects +my $gc; +my $perl; +my $block; +my $perl_charname; +my $print; + +# Are there conflicting names because of beginning with 'In_', or 'Is_' +my $has_In_conflicts = 0; +my $has_Is_conflicts = 0; + +sub internal_file_to_platform ($) { + # Convert our file paths which have '/' separators to those of the + # platform. + + my $file = shift; + return undef unless defined $file; + + return File::Spec->join(split '/', $file); } -sub flush { - my ($table, $file) = @_; - print "$file\n"; - if (open(my $fh, ">$file")) { - header($fh); - begin($fh); - for my $i (@$table) { - print $fh $i->[0], "\t", - $i->[1] ne $i->[0] ? $i->[1] : "", "\t", - defined $i->[2] ? $i->[2] : "", "\n"; - } - end($fh); - close($fh); - } else { - die "$0: $file: $!\n"; - } -} - -# -# The %In contains the mapping of the script/block name into a number. -# - -my %In; -my $InId = 0; -my %InIn; - -my %InScript; -my %InBlock; - -# -# Read in the Unicode.txt, the main Unicode database. -# - -my %Cat; -my %General; -my @General; - -sub gencat { - my ($Name, $GeneralH, $GeneralA, $Cat, - $name, $cat, $code, $op) = @_; - - $op->($Name, $code, $name); - $op->($GeneralA, $code, $cat); - - $op->($GeneralH->{$name} ||= [], $code, $name); - - $op->($Cat->{$cat} ||= [], $code); - $op->($Cat->{substr($cat, 0, 1)} - ||= [], $code); - # 005F: SPACING UNDERSCORE - $op->($Cat->{Word} ||= [], $code) - if $cat =~ /^[LMN]/ or $code eq "005F"; - $op->($Cat->{Alnum} ||= [], $code) - if $cat =~ /^[LMN]/; - $op->($Cat->{Alpha} ||= [], $code) - if $cat =~ /^[LM]/; - # 0009: HORIZONTAL TABULATION - # 000A: LINE FEED - # 000B: VERTICAL TABULATION - # 000C: FORM FEED - # 000D: CARRIAGE RETURN - # 0020: SPACE - $op->($Cat->{Space} ||= [], $code) - if $cat =~ /^Z/ || - $code =~ /^(0009|000A|000B|000C|000D)$/; - $op->($Cat->{SpacePerl} ||= [], $code) - if $cat =~ /^Z/ || - $code =~ /^(0009|000A|000C|000D)$/; - $op->($Cat->{Blank} ||= [], $code) - if $code =~ /^(0020|0009)$/ || - $cat =~ /^Z[^lp]$/; - $op->($Cat->{Digit} ||= [], $code) if $cat eq "Nd"; - $op->($Cat->{Upper} ||= [], $code) if $cat eq "Lu"; - $op->($Cat->{Lower} ||= [], $code) if $cat eq "Ll"; - $op->($Cat->{Title} ||= [], $code) if $cat eq "Lt"; - $op->($Cat->{ASCII} ||= [], $code) if $code le "007F"; - $op->($Cat->{Cntrl} ||= [], $code) if $cat =~ /^C/; - $op->($Cat->{Graph} ||= [], $code) if $cat =~ /^([LMNPS]|Co)/; - $op->($Cat->{Print} ||= [], $code) if $cat =~ /^([LMNPS]|Co|Zs)/; - $op->($Cat->{Punct} ||= [], $code) if $cat =~ /^P/; - # 003[0-9]: DIGIT ZERO..NINE, 00[46][1-6]: A..F, a..f - $op->($Cat->{XDigit} ||= [], $code) - if $code =~ /^00(3[0-9]|[46][1-6])$/; - +sub file_exists ($) { # platform independent '-e'. This program internally + # uses slash as a path separator. + my $file = shift; + return 0 if ! defined $file; + return -e internal_file_to_platform($file); } -if (open(my $Unicode, "Unicode.txt")) { - my @Name; - my @Bidi; - my %Bidi; - my @Comb; - my @Deco; - my %Deco; - my %DC; - my @Number; - my @Mirrored; - my %To; - while (<$Unicode>) { - next unless /^[0-9A-Fa-f]+;/; - s/\s+$//; +sub objaddr($) { + # Returns the address of the blessed input object. + # It doesn't check for blessedness because that would do a string eval + # every call, and the program is structured so that this is never called + # for a non-blessed object. - my ($code, $name, $cat, $comb, $bidi, $deco, - $decimal, $digit, $number, - $mirrored, $unicode10, $comment, - $upper, $lower, $title) = split(/\s*;\s*/); + no overloading; # If overloaded, numifying below won't work. + + # Numifying a ref gives its address. + return pack 'J', $_[0]; +} - if ($name =~ /^<(.+), (First|Last)>$/) { - $name = $1; - gencat(\@Name, \%General, \@General, \%Cat, - $name, $cat, $code, - $2 eq 'First' ? \&append : \&extend); - unless (defined $In{$name}) { - $In{$name} = $InId++; - $InIn{$name} = $General{$name}; - } - } else { +# Commented code below should work on Perl 5.8. +## This 'require' doesn't necessarily work in miniperl, and even if it does, +## the native perl version of it (which is what would operate under miniperl) +## is extremely slow, as it does a string eval every call. +#my $has_fast_scalar_util = $ !~ /miniperl/ +# && defined eval "require Scalar::Util"; +# +#sub objaddr($) { +# # Returns the address of the blessed input object. Uses the XS version if +# # available. It doesn't check for blessedness because that would do a +# # string eval every call, and the program is structured so that this is +# # never called for a non-blessed object. +# +# return Scalar::Util::refaddr($_[0]) if $has_fast_scalar_util; +# +# # Check at least that is a ref. +# my $pkg = ref($_[0]) or return undef; +# +# # Change to a fake package to defeat any overloaded stringify +# bless $_[0], 'main::Fake'; +# +# # Numifying a ref gives its address. +# my $addr = pack 'J', $_[0]; +# +# # Return to original class +# bless $_[0], $pkg; +# return $addr; +#} + +sub max ($$) { + my $a = shift; + my $b = shift; + return $a if $a >= $b; + return $b; +} - gencat(\@Name, \%General, \@General, \%Cat, - $name, $cat, $code, \&append); +sub min ($$) { + my $a = shift; + my $b = shift; + return $a if $a <= $b; + return $b; +} - append($To{Upper} ||= [], $code, $upper) if $upper; - append($To{Lower} ||= [], $code, $lower) if $lower; - append($To{Title} ||= [], $code, $title) if $title; - append($To{Digit} ||= [], $code, $decimal) if $decimal; - - append(\@Bidi, $code, $bidi); - append($Bidi{$bidi} ||= [], $code); - - append(\@Comb, $code, $comb) if $comb; - - if ($deco) { - append(\@Deco, $code, $deco); - if ($deco =~/^<(\w+)>/) { - append($Deco{Compat} ||= [], $code); - append($DC{$1} ||= [], $code); - } else { - append($Deco{Canon} ||= [], $code); - } - } - - append(\@Number, $code, $number) if $number; - - append(\@Mirrored, $code) if $mirrored eq "Y"; - } +sub clarify_number ($) { + # This returns the input number with underscores inserted every 3 digits + # in large (5 digits or more) numbers. Input must be entirely digits, not + # checked. + + my $number = shift; + my $pos = length($number) - 3; + return $number if $pos <= 1; + while ($pos > 0) { + substr($number, $pos, 0) = '_'; + $pos -= 3; } + return $number; +} - flush(\@Name, "Name.pl"); - foreach my $cat (sort keys %Cat) { - flush($Cat{$cat}, "Is/$cat.pl"); - } +package Carp; - foreach my $to (sort keys %To) { - flush($To{$to}, "To/$to.pl"); - } +# These routines give a uniform treatment of messages in this program. They +# are placed in the Carp package to cause the stack trace to not include them, +# although an alternative would be to use another package and set @CARP_NOT +# for it. - flush(\@Bidi, "Bidirectional.pl"); - foreach my $bidi (sort keys %Bidi) { - flush($Bidi{$bidi}, "Is/Bidi$bidi.pl"); - } +our $Verbose = 1 if main::DEBUG; # Useful info when debugging - flush(\@Comb, "CombiningClass.pl"); +# This is a work-around suggested by Nicholas Clark to fix a problem with Carp +# and overload trying to load Scalar:Util under miniperl. See +# http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/2009-11/msg01057.html +undef $overload::VERSION; - flush(\@Deco, "Decomposition.pl"); - foreach my $deco (sort keys %Deco) { - flush($Deco{$deco}, "Is/Deco$deco.pl"); - } - foreach my $dc (sort keys %DC) { - flush($DC{$dc}, "Is/DC$dc.pl"); +sub my_carp { + my $message = shift || ""; + my $nofold = shift || 0; + + if ($message) { + $message = main::join_lines($message); + $message =~ s/^$0: *//; # Remove initial program name + $message =~ s/[.;,]+$//; # Remove certain ending punctuation + $message = "\n$0: $message;"; + + # Fold the message with program name, semi-colon end punctuation + # (which looks good with the message that carp appends to it), and a + # hanging indent for continuation lines. + $message = main::simple_fold($message, "", 4) unless $nofold; + $message =~ s/\n$//; # Remove the trailing nl so what carp + # appends is to the same line } - flush(\@Number, "Number.pl"); + return $message if defined wantarray; # If a caller just wants the msg + + carp $message; + return; +} + +sub my_carp_bug { + # This is called when it is clear that the problem is caused by a bug in + # this program. - flush(\@Mirrored, "Is/Mirrored.pl"); -} else { - die "$0: Unicode.txt: $!\n"; + my $message = shift; + $message =~ s/^$0: *//; + $message = my_carp("Bug in $0. Please report it by running perlbug or if that is unavailable, by sending email to perbug\@perl.org:\n$message"); + carp $message; + return; } -# The general cateory can be written out already now. +sub carp_too_few_args { + if (@_ != 2) { + my_carp_bug("Wrong number of arguments: to 'carp_too_few_arguments'. No action taken."); + return; + } -flush(\@General, "Category.pl"); + my $args_ref = shift; + my $count = shift; -# -# Read in the LineBrk.txt. -# + my_carp_bug("Need at least $count arguments to " + . (caller 1)[3] + . ". Instead got: '" + . join ', ', @$args_ref + . "'. No action taken."); + return; +} -if (open(my $LineBrk, "LineBrk.txt")) { - my @Lbrk; - my %Lbrk; +sub carp_extra_args { + my $args_ref = shift; + my_carp_bug("Too many arguments to 'carp_extra_args': (" . join(', ', @_) . "); Extras ignored.") if @_; - while (<$LineBrk>) { - next unless /^([0-9A-Fa-f]+)(?:\.\.([0-9A-Fa-f]+))?\s*;\s*(\w+)/; + unless (ref $args_ref) { + my_carp_bug("Argument to 'carp_extra_args' ($args_ref) must be a ref. Not checking arguments."); + return; + } + my ($package, $file, $line) = caller; + my $subroutine = (caller 1)[3]; + + my $list; + if (ref $args_ref eq 'HASH') { + foreach my $key (keys %$args_ref) { + $args_ref->{$key} = $UNDEF unless defined $args_ref->{$key}; + } + $list = join ', ', each %{$args_ref}; + } + elsif (ref $args_ref eq 'ARRAY') { + foreach my $arg (@$args_ref) { + $arg = $UNDEF unless defined $arg; + } + $list = join ', ', @$args_ref; + } + else { + my_carp_bug("Can't cope with ref " + . ref($args_ref) + . " . argument to 'carp_extra_args'. Not checking arguments."); + return; + } - my ($first, $last, $lbrk) = ($1, $2, $3); + my_carp_bug("Unrecognized parameters in options: '$list' to $subroutine. Skipped."); + return; +} - append(\@Lbrk, $first, $lbrk); - append($Lbrk{$lbrk} ||= [], $first); - if (defined $last) { - extend(\@Lbrk, $last); - extend($Lbrk{$lbrk}, $last); - } +package main; + +{ # Closure + + # This program uses the inside-out method for objects, as recommended in + # "Perl Best Practices". This closure aids in generating those. There + # are two routines. setup_package() is called once per package to set + # things up, and then set_access() is called for each hash representing a + # field in the object. These routines arrange for the object to be + # properly destroyed when no longer used, and for standard accessor + # functions to be generated. If you need more complex accessors, just + # write your own and leave those accesses out of the call to set_access(). + # More details below. + + my %constructor_fields; # fields that are to be used in constructors; see + # below + + # The values of this hash will be the package names as keys to other + # hashes containing the name of each field in the package as keys, and + # references to their respective hashes as values. + my %package_fields; + + sub setup_package { + # Sets up the package, creating standard DESTROY and dump methods + # (unless already defined). The dump method is used in debugging by + # simple_dumper(). + # The optional parameters are: + # a) a reference to a hash, that gets populated by later + # set_access() calls with one of the accesses being + # 'constructor'. The caller can then refer to this, but it is + # not otherwise used by these two routines. + # b) a reference to a callback routine to call during destruction + # of the object, before any fields are actually destroyed + + my %args = @_; + my $constructor_ref = delete $args{'Constructor_Fields'}; + my $destroy_callback = delete $args{'Destroy_Callback'}; + Carp::carp_extra_args(\@_) if main::DEBUG && %args; + + my %fields; + my $package = (caller)[0]; + + $package_fields{$package} = \%fields; + $constructor_fields{$package} = $constructor_ref; + + unless ($package->can('DESTROY')) { + my $destroy_name = "${package}::DESTROY"; + no strict "refs"; + + # Use typeglob to give the anonymous subroutine the name we want + *$destroy_name = sub { + my $self = shift; + my $addr = do { no overloading; pack 'J', $self; }; + + $self->$destroy_callback if $destroy_callback; + foreach my $field (keys %{$package_fields{$package}}) { + #print STDERR __LINE__, ": Destroying ", ref $self, " ", sprintf("%04X", $addr), ": ", $field, "\n"; + delete $package_fields{$package}{$field}{$addr}; + } + return; + } + } + + unless ($package->can('dump')) { + my $dump_name = "${package}::dump"; + no strict "refs"; + *$dump_name = sub { + my $self = shift; + return dump_inside_out($self, $package_fields{$package}, @_); + } + } + return; } - flush(\@Lbrk, "Lbrk.pl"); - foreach my $lbrk (sort keys %Lbrk) { - flush($Lbrk{$lbrk}, "Is/Lbrk$lbrk.pl"); + sub set_access { + # Arrange for the input field to be garbage collected when no longer + # needed. Also, creates standard accessor functions for the field + # based on the optional parameters-- none if none of these parameters: + # 'addable' creates an 'add_NAME()' accessor function. + # 'readable' or 'readable_array' creates a 'NAME()' accessor + # function. + # 'settable' creates a 'set_NAME()' accessor function. + # 'constructor' doesn't create an accessor function, but adds the + # field to the hash that was previously passed to + # setup_package(); + # Any of the accesses can be abbreviated down, so that 'a', 'ad', + # 'add' etc. all mean 'addable'. + # The read accessor function will work on both array and scalar + # values. If another accessor in the parameter list is 'a', the read + # access assumes an array. You can also force it to be array access + # by specifying 'readable_array' instead of 'readable' + # + # A sort-of 'protected' access can be set-up by preceding the addable, + # readable or settable with some initial portion of 'protected_' (but, + # the underscore is required), like 'p_a', 'pro_set', etc. The + # "protection" is only by convention. All that happens is that the + # accessor functions' names begin with an underscore. So instead of + # calling set_foo, the call is _set_foo. (Real protection could be + # accomplished by having a new subroutine, end_package, called at the + # end of each package, and then storing the __LINE__ ranges and + # checking them on every accessor. But that is way overkill.) + + # We create anonymous subroutines as the accessors and then use + # typeglobs to assign them to the proper package and name + + my $name = shift; # Name of the field + my $field = shift; # Reference to the inside-out hash containing the + # field + + my $package = (caller)[0]; + + if (! exists $package_fields{$package}) { + croak "$0: Must call 'setup_package' before 'set_access'"; + } + + # Stash the field so DESTROY can get it. + $package_fields{$package}{$name} = $field; + + # Remaining arguments are the accessors. For each... + foreach my $access (@_) { + my $access = lc $access; + + my $protected = ""; + + # Match the input as far as it goes. + if ($access =~ /^(p[^_]*)_/) { + $protected = $1; + if (substr('protected_', 0, length $protected) + eq $protected) + { + + # Add 1 for the underscore not included in $protected + $access = substr($access, length($protected) + 1); + $protected = '_'; + } + else { + $protected = ""; + } + } + + if (substr('addable', 0, length $access) eq $access) { + my $subname = "${package}::${protected}add_$name"; + no strict "refs"; + + # add_ accessor. Don't add if already there, which we + # determine using 'eq' for scalars and '==' otherwise. + *$subname = sub { + use strict "refs"; + return Carp::carp_too_few_args(\@_, 2) if main::DEBUG && @_ < 2; + my $self = shift; + my $value = shift; + my $addr = do { no overloading; pack 'J', $self; }; + Carp::carp_extra_args(\@_) if main::DEBUG && @_; + if (ref $value) { + return if grep { $value == $_ } @{$field->{$addr}}; + } + else { + return if grep { $value eq $_ } @{$field->{$addr}}; + } + push @{$field->{$addr}}, $value; + return; + } + } + elsif (substr('constructor', 0, length $access) eq $access) { + if ($protected) { + Carp::my_carp_bug("Can't set-up 'protected' constructors") + } + else { + $constructor_fields{$package}{$name} = $field; + } + } + elsif (substr('readable_array', 0, length $access) eq $access) { + + # Here has read access. If one of the other parameters for + # access is array, or this one specifies array (by being more + # than just 'readable_'), then create a subroutine that + # assumes the data is an array. Otherwise just a scalar + my $subname = "${package}::${protected}$name"; + if (grep { /^a/i } @_ + or length($access) > length('readable_')) + { + no strict "refs"; + *$subname = sub { + use strict "refs"; + Carp::carp_extra_args(\@_) if main::DEBUG && @_ > 1; + my $addr = do { no overloading; pack 'J', $_[0]; }; + if (ref $field->{$addr} ne 'ARRAY') { + my $type = ref $field->{$addr}; + $type = 'scalar' unless $type; + Carp::my_carp_bug("Trying to read $name as an array when it is a $type. Big problems."); + return; + } + return scalar @{$field->{$addr}} unless wantarray; + + # Make a copy; had problems with caller modifying the + # original otherwise + my @return = @{$field->{$addr}}; + return @return; + } + } + else { + + # Here not an array value, a simpler function. + no strict "refs"; + *$subname = sub { + use strict "refs"; + Carp::carp_extra_args(\@_) if main::DEBUG && @_ > 1; + no overloading; + return $field->{pack 'J', $_[0]}; + } + } + } + elsif (substr('settable', 0, length $access) eq $access) { + my $subname = "${package}::${protected}set_$name"; + no strict "refs"; + *$subname = sub { + use strict "refs"; + if (main::DEBUG) { + return Carp::carp_too_few_args(\@_, 2) if @_ < 2; + Carp::carp_extra_args(\@_) if @_ > 2; + } + # $self is $_[0]; $value is $_[1] + no overloading; + $field->{pack 'J', $_[0]} = $_[1]; + return; + } + } + else { + Carp::my_carp_bug("Unknown accessor type $access. No accessor set."); + } + } + return; } -} else { - die "$0: LineBrk.txt: $!\n"; } +package Input_file; + +# All input files use this object, which stores various attributes about them, +# and provides for convenient, uniform handling. The run method wraps the +# processing. It handles all the bookkeeping of opening, reading, and closing +# the file, returning only significant input lines. +# +# Each object gets a handler which processes the body of the file, and is +# called by run(). Most should use the generic, default handler, which has +# code scrubbed to handle things you might not expect. A handler should +# basically be a while(next_line()) {...} loop. +# +# You can also set up handlers to +# 1) call before the first line is read for pre processing +# 2) call to adjust each line of the input before the main handler gets them +# 3) call upon EOF before the main handler exits its loop +# 4) call at the end for post processing +# +# $_ is used to store the input line, and is to be filtered by the +# each_line_handler()s. So, if the format of the line is not in the desired +# format for the main handler, these are used to do that adjusting. They can +# be stacked (by enclosing them in an [ anonymous array ] in the constructor, +# so the $_ output of one is used as the input to the next. None of the other +# handlers are stackable, but could easily be changed to be so. +# +# Most of the handlers can call insert_lines() or insert_adjusted_lines() +# which insert the parameters as lines to be processed before the next input +# file line is read. This allows the EOF handler to flush buffers, for +# example. The difference between the two routines is that the lines inserted +# by insert_lines() are subjected to the each_line_handler()s. (So if you +# called it from such a handler, you would get infinite recursion.) Lines +# inserted by insert_adjusted_lines() go directly to the main handler without +# any adjustments. If the post-processing handler calls any of these, there +# will be no effect. Some error checking for these conditions could be added, +# but it hasn't been done. # -# Read in the ArabShap.txt. +# carp_bad_line() should be called to warn of bad input lines, which clears $_ +# to prevent further processing of the line. This routine will output the +# message as a warning once, and then keep a count of the lines that have the +# same message, and output that count at the end of the file's processing. +# This keeps the number of messages down to a manageable amount. # +# get_missings() should be called to retrieve any @missing input lines. +# Messages will be raised if this isn't done if the options aren't to ignore +# missings. + +sub trace { return main::trace(@_); } + +{ # Closure + # Keep track of fields that are to be put into the constructor. + my %constructor_fields; + + main::setup_package(Constructor_Fields => \%constructor_fields); + + my %file; # Input file name, required + main::set_access('file', \%file, qw{ c r }); + + my %first_released; # Unicode version file was first released in, required + main::set_access('first_released', \%first_released, qw{ c r }); + + my %handler; # Subroutine to process the input file, defaults to + # 'process_generic_property_file' + main::set_access('handler', \%handler, qw{ c }); + + my %property; + # name of property this file is for. defaults to none, meaning not + # applicable, or is otherwise determinable, for example, from each line. + main::set_access('property', \%property, qw{ c }); + + my %optional; + # If this is true, the file is optional. If not present, no warning is + # output. If it is present, the string given by this parameter is + # evaluated, and if false the file is not processed. + main::set_access('optional', \%optional, 'c', 'r'); + + my %non_skip; + # This is used for debugging, to skip processing of all but a few input + # files. Add 'non_skip => 1' to the constructor for those files you want + # processed when you set the $debug_skip global. + main::set_access('non_skip', \%non_skip, 'c'); + + my %skip; + # This is used to skip processing of this input file semi-permanently. + # It is used for files that we aren't planning to process anytime soon, + # but want to allow to be in the directory and not raise a message that we + # are not handling. Mostly for test files. This is in contrast to the + # non_skip element, which is supposed to be used very temporarily for + # debugging. Sets 'optional' to 1 + main::set_access('skip', \%skip, 'c'); + + my %each_line_handler; + # list of subroutines to look at and filter each non-comment line in the + # file. defaults to none. The subroutines are called in order, each is + # to adjust $_ for the next one, and the final one adjusts it for + # 'handler' + main::set_access('each_line_handler', \%each_line_handler, 'c'); + + my %has_missings_defaults; + # ? Are there lines in the file giving default values for code points + # missing from it?. Defaults to NO_DEFAULTS. Otherwise NOT_IGNORED is + # the norm, but IGNORED means it has such lines, but the handler doesn't + # use them. Having these three states allows us to catch changes to the + # UCD that this program should track + main::set_access('has_missings_defaults', + \%has_missings_defaults, qw{ c r }); + + my %pre_handler; + # Subroutine to call before doing anything else in the file. If undef, no + # such handler is called. + main::set_access('pre_handler', \%pre_handler, qw{ c }); + + my %eof_handler; + # Subroutine to call upon getting an EOF on the input file, but before + # that is returned to the main handler. This is to allow buffers to be + # flushed. The handler is expected to call insert_lines() or + # insert_adjusted() with the buffered material + main::set_access('eof_handler', \%eof_handler, qw{ c r }); + + my %post_handler; + # Subroutine to call after all the lines of the file are read in and + # processed. If undef, no such handler is called. + main::set_access('post_handler', \%post_handler, qw{ c }); + + my %progress_message; + # Message to print to display progress in lieu of the standard one + main::set_access('progress_message', \%progress_message, qw{ c }); + + my %handle; + # cache open file handle, internal. Is undef if file hasn't been + # processed at all, empty if has; + main::set_access('handle', \%handle); + + my %added_lines; + # cache of lines added virtually to the file, internal + main::set_access('added_lines', \%added_lines); + + my %errors; + # cache of errors found, internal + main::set_access('errors', \%errors); + + my %missings; + # storage of '@missing' defaults lines + main::set_access('missings', \%missings); + + sub new { + my $class = shift; + + my $self = bless \do{ my $anonymous_scalar }, $class; + my $addr = do { no overloading; pack 'J', $self; }; + + # Set defaults + $handler{$addr} = \&main::process_generic_property_file; + $non_skip{$addr} = 0; + $skip{$addr} = 0; + $has_missings_defaults{$addr} = $NO_DEFAULTS; + $handle{$addr} = undef; + $added_lines{$addr} = [ ]; + $each_line_handler{$addr} = [ ]; + $errors{$addr} = { }; + $missings{$addr} = [ ]; + + # Two positional parameters. + return Carp::carp_too_few_args(\@_, 2) if main::DEBUG && @_ < 2; + $file{$addr} = main::internal_file_to_platform(shift); + $first_released{$addr} = shift; + + # The rest of the arguments are key => value pairs + # %constructor_fields has been set up earlier to list all possible + # ones. Either set or push, depending on how the default has been set + # up just above. + my %args = @_; + foreach my $key (keys %args) { + my $argument = $args{$key}; + + # Note that the fields are the lower case of the constructor keys + my $hash = $constructor_fields{lc $key}; + if (! defined $hash) { + Carp::my_carp_bug("Unrecognized parameters '$key => $argument' to new() for $self. Skipped"); + next; + } + if (ref $hash->{$addr} eq 'ARRAY') { + if (ref $argument eq 'ARRAY') { + foreach my $argument (@{$argument}) { + next if ! defined $argument; + push @{$hash->{$addr}}, $argument; + } + } + else { + push @{$hash->{$addr}}, $argument if defined $argument; + } + } + else { + $hash->{$addr} = $argument; + } + delete $args{$key}; + }; + + # If the file has a property for it, it means that the property is not + # listed in the file's entries. So add a handler to the list of line + # handlers to insert the property name into the lines, to provide a + # uniform interface to the final processing subroutine. + # the final code doesn't have to worry about that. + if ($property{$addr}) { + push @{$each_line_handler{$addr}}, \&_insert_property_into_line; + } + + if ($non_skip{$addr} && ! $debug_skip && $verbosity) { + print "Warning: " . __PACKAGE__ . " constructor for $file{$addr} has useless 'non_skip' in it\n"; + } + + $optional{$addr} = 1 if $skip{$addr}; + + return $self; + } -if (open(my $ArabShap, "ArabShap.txt")) { - my @ArabLink; - my @ArabLinkGroup; - while (<$ArabShap>) { - next unless /^[0-9A-Fa-f]+;/; - s/\s+$//; + use overload + fallback => 0, + qw("") => "_operator_stringify", + "." => \&main::_operator_dot, + ; - my ($code, $name, $link, $linkgroup) = split(/\s*;\s*/); + sub _operator_stringify { + my $self = shift; - append(\@ArabLink, $code, $link); - append(\@ArabLinkGroup, $code, $linkgroup); + return __PACKAGE__ . " object for " . $self->file; } - flush(\@ArabLink, "ArabLink.pl"); - flush(\@ArabLinkGroup, "ArabLnkGrp.pl"); -} else { - die "$0: ArabShap.txt: $!\n"; -} + # flag to make sure extracted files are processed early + my $seen_non_extracted_non_age = 0; + + sub run { + # Process the input object $self. This opens and closes the file and + # calls all the handlers for it. Currently, this can only be called + # once per file, as it destroy's the EOF handler + + my $self = shift; + Carp::carp_extra_args(\@_) if main::DEBUG && @_; + + my $addr = do { no overloading; pack 'J', $self; }; + + my $file = $file{$addr}; + + # Don't process if not expecting this file (because released later + # than this Unicode version), and isn't there. This means if someone + # copies it into an earlier version's directory, we will go ahead and + # process it. + return if $first_released{$addr} gt $v_version && ! -e $file; + + # If in debugging mode and this file doesn't have the non-skip + # flag set, and isn't one of the critical files, skip it. + if ($debug_skip + && $first_released{$addr} ne v0 + && ! $non_skip{$addr}) + { + print "Skipping $file in debugging\n" if $verbosity; + return; + } + + # File could be optional + if ($optional{$addr}) { + return unless -e $file; + my $result = eval $optional{$addr}; + if (! defined $result) { + Carp::my_carp_bug("Got '$@' when tried to eval $optional{$addr}. $file Skipped."); + return; + } + if (! $result) { + if ($verbosity) { + print STDERR "Skipping processing input file '$file' because '$optional{$addr}' is not true\n"; + } + return; + } + } + + if (! defined $file || ! -e $file) { + + # If the file doesn't exist, see if have internal data for it + # (based on first_released being 0). + if ($first_released{$addr} eq v0) { + $handle{$addr} = 'pretend_is_open'; + } + else { + if (! $optional{$addr} # File could be optional + && $v_version ge $first_released{$addr}) + { + print STDERR "Skipping processing input file '$file' because not found\n" if $v_version ge $first_released{$addr}; + } + return; + } + } + else { + + # Here, the file exists. Some platforms may change the case of + # its name + if ($seen_non_extracted_non_age) { + if ($file =~ /$EXTRACTED/i) { + Carp::my_carp_bug(join_lines(<rel2abs($file); + my $expecting = delete $potential_files{$fkey}; + $expecting = delete $potential_files{lc($fkey)} unless defined $expecting; + Carp::my_carp("Was not expecting '$file'.") if + ! $expecting + && ! defined $handle{$addr}; + + # Having deleted from expected files, we can quit if not to do + # anything. Don't print progress unless really want verbosity + if ($skip{$addr}) { + print "Skipping $file.\n" if $verbosity >= $VERBOSE; + return; + } + + # Open the file, converting the slashes used in this program + # into the proper form for the OS + my $file_handle; + if (not open $file_handle, "<", $file) { + Carp::my_carp("Can't open $file. Skipping: $!"); + return 0; + } + $handle{$addr} = $file_handle; # Cache the open file handle + } + + if ($verbosity >= $PROGRESS) { + if ($progress_message{$addr}) { + print "$progress_message{$addr}\n"; + } + else { + # If using a virtual file, say so. + print "Processing ", (-e $file) + ? $file + : "substitute $file", + "\n"; + } + } + + + # Call any special handler for before the file. + &{$pre_handler{$addr}}($self) if $pre_handler{$addr}; + + # Then the main handler + &{$handler{$addr}}($self); + + # Then any special post-file handler. + &{$post_handler{$addr}}($self) if $post_handler{$addr}; + + # If any errors have been accumulated, output the counts (as the first + # error message in each class was output when it was encountered). + if ($errors{$addr}) { + my $total = 0; + my $types = 0; + foreach my $error (keys %{$errors{$addr}}) { + $total += $errors{$addr}->{$error}; + delete $errors{$addr}->{$error}; + $types++; + } + if ($total > 1) { + my $message + = "A total of $total lines had errors in $file. "; + + $message .= ($types == 1) + ? '(Only the first one was displayed.)' + : '(Only the first of each type was displayed.)'; + Carp::my_carp($message); + } + } + + if (@{$missings{$addr}}) { + Carp::my_carp_bug("Handler for $file didn't look at all the \@missing lines. Generated tables likely are wrong"); + } + + # If a real file handle, close it. + close $handle{$addr} or Carp::my_carp("Can't close $file: $!") if + ref $handle{$addr}; + $handle{$addr} = ""; # Uses empty to indicate that has already seen + # the file, as opposed to undef + return; + } + sub next_line { + # Sets $_ to be the next logical input line, if any. Returns non-zero + # if such a line exists. 'logical' means that any lines that have + # been added via insert_lines() will be returned in $_ before the file + # is read again. + + my $self = shift; + Carp::carp_extra_args(\@_) if main::DEBUG && @_; + + my $addr = do { no overloading; pack 'J', $self; }; + + # Here the file is open (or if the handle is not a ref, is an open + # 'virtual' file). Get the next line; any inserted lines get priority + # over the file itself. + my $adjusted; + + LINE: + while (1) { # Loop until find non-comment, non-empty line + #local $to_trace = 1 if main::DEBUG; + my $inserted_ref = shift @{$added_lines{$addr}}; + if (defined $inserted_ref) { + ($adjusted, $_) = @{$inserted_ref}; + trace $adjusted, $_ if main::DEBUG && $to_trace; + return 1 if $adjusted; + } + else { + last if ! ref $handle{$addr}; # Don't read unless is real file + last if ! defined ($_ = readline $handle{$addr}); + } + chomp; + trace $_ if main::DEBUG && $to_trace; + + # See if this line is the comment line that defines what property + # value that code points that are not listed in the file should + # have. The format or existence of these lines is not guaranteed + # by Unicode since they are comments, but the documentation says + # that this was added for machine-readability, so probably won't + # change. This works starting in Unicode Version 5.0. They look + # like: + # + # @missing: 0000..10FFFF; Not_Reordered + # @missing: 0000..10FFFF; Decomposition_Mapping; + # @missing: 0000..10FFFF; ; NaN + # + # Save the line for a later get_missings() call. + if (/$missing_defaults_prefix/) { + if ($has_missings_defaults{$addr} == $NO_DEFAULTS) { + $self->carp_bad_line("Unexpected \@missing line. Assuming no missing entries"); + } + elsif ($has_missings_defaults{$addr} == $NOT_IGNORED) { + my @defaults = split /\s* ; \s*/x, $_; + + # The first field is the @missing, which ends in a + # semi-colon, so can safely shift. + shift @defaults; + + # Some of these lines may have empty field placeholders + # which get in the way. An example is: + # @missing: 0000..10FFFF; ; NaN + # Remove them. Process starting from the top so the + # splice doesn't affect things still to be looked at. + for (my $i = @defaults - 1; $i >= 0; $i--) { + next if $defaults[$i] ne ""; + splice @defaults, $i, 1; + } + + # What's left should be just the property (maybe) and the + # default. Having only one element means it doesn't have + # the property. + my $default; + my $property; + if (@defaults >= 1) { + if (@defaults == 1) { + $default = $defaults[0]; + } + else { + $property = $defaults[0]; + $default = $defaults[1]; + } + } + + if (@defaults < 1 + || @defaults > 2 + || ($default =~ /^$/i + && $default !~ /^$/i)) + { + $self->carp_bad_line("Unrecognized \@missing line: $_. Assuming no missing entries"); + } + else { + + # If the property is missing from the line, it should + # be the one for the whole file + $property = $property{$addr} if ! defined $property; + + # Change to the null string, which is what it + # really means. If the default is the code point + # itself, set it to , which is what + # Unicode uses (but sometimes they've forgotten the + # space) + if ($default =~ /^$/i) { + $default = ""; + } + elsif ($default =~ /^$/i) { + $default = $CODE_POINT; + } + + # Store them as a sub-arrays with both components. + push @{$missings{$addr}}, [ $default, $property ]; + } + } + + # There is nothing for the caller to process on this comment + # line. + next; + } + + # Remove comments and trailing space, and skip this line if the + # result is empty + s/#.*//; + s/\s+$//; + next if /^$/; + + # Call any handlers for this line, and skip further processing of + # the line if the handler sets the line to null. + foreach my $sub_ref (@{$each_line_handler{$addr}}) { + &{$sub_ref}($self); + next LINE if /^$/; + } + + # Here the line is ok. return success. + return 1; + } # End of looping through lines. + + # If there is an EOF handler, call it (only once) and if it generates + # more lines to process go back in the loop to handle them. + if ($eof_handler{$addr}) { + &{$eof_handler{$addr}}($self); + $eof_handler{$addr} = ""; # Currently only get one shot at it. + goto LINE if $added_lines{$addr}; + } + + # Return failure -- no more lines. + return 0; + + } + +# Not currently used, not fully tested. +# sub peek { +# # Non-destructive look-ahead one non-adjusted, non-comment, non-blank +# # record. Not callable from an each_line_handler(), nor does it call +# # an each_line_handler() on the line. +# +# my $self = shift; +# my $addr = do { no overloading; pack 'J', $self; }; # -# Read in the Jamo.txt. +# foreach my $inserted_ref (@{$added_lines{$addr}}) { +# my ($adjusted, $line) = @{$inserted_ref}; +# next if $adjusted; # +# # Remove comments and trailing space, and return a non-empty +# # resulting line +# $line =~ s/#.*//; +# $line =~ s/\s+$//; +# return $line if $line ne ""; +# } +# +# return if ! ref $handle{$addr}; # Don't read unless is real file +# while (1) { # Loop until find non-comment, non-empty line +# local $to_trace = 1 if main::DEBUG; +# trace $_ if main::DEBUG && $to_trace; +# return if ! defined (my $line = readline $handle{$addr}); +# chomp $line; +# push @{$added_lines{$addr}}, [ 0, $line ]; +# +# $line =~ s/#.*//; +# $line =~ s/\s+$//; +# return $line if $line ne ""; +# } +# +# return; +# } -if (open(my $Jamo, "Jamo.txt")) { - my @Short; - while (<$Jamo>) { - next unless /^([0-9A-Fa-f]+)\s*;\s*(\w*)/; + sub insert_lines { + # Lines can be inserted so that it looks like they were in the input + # file at the place it was when this routine is called. See also + # insert_adjusted_lines(). Lines inserted via this routine go through + # any each_line_handler() - my ($code, $short) = ($1, $2); + my $self = shift; - append(\@Short, $code, $short); + # Each inserted line is an array, with the first element being 0 to + # indicate that this line hasn't been adjusted, and needs to be + # processed. + no overloading; + push @{$added_lines{pack 'J', $self}}, map { [ 0, $_ ] } @_; + return; } - flush(\@Short, "JamoShort.pl"); -} else { - die "$0: Jamo.txt: $!\n"; -} + sub insert_adjusted_lines { + # Lines can be inserted so that it looks like they were in the input + # file at the place it was when this routine is called. See also + # insert_lines(). Lines inserted via this routine are already fully + # adjusted, ready to be processed; each_line_handler()s handlers will + # not be called. This means this is not a completely general + # facility, as only the last each_line_handler on the stack should + # call this. It could be made more general, by passing to each of the + # line_handlers their position on the stack, which they would pass on + # to this routine, and that would replace the boolean first element in + # the anonymous array pushed here, so that the next_line routine could + # use that to call only those handlers whose index is after it on the + # stack. But this is overkill for what is needed now. + + my $self = shift; + trace $_[0] if main::DEBUG && $to_trace; + + # Each inserted line is an array, with the first element being 1 to + # indicate that this line has been adjusted + no overloading; + push @{$added_lines{pack 'J', $self}}, map { [ 1, $_ ] } @_; + return; + } -# -# Read in the Scripts.txt. -# + sub get_missings { + # Returns the stored up @missings lines' values, and clears the list. + # The values are in an array, consisting of the default in the first + # element, and the property in the 2nd. However, since these lines + # can be stacked up, the return is an array of all these arrays. + + my $self = shift; + Carp::carp_extra_args(\@_) if main::DEBUG && @_; -my @Scripts; + my $addr = do { no overloading; pack 'J', $self; }; -if (open(my $Scripts, "Scripts.txt")) { - while (<$Scripts>) { - next unless /^([0-9A-Fa-f]+)(?:\.\.([0-9A-Fa-f]+))?\s*;\s*(.+?)\s*\#/; + # If not accepting a list return, just return the first one. + return shift @{$missings{$addr}} unless wantarray; - # Wait until all the scripts have been read since - # they are not listed in numeric order. - push @Scripts, [ hex($1), $1, $2, $3 ]; + my @return = @{$missings{$addr}}; + undef @{$missings{$addr}}; + return @return; } -} else { - die "$0: Scripts.txt: $!\n"; -} -# Now append the scripts properties in their code point order. + sub _insert_property_into_line { + # Add a property field to $_, if this file requires it. -my %Script; -my $Scripts = []; + my $self = shift; + my $addr = do { no overloading; pack 'J', $self; }; + my $property = $property{$addr}; + Carp::carp_extra_args(\@_) if main::DEBUG && @_; -for my $script (sort { $a->[0] <=> $b->[0] } @Scripts) { - my ($code, $first, $last, $name) = @$script; - append($Scripts, $first, $name); - append($Script{$name} ||= [], $first, $name); - if (defined $last) { - extend($Scripts, $last); - extend($Script{$name}, $last); - } - unless (defined $In{$name}) { - $InScript{$InId} = $name; - $In{$name} = $InId++; - $InIn{$name} = $Script{$name}; + $_ =~ s/(;|$)/; $property$1/; + return; } -} - -# Scripts.pl can be written out already now. -flush(\@Scripts, "Scripts.pl"); + sub carp_bad_line { + # Output consistent error messages, using either a generic one, or the + # one given by the optional parameter. To avoid gazillions of the + # same message in case the syntax of a file is way off, this routine + # only outputs the first instance of each message, incrementing a + # count so the totals can be output at the end of the file. -# Common is everything not explicitly assigned to a Script + my $self = shift; + my $message = shift; + Carp::carp_extra_args(\@_) if main::DEBUG && @_; -$In{Common} = $InId++; -my $Common = inverse($Scripts); -$InIn{Common} = $Common; + my $addr = do { no overloading; pack 'J', $self; }; -# -# Read in the Blocks.txt. -# + $message = 'Unexpected line' unless $message; -my @Blocks; -my %Blocks; + # No trailing punctuation so as to fit with our addenda. + $message =~ s/[.:;,]$//; -if (open(my $Blocks, "Blocks.txt")) { - while (<$Blocks>) { - next unless /^([0-9A-Fa-f]+)\.\.([0-9A-Fa-f]+)\s*;\s*(.+?)\s*$/; - - my ($first, $last, $name) = ($1, $2, $3); - my $origname = $name; + # If haven't seen this exact message before, output it now. Otherwise + # increment the count of how many times it has occurred + unless ($errors{$addr}->{$message}) { + Carp::my_carp("$message in '$_' in " + . $file{$addr} + . " at line $.. Skipping this line;"); + $errors{$addr}->{$message} = 1; + } + else { + $errors{$addr}->{$message}++; + } - # If there's a naming conflict (the script names are - # in uppercase), the name of the block has " Block" - # appended to it. - my $pat = $name; - $pat =~ s/([- _])/(?:[-_]|\\s+)?/g; - for my $i (values %InScript) { - if ($i =~ /^$pat$/i) { - $name .= " Block"; - last; - } - } + # Clear the line to prevent any further (meaningful) processing of it. + $_ = ""; - append(\@Blocks, $first, $name); - append($Blocks{$name} ||= [], $first, $name); - if (defined $last) { - extend(\@Blocks, $last); - extend($Blocks{$name}, $last); - } - unless (defined $In{$name}) { - $InBlock{$InId} = $origname; - $In{$name} = $InId++; - $InIn{$name} = $Blocks{$name}; - } + return; } -} else { - die "$0: Blocks.txt: $!\n"; -} +} # End closure -# Blocks.pl can be written out already now. +package Multi_Default; -flush(\@Blocks, "Blocks.pl"); +# Certain properties in early versions of Unicode had more than one possible +# default for code points missing from the files. In these cases, one +# default applies to everything left over after all the others are applied, +# and for each of the others, there is a description of which class of code +# points applies to it. This object helps implement this by storing the +# defaults, and for all but that final default, an eval string that generates +# the class that it applies to. -# -# Read in the PropList.txt. It contains extended properties not -# listed in the Unicode.txt, such as 'Other_Alphabetic': -# alphabetic but not of the general category L; many modifiers -# belong to this extended property category: while they are not -# alphabets, they are alphabetic in nature. -# -my @Props; +{ # Closure -if (open(my $Props, "PropList.txt")) { - while (<$Props>) { - next unless /^([0-9A-Fa-f]+)(?:\.\.([0-9A-Fa-f]+))?\s*;\s*(.+?)\s*\#/; + main::setup_package(); - # Wait until all the extended properties have been read since - # they are not listed in numeric order. - push @Props, [ hex($1), $1, $2, $3 ]; - } -} else { - die "$0: PropList.txt: $!\n"; -} + my %class_defaults; + # The defaults structure for the classes + main::set_access('class_defaults', \%class_defaults); + + my %other_default; + # The default that applies to everything left over. + main::set_access('other_default', \%other_default, 'r'); + + + sub new { + # The constructor is called with default => eval pairs, terminated by + # the left-over default. e.g. + # Multi_Default->new( + # 'T' => '$gc->table("Mn") + $gc->table("Cf") - 0x200C + # - 0x200D', + # 'R' => 'some other expression that evaluates to code points', + # . + # . + # . + # 'U')); + + my $class = shift; + + my $self = bless \do{my $anonymous_scalar}, $class; + my $addr = do { no overloading; pack 'J', $self; }; -# Now append the extended properties in their code point order. + while (@_ > 1) { + my $default = shift; + my $eval = shift; + $class_defaults{$addr}->{$default} = $eval; + } -my %Prop; -my $Props = []; + $other_default{$addr} = shift; -for my $prop (sort { $a->[0] <=> $b->[0] } @Props) { - my ($code, $first, $last, $name) = @$prop; - append($Props, $first, $name); - append($Prop{$name} ||= [], $first, $name); - if (defined $last) { - extend($Props, $last); - extend($Prop{$name}, $last); + return $self; } - unless (defined $In{$name}) { - $In{$name} = $InId++; - $InIn{$name} = $Prop{$name}; + + sub get_next_defaults { + # Iterates and returns the next class of defaults. + my $self = shift; + Carp::carp_extra_args(\@_) if main::DEBUG && @_; + + my $addr = do { no overloading; pack 'J', $self; }; + + return each %{$class_defaults{$addr}}; } } -# Assigned is everything not Cn aka Noncharacter_Code_Point +package Alias; -$In{Assigned} = $InId++; -my $Assigned = inverse($Prop{Noncharacter_Code_Point}); -$InIn{Assigned} = $Assigned; +# An alias is one of the names that a table goes by. This class defines them +# including some attributes. Everything is currently setup in the +# constructor. -sub merge_general_and_extended { - my ($name, $general, $extended) = @_; - my $merged; - push @$merged, - map { pop @{$_}; $_ } - sort { $a->[2] <=> $b->[2] } - map { [ $_->[0], $_->[1], hex($_->[0]) ] } - ($general ? - map { ref $_ ? @$_ : $_ } - @Cat {ref $general ? @$general : $general } : - (), - $extended ? - map { ref $_ ? @$_ : $_ } - @Prop{ref $extended ? @$extended : $extended} : - ()); +{ # Closure - $In{$name} = $InId++; - $InIn{$name} = $merged; - - return $merged; -} + main::setup_package(); -# Alphabetic is L and Other_Alphabetic. + my %name; + main::set_access('name', \%name, 'r'); -my $Alphabetic = - merge_general_and_extended('Alphabetic', 'L', 'Other_Alphabetic'); + my %loose_match; + # Determined by the constructor code if this name should match loosely or + # not. The constructor parameters can override this, but it isn't fully + # implemented, as should have ability to override Unicode one's via + # something like a set_loose_match() + main::set_access('loose_match', \%loose_match, 'r'); -# Lowercase is Ll and Other_Lowercase. + my %make_pod_entry; + # Some aliases should not get their own entries because they are covered + # by a wild-card, and some we want to discourage use of. Binary + main::set_access('make_pod_entry', \%make_pod_entry, 'r'); -my $Lowercase = - merge_general_and_extended('Lowercase', 'Ll', 'Other_Lowercase'); + my %status; + # Aliases have a status, like deprecated, or even suppressed (which means + # they don't appear in documentation). Enum + main::set_access('status', \%status, 'r'); -# Uppercase is Lu and Other_Uppercase. + my %externally_ok; + # Similarly, some aliases should not be considered as usable ones for + # external use, such as file names, or we don't want documentation to + # recommend them. Boolean + main::set_access('externally_ok', \%externally_ok, 'r'); -my $Uppercase = - merge_general_and_extended('Uppercase', 'Lu', 'Other_Uppercase'); + sub new { + my $class = shift; -# Math is Sm and Other_Math. + my $self = bless \do { my $anonymous_scalar }, $class; + my $addr = do { no overloading; pack 'J', $self; }; -my $Math = - merge_general_and_extended('Math', 'Sm', 'Other_Math'); + $name{$addr} = shift; + $loose_match{$addr} = shift; + $make_pod_entry{$addr} = shift; + $externally_ok{$addr} = shift; + $status{$addr} = shift; -# Lampersand is Ll, Lu, and Lt. + Carp::carp_extra_args(\@_) if main::DEBUG && @_; -my $Lampersand = - merge_general_and_extended('Lampersand', [ qw(Ll Lu Lt) ]); + # Null names are never ok externally + $externally_ok{$addr} = 0 if $name{$addr} eq ""; -# ID_Start is Ll, Lu, Lt, Lm, Lo, and Nl. + return $self; + } +} -my $ID_Start = - merge_general_and_extended('ID_Start', [ qw(Ll Lu Lt Lm Lo Nl) ]); +package Range; -# ID_Continue is ID_Start, Mn, Mc, Nd, and Pc. +# A range is the basic unit for storing code points, and is described in the +# comments at the beginning of the program. Each range has a starting code +# point; an ending code point (not less than the starting one); a value +# that applies to every code point in between the two end-points, inclusive; +# and an enum type that applies to the value. The type is for the user's +# convenience, and has no meaning here, except that a non-zero type is +# considered to not obey the normal Unicode rules for having standard forms. +# +# The same structure is used for both map and match tables, even though in the +# latter, the value (and hence type) is irrelevant and could be used as a +# comment. In map tables, the value is what all the code points in the range +# map to. Type 0 values have the standardized version of the value stored as +# well, so as to not have to recalculate it a lot. -my $ID_Continue = - merge_general_and_extended('ID_Continue', [ qw(Ll Lu Lt Lm Lo Nl - Mn Mc Nd Pc) ]); +sub trace { return main::trace(@_); } -# -# Any is any. -# +{ # Closure -$In{Any} = $InId++; -my $Any = [ [ 0, sprintf("%04X", $LastUnicodeCodepoint) ] ]; -$InIn{Any} = $Any; + main::setup_package(); -# -# mapping() will be used to write out the In and Is virtual mappings. -# + my %start; + main::set_access('start', \%start, 'r', 's'); -sub mapping { - my ($map, $name) = @_; + my %end; + main::set_access('end', \%end, 'r', 's'); - if (open(my $fh, ">$name.pl")) { - print "$name.pl\n"; - header($fh); + my %value; + main::set_access('value', \%value, 'r'); - # The %pat will hold a hash that maps the first two - # lowercased letters of a class to a 'fuzzified' regular - # expression that points to the real mapping. + my %type; + main::set_access('type', \%type, 'r'); - my %pat; + my %standard_form; + # The value in internal standard form. Defined only if the type is 0. + main::set_access('standard_form', \%standard_form); - # But first write out the offical name to real name - # (the filename) mapping. + # Note that if these fields change, the dump() method should as well - print $fh < '$map->{$i}',\n", "'$i'"; - } - print $fh < {\n"; - foreach my $ipat (@{$pat{$prefix}}) { - my ($i, $pat) = @$ipat; - print $fh "\t'$pat' => '$map->{$i}',\n"; - } - print $fh "},\n"; - } - print $fh <>In.pl")) { - print $In < $b } keys %InScript) { - printf $In "%4d => '$InScript{$i}',\n", $i; + Carp::carp_extra_args(\%args) if main::DEBUG && %args; + + if (! $type{$addr}) { + $standard_form{$addr} = main::standardize($value); + } + + return $self; } - print $In < 0, + qw("") => "_operator_stringify", + "." => \&main::_operator_dot, + ; + + sub _operator_stringify { + my $self = shift; + my $addr = do { no overloading; pack 'J', $self; }; + + # Output it like '0041..0065 (value)' + my $return = sprintf("%04X", $start{$addr}) + . '..' + . sprintf("%04X", $end{$addr}); + my $value = $value{$addr}; + my $type = $type{$addr}; + $return .= ' ('; + $return .= "$value"; + $return .= ", Type=$type" if $type != 0; + $return .= ')'; + + return $return; + } + + sub standard_form { + # The standard form is the value itself if the standard form is + # undefined (that is if the value is special) + + my $self = shift; + Carp::carp_extra_args(\@_) if main::DEBUG && @_; -%utf8::InBlock = -( -EOT - for my $i (sort { $a <=> $b } keys %InBlock) { - printf $In "%4d => '$InBlock{$i}',\n", $i; + my $addr = do { no overloading; pack 'J', $self; }; + + return $standard_form{$addr} if defined $standard_form{$addr}; + return $value{$addr}; } - print $In < $In{$b} } keys %In) { - flush($InIn{$in}, "In/$In{$in}.pl"); -} +# There are a number of methods to manipulate range lists, and some operators +# are overloaded to handle them. -# -# The mapping from General Category long forms to short forms is -# currently hardwired here since no simple data file in the UCD -# seems to do that. Unicode 3.2 will assumedly correct this. -# +sub trace { return main::trace(@_); } -my %Is = ( - 'Letter' => 'L', - 'Uppercase_Letter' => 'Lu', - 'Lowercase_Letter' => 'Ll', - 'Titlecase_Letter' => 'Lt', - 'Modifier_Letter' => 'Lm', - 'Other_Letter' => 'Lo', +{ # Closure - 'Mark' => 'M', - 'Non_Spacing_Mark' => 'Mn', - 'Spacing_Mark' => 'Mc', - 'Enclosing_Mark' => 'Me', + our $addr; - 'Separator' => 'Z', - 'Space_Separator' => 'Zs', - 'Line_Separator' => 'Zl', - 'Paragraph_Separator' => 'Zp', + main::setup_package(); - 'Number' => 'N', - 'Decimal_Number' => 'Nd', - 'Letter_Number' => 'Nl', - 'Other_Number' => 'No', + my %ranges; + # The list of ranges + main::set_access('ranges', \%ranges, 'readable_array'); - 'Punctuation' => 'P', - 'Connector_Punctuation' => 'Pc', - 'Dash_Punctuation' => 'Pd', - 'Open_Punctuation' => 'Ps', - 'Close_Punctuation' => 'Pe', - 'Initial_Punctuation' => 'Pi', - 'Final_Punctuation' => 'Pf', - 'Other_Punctuation' => 'Po', + my %max; + # The highest code point in the list. This was originally a method, but + # actual measurements said it was used a lot. + main::set_access('max', \%max, 'r'); - 'Symbol' => 'S', - 'Math_Symbol' => 'Sm', - 'Currency_Symbol' => 'Sc', - 'Modifier_Symbol' => 'Sk', - 'Other_Symbol' => 'So', + my %each_range_iterator; + # Iterator position for each_range() + main::set_access('each_range_iterator', \%each_range_iterator); - 'Other' => 'C', - 'Control' => 'Cc', - 'Format' => 'Cf', - 'Surrogate' => 'Cs', - 'Private Use' => 'Co', - 'Unassigned' => 'Cn', -); + my %owner_name_of; + # Name of parent this is attached to, if any. Solely for better error + # messages. + main::set_access('owner_name_of', \%owner_name_of, 'p_r'); -# -# Write out the virtual Is mappings. -# + my %_search_ranges_cache; + # A cache of the previous result from _search_ranges(), for better + # performance + main::set_access('_search_ranges_cache', \%_search_ranges_cache); -mapping(\%Is, "Is"); + sub new { + my $class = shift; + my %args = @_; -# -# Read in the special cases. -# + # Optional initialization data for the range list. + my $initialize = delete $args{'Initialize'}; -my %Case; + my $self; -if (open(my $SpecCase, "SpecCase.txt")) { - while (<$SpecCase>) { - next unless /^[0-9A-Fa-f]+;/; - s/\#.*//; - s/\s+$//; + # Use _union() to initialize. _union() returns an object of this + # class, which means that it will call this constructor recursively. + # But it won't have this $initialize parameter so that it won't + # infinitely loop on this. + return _union($class, $initialize, %args) if defined $initialize; - my ($code, $lower, $title, $upper, $condition) = split(/\s*;\s*/); + $self = bless \do { my $anonymous_scalar }, $class; + my $addr = do { no overloading; pack 'J', $self; }; - if ($condition) { # not implemented yet - print "# SKIPPING $_\n"; - next; - } + # Optional parent object, only for debug info. + $owner_name_of{$addr} = delete $args{'Owner'}; + $owner_name_of{$addr} = "" if ! defined $owner_name_of{$addr}; - # Wait until all the special cases have been read since - # they are not listed in numeric order. - my $ix = hex($code); - push @{$Case{Lower}}, [ $ix, $code, $lower ]; - push @{$Case{Title}}, [ $ix, $code, $title ]; - push @{$Case{Upper}}, [ $ix, $code, $upper ]; + # Stringify, in case it is an object. + $owner_name_of{$addr} = "$owner_name_of{$addr}"; + + # This is used only for error messages, and so a colon is added + $owner_name_of{$addr} .= ": " if $owner_name_of{$addr} ne ""; + + Carp::carp_extra_args(\%args) if main::DEBUG && %args; + + # Max is initialized to a negative value that isn't adjacent to 0, + # for simpler tests + $max{$addr} = -2; + + $_search_ranges_cache{$addr} = 0; + $ranges{$addr} = []; + + return $self; } -} else { - die "$0: SpecCase.txt: $!\n"; -} -# Now write out the special cases properties in their code point order. -# Prepend them to the To/{Upper,Lower,Title}.pl. + use overload + fallback => 0, + qw("") => "_operator_stringify", + "." => \&main::_operator_dot, + ; -for my $case (qw(Lower Title Upper)) { - my $NormalCase = do "To/$case.pl" || die "$0: To/$case.pl: $!\n"; - if (open(my $Case, ">To/$case.pl")) { - header($Case); - print $Case <[0] <=> $b->[0] } @{$Case{$case}}) { - my ($ix, $code, $to) = @$prop; - my $tostr = - join "", map { sprintf "\\x{%s}", $_ } split ' ', $to; - printf $Case qq['%04X' => "$tostr",\n], $ix; - } - print $Case <new($element, $element); + } + } + elsif ($arg->isa('Range')) { + push @records, $arg; + } + elsif ($arg->can('ranges')) { + push @records, $arg->ranges; + } + else { + my $message = ""; + if (defined $self) { + no overloading; + $message .= $owner_name_of{pack 'J', $self}; + } + Carp::my_carp_bug($message . "Cannot take the union of a $type. No union done."); + return; + } + } + + # Sort with the range containing the lowest ordinal first, but if + # two ranges start at the same code point, sort with the bigger range + # of the two first, because it takes fewer cycles. + @records = sort { ($a->start <=> $b->start) + or + # if b is shorter than a, b->end will be + # less than a->end, and we want to select + # a, so want to return -1 + ($b->end <=> $a->end) + } @records; + + my $new = $class->new(@_); + + # Fold in records so long as they add new information. + for my $set (@records) { + my $start = $set->start; + my $end = $set->end; + my $value = $set->value; + if ($start > $new->max) { + $new->_add_delete('+', $start, $end, $value); + } + elsif ($end > $new->max) { + $new->_add_delete('+', $new->max +1, $end, $value); + } + } + + return $new; } -} -# -# Read in the case foldings. -# -# We will do full case folding, C + F + I (see CaseFold.txt). -# + sub range_count { # Return the number of ranges in the range list + my $self = shift; + Carp::carp_extra_args(\@_) if main::DEBUG && @_; + + no overloading; + return scalar @{$ranges{pack 'J', $self}}; + } + + sub min { + # Returns the minimum code point currently in the range list, or if + # the range list is empty, 2 beyond the max possible. This is a + # method because used so rarely, that not worth saving between calls, + # and having to worry about changing it as ranges are added and + # deleted. -if (open(my $CaseFold, "CaseFold.txt")) { - my @Fold; - my %Fold; + my $self = shift; + Carp::carp_extra_args(\@_) if main::DEBUG && @_; - while (<$CaseFold>) { - next unless /^([0-9A-Fa-f]+)\s*;\s*([CFI])\s*;\s*([0-9A-Fa-f]+(?: [0-9A-Fa-f]+)*)\s*;/; + my $addr = do { no overloading; pack 'J', $self; }; - my ($code, $status, $fold) = ($1, $2, $3); + # If the range list is empty, return a large value that isn't adjacent + # to any that could be in the range list, for simpler tests + return $LAST_UNICODE_CODEPOINT + 2 unless scalar @{$ranges{$addr}}; + return $ranges{$addr}->[0]->start; + } - if ($status eq 'C') { # Common: one-to-one folding - append(\@Fold, $code, $fold); - } else { # F: full, or I: dotted uppercase I -> dotless lowercase I - $Fold{hex($code)} = $fold; - } + sub contains { + # Boolean: Is argument in the range list? If so returns $i such that: + # range[$i]->end < $codepoint <= range[$i+1]->end + # which is one beyond what you want; this is so that the 0th range + # doesn't return false + my $self = shift; + my $codepoint = shift; + Carp::carp_extra_args(\@_) if main::DEBUG && @_; + + my $i = $self->_search_ranges($codepoint); + return 0 unless defined $i; + + # The search returns $i, such that + # range[$i-1]->end < $codepoint <= range[$i]->end + # So is in the table if and only iff it is at least the start position + # of range $i. + no overloading; + return 0 if $ranges{pack 'J', $self}->[$i]->start > $codepoint; + return $i + 1; } - flush(\@Fold, "To/Fold.pl"); + sub value_of { + # Returns the value associated with the code point, undef if none - # - # Prepend the special foldings to the common foldings. - # + my $self = shift; + my $codepoint = shift; + Carp::carp_extra_args(\@_) if main::DEBUG && @_; - my $CommonFold = do "To/Fold.pl" || die "$0: To/Fold.pl: $!\n"; - if (open(my $Fold, ">To/Fold.pl")) { - header($Fold); - print $Fold < $b } keys %Fold) { - my $foldstr = - join "", map { sprintf "\\x{%s}", $_ } split ' ', $Fold{$code}; - printf $Fold qq['%04X' => "$foldstr",\n], $code; - } - print $Fold <contains($codepoint); + return unless $i; -EOT - begin($Fold); - print $Fold $CommonFold; - end($Fold); - } else { - die "$0: To/Fold.pl: $!\n"; + # contains() returns 1 beyond where we should look + no overloading; + return $ranges{pack 'J', $self}->[$i-1]->value; } -} else { - die "$0: CaseFold.txt: $!\n"; -} -# That's all, folks! + sub _search_ranges { + # Find the range in the list which contains a code point, or where it + # should go if were to add it. That is, it returns $i, such that: + # range[$i-1]->end < $codepoint <= range[$i]->end + # Returns undef if no such $i is possible (e.g. at end of table), or + # if there is an error. + + my $self = shift; + my $code_point = shift; + Carp::carp_extra_args(\@_) if main::DEBUG && @_; + + my $addr = do { no overloading; pack 'J', $self; }; + + return if $code_point > $max{$addr}; + my $r = $ranges{$addr}; # The current list of ranges + my $range_list_size = scalar @$r; + my $i; + + use integer; # want integer division + + # Use the cached result as the starting guess for this one, because, + # an experiment on 5.1 showed that 90% of the time the cache was the + # same as the result on the next call (and 7% it was one less). + $i = $_search_ranges_cache{$addr}; + $i = 0 if $i >= $range_list_size; # Reset if no longer valid (prob. + # from an intervening deletion + #local $to_trace = 1 if main::DEBUG; + trace "previous \$i is still valid: $i" if main::DEBUG && $to_trace && $code_point <= $r->[$i]->end && ($i == 0 || $r->[$i-1]->end < $code_point); + return $i if $code_point <= $r->[$i]->end + && ($i == 0 || $r->[$i-1]->end < $code_point); + + # Here the cache doesn't yield the correct $i. Try adding 1. + if ($i < $range_list_size - 1 + && $r->[$i]->end < $code_point && + $code_point <= $r->[$i+1]->end) + { + $i++; + trace "next \$i is correct: $i" if main::DEBUG && $to_trace; + $_search_ranges_cache{$addr} = $i; + return $i; + } + + # Here, adding 1 also didn't work. We do a binary search to + # find the correct position, starting with current $i + my $lower = 0; + my $upper = $range_list_size - 1; + while (1) { + trace "top of loop i=$i:", sprintf("%04X", $r->[$lower]->start), "[$lower] .. ", sprintf("%04X", $r->[$i]->start), "[$i] .. ", sprintf("%04X", $r->[$upper]->start), "[$upper]" if main::DEBUG && $to_trace; + + if ($code_point <= $r->[$i]->end) { + + # Here we have met the upper constraint. We can quit if we + # also meet the lower one. + last if $i == 0 || $r->[$i-1]->end < $code_point; + + $upper = $i; # Still too high. + + } + else { + + # Here, $r[$i]->end < $code_point, so look higher up. + $lower = $i; + } + + # Split search domain in half to try again. + my $temp = ($upper + $lower) / 2; + + # No point in continuing unless $i changes for next time + # in the loop. + if ($temp == $i) { + + # We can't reach the highest element because of the averaging. + # So if one below the upper edge, force it there and try one + # more time. + if ($i == $range_list_size - 2) { + + trace "Forcing to upper edge" if main::DEBUG && $to_trace; + $i = $range_list_size - 1; + + # Change $lower as well so if fails next time through, + # taking the average will yield the same $i, and we will + # quit with the error message just below. + $lower = $i; + next; + } + Carp::my_carp_bug("$owner_name_of{$addr}Can't find where the range ought to go. No action taken."); + return; + } + $i = $temp; + } # End of while loop + + if (main::DEBUG && $to_trace) { + trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i; + trace "i= [ $i ]", $r->[$i]; + trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < $range_list_size - 1; + } + + # Here we have found the offset. Cache it as a starting point for the + # next call. + $_search_ranges_cache{$addr} = $i; + return $i; + } + + sub _add_delete { + # Add, replace or delete ranges to or from a list. The $type + # parameter gives which: + # '+' => insert or replace a range, returning a list of any changed + # ranges. + # '-' => delete a range, returning a list of any deleted ranges. + # + # The next three parameters give respectively the start, end, and + # value associated with the range. 'value' should be null unless the + # operation is '+'; + # + # The range list is kept sorted so that the range with the lowest + # starting position is first in the list, and generally, adjacent + # ranges with the same values are merged into a single larger one (see + # exceptions below). + # + # There are more parameters; all are key => value pairs: + # Type gives the type of the value. It is only valid for '+'. + # All ranges have types; if this parameter is omitted, 0 is + # assumed. Ranges with type 0 are assumed to obey the + # Unicode rules for casing, etc; ranges with other types are + # not. Otherwise, the type is arbitrary, for the caller's + # convenience, and looked at only by this routine to keep + # adjacent ranges of different types from being merged into + # a single larger range, and when Replace => + # $IF_NOT_EQUIVALENT is specified (see just below). + # Replace determines what to do if the range list already contains + # ranges which coincide with all or portions of the input + # range. It is only valid for '+': + # => $NO means that the new value is not to replace + # any existing ones, but any empty gaps of the + # range list coinciding with the input range + # will be filled in with the new value. + # => $UNCONDITIONALLY means to replace the existing values with + # this one unconditionally. However, if the + # new and old values are identical, the + # replacement is skipped to save cycles + # => $IF_NOT_EQUIVALENT means to replace the existing values + # with this one if they are not equivalent. + # Ranges are equivalent if their types are the + # same, and they are the same string; or if + # both are type 0 ranges, if their Unicode + # standard forms are identical. In this last + # case, the routine chooses the more "modern" + # one to use. This is because some of the + # older files are formatted with values that + # are, for example, ALL CAPs, whereas the + # derived files have a more modern style, + # which looks better. By looking for this + # style when the pre-existing and replacement + # standard forms are the same, we can move to + # the modern style + # => $MULTIPLE means that if this range duplicates an + # existing one, but has a different value, + # don't replace the existing one, but insert + # this, one so that the same range can occur + # multiple times. + # => anything else is the same as => $IF_NOT_EQUIVALENT + # + # "same value" means identical for non-type-0 ranges, and it means + # having the same standard forms for type-0 ranges. + + return Carp::carp_too_few_args(\@_, 5) if main::DEBUG && @_ < 5; + + my $self = shift; + my $operation = shift; # '+' for add/replace; '-' for delete; + my $start = shift; + my $end = shift; + my $value = shift; + + my %args = @_; + + $value = "" if not defined $value; # warning: $value can be "0" + + my $replace = delete $args{'Replace'}; + $replace = $IF_NOT_EQUIVALENT unless defined $replace; + + my $type = delete $args{'Type'}; + $type = 0 unless defined $type; + + Carp::carp_extra_args(\%args) if main::DEBUG && %args; + + my $addr = do { no overloading; pack 'J', $self; }; + + if ($operation ne '+' && $operation ne '-') { + Carp::my_carp_bug("$owner_name_of{$addr}First parameter to _add_delete must be '+' or '-'. No action taken."); + return; + } + unless (defined $start && defined $end) { + Carp::my_carp_bug("$owner_name_of{$addr}Undefined start and/or end to _add_delete. No action taken."); + return; + } + unless ($end >= $start) { + Carp::my_carp_bug("$owner_name_of{$addr}End of range (" . sprintf("%04X", $end) . ") must not be before start (" . sprintf("%04X", $start) . "). No action taken."); + return; + } + #local $to_trace = 1 if main::DEBUG; + + if ($operation eq '-') { + if ($replace != $IF_NOT_EQUIVALENT) { + Carp::my_carp_bug("$owner_name_of{$addr}Replace => \$IF_NOT_EQUIVALENT is required when deleting a range from a range list. Assuming Replace => \$IF_NOT_EQUIVALENT."); + $replace = $IF_NOT_EQUIVALENT; + } + if ($type) { + Carp::my_carp_bug("$owner_name_of{$addr}Type => 0 is required when deleting a range from a range list. Assuming Type => 0."); + $type = 0; + } + if ($value ne "") { + Carp::my_carp_bug("$owner_name_of{$addr}Value => \"\" is required when deleting a range from a range list. Assuming Value => \"\"."); + $value = ""; + } + } + + my $r = $ranges{$addr}; # The current list of ranges + my $range_list_size = scalar @$r; # And its size + my $max = $max{$addr}; # The current high code point in + # the list of ranges + + # Do a special case requiring fewer machine cycles when the new range + # starts after the current highest point. The Unicode input data is + # structured so this is common. + if ($start > $max) { + + trace "$owner_name_of{$addr} $operation", sprintf("%04X", $start) . '..' . sprintf("%04X", $end) . " ($value) type=$type" if main::DEBUG && $to_trace; + return if $operation eq '-'; # Deleting a non-existing range is a + # no-op + + # If the new range doesn't logically extend the current final one + # in the range list, create a new range at the end of the range + # list. (max cleverly is initialized to a negative number not + # adjacent to 0 if the range list is empty, so even adding a range + # to an empty range list starting at 0 will have this 'if' + # succeed.) + if ($start > $max + 1 # non-adjacent means can't extend. + || @{$r}[-1]->value ne $value # values differ, can't extend. + || @{$r}[-1]->type != $type # types differ, can't extend. + ) { + push @$r, Range->new($start, $end, + Value => $value, + Type => $type); + } + else { + + # Here, the new range starts just after the current highest in + # the range list, and they have the same type and value. + # Extend the current range to incorporate the new one. + @{$r}[-1]->set_end($end); + } + + # This becomes the new maximum. + $max{$addr} = $end; + + return; + } + #local $to_trace = 0 if main::DEBUG; + + trace "$owner_name_of{$addr} $operation", sprintf("%04X", $start) . '..' . sprintf("%04X", $end) . " ($value) replace=$replace" if main::DEBUG && $to_trace; + + # Here, the input range isn't after the whole rest of the range list. + # Most likely 'splice' will be needed. The rest of the routine finds + # the needed splice parameters, and if necessary, does the splice. + # First, find the offset parameter needed by the splice function for + # the input range. Note that the input range may span multiple + # existing ones, but we'll worry about that later. For now, just find + # the beginning. If the input range is to be inserted starting in a + # position not currently in the range list, it must (obviously) come + # just after the range below it, and just before the range above it. + # Slightly less obviously, it will occupy the position currently + # occupied by the range that is to come after it. More formally, we + # are looking for the position, $i, in the array of ranges, such that: + # + # r[$i-1]->start <= r[$i-1]->end < $start < r[$i]->start <= r[$i]->end + # + # (The ordered relationships within existing ranges are also shown in + # the equation above). However, if the start of the input range is + # within an existing range, the splice offset should point to that + # existing range's position in the list; that is $i satisfies a + # somewhat different equation, namely: + # + #r[$i-1]->start <= r[$i-1]->end < r[$i]->start <= $start <= r[$i]->end + # + # More briefly, $start can come before or after r[$i]->start, and at + # this point, we don't know which it will be. However, these + # two equations share these constraints: + # + # r[$i-1]->end < $start <= r[$i]->end + # + # And that is good enough to find $i. + + my $i = $self->_search_ranges($start); + if (! defined $i) { + Carp::my_carp_bug("Searching $self for range beginning with $start unexpectedly returned undefined. Operation '$operation' not performed"); + return; + } + + # The search function returns $i such that: + # + # r[$i-1]->end < $start <= r[$i]->end + # + # That means that $i points to the first range in the range list + # that could possibly be affected by this operation. We still don't + # know if the start of the input range is within r[$i], or if it + # points to empty space between r[$i-1] and r[$i]. + trace "[$i] is the beginning splice point. Existing range there is ", $r->[$i] if main::DEBUG && $to_trace; + + # Special case the insertion of data that is not to replace any + # existing data. + if ($replace == $NO) { # If $NO, has to be operation '+' + #local $to_trace = 1 if main::DEBUG; + trace "Doesn't replace" if main::DEBUG && $to_trace; + + # Here, the new range is to take effect only on those code points + # that aren't already in an existing range. This can be done by + # looking through the existing range list and finding the gaps in + # the ranges that this new range affects, and then calling this + # function recursively on each of those gaps, leaving untouched + # anything already in the list. Gather up a list of the changed + # gaps first so that changes to the internal state as new ranges + # are added won't be a problem. + my @gap_list; + + # First, if the starting point of the input range is outside an + # existing one, there is a gap from there to the beginning of the + # existing range -- add a span to fill the part that this new + # range occupies + if ($start < $r->[$i]->start) { + push @gap_list, Range->new($start, + main::min($end, + $r->[$i]->start - 1), + Type => $type); + trace "gap before $r->[$i] [$i], will add", $gap_list[-1] if main::DEBUG && $to_trace; + } + + # Then look through the range list for other gaps until we reach + # the highest range affected by the input one. + my $j; + for ($j = $i+1; $j < $range_list_size; $j++) { + trace "j=[$j]", $r->[$j] if main::DEBUG && $to_trace; + last if $end < $r->[$j]->start; + + # If there is a gap between when this range starts and the + # previous one ends, add a span to fill it. Note that just + # because there are two ranges doesn't mean there is a + # non-zero gap between them. It could be that they have + # different values or types + if ($r->[$j-1]->end + 1 != $r->[$j]->start) { + push @gap_list, + Range->new($r->[$j-1]->end + 1, + $r->[$j]->start - 1, + Type => $type); + trace "gap between $r->[$j-1] and $r->[$j] [$j], will add: $gap_list[-1]" if main::DEBUG && $to_trace; + } + } + + # Here, we have either found an existing range in the range list, + # beyond the area affected by the input one, or we fell off the + # end of the loop because the input range affects the whole rest + # of the range list. In either case, $j is 1 higher than the + # highest affected range. If $j == $i, it means that there are no + # affected ranges, that the entire insertion is in the gap between + # r[$i-1], and r[$i], which we already have taken care of before + # the loop. + # On the other hand, if there are affected ranges, it might be + # that there is a gap that needs filling after the final such + # range to the end of the input range + if ($r->[$j-1]->end < $end) { + push @gap_list, Range->new(main::max($start, + $r->[$j-1]->end + 1), + $end, + Type => $type); + trace "gap after $r->[$j-1], will add $gap_list[-1]" if main::DEBUG && $to_trace; + } + + # Call recursively to fill in all the gaps. + foreach my $gap (@gap_list) { + $self->_add_delete($operation, + $gap->start, + $gap->end, + $value, + Type => $type); + } + + return; + } + + # Here, we have taken care of the case where $replace is $NO, which + # means that whatever action we now take is done unconditionally. It + # still could be that this call will result in a no-op, if duplicates + # aren't allowed, and we are inserting a range that merely duplicates + # data already in the range list; or also if deleting a non-existent + # range. + # $i still points to the first potential affected range. Now find the + # highest range affected, which will determine the length parameter to + # splice. (The input range can span multiple existing ones.) While + # we are looking through the range list, see also if this is an + # insertion that will change the values of at least one of the + # affected ranges. We don't need to do this check unless this is an + # insertion of non-multiples, and also since this is a boolean, we + # don't need to do it if have already determined that it will make a + # change; just unconditionally change them. $cdm is created to be 1 + # if either of these is true. (The 'c' in the name comes from below) + my $cdm = ($operation eq '-' || $replace == $MULTIPLE); + my $j; # This will point to the highest affected range + + # For non-zero types, the standard form is the value itself; + my $standard_form = ($type) ? $value : main::standardize($value); + + for ($j = $i; $j < $range_list_size; $j++) { + trace "Looking for highest affected range; the one at $j is ", $r->[$j] if main::DEBUG && $to_trace; + + # If find a range that it doesn't overlap into, we can stop + # searching + last if $end < $r->[$j]->start; + + # Here, overlaps the range at $j. If the value's don't match, + # and this is supposedly an insertion, it becomes a change + # instead. This is what the 'c' stands for in $cdm. + if (! $cdm) { + if ($r->[$j]->standard_form ne $standard_form) { + $cdm = 1; + } + else { + + # Here, the two values are essentially the same. If the + # two are actually identical, replacing wouldn't change + # anything so skip it. + my $pre_existing = $r->[$j]->value; + if ($pre_existing ne $value) { + + # Here the new and old standardized values are the + # same, but the non-standardized values aren't. If + # replacing unconditionally, then replace + if( $replace == $UNCONDITIONALLY) { + $cdm = 1; + } + else { + + # Here, are replacing conditionally. Decide to + # replace or not based on which appears to look + # the "nicest". If one is mixed case and the + # other isn't, choose the mixed case one. + my $new_mixed = $value =~ /[A-Z]/ + && $value =~ /[a-z]/; + my $old_mixed = $pre_existing =~ /[A-Z]/ + && $pre_existing =~ /[a-z]/; + + if ($old_mixed != $new_mixed) { + $cdm = 1 if $new_mixed; + if (main::DEBUG && $to_trace) { + if ($cdm) { + trace "Replacing $pre_existing with $value"; + } + else { + trace "Retaining $pre_existing over $value"; + } + } + } + else { + + # Here casing wasn't different between the two. + # If one has hyphens or underscores and the + # other doesn't, choose the one with the + # punctuation. + my $new_punct = $value =~ /[-_]/; + my $old_punct = $pre_existing =~ /[-_]/; + + if ($old_punct != $new_punct) { + $cdm = 1 if $new_punct; + if (main::DEBUG && $to_trace) { + if ($cdm) { + trace "Replacing $pre_existing with $value"; + } + else { + trace "Retaining $pre_existing over $value"; + } + } + } # else existing one is just as "good"; + # retain it to save cycles. + } + } + } + } + } + } # End of loop looking for highest affected range. + + # Here, $j points to one beyond the highest range that this insertion + # affects (hence to beyond the range list if that range is the final + # one in the range list). + + # The splice length is all the affected ranges. Get it before + # subtracting, for efficiency, so we don't have to later add 1. + my $length = $j - $i; + + $j--; # $j now points to the highest affected range. + trace "Final affected range is $j: $r->[$j]" if main::DEBUG && $to_trace; + + # If inserting a multiple record, this is where it goes, after all the + # existing ones for this range. This implies an insertion, and no + # change to any existing ranges. Note that $j can be -1 if this new + # range doesn't actually duplicate any existing, and comes at the + # beginning of the list, in which case we can handle it like any other + # insertion, and is easier to do so. + if ($replace == $MULTIPLE && $j >= 0) { + + # This restriction could be remedied with a little extra work, but + # it won't hopefully ever be necessary + if ($r->[$j]->start != $r->[$j]->end) { + Carp::my_carp_bug("$owner_name_of{$addr}Can't cope with adding a multiple when the other range ($r->[$j]) contains more than one code point. No action taken."); + return; + } + + # Don't add an exact duplicate, as it isn't really a multiple + return if $value eq $r->[$j]->value && $type eq $r->[$j]->type; + + trace "Adding multiple record at $j+1 with $start..$end, $value" if main::DEBUG && $to_trace; + my @return = splice @$r, + $j+1, + 0, + Range->new($start, + $end, + Value => $value, + Type => $type); + if (main::DEBUG && $to_trace) { + trace "After splice:"; + trace 'j-2=[', $j-2, ']', $r->[$j-2] if $j >= 2; + trace 'j-1=[', $j-1, ']', $r->[$j-1] if $j >= 1; + trace "j =[", $j, "]", $r->[$j] if $j >= 0; + trace 'j+1=[', $j+1, ']', $r->[$j+1] if $j < @$r - 1; + trace 'j+2=[', $j+2, ']', $r->[$j+2] if $j < @$r - 2; + trace 'j+3=[', $j+3, ']', $r->[$j+3] if $j < @$r - 3; + } + return @return; + } + + # Here, have taken care of $NO and $MULTIPLE replaces. + # $j points to the highest affected range. But it can be < $i or even + # -1. These happen only if the insertion is entirely in the gap + # between r[$i-1] and r[$i]. Here's why: j < i means that the j loop + # above exited first time through with $end < $r->[$i]->start. (And + # then we subtracted one from j) This implies also that $start < + # $r->[$i]->start, but we know from above that $r->[$i-1]->end < + # $start, so the entire input range is in the gap. + if ($j < $i) { + + # Here the entire input range is in the gap before $i. + + if (main::DEBUG && $to_trace) { + if ($i) { + trace "Entire range is between $r->[$i-1] and $r->[$i]"; + } + else { + trace "Entire range is before $r->[$i]"; + } + } + return if $operation ne '+'; # Deletion of a non-existent range is + # a no-op + } + else { + + # Here the entire input range is not in the gap before $i. There + # is an affected one, and $j points to the highest such one. + + # At this point, here is the situation: + # This is not an insertion of a multiple, nor of tentative ($NO) + # data. + # $i points to the first element in the current range list that + # may be affected by this operation. In fact, we know + # that the range at $i is affected because we are in + # the else branch of this 'if' + # $j points to the highest affected range. + # In other words, + # r[$i-1]->end < $start <= r[$i]->end + # And: + # r[$i-1]->end < $start <= $end <= r[$j]->end + # + # Also: + # $cdm is a boolean which is set true if and only if this is a + # change or deletion (multiple was handled above). In + # other words, it could be renamed to be just $cd. + + # We now have enough information to decide if this call is a no-op + # or not. It is a no-op if it is a deletion of a non-existent + # range, or an insertion of already existing data. + + if (main::DEBUG && $to_trace && ! $cdm + && $i == $j + && $start >= $r->[$i]->start) + { + trace "no-op"; + } + return if ! $cdm # change or delete => not no-op + && $i == $j # more than one affected range => not no-op + + # Here, r[$i-1]->end < $start <= $end <= r[$i]->end + # Further, $start and/or $end is >= r[$i]->start + # The test below hence guarantees that + # r[$i]->start < $start <= $end <= r[$i]->end + # This means the input range is contained entirely in + # the one at $i, so is a no-op + && $start >= $r->[$i]->start; + } + + # Here, we know that some action will have to be taken. We have + # calculated the offset and length (though adjustments may be needed) + # for the splice. Now start constructing the replacement list. + my @replacement; + my $splice_start = $i; + + my $extends_below; + my $extends_above; + + # See if should extend any adjacent ranges. + if ($operation eq '-') { # Don't extend deletions + $extends_below = $extends_above = 0; + } + else { # Here, should extend any adjacent ranges. See if there are + # any. + $extends_below = ($i > 0 + # can't extend unless adjacent + && $r->[$i-1]->end == $start -1 + # can't extend unless are same standard value + && $r->[$i-1]->standard_form eq $standard_form + # can't extend unless share type + && $r->[$i-1]->type == $type); + $extends_above = ($j+1 < $range_list_size + && $r->[$j+1]->start == $end +1 + && $r->[$j+1]->standard_form eq $standard_form + && $r->[$j-1]->type == $type); + } + if ($extends_below && $extends_above) { # Adds to both + $splice_start--; # start replace at element below + $length += 2; # will replace on both sides + trace "Extends both below and above ranges" if main::DEBUG && $to_trace; + + # The result will fill in any gap, replacing both sides, and + # create one large range. + @replacement = Range->new($r->[$i-1]->start, + $r->[$j+1]->end, + Value => $value, + Type => $type); + } + else { + + # Here we know that the result won't just be the conglomeration of + # a new range with both its adjacent neighbors. But it could + # extend one of them. + + if ($extends_below) { + + # Here the new element adds to the one below, but not to the + # one above. If inserting, and only to that one range, can + # just change its ending to include the new one. + if ($length == 0 && ! $cdm) { + $r->[$i-1]->set_end($end); + trace "inserted range extends range to below so it is now $r->[$i-1]" if main::DEBUG && $to_trace; + return; + } + else { + trace "Changing inserted range to start at ", sprintf("%04X", $r->[$i-1]->start), " instead of ", sprintf("%04X", $start) if main::DEBUG && $to_trace; + $splice_start--; # start replace at element below + $length++; # will replace the element below + $start = $r->[$i-1]->start; + } + } + elsif ($extends_above) { + + # Here the new element adds to the one above, but not below. + # Mirror the code above + if ($length == 0 && ! $cdm) { + $r->[$j+1]->set_start($start); + trace "inserted range extends range to above so it is now $r->[$j+1]" if main::DEBUG && $to_trace; + return; + } + else { + trace "Changing inserted range to end at ", sprintf("%04X", $r->[$j+1]->end), " instead of ", sprintf("%04X", $end) if main::DEBUG && $to_trace; + $length++; # will replace the element above + $end = $r->[$j+1]->end; + } + } + + trace "Range at $i is $r->[$i]" if main::DEBUG && $to_trace; + + # Finally, here we know there will have to be a splice. + # If the change or delete affects only the highest portion of the + # first affected range, the range will have to be split. The + # splice will remove the whole range, but will replace it by a new + # range containing just the unaffected part. So, in this case, + # add to the replacement list just this unaffected portion. + if (! $extends_below + && $start > $r->[$i]->start && $start <= $r->[$i]->end) + { + push @replacement, + Range->new($r->[$i]->start, + $start - 1, + Value => $r->[$i]->value, + Type => $r->[$i]->type); + } + + # In the case of an insert or change, but not a delete, we have to + # put in the new stuff; this comes next. + if ($operation eq '+') { + push @replacement, Range->new($start, + $end, + Value => $value, + Type => $type); + } + + trace "Range at $j is $r->[$j]" if main::DEBUG && $to_trace && $j != $i; + #trace "$end >=", $r->[$j]->start, " && $end <", $r->[$j]->end if main::DEBUG && $to_trace; + + # And finally, if we're changing or deleting only a portion of the + # highest affected range, it must be split, as the lowest one was. + if (! $extends_above + && $j >= 0 # Remember that j can be -1 if before first + # current element + && $end >= $r->[$j]->start + && $end < $r->[$j]->end) + { + push @replacement, + Range->new($end + 1, + $r->[$j]->end, + Value => $r->[$j]->value, + Type => $r->[$j]->type); + } + } + + # And do the splice, as calculated above + if (main::DEBUG && $to_trace) { + trace "replacing $length element(s) at $i with "; + foreach my $replacement (@replacement) { + trace " $replacement"; + } + trace "Before splice:"; + trace 'i-2=[', $i-2, ']', $r->[$i-2] if $i >= 2; + trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i >= 1; + trace "i =[", $i, "]", $r->[$i]; + trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < @$r - 1; + trace 'i+2=[', $i+2, ']', $r->[$i+2] if $i < @$r - 2; + } + + my @return = splice @$r, $splice_start, $length, @replacement; + + if (main::DEBUG && $to_trace) { + trace "After splice:"; + trace 'i-2=[', $i-2, ']', $r->[$i-2] if $i >= 2; + trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i >= 1; + trace "i =[", $i, "]", $r->[$i]; + trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < @$r - 1; + trace 'i+2=[', $i+2, ']', $r->[$i+2] if $i < @$r - 2; + trace "removed @return"; + } + + # An actual deletion could have changed the maximum in the list. + # There was no deletion if the splice didn't return something, but + # otherwise recalculate it. This is done too rarely to worry about + # performance. + if ($operation eq '-' && @return) { + $max{$addr} = $r->[-1]->end; + } + return @return; + } + + sub reset_each_range { # reset the iterator for each_range(); + my $self = shift; + Carp::carp_extra_args(\@_) if main::DEBUG && @_; + + no overloading; + undef $each_range_iterator{pack 'J', $self}; + return; + } + + sub each_range { + # Iterate over each range in a range list. Results are undefined if + # the range list is changed during the iteration. + + my $self = shift; + Carp::carp_extra_args(\@_) if main::DEBUG && @_; + + my $addr = do { no overloading; pack 'J', $self; }; + + return if $self->is_empty; + + $each_range_iterator{$addr} = -1 + if ! defined $each_range_iterator{$addr}; + $each_range_iterator{$addr}++; + return $ranges{$addr}->[$each_range_iterator{$addr}] + if $each_range_iterator{$addr} < @{$ranges{$addr}}; + undef $each_range_iterator{$addr}; + return; + } + + sub count { # Returns count of code points in range list + my $self = shift; + Carp::carp_extra_args(\@_) if main::DEBUG && @_; + + my $addr = do { no overloading; pack 'J', $self; }; + + my $count = 0; + foreach my $range (@{$ranges{$addr}}) { + $count += $range->end - $range->start + 1; + } + return $count; + } + + sub delete_range { # Delete a range + my $self = shift; + my $start = shift; + my $end = shift; + + Carp::carp_extra_args(\@_) if main::DEBUG && @_; + + return $self->_add_delete('-', $start, $end, ""); + } + + sub is_empty { # Returns boolean as to if a range list is empty + my $self = shift; + Carp::carp_extra_args(\@_) if main::DEBUG && @_; + + no overloading; + return scalar @{$ranges{pack 'J', $self}} == 0; + } + + sub hash { + # Quickly returns a scalar suitable for separating tables into + # buckets, i.e. it is a hash function of the contents of a table, so + # there are relatively few conflicts. + + my $self = shift; + Carp::carp_extra_args(\@_) if main::DEBUG && @_; + + my $addr = do { no overloading; pack 'J', $self; }; + + # These are quickly computable. Return looks like 'min..max;count' + return $self->min . "..$max{$addr};" . scalar @{$ranges{$addr}}; + } +} # End closure for _Range_List_Base + +package Range_List; +use base '_Range_List_Base'; + +# A Range_List is a range list for match tables; i.e. the range values are +# not significant. Thus a number of operations can be safely added to it, +# such as inversion, intersection. Note that union is also an unsafe +# operation when range values are cared about, and that method is in the base +# class, not here. But things are set up so that that method is callable only +# during initialization. Only in this derived class, is there an operation +# that combines two tables. A Range_Map can thus be used to initialize a +# Range_List, and its mappings will be in the list, but are not significant to +# this class. + +sub trace { return main::trace(@_); } + +{ # Closure + + use overload + fallback => 0, + '+' => sub { my $self = shift; + my $other = shift; + + return $self->_union($other) + }, + '&' => sub { my $self = shift; + my $other = shift; + + return $self->_intersect($other, 0); + }, + '~' => "_invert", + '-' => "_subtract", + ; + + sub _invert { + # Returns a new Range_List that gives all code points not in $self. + + my $self = shift; + + my $new = Range_List->new; + + # Go through each range in the table, finding the gaps between them + my $max = -1; # Set so no gap before range beginning at 0 + for my $range ($self->ranges) { + my $start = $range->start; + my $end = $range->end; + + # If there is a gap before this range, the inverse will contain + # that gap. + if ($start > $max + 1) { + $new->add_range($max + 1, $start - 1); + } + $max = $end; + } + + # And finally, add the gap from the end of the table to the max + # possible code point + if ($max < $LAST_UNICODE_CODEPOINT) { + $new->add_range($max + 1, $LAST_UNICODE_CODEPOINT); + } + return $new; + } + + sub _subtract { + # Returns a new Range_List with the argument deleted from it. The + # argument can be a single code point, a range, or something that has + # a range, with the _range_list() method on it returning them + + my $self = shift; + my $other = shift; + my $reversed = shift; + Carp::carp_extra_args(\@_) if main::DEBUG && @_; + + if ($reversed) { + Carp::my_carp_bug("Can't cope with a " + . __PACKAGE__ + . " being the second parameter in a '-'. Subtraction ignored."); + return $self; + } + + my $new = Range_List->new(Initialize => $self); + + if (! ref $other) { # Single code point + $new->delete_range($other, $other); + } + elsif ($other->isa('Range')) { + $new->delete_range($other->start, $other->end); + } + elsif ($other->can('_range_list')) { + foreach my $range ($other->_range_list->ranges) { + $new->delete_range($range->start, $range->end); + } + } + else { + Carp::my_carp_bug("Can't cope with a " + . ref($other) + . " argument to '-'. Subtraction ignored." + ); + return $self; + } + + return $new; + } + + sub _intersect { + # Returns either a boolean giving whether the two inputs' range lists + # intersect (overlap), or a new Range_List containing the intersection + # of the two lists. The optional final parameter being true indicates + # to do the check instead of the intersection. + + my $a_object = shift; + my $b_object = shift; + my $check_if_overlapping = shift; + $check_if_overlapping = 0 unless defined $check_if_overlapping; + Carp::carp_extra_args(\@_) if main::DEBUG && @_; + + if (! defined $b_object) { + my $message = ""; + $message .= $a_object->_owner_name_of if defined $a_object; + Carp::my_carp_bug($message .= "Called with undefined value. Intersection not done."); + return; + } + + # a & b = !(!a | !b), or in our terminology = ~ ( ~a + -b ) + # Thus the intersection could be much more simply be written: + # return ~(~$a_object + ~$b_object); + # But, this is slower, and when taking the inverse of a large + # range_size_1 table, back when such tables were always stored that + # way, it became prohibitively slow, hence the code was changed to the + # below + + if ($b_object->isa('Range')) { + $b_object = Range_List->new(Initialize => $b_object, + Owner => $a_object->_owner_name_of); + } + $b_object = $b_object->_range_list if $b_object->can('_range_list'); + + my @a_ranges = $a_object->ranges; + my @b_ranges = $b_object->ranges; + + #local $to_trace = 1 if main::DEBUG; + trace "intersecting $a_object with ", scalar @a_ranges, "ranges and $b_object with", scalar @b_ranges, " ranges" if main::DEBUG && $to_trace; + + # Start with the first range in each list + my $a_i = 0; + my $range_a = $a_ranges[$a_i]; + my $b_i = 0; + my $range_b = $b_ranges[$b_i]; + + my $new = __PACKAGE__->new(Owner => $a_object->_owner_name_of) + if ! $check_if_overlapping; + + # If either list is empty, there is no intersection and no overlap + if (! defined $range_a || ! defined $range_b) { + return $check_if_overlapping ? 0 : $new; + } + trace "range_a[$a_i]=$range_a; range_b[$b_i]=$range_b" if main::DEBUG && $to_trace; + + # Otherwise, must calculate the intersection/overlap. Start with the + # very first code point in each list + my $a = $range_a->start; + my $b = $range_b->start; + + # Loop through all the ranges of each list; in each iteration, $a and + # $b are the current code points in their respective lists + while (1) { + + # If $a and $b are the same code point, ... + if ($a == $b) { + + # it means the lists overlap. If just checking for overlap + # know the answer now, + return 1 if $check_if_overlapping; + + # The intersection includes this code point plus anything else + # common to both current ranges. + my $start = $a; + my $end = main::min($range_a->end, $range_b->end); + if (! $check_if_overlapping) { + trace "adding intersection range ", sprintf("%04X", $start) . ".." . sprintf("%04X", $end) if main::DEBUG && $to_trace; + $new->add_range($start, $end); + } + + # Skip ahead to the end of the current intersect + $a = $b = $end; + + # If the current intersect ends at the end of either range (as + # it must for at least one of them), the next possible one + # will be the beginning code point in it's list's next range. + if ($a == $range_a->end) { + $range_a = $a_ranges[++$a_i]; + last unless defined $range_a; + $a = $range_a->start; + } + if ($b == $range_b->end) { + $range_b = $b_ranges[++$b_i]; + last unless defined $range_b; + $b = $range_b->start; + } + + trace "range_a[$a_i]=$range_a; range_b[$b_i]=$range_b" if main::DEBUG && $to_trace; + } + elsif ($a < $b) { + + # Not equal, but if the range containing $a encompasses $b, + # change $a to be the middle of the range where it does equal + # $b, so the next iteration will get the intersection + if ($range_a->end >= $b) { + $a = $b; + } + else { + + # Here, the current range containing $a is entirely below + # $b. Go try to find a range that could contain $b. + $a_i = $a_object->_search_ranges($b); + + # If no range found, quit. + last unless defined $a_i; + + # The search returns $a_i, such that + # range_a[$a_i-1]->end < $b <= range_a[$a_i]->end + # Set $a to the beginning of this new range, and repeat. + $range_a = $a_ranges[$a_i]; + $a = $range_a->start; + } + } + else { # Here, $b < $a. + + # Mirror image code to the leg just above + if ($range_b->end >= $a) { + $b = $a; + } + else { + $b_i = $b_object->_search_ranges($a); + last unless defined $b_i; + $range_b = $b_ranges[$b_i]; + $b = $range_b->start; + } + } + } # End of looping through ranges. + + # Intersection fully computed, or now know that there is no overlap + return $check_if_overlapping ? 0 : $new; + } + + sub overlaps { + # Returns boolean giving whether the two arguments overlap somewhere + + my $self = shift; + my $other = shift; + Carp::carp_extra_args(\@_) if main::DEBUG && @_; + + return $self->_intersect($other, 1); + } + + sub add_range { + # Add a range to the list. + + my $self = shift; + my $start = shift; + my $end = shift; + Carp::carp_extra_args(\@_) if main::DEBUG && @_; + + return $self->_add_delete('+', $start, $end, ""); + } + + sub is_code_point_usable { + # This used only for making the test script. See if the input + # proposed trial code point is one that Perl will handle. If second + # parameter is 0, it won't select some code points for various + # reasons, noted below. + + my $code = shift; + my $try_hard = shift; + Carp::carp_extra_args(\@_) if main::DEBUG && @_; + + return 0 if $code < 0; # Never use a negative + + # shun null. I'm (khw) not sure why this was done, but NULL would be + # the character very frequently used. + return $try_hard if $code == 0x0000; + + return 0 if $try_hard; # XXX Temporary until fix utf8.c + + # shun non-character code points. + return $try_hard if $code >= 0xFDD0 && $code <= 0xFDEF; + return $try_hard if ($code & 0xFFFE) == 0xFFFE; # includes FFFF + + return $try_hard if $code > $LAST_UNICODE_CODEPOINT; # keep in range + return $try_hard if $code >= 0xD800 && $code <= 0xDFFF; # no surrogate + + return 1; + } + + sub get_valid_code_point { + # Return a code point that's part of the range list. Returns nothing + # if the table is empty or we can't find a suitable code point. This + # used only for making the test script. + + my $self = shift; + Carp::carp_extra_args(\@_) if main::DEBUG && @_; + + my $addr = do { no overloading; pack 'J', $self; }; + + # On first pass, don't choose less desirable code points; if no good + # one is found, repeat, allowing a less desirable one to be selected. + for my $try_hard (0, 1) { + + # Look through all the ranges for a usable code point. + for my $set ($self->ranges) { + + # Try the edge cases first, starting with the end point of the + # range. + my $end = $set->end; + return $end if is_code_point_usable($end, $try_hard); + + # End point didn't, work. Start at the beginning and try + # every one until find one that does work. + for my $trial ($set->start .. $end - 1) { + return $trial if is_code_point_usable($trial, $try_hard); + } + } + } + return (); # If none found, give up. + } + + sub get_invalid_code_point { + # Return a code point that's not part of the table. Returns nothing + # if the table covers all code points or a suitable code point can't + # be found. This used only for making the test script. + + my $self = shift; + Carp::carp_extra_args(\@_) if main::DEBUG && @_; + + # Just find a valid code point of the inverse, if any. + return Range_List->new(Initialize => ~ $self)->get_valid_code_point; + } +} # end closure for Range_List + +package Range_Map; +use base '_Range_List_Base'; + +# A Range_Map is a range list in which the range values (called maps) are +# significant, and hence shouldn't be manipulated by our other code, which +# could be ambiguous or lose things. For example, in taking the union of two +# lists, which share code points, but which have differing values, which one +# has precedence in the union? +# It turns out that these operations aren't really necessary for map tables, +# and so this class was created to make sure they aren't accidentally +# applied to them. + +{ # Closure + + sub add_map { + # Add a range containing a mapping value to the list + + my $self = shift; + # Rest of parameters passed on + + return $self->_add_delete('+', @_); + } + + sub add_duplicate { + # Adds entry to a range list which can duplicate an existing entry + + my $self = shift; + my $code_point = shift; + my $value = shift; + Carp::carp_extra_args(\@_) if main::DEBUG && @_; + + return $self->add_map($code_point, $code_point, + $value, Replace => $MULTIPLE); + } +} # End of closure for package Range_Map + +package _Base_Table; + +# A table is the basic data structure that gets written out into a file for +# use by the Perl core. This is the abstract base class implementing the +# common elements from the derived ones. A list of the methods to be +# furnished by an implementing class is just after the constructor. + +sub standardize { return main::standardize($_[0]); } +sub trace { return main::trace(@_); } + +{ # Closure + + main::setup_package(); + + my %range_list; + # Object containing the ranges of the table. + main::set_access('range_list', \%range_list, 'p_r', 'p_s'); + + my %full_name; + # The full table name. + main::set_access('full_name', \%full_name, 'r'); + + my %name; + # The table name, almost always shorter + main::set_access('name', \%name, 'r'); + + my %short_name; + # The shortest of all the aliases for this table, with underscores removed + main::set_access('short_name', \%short_name); + + my %nominal_short_name_length; + # The length of short_name before removing underscores + main::set_access('nominal_short_name_length', + \%nominal_short_name_length); + + my %complete_name; + # The complete name, including property. + main::set_access('complete_name', \%complete_name, 'r'); + + my %property; + # Parent property this table is attached to. + main::set_access('property', \%property, 'r'); + + my %aliases; + # Ordered list of aliases of the table's name. The first ones in the list + # are output first in comments + main::set_access('aliases', \%aliases, 'readable_array'); + + my %comment; + # A comment associated with the table for human readers of the files + main::set_access('comment', \%comment, 's'); + + my %description; + # A comment giving a short description of the table's meaning for human + # readers of the files. + main::set_access('description', \%description, 'readable_array'); + + my %note; + # A comment giving a short note about the table for human readers of the + # files. + main::set_access('note', \%note, 'readable_array'); + + my %internal_only; + # Boolean; if set means any file that contains this table is marked as for + # internal-only use. + main::set_access('internal_only', \%internal_only); + + my %find_table_from_alias; + # The parent property passes this pointer to a hash which this class adds + # all its aliases to, so that the parent can quickly take an alias and + # find this table. + main::set_access('find_table_from_alias', \%find_table_from_alias, 'p_r'); + + my %locked; + # After this table is made equivalent to another one; we shouldn't go + # changing the contents because that could mean it's no longer equivalent + main::set_access('locked', \%locked, 'r'); + + my %file_path; + # This gives the final path to the file containing the table. Each + # directory in the path is an element in the array + main::set_access('file_path', \%file_path, 'readable_array'); + + my %status; + # What is the table's status, normal, $OBSOLETE, etc. Enum + main::set_access('status', \%status, 'r'); + + my %status_info; + # A comment about its being obsolete, or whatever non normal status it has + main::set_access('status_info', \%status_info, 'r'); + + my %range_size_1; + # Is the table to be output with each range only a single code point? + # This is done to avoid breaking existing code that may have come to rely + # on this behavior in previous versions of this program.) + main::set_access('range_size_1', \%range_size_1, 'r', 's'); + + my %perl_extension; + # A boolean set iff this table is a Perl extension to the Unicode + # standard. + main::set_access('perl_extension', \%perl_extension, 'r'); + + my %output_range_counts; + # A boolean set iff this table is to have comments written in the + # output file that contain the number of code points in the range. + # The constructor can override the global flag of the same name. + main::set_access('output_range_counts', \%output_range_counts, 'r'); + + sub new { + # All arguments are key => value pairs, which you can see below, most + # of which match fields documented above. Otherwise: Pod_Entry, + # Externally_Ok, and Fuzzy apply to the names of the table, and are + # documented in the Alias package + + return Carp::carp_too_few_args(\@_, 2) if main::DEBUG && @_ < 2; + + my $class = shift; + + my $self = bless \do { my $anonymous_scalar }, $class; + my $addr = do { no overloading; pack 'J', $self; }; + + my %args = @_; + + $name{$addr} = delete $args{'Name'}; + $find_table_from_alias{$addr} = delete $args{'_Alias_Hash'}; + $full_name{$addr} = delete $args{'Full_Name'}; + my $complete_name = $complete_name{$addr} + = delete $args{'Complete_Name'}; + $internal_only{$addr} = delete $args{'Internal_Only_Warning'} || 0; + $output_range_counts{$addr} = delete $args{'Output_Range_Counts'}; + $property{$addr} = delete $args{'_Property'}; + $range_list{$addr} = delete $args{'_Range_List'}; + $status{$addr} = delete $args{'Status'} || $NORMAL; + $status_info{$addr} = delete $args{'_Status_Info'} || ""; + $range_size_1{$addr} = delete $args{'Range_Size_1'} || 0; + $range_size_1{$addr} = 1 if $output_names; # Make sure 1 name per line + + my $description = delete $args{'Description'}; + my $externally_ok = delete $args{'Externally_Ok'}; + my $loose_match = delete $args{'Fuzzy'}; + my $note = delete $args{'Note'}; + my $make_pod_entry = delete $args{'Pod_Entry'}; + my $perl_extension = delete $args{'Perl_Extension'}; + + # Shouldn't have any left over + Carp::carp_extra_args(\%args) if main::DEBUG && %args; + + # Can't use || above because conceivably the name could be 0, and + # can't use // operator in case this program gets used in Perl 5.8 + $full_name{$addr} = $name{$addr} if ! defined $full_name{$addr}; + $output_range_counts{$addr} = $output_range_counts if + ! defined $output_range_counts{$addr}; + + $aliases{$addr} = [ ]; + $comment{$addr} = [ ]; + $description{$addr} = [ ]; + $note{$addr} = [ ]; + $file_path{$addr} = [ ]; + $locked{$addr} = ""; + + push @{$description{$addr}}, $description if $description; + push @{$note{$addr}}, $note if $note; + + if ($status{$addr} eq $PLACEHOLDER) { + + # A placeholder table doesn't get documented, is a perl extension, + # and quite likely will be empty + $make_pod_entry = 0 if ! defined $make_pod_entry; + $perl_extension = 1 if ! defined $perl_extension; + push @tables_that_may_be_empty, $complete_name{$addr}; + } + elsif (! $status{$addr}) { + + # If hasn't set its status already, see if it is on one of the + # lists of properties or tables that have particular statuses; if + # not, is normal. The lists are prioritized so the most serious + # ones are checked first + if (exists $why_suppressed{$complete_name} + # Don't suppress if overriden + && ! grep { $_ eq $complete_name{$addr} } + @output_mapped_properties) + { + $status{$addr} = $SUPPRESSED; + } + elsif (exists $why_deprecated{$complete_name}) { + $status{$addr} = $DEPRECATED; + } + elsif (exists $why_stabilized{$complete_name}) { + $status{$addr} = $STABILIZED; + } + elsif (exists $why_obsolete{$complete_name}) { + $status{$addr} = $OBSOLETE; + } + + # Existence above doesn't necessarily mean there is a message + # associated with it. Use the most serious message. + if ($status{$addr}) { + if ($why_suppressed{$complete_name}) { + $status_info{$addr} + = $why_suppressed{$complete_name}; + } + elsif ($why_deprecated{$complete_name}) { + $status_info{$addr} + = $why_deprecated{$complete_name}; + } + elsif ($why_stabilized{$complete_name}) { + $status_info{$addr} + = $why_stabilized{$complete_name}; + } + elsif ($why_obsolete{$complete_name}) { + $status_info{$addr} + = $why_obsolete{$complete_name}; + } + } + } + + $perl_extension{$addr} = $perl_extension || 0; + + # By convention what typically gets printed only or first is what's + # first in the list, so put the full name there for good output + # clarity. Other routines rely on the full name being first on the + # list + $self->add_alias($full_name{$addr}, + Externally_Ok => $externally_ok, + Fuzzy => $loose_match, + Pod_Entry => $make_pod_entry, + Status => $status{$addr}, + ); + + # Then comes the other name, if meaningfully different. + if (standardize($full_name{$addr}) ne standardize($name{$addr})) { + $self->add_alias($name{$addr}, + Externally_Ok => $externally_ok, + Fuzzy => $loose_match, + Pod_Entry => $make_pod_entry, + Status => $status{$addr}, + ); + } + + return $self; + } + + # Here are the methods that are required to be defined by any derived + # class + for my $sub (qw( + append_to_body + pre_body + )) + # append_to_body and pre_body are called in the write() method + # to add stuff after the main body of the table, but before + # its close; and to prepend stuff before the beginning of the + # table. + { + no strict "refs"; + *$sub = sub { + Carp::my_carp_bug( __LINE__ + . ": Must create method '$sub()' for " + . ref shift); + return; + } + } + + use overload + fallback => 0, + "." => \&main::_operator_dot, + '!=' => \&main::_operator_not_equal, + '==' => \&main::_operator_equal, + ; + + sub ranges { + # Returns the array of ranges associated with this table. + + no overloading; + return $range_list{pack 'J', shift}->ranges; + } + + sub add_alias { + # Add a synonym for this table. + + return Carp::carp_too_few_args(\@_, 3) if main::DEBUG && @_ < 3; + + my $self = shift; + my $name = shift; # The name to add. + my $pointer = shift; # What the alias hash should point to. For + # map tables, this is the parent property; + # for match tables, it is the table itself. + + my %args = @_; + my $loose_match = delete $args{'Fuzzy'}; + + my $make_pod_entry = delete $args{'Pod_Entry'}; + $make_pod_entry = $YES unless defined $make_pod_entry; + + my $externally_ok = delete $args{'Externally_Ok'}; + $externally_ok = 1 unless defined $externally_ok; + + my $status = delete $args{'Status'}; + $status = $NORMAL unless defined $status; + + Carp::carp_extra_args(\%args) if main::DEBUG && %args; + + # Capitalize the first letter of the alias unless it is one of the CJK + # ones which specifically begins with a lower 'k'. Do this because + # Unicode has varied whether they capitalize first letters or not, and + # have later changed their minds and capitalized them, but not the + # other way around. So do it always and avoid changes from release to + # release + $name = ucfirst($name) unless $name =~ /^k[A-Z]/; + + my $addr = do { no overloading; pack 'J', $self; }; + + # Figure out if should be loosely matched if not already specified. + if (! defined $loose_match) { + + # Is a loose_match if isn't null, and doesn't begin with an + # underscore and isn't just a number + if ($name ne "" + && substr($name, 0, 1) ne '_' + && $name !~ qr{^[0-9_.+-/]+$}) + { + $loose_match = 1; + } + else { + $loose_match = 0; + } + } + + # If this alias has already been defined, do nothing. + return if defined $find_table_from_alias{$addr}->{$name}; + + # That includes if it is standardly equivalent to an existing alias, + # in which case, add this name to the list, so won't have to search + # for it again. + my $standard_name = main::standardize($name); + if (defined $find_table_from_alias{$addr}->{$standard_name}) { + $find_table_from_alias{$addr}->{$name} + = $find_table_from_alias{$addr}->{$standard_name}; + return; + } + + # Set the index hash for this alias for future quick reference. + $find_table_from_alias{$addr}->{$name} = $pointer; + $find_table_from_alias{$addr}->{$standard_name} = $pointer; + local $to_trace = 0 if main::DEBUG; + trace "adding alias $name to $pointer" if main::DEBUG && $to_trace; + trace "adding alias $standard_name to $pointer" if main::DEBUG && $to_trace; + + + # Put the new alias at the end of the list of aliases unless the final + # element begins with an underscore (meaning it is for internal perl + # use) or is all numeric, in which case, put the new one before that + # one. This floats any all-numeric or underscore-beginning aliases to + # the end. This is done so that they are listed last in output lists, + # to encourage the user to use a better name (either more descriptive + # or not an internal-only one) instead. This ordering is relied on + # implicitly elsewhere in this program, like in short_name() + my $list = $aliases{$addr}; + my $insert_position = (@$list == 0 + || (substr($list->[-1]->name, 0, 1) ne '_' + && $list->[-1]->name =~ /\D/)) + ? @$list + : @$list - 1; + splice @$list, + $insert_position, + 0, + Alias->new($name, $loose_match, $make_pod_entry, + $externally_ok, $status); + + # This name may be shorter than any existing ones, so clear the cache + # of the shortest, so will have to be recalculated. + no overloading; + undef $short_name{pack 'J', $self}; + return; + } + + sub short_name { + # Returns a name suitable for use as the base part of a file name. + # That is, shorter wins. It can return undef if there is no suitable + # name. The name has all non-essential underscores removed. + + # The optional second parameter is a reference to a scalar in which + # this routine will store the length the returned name had before the + # underscores were removed, or undef if the return is undef. + + # The shortest name can change if new aliases are added. So using + # this should be deferred until after all these are added. The code + # that does that should clear this one's cache. + # Any name with alphabetics is preferred over an all numeric one, even + # if longer. + + my $self = shift; + my $nominal_length_ptr = shift; + Carp::carp_extra_args(\@_) if main::DEBUG && @_; + + my $addr = do { no overloading; pack 'J', $self; }; + + # For efficiency, don't recalculate, but this means that adding new + # aliases could change what the shortest is, so the code that does + # that needs to undef this. + if (defined $short_name{$addr}) { + if ($nominal_length_ptr) { + $$nominal_length_ptr = $nominal_short_name_length{$addr}; + } + return $short_name{$addr}; + } + + # Look at each alias + foreach my $alias ($self->aliases()) { + + # Don't use an alias that isn't ok to use for an external name. + next if ! $alias->externally_ok; + + my $name = main::Standardize($alias->name); + trace $self, $name if main::DEBUG && $to_trace; + + # Take the first one, or a shorter one that isn't numeric. This + # relies on numeric aliases always being last in the array + # returned by aliases(). Any alpha one will have precedence. + if (! defined $short_name{$addr} + || ($name =~ /\D/ + && length($name) < length($short_name{$addr}))) + { + # Remove interior underscores. + ($short_name{$addr} = $name) =~ s/ (?<= . ) _ (?= . ) //xg; + + $nominal_short_name_length{$addr} = length $name; + } + } + + # If no suitable external name return undef + if (! defined $short_name{$addr}) { + $$nominal_length_ptr = undef if $nominal_length_ptr; + return; + } + + # Don't allow a null external name. + if ($short_name{$addr} eq "") { + $short_name{$addr} = '_'; + $nominal_short_name_length{$addr} = 1; + } + + trace $self, $short_name{$addr} if main::DEBUG && $to_trace; + + if ($nominal_length_ptr) { + $$nominal_length_ptr = $nominal_short_name_length{$addr}; + } + return $short_name{$addr}; + } + + sub external_name { + # Returns the external name that this table should be known by. This + # is usually the short_name, but not if the short_name is undefined. + + my $self = shift; + Carp::carp_extra_args(\@_) if main::DEBUG && @_; + + my $short = $self->short_name; + return $short if defined $short; + + return '_'; + } + + sub add_description { # Adds the parameter as a short description. + + my $self = shift; + my $description = shift; + chomp $description; + Carp::carp_extra_args(\@_) if main::DEBUG && @_; + + no overloading; + push @{$description{pack 'J', $self}}, $description; + + return; + } + + sub add_note { # Adds the parameter as a short note. + + my $self = shift; + my $note = shift; + chomp $note; + Carp::carp_extra_args(\@_) if main::DEBUG && @_; + + no overloading; + push @{$note{pack 'J', $self}}, $note; + + return; + } + + sub add_comment { # Adds the parameter as a comment. + + my $self = shift; + my $comment = shift; + Carp::carp_extra_args(\@_) if main::DEBUG && @_; + + chomp $comment; + + no overloading; + push @{$comment{pack 'J', $self}}, $comment; + + return; + } + + sub comment { + # Return the current comment for this table. If called in list + # context, returns the array of comments. In scalar, returns a string + # of each element joined together with a period ending each. + + my $self = shift; + Carp::carp_extra_args(\@_) if main::DEBUG && @_; + + my $addr = do { no overloading; pack 'J', $self; }; + my @list = @{$comment{$addr}}; + return @list if wantarray; + my $return = ""; + foreach my $sentence (@list) { + $return .= '. ' if $return; + $return .= $sentence; + $return =~ s/\.$//; + } + $return .= '.' if $return; + return $return; + } + + sub initialize { + # Initialize the table with the argument which is any valid + # initialization for range lists. + + my $self = shift; + my $addr = do { no overloading; pack 'J', $self; }; + my $initialization = shift; + Carp::carp_extra_args(\@_) if main::DEBUG && @_; + + # Replace the current range list with a new one of the same exact + # type. + my $class = ref $range_list{$addr}; + $range_list{$addr} = $class->new(Owner => $self, + Initialize => $initialization); + return; + + } + + sub header { + # The header that is output for the table in the file it is written + # in. + + my $self = shift; + Carp::carp_extra_args(\@_) if main::DEBUG && @_; + + my $return = ""; + $return .= $DEVELOPMENT_ONLY if $compare_versions; + $return .= $HEADER; + no overloading; + $return .= $INTERNAL_ONLY if $internal_only{pack 'J', $self}; + return $return; + } + + sub write { + # Write a representation of the table to its file. + + my $self = shift; + my $tab_stops = shift; # The number of tab stops over to put any + # comment. + my $suppress_value = shift; # Optional, if the value associated with + # a range equals this one, don't write + # the range + Carp::carp_extra_args(\@_) if main::DEBUG && @_; + + my $addr = do { no overloading; pack 'J', $self; }; + + # Start with the header + my @OUT = $self->header; + + # Then the comments + push @OUT, "\n", main::simple_fold($comment{$addr}, '# '), "\n" + if $comment{$addr}; + + # Then any pre-body stuff. + my $pre_body = $self->pre_body; + push @OUT, $pre_body, "\n" if $pre_body; + + # The main body looks like a 'here' document + push @OUT, "return <<'END';\n"; + + if ($range_list{$addr}->is_empty) { + + # This is a kludge for empty tables to silence a warning in + # utf8.c, which can't really deal with empty tables, but it can + # deal with a table that matches nothing, as the inverse of 'Any' + # does. + push @OUT, "!utf8::IsAny\n"; + } + else { + my $range_size_1 = $range_size_1{$addr}; + + # Output each range as part of the here document. + for my $set ($range_list{$addr}->ranges) { + my $start = $set->start; + my $end = $set->end; + my $value = $set->value; + + # Don't output ranges whose value is the one to suppress + next if defined $suppress_value && $value eq $suppress_value; + + # If has or wants a single point range output + if ($start == $end || $range_size_1) { + if (ref $range_size_1 eq 'CODE') { + for my $i ($start .. $end) { + push @OUT, &$range_size_1($i, $value); + } + } + else { + for my $i ($start .. $end) { + push @OUT, sprintf "%04X\t\t%s\n", $i, $value; + if ($output_names) { + if (! defined $viacode[$i]) { + $viacode[$i] = + Property::property_ref('Perl_Charnames') + ->value_of($i) + || ""; + } + $OUT[-1] =~ s/\n/\t# $viacode[$i]\n/; + } + } + } + } + else { + push @OUT, sprintf "%04X\t%04X\t%s", $start, $end, $value; + + # Add a comment with the size of the range, if requested. + # Expand Tabs to make sure they all start in the same + # column, and then unexpand to use mostly tabs. + if (! $output_range_counts{$addr}) { + $OUT[-1] .= "\n"; + } + else { + $OUT[-1] = Text::Tabs::expand($OUT[-1]); + my $count = main::clarify_number($end - $start + 1); + use integer; + + my $width = $tab_stops * 8 - 1; + $OUT[-1] = sprintf("%-*s # [%s]\n", + $width, + $OUT[-1], + $count); + $OUT[-1] = Text::Tabs::unexpand($OUT[-1]); + } + } + } # End of loop through all the table's ranges + } + + # Add anything that goes after the main body, but within the here + # document, + my $append_to_body = $self->append_to_body; + push @OUT, $append_to_body if $append_to_body; + + # And finish the here document. + push @OUT, "END\n"; + + # All these files have a .pl suffix + $file_path{$addr}->[-1] .= '.pl'; + + main::write($file_path{$addr}, \@OUT); + return; + } + + sub set_status { # Set the table's status + my $self = shift; + my $status = shift; # The status enum value + my $info = shift; # Any message associated with it. + Carp::carp_extra_args(\@_) if main::DEBUG && @_; + + my $addr = do { no overloading; pack 'J', $self; }; + + $status{$addr} = $status; + $status_info{$addr} = $info; + return; + } + + sub lock { + # Don't allow changes to the table from now on. This stores a stack + # trace of where it was called, so that later attempts to modify it + # can immediately show where it got locked. + + my $self = shift; + Carp::carp_extra_args(\@_) if main::DEBUG && @_; + + my $addr = do { no overloading; pack 'J', $self; }; + + $locked{$addr} = ""; + + my $line = (caller(0))[2]; + my $i = 1; + + # Accumulate the stack trace + while (1) { + my ($pkg, $file, $caller_line, $caller) = caller $i++; + + last unless defined $caller; + + $locked{$addr} .= " called from $caller() at line $line\n"; + $line = $caller_line; + } + $locked{$addr} .= " called from main at line $line\n"; + + return; + } + + sub carp_if_locked { + # Return whether a table is locked or not, and, by the way, complain + # if is locked + + my $self = shift; + Carp::carp_extra_args(\@_) if main::DEBUG && @_; + + my $addr = do { no overloading; pack 'J', $self; }; + + return 0 if ! $locked{$addr}; + Carp::my_carp_bug("Can't modify a locked table. Stack trace of locking:\n$locked{$addr}\n\n"); + return 1; + } + + sub set_file_path { # Set the final directory path for this table + my $self = shift; + # Rest of parameters passed on + + no overloading; + @{$file_path{pack 'J', $self}} = @_; + return + } + + # Accessors for the range list stored in this table. First for + # unconditional + for my $sub (qw( + contains + count + each_range + hash + is_empty + max + min + range_count + reset_each_range + value_of + )) + { + no strict "refs"; + *$sub = sub { + use strict "refs"; + my $self = shift; + no overloading; + return $range_list{pack 'J', $self}->$sub(@_); + } + } + + # Then for ones that should fail if locked + for my $sub (qw( + delete_range + )) + { + no strict "refs"; + *$sub = sub { + use strict "refs"; + my $self = shift; + + return if $self->carp_if_locked; + no overloading; + return $range_list{pack 'J', $self}->$sub(@_); + } + } + +} # End closure + +package Map_Table; +use base '_Base_Table'; + +# A Map Table is a table that contains the mappings from code points to +# values. There are two weird cases: +# 1) Anomalous entries are ones that aren't maps of ranges of code points, but +# are written in the table's file at the end of the table nonetheless. It +# requires specially constructed code to handle these; utf8.c can not read +# these in, so they should not go in $map_directory. As of this writing, +# the only case that these happen is for named sequences used in +# charnames.pm. But this code doesn't enforce any syntax on these, so +# something else could come along that uses it. +# 2) Specials are anything that doesn't fit syntactically into the body of the +# table. The ranges for these have a map type of non-zero. The code below +# knows about and handles each possible type. In most cases, these are +# written as part of the header. +# +# A map table deliberately can't be manipulated at will unlike match tables. +# This is because of the ambiguities having to do with what to do with +# overlapping code points. And there just isn't a need for those things; +# what one wants to do is just query, add, replace, or delete mappings, plus +# write the final result. +# However, there is a method to get the list of possible ranges that aren't in +# this table to use for defaulting missing code point mappings. And, +# map_add_or_replace_non_nulls() does allow one to add another table to this +# one, but it is clearly very specialized, and defined that the other's +# non-null values replace this one's if there is any overlap. + +sub trace { return main::trace(@_); } + +{ # Closure + + main::setup_package(); + + my %default_map; + # Many input files omit some entries; this gives what the mapping for the + # missing entries should be + main::set_access('default_map', \%default_map, 'r'); + + my %anomalous_entries; + # Things that go in the body of the table which don't fit the normal + # scheme of things, like having a range. Not much can be done with these + # once there except to output them. This was created to handle named + # sequences. + main::set_access('anomalous_entry', \%anomalous_entries, 'a'); + main::set_access('anomalous_entries', # Append singular, read plural + \%anomalous_entries, + 'readable_array'); + + my %format; + # The format of the entries of the table. This is calculated from the + # data in the table (or passed in the constructor). This is an enum e.g., + # $STRING_FORMAT + main::set_access('format', \%format); + + my %core_access; + # This is a string, solely for documentation, indicating how one can get + # access to this property via the Perl core. + main::set_access('core_access', \%core_access, 'r', 's'); + + my %has_specials; + # Boolean set when non-zero map-type ranges are added to this table, + # which happens in only a few tables. This is purely for performance, to + # avoid having to search through every table upon output, so if all the + # non-zero maps got deleted before output, this would remain set, and the + # only penalty would be performance. Currently, most map tables that get + # output have specials in them, so this doesn't help that much anyway. + main::set_access('has_specials', \%has_specials); + + my %to_output_map; + # Boolean as to whether or not to write out this map table + main::set_access('to_output_map', \%to_output_map, 's'); + + + sub new { + my $class = shift; + my $name = shift; + + my %args = @_; + + # Optional initialization data for the table. + my $initialize = delete $args{'Initialize'}; + + my $core_access = delete $args{'Core_Access'}; + my $default_map = delete $args{'Default_Map'}; + my $format = delete $args{'Format'}; + my $property = delete $args{'_Property'}; + my $full_name = delete $args{'Full_Name'}; + # Rest of parameters passed on + + my $range_list = Range_Map->new(Owner => $property); + + my $self = $class->SUPER::new( + Name => $name, + Complete_Name => $full_name, + Full_Name => $full_name, + _Property => $property, + _Range_List => $range_list, + %args); + + my $addr = do { no overloading; pack 'J', $self; }; + + $anomalous_entries{$addr} = []; + $core_access{$addr} = $core_access; + $default_map{$addr} = $default_map; + $format{$addr} = $format; + + $self->initialize($initialize) if defined $initialize; + + return $self; + } + + use overload + fallback => 0, + qw("") => "_operator_stringify", + ; + + sub _operator_stringify { + my $self = shift; + + my $name = $self->property->full_name; + $name = '""' if $name eq ""; + return "Map table for Property '$name'"; + } + + sub add_alias { + # Add a synonym for this table (which means the property itself) + my $self = shift; + my $name = shift; + # Rest of parameters passed on. + + $self->SUPER::add_alias($name, $self->property, @_); + return; + } + + sub add_map { + # Add a range of code points to the list of specially-handled code + # points. $MULTI_CP is assumed if the type of special is not passed + # in. + + my $self = shift; + my $lower = shift; + my $upper = shift; + my $string = shift; + my %args = @_; + + my $type = delete $args{'Type'} || 0; + # Rest of parameters passed on + + # Can't change the table if locked. + return if $self->carp_if_locked; + + my $addr = do { no overloading; pack 'J', $self; }; + + $has_specials{$addr} = 1 if $type; + + $self->_range_list->add_map($lower, $upper, + $string, + @_, + Type => $type); + return; + } + + sub append_to_body { + # Adds to the written HERE document of the table's body any anomalous + # entries in the table.. + + my $self = shift; + Carp::carp_extra_args(\@_) if main::DEBUG && @_; + + my $addr = do { no overloading; pack 'J', $self; }; + + return "" unless @{$anomalous_entries{$addr}}; + return join("\n", @{$anomalous_entries{$addr}}) . "\n"; + } + + sub map_add_or_replace_non_nulls { + # This adds the mappings in the table $other to $self. Non-null + # mappings from $other override those in $self. It essentially merges + # the two tables, with the second having priority except for null + # mappings. + + my $self = shift; + my $other = shift; + Carp::carp_extra_args(\@_) if main::DEBUG && @_; + + return if $self->carp_if_locked; + + if (! $other->isa(__PACKAGE__)) { + Carp::my_carp_bug("$other should be a " + . __PACKAGE__ + . ". Not a '" + . ref($other) + . "'. Not added;"); + return; + } + + my $addr = do { no overloading; pack 'J', $self; }; + my $other_addr = do { no overloading; pack 'J', $other; }; + + local $to_trace = 0 if main::DEBUG; + + my $self_range_list = $self->_range_list; + my $other_range_list = $other->_range_list; + foreach my $range ($other_range_list->ranges) { + my $value = $range->value; + next if $value eq ""; + $self_range_list->_add_delete('+', + $range->start, + $range->end, + $value, + Type => $range->type, + Replace => $UNCONDITIONALLY); + } + + # Copy the specials information from the other table to $self + if ($has_specials{$other_addr}) { + $has_specials{$addr} = 1; + } + + return; + } + + sub set_default_map { + # Define what code points that are missing from the input files should + # map to + + my $self = shift; + my $map = shift; + Carp::carp_extra_args(\@_) if main::DEBUG && @_; + + my $addr = do { no overloading; pack 'J', $self; }; + + # Convert the input to the standard equivalent, if any (won't have any + # for $STRING properties) + my $standard = $self->_find_table_from_alias->{$map}; + $map = $standard->name if defined $standard; + + # Warn if there already is a non-equivalent default map for this + # property. Note that a default map can be a ref, which means that + # what it actually means is delayed until later in the program, and it + # IS permissible to override it here without a message. + my $default_map = $default_map{$addr}; + if (defined $default_map + && ! ref($default_map) + && $default_map ne $map + && main::Standardize($map) ne $default_map) + { + my $property = $self->property; + my $map_table = $property->table($map); + my $default_table = $property->table($default_map); + if (defined $map_table + && defined $default_table + && $map_table != $default_table) + { + Carp::my_carp("Changing the default mapping for " + . $property + . " from $default_map to $map'"); + } + } + + $default_map{$addr} = $map; + + # Don't also create any missing table for this map at this point, + # because if we did, it could get done before the main table add is + # done for PropValueAliases.txt; instead the caller will have to make + # sure it exists, if desired. + return; + } + + sub to_output_map { + # Returns boolean: should we write this map table? + + my $self = shift; + Carp::carp_extra_args(\@_) if main::DEBUG && @_; + + my $addr = do { no overloading; pack 'J', $self; }; + + # If overridden, use that + return $to_output_map{$addr} if defined $to_output_map{$addr}; + + my $full_name = $self->full_name; + + # If table says to output, do so; if says to suppress it, do do. + return 1 if grep { $_ eq $full_name } @output_mapped_properties; + return 0 if $self->status eq $SUPPRESSED; + + my $type = $self->property->type; + + # Don't want to output binary map tables even for debugging. + return 0 if $type == $BINARY; + + # But do want to output string ones. + return 1 if $type == $STRING; + + # Otherwise is an $ENUM, don't output it + return 0; + } + + sub inverse_list { + # Returns a Range_List that is gaps of the current table. That is, + # the inversion + + my $self = shift; + Carp::carp_extra_args(\@_) if main::DEBUG && @_; + + my $current = Range_List->new(Initialize => $self->_range_list, + Owner => $self->property); + return ~ $current; + } + + sub set_final_comment { + # Just before output, create the comment that heads the file + # containing this table. + + my $self = shift; + Carp::carp_extra_args(\@_) if main::DEBUG && @_; + + # No sense generating a comment if aren't going to write it out. + return if ! $self->to_output_map; + + my $addr = do { no overloading; pack 'J', $self; }; + + my $property = $self->property; + + # Get all the possible names for this property. Don't use any that + # aren't ok for use in a file name, etc. This is perhaps causing that + # flag to do double duty, and may have to be changed in the future to + # have our own flag for just this purpose; but it works now to exclude + # Perl generated synonyms from the lists for properties, where the + # name is always the proper Unicode one. + my @property_aliases = grep { $_->externally_ok } $self->aliases; + + my $count = $self->count; + my $default_map = $default_map{$addr}; + + # The ranges that map to the default aren't output, so subtract that + # to get those actually output. A property with matching tables + # already has the information calculated. + if ($property->type != $STRING) { + $count -= $property->table($default_map)->count; + } + elsif (defined $default_map) { + + # But for $STRING properties, must calculate now. Subtract the + # count from each range that maps to the default. + foreach my $range ($self->_range_list->ranges) { + if ($range->value eq $default_map) { + $count -= $range->end +1 - $range->start; + } + } + + } + + # Get a string version of $count with underscores in large numbers, + # for clarity. + my $string_count = main::clarify_number($count); + + my $code_points = ($count == 1) + ? 'single code point' + : "$string_count code points"; + + my $mapping; + my $these_mappings; + my $are; + if (@property_aliases <= 1) { + $mapping = 'mapping'; + $these_mappings = 'this mapping'; + $are = 'is' + } + else { + $mapping = 'synonymous mappings'; + $these_mappings = 'these mappings'; + $are = 'are' + } + my $cp; + if ($count >= $MAX_UNICODE_CODEPOINTS) { + $cp = "any code point in Unicode Version $string_version"; + } + else { + my $map_to; + if ($default_map eq "") { + $map_to = 'the null string'; + } + elsif ($default_map eq $CODE_POINT) { + $map_to = "itself"; + } + else { + $map_to = "'$default_map'"; + } + if ($count == 1) { + $cp = "the single code point"; + } + else { + $cp = "one of the $code_points"; + } + $cp .= " in Unicode Version $string_version for which the mapping is not to $map_to"; + } + + my $comment = ""; + + my $status = $self->status; + if ($status) { + my $warn = uc $status_past_participles{$status}; + $comment .= <name . '(cp)' + ); + } + $comment .= + "\nwhere 'cp' is $cp. Note that $these_mappings $are "; + + my $access = $core_access{$addr}; + if ($access) { + $comment .= "accessible through the Perl core via $access."; + } + else { + $comment .= "not accessible through the Perl core directly."; + } + + # And append any commentary already set from the actual property. + $comment .= "\n\n" . $self->comment if $self->comment; + if ($self->description) { + $comment .= "\n\n" . join " ", $self->description; + } + if ($self->note) { + $comment .= "\n\n" . join " ", $self->note; + } + $comment .= "\n"; + + if (! $self->perl_extension) { + $comment .= <range_size_1) { + $comment.= <output_range_counts) { + $comment .= <set_comment(main::join_lines($comment)); + return; + } + + my %swash_keys; # Makes sure don't duplicate swash names. + + sub pre_body { + # Returns the string that should be output in the file before the main + # body of this table. This includes some hash entries identifying the + # format of the body, and what the single value should be for all + # ranges missing from it. It also includes any code points which have + # map_types that don't go in the main table. + + my $self = shift; + Carp::carp_extra_args(\@_) if main::DEBUG && @_; + + my $addr = do { no overloading; pack 'J', $self; }; + + my $name = $self->property->swash_name; + + if (defined $swash_keys{$name}) { + Carp::my_carp(join_lines(<_range_list; + foreach my $range ($range_map->ranges) { + next unless $range->type != 0; + my $low = $range->start; + my $high = $range->end; + my $map = $range->value; + my $type = $range->type; + + # No need to output the range if it maps to the default. And + # the write method won't output it either, so no need to + # delete it to keep it from being output, and is faster to + # skip than to delete anyway. + next if $map eq $default_map; + + # Delete the range to keep write() from trying to output it + $range_map->delete_range($low, $high); + + # Switch based on the map type... + if ($type == $HANGUL_SYLLABLE) { + + # These are entirely algorithmically determinable based on + # some constants furnished by Unicode; for now, just set a + # flag to indicate that have them. Below we will output + # the code that does the algorithm. + $has_hangul_syllables = 1; + } + elsif ($type == $CP_IN_NAME) { + + # If the name ends in the code point it represents, are + # also algorithmically determinable, but need information + # about the map to do so. Both the map and its inverse + # are stored in data structures output in the file. + push @{$names_ending_in_code_point{$map}->{'low'}}, $low; + push @{$names_ending_in_code_point{$map}->{'high'}}, $high; + + push @code_points_ending_in_code_point, { low => $low, + high => $high, + name => $map + }; + } + elsif ($range->type == $MULTI_CP || $range->type == $NULL) { + + # Multi-code point maps and null string maps have an entry + # for each code point in the range. They use the same + # output format. + for my $code_point ($low .. $high) { + + # The pack() below can't cope with surrogates. + if ($code_point >= 0xD800 && $code_point <= 0xDFFF) { + Carp::my_carp("Surrogage code point '$code_point' in mapping to '$map' in $self. No map created"); + next; + } + + # Generate the hash entries for these in the form that + # utf8.c understands. + my $tostr = ""; + foreach my $to (split " ", $map) { + if ($to !~ /^$code_point_re$/) { + Carp::my_carp("Illegal code point '$to' in mapping '$map' from $code_point in $self. No map created"); + next; + } + $tostr .= sprintf "\\x{%s}", $to; + } + + # I (khw) have never waded through this line to + # understand it well enough to comment it. + my $utf8 = sprintf(qq["%s" => "$tostr",], + join("", map { sprintf "\\x%02X", $_ } + unpack("U0C*", pack("U", $code_point)))); + + # Add a comment so that a human reader can more easily + # see what's going on. + push @multi_code_point_maps, + sprintf("%-45s # U+%04X => %s", $utf8, + $code_point, + $map); + } + } + else { + Carp::my_carp("Unrecognized map type '$range->type' in '$range' in $self. Using type 0 instead"); + $range_map->add_map($low, $high, $map, Replace => $UNCONDITIONALLY, Type => 0); + } + } # End of loop through all ranges + + # Here have gone through the whole file. If actually generated + # anything for each map type, add its respective header and + # trailer + if (@multi_code_point_maps) { + $pre_body .= <[$i] > $code_point; + next if $names_ending_in_code_point{$base}{'high'}->[$i] < $code_point; + + # Here, the code point is in the range. + return $code_point; + } + + # Here, looked like the name had a code point number in it, but + # did not match one of the valid ones. + return; + } + + sub code_point_to_name_special { + my $code_point = shift; + + # Returns the name of a code point if algorithmically determinable; + # undef if not +END + if ($has_hangul_syllables) { + $pre_body .= << 'END'; + + # If in the Hangul range, calculate the name based on Unicode's + # algorithm + if ($code_point >= $SBase && $code_point <= $SBase + $SCount -1) { + use integer; + my $SIndex = $code_point - $SBase; + my $L = $LBase + $SIndex / $NCount; + my $V = $VBase + ($SIndex % $NCount) / $TCount; + my $T = $TBase + $SIndex % $TCount; + $name = "$HANGUL_SYLLABLE$Jamo{$L}$Jamo{$V}"; + $name .= $Jamo{$T} if $T != $TBase; + return $name; + } +END + } + $pre_body .= << 'END'; + + # Look through list of these code points for one in range. + foreach my $hash (@code_points_ending_in_code_point) { + return if $code_point < $hash->{'low'}; + if ($code_point <= $hash->{'high'}) { + return sprintf("%s-%04X", $hash->{'name'}, $code_point); + } + } + return; # None found + } +} # End closure + +END + } # End of has hangul or code point in name maps. + } # End of has specials + + # Calculate the format of the table if not already done. + my $format = $format{$addr}; + my $property = $self->property; + my $type = $property->type; + if (! defined $format) { + if ($type == $BINARY) { + + # Don't bother checking the values, because we elsewhere + # verify that a binary table has only 2 values. + $format = $BINARY_FORMAT; + } + else { + my @ranges = $self->_range_list->ranges; + + # default an empty table based on its type and default map + if (! @ranges) { + + # But it turns out that the only one we can say is a + # non-string (besides binary, handled above) is when the + # table is a string and the default map is to a code point + if ($type == $STRING && $default_map eq $CODE_POINT) { + $format = $HEX_FORMAT; + } + else { + $format = $STRING_FORMAT; + } + } + else { + + # Start with the most restrictive format, and as we find + # something that doesn't fit with that, change to the next + # most restrictive, and so on. + $format = $DECIMAL_FORMAT; + foreach my $range (@ranges) { + my $map = $range->value; + if ($map ne $default_map) { + last if $format eq $STRING_FORMAT; # already at + # least + # restrictive + $format = $INTEGER_FORMAT + if $format eq $DECIMAL_FORMAT + && $map !~ / ^ [0-9] $ /x; + $format = $FLOAT_FORMAT + if $format eq $INTEGER_FORMAT + && $map !~ / ^ -? [0-9]+ $ /x; + $format = $RATIONAL_FORMAT + if $format eq $FLOAT_FORMAT + && $map !~ / ^ -? [0-9]+ \. [0-9]* $ /x; + $format = $HEX_FORMAT + if $format eq $RATIONAL_FORMAT + && $map !~ / ^ -? [0-9]+ ( \/ [0-9]+ )? $ /x; + $format = $STRING_FORMAT if $format eq $HEX_FORMAT + && $map =~ /[^0-9A-F]/; + } + } + } + } + } # end of calculating format + + my $return = <SUPER::write( + ($self->property == $block) + ? 7 # block file needs more tab stops + : 3, + $default_map{$addr}); # don't write defaulteds + } + + # Accessors for the underlying list that should fail if locked. + for my $sub (qw( + add_duplicate + )) + { + no strict "refs"; + *$sub = sub { + use strict "refs"; + my $self = shift; + + return if $self->carp_if_locked; + return $self->_range_list->$sub(@_); + } + } +} # End closure for Map_Table + +package Match_Table; +use base '_Base_Table'; + +# A Match table is one which is a list of all the code points that have +# the same property and property value, for use in \p{property=value} +# constructs in regular expressions. It adds very little data to the base +# structure, but many methods, as these lists can be combined in many ways to +# form new ones. +# There are only a few concepts added: +# 1) Equivalents and Relatedness. +# Two tables can match the identical code points, but have different names. +# This always happens when there is a perl single form extension +# \p{IsProperty} for the Unicode compound form \P{Property=True}. The two +# tables are set to be related, with the Perl extension being a child, and +# the Unicode property being the parent. +# +# It may be that two tables match the identical code points and we don't +# know if they are related or not. This happens most frequently when the +# Block and Script properties have the exact range. But note that a +# revision to Unicode could add new code points to the script, which would +# now have to be in a different block (as the block was filled, or there +# would have been 'Unknown' script code points in it and they wouldn't have +# been identical). So we can't rely on any two properties from Unicode +# always matching the same code points from release to release, and thus +# these tables are considered coincidentally equivalent--not related. When +# two tables are unrelated but equivalent, one is arbitrarily chosen as the +# 'leader', and the others are 'equivalents'. This concept is useful +# to minimize the number of tables written out. Only one file is used for +# any identical set of code points, with entries in Heavy.pl mapping all +# the involved tables to it. +# +# Related tables will always be identical; we set them up to be so. Thus +# if the Unicode one is deprecated, the Perl one will be too. Not so for +# unrelated tables. Relatedness makes generating the documentation easier. +# +# 2) Conflicting. It may be that there will eventually be name clashes, with +# the same name meaning different things. For a while, there actually were +# conflicts, but they have so far been resolved by changing Perl's or +# Unicode's definitions to match the other, but when this code was written, +# it wasn't clear that that was what was going to happen. (Unicode changed +# because of protests during their beta period.) Name clashes are warned +# about during compilation, and the documentation. The generated tables +# are sane, free of name clashes, because the code suppresses the Perl +# version. But manual intervention to decide what the actual behavior +# should be may be required should this happen. The introductory comments +# have more to say about this. + +sub standardize { return main::standardize($_[0]); } +sub trace { return main::trace(@_); } + + +{ # Closure + + main::setup_package(); + + my %leader; + # The leader table of this one; initially $self. + main::set_access('leader', \%leader, 'r'); + + my %equivalents; + # An array of any tables that have this one as their leader + main::set_access('equivalents', \%equivalents, 'readable_array'); + + my %parent; + # The parent table to this one, initially $self. This allows us to + # distinguish between equivalent tables that are related, and those which + # may not be, but share the same output file because they match the exact + # same set of code points in the current Unicode release. + main::set_access('parent', \%parent, 'r'); + + my %children; + # An array of any tables that have this one as their parent + main::set_access('children', \%children, 'readable_array'); + + my %conflicting; + # Array of any tables that would have the same name as this one with + # a different meaning. This is used for the generated documentation. + main::set_access('conflicting', \%conflicting, 'readable_array'); + + my %matches_all; + # Set in the constructor for tables that are expected to match all code + # points. + main::set_access('matches_all', \%matches_all, 'r'); + + sub new { + my $class = shift; + + my %args = @_; + + # The property for which this table is a listing of property values. + my $property = delete $args{'_Property'}; + + my $name = delete $args{'Name'}; + my $full_name = delete $args{'Full_Name'}; + $full_name = $name if ! defined $full_name; + + # Optional + my $initialize = delete $args{'Initialize'}; + my $matches_all = delete $args{'Matches_All'} || 0; + # Rest of parameters passed on. + + my $range_list = Range_List->new(Initialize => $initialize, + Owner => $property); + + my $complete = $full_name; + $complete = '""' if $complete eq ""; # A null name shouldn't happen, + # but this helps debug if it + # does + # The complete name for a match table includes it's property in a + # compound form 'property=table', except if the property is the + # pseudo-property, perl, in which case it is just the single form, + # 'table' (If you change the '=' must also change the ':' in lots of + # places in this program that assume an equal sign) + $complete = $property->full_name . "=$complete" if $property != $perl; + + my $self = $class->SUPER::new(%args, + Name => $name, + Complete_Name => $complete, + Full_Name => $full_name, + _Property => $property, + _Range_List => $range_list, + ); + my $addr = do { no overloading; pack 'J', $self; }; + + $conflicting{$addr} = [ ]; + $equivalents{$addr} = [ ]; + $children{$addr} = [ ]; + $matches_all{$addr} = $matches_all; + $leader{$addr} = $self; + $parent{$addr} = $self; + + return $self; + } + + # See this program's beginning comment block about overloading these. + use overload + fallback => 0, + qw("") => "_operator_stringify", + '=' => sub { + my $self = shift; + + return if $self->carp_if_locked; + return $self; + }, + + '+' => sub { + my $self = shift; + my $other = shift; + + return $self->_range_list + $other; + }, + '&' => sub { + my $self = shift; + my $other = shift; + + return $self->_range_list & $other; + }, + '+=' => sub { + my $self = shift; + my $other = shift; + + return if $self->carp_if_locked; + + my $addr = do { no overloading; pack 'J', $self; }; + + if (ref $other) { + + # Change the range list of this table to be the + # union of the two. + $self->_set_range_list($self->_range_list + + $other); + } + else { # $other is just a simple value + $self->add_range($other, $other); + } + return $self; + }, + '-' => sub { my $self = shift; + my $other = shift; + my $reversed = shift; + + if ($reversed) { + Carp::my_carp_bug("Can't cope with a " + . __PACKAGE__ + . " being the first parameter in a '-'. Subtraction ignored."); + return; + } + + return $self->_range_list - $other; + }, + '~' => sub { my $self = shift; + return ~ $self->_range_list; + }, + ; + + sub _operator_stringify { + my $self = shift; + + my $name = $self->complete_name; + return "Table '$name'"; + } + + sub add_alias { + # Add a synonym for this table. See the comments in the base class + + my $self = shift; + my $name = shift; + # Rest of parameters passed on. + + $self->SUPER::add_alias($name, $self, @_); + return; + } + + sub add_conflicting { + # Add the name of some other object to the list of ones that name + # clash with this match table. + + my $self = shift; + my $conflicting_name = shift; # The name of the conflicting object + my $p = shift || 'p'; # Optional, is this a \p{} or \P{} ? + my $conflicting_object = shift; # Optional, the conflicting object + # itself. This is used to + # disambiguate the text if the input + # name is identical to any of the + # aliases $self is known by. + # Sometimes the conflicting object is + # merely hypothetical, so this has to + # be an optional parameter. + Carp::carp_extra_args(\@_) if main::DEBUG && @_; + + my $addr = do { no overloading; pack 'J', $self; }; + + # Check if the conflicting name is exactly the same as any existing + # alias in this table (as long as there is a real object there to + # disambiguate with). + if (defined $conflicting_object) { + foreach my $alias ($self->aliases) { + if ($alias->name eq $conflicting_name) { + + # Here, there is an exact match. This results in + # ambiguous comments, so disambiguate by changing the + # conflicting name to its object's complete equivalent. + $conflicting_name = $conflicting_object->complete_name; + last; + } + } + } + + # Convert to the \p{...} final name + $conflicting_name = "\\$p" . "{$conflicting_name}"; + + # Only add once + return if grep { $conflicting_name eq $_ } @{$conflicting{$addr}}; + + push @{$conflicting{$addr}}, $conflicting_name; + + return; + } + + sub is_equivalent_to { + # Return boolean of whether or not the other object is a table of this + # type and has been marked equivalent to this one. + + my $self = shift; + my $other = shift; + Carp::carp_extra_args(\@_) if main::DEBUG && @_; + + return 0 if ! defined $other; # Can happen for incomplete early + # releases + unless ($other->isa(__PACKAGE__)) { + my $ref_other = ref $other; + my $ref_self = ref $self; + Carp::my_carp_bug("Argument to 'is_equivalent_to' must be another $ref_self, not a '$ref_other'. $other not set equivalent to $self."); + return 0; + } + + # Two tables are equivalent if they have the same leader. + no overloading; + return $leader{pack 'J', $self} == $leader{pack 'J', $other}; + return; + } + + sub matches_identically_to { + # Return a boolean as to whether or not two tables match identical + # sets of code points. + + my $self = shift; + my $other = shift; + Carp::carp_extra_args(\@_) if main::DEBUG && @_; + + unless ($other->isa(__PACKAGE__)) { + my $ref_other = ref $other; + my $ref_self = ref $self; + Carp::my_carp_bug("Argument to 'matches_identically_to' must be another $ref_self, not a '$ref_other'. $other not set equivalent to $self."); + return 0; + } + + # These are ordered in increasing real time to figure out (at least + # until a patch changes that and doesn't change this) + return 0 if $self->max != $other->max; + return 0 if $self->min != $other->min; + return 0 if $self->range_count != $other->range_count; + return 0 if $self->count != $other->count; + + # Here they could be identical because all the tests above passed. + # The loop below is somewhat simpler since we know they have the same + # number of elements. Compare range by range, until reach the end or + # find something that differs. + my @a_ranges = $self->_range_list->ranges; + my @b_ranges = $other->_range_list->ranges; + for my $i (0 .. @a_ranges - 1) { + my $a = $a_ranges[$i]; + my $b = $b_ranges[$i]; + trace "self $a; other $b" if main::DEBUG && $to_trace; + return 0 if $a->start != $b->start || $a->end != $b->end; + } + return 1; + } + + sub set_equivalent_to { + # Set $self equivalent to the parameter table. + # The required Related => 'x' parameter is a boolean indicating + # whether these tables are related or not. If related, $other becomes + # the 'parent' of $self; if unrelated it becomes the 'leader' + # + # Related tables share all characteristics except names; equivalents + # not quite so many. + # If they are related, one must be a perl extension. This is because + # we can't guarantee that Unicode won't change one or the other in a + # later release even if they are idential now. + + my $self = shift; + my $other = shift; + + my %args = @_; + my $related = delete $args{'Related'}; + + Carp::carp_extra_args(\%args) if main::DEBUG && %args; + + return if ! defined $other; # Keep on going; happens in some early + # Unicode releases. + + if (! defined $related) { + Carp::my_carp_bug("set_equivalent_to must have 'Related => [01] parameter. Assuming $self is not related to $other"); + $related = 0; + } + + # If already are equivalent, no need to re-do it; if subroutine + # returns null, it found an error, also do nothing + my $are_equivalent = $self->is_equivalent_to($other); + return if ! defined $are_equivalent || $are_equivalent; + + my $addr = do { no overloading; pack 'J', $self; }; + my $current_leader = ($related) ? $parent{$addr} : $leader{$addr}; + + if ($related && + ! $other->perl_extension + && ! $current_leader->perl_extension) + { + Carp::my_carp_bug("set_equivalent_to should have 'Related => 0 for equivalencing two Unicode properties. Assuming $self is not related to $other"); + $related = 0; + } + + my $leader = do { no overloading; pack 'J', $current_leader; }; + my $other_addr = do { no overloading; pack 'J', $other; }; + + # Any tables that are equivalent to or children of this table must now + # instead be equivalent to or (children) to the new leader (parent), + # still equivalent. The equivalency includes their matches_all info, + # and for related tables, their status + # All related tables are of necessity equivalent, but the converse + # isn't necessarily true + my $status = $other->status; + my $status_info = $other->status_info; + my $matches_all = $matches_all{other_addr}; + foreach my $table ($current_leader, @{$equivalents{$leader}}) { + next if $table == $other; + trace "setting $other to be the leader of $table, status=$status" if main::DEBUG && $to_trace; + + my $table_addr = do { no overloading; pack 'J', $table; }; + $leader{$table_addr} = $other; + $matches_all{$table_addr} = $matches_all; + $self->_set_range_list($other->_range_list); + push @{$equivalents{$other_addr}}, $table; + if ($related) { + $parent{$table_addr} = $other; + push @{$children{$other_addr}}, $table; + $table->set_status($status, $status_info); + } + } + + # Now that we've declared these to be equivalent, any changes to one + # of the tables would invalidate that equivalency. + $self->lock; + $other->lock; + return; + } + + sub add_range { # Add a range to the list for this table. + my $self = shift; + # Rest of parameters passed on + + return if $self->carp_if_locked; + return $self->_range_list->add_range(@_); + } + + sub pre_body { # Does nothing for match tables. + return + } + + sub append_to_body { # Does nothing for match tables. + return + } + + sub write { + my $self = shift; + Carp::carp_extra_args(\@_) if main::DEBUG && @_; + + return $self->SUPER::write(2); # 2 tab stops + } + + sub set_final_comment { + # This creates a comment for the file that is to hold the match table + # $self. It is somewhat convoluted to make the English read nicely, + # but, heh, it's just a comment. + # This should be called only with the leader match table of all the + # ones that share the same file. It lists all such tables, ordered so + # that related ones are together. + + my $leader = shift; # Should only be called on the leader table of + # an equivalent group + Carp::carp_extra_args(\@_) if main::DEBUG && @_; + + my $addr = do { no overloading; pack 'J', $leader; }; + + if ($leader{$addr} != $leader) { + Carp::my_carp_bug(<count; + my $string_count = main::clarify_number($count); + + my $loose_count = 0; # how many aliases loosely matched + my $compound_name = ""; # ? Are any names compound?, and if so, an + # example + my $properties_with_compound_names = 0; # count of these + + + my %flags; # The status flags used in the file + my $total_entries = 0; # number of entries written in the comment + my $matches_comment = ""; # The portion of the comment about the + # \p{}'s + my @global_comments; # List of all the tables' comments that are + # there before this routine was called. + + # Get list of all the parent tables that are equivalent to this one + # (including itself). + my @parents = grep { $parent{main::objaddr $_} == $_ } + main::uniques($leader, @{$equivalents{$addr}}); + my $has_unrelated = (@parents >= 2); # boolean, ? are there unrelated + # tables + + for my $parent (@parents) { + + my $property = $parent->property; + + # Special case 'N' tables in properties with two match tables when + # the other is a 'Y' one. These are likely to be binary tables, + # but not necessarily. In either case, \P{} will match the + # complement of \p{}, and so if something is a synonym of \p, the + # complement of that something will be the synonym of \P. This + # would be true of any property with just two match tables, not + # just those whose values are Y and N; but that would require a + # little extra work, and there are none such so far in Unicode. + my $perl_p = 'p'; # which is it? \p{} or \P{} + my @yes_perl_synonyms; # list of any synonyms for the 'Y' table + + if (scalar $property->tables == 2 + && $parent == $property->table('N') + && defined (my $yes = $property->table('Y'))) + { + my $yes_addr = do { no overloading; pack 'J', $yes; }; + @yes_perl_synonyms + = grep { $_->property == $perl } + main::uniques($yes, + $parent{$yes_addr}, + $parent{$yes_addr}->children); + + # But these synonyms are \P{} ,not \p{} + $perl_p = 'P'; + } + + my @description; # Will hold the table description + my @note; # Will hold the table notes. + my @conflicting; # Will hold the table conflicts. + + # Look at the parent, any yes synonyms, and all the children + my $parent_addr = do { no overloading; pack 'J', $parent; }; + for my $table ($parent, + @yes_perl_synonyms, + @{$children{$parent_addr}}) + { + my $table_addr = do { no overloading; pack 'J', $table; }; + my $table_property = $table->property; + + # Tables are separated by a blank line to create a grouping. + $matches_comment .= "\n" if $matches_comment; + + # The table is named based on the property and value + # combination it is for, like script=greek. But there may be + # a number of synonyms for each side, like 'sc' for 'script', + # and 'grek' for 'greek'. Any combination of these is a valid + # name for this table. In this case, there are three more, + # 'sc=grek', 'sc=greek', and 'script='grek'. Rather than + # listing all possible combinations in the comment, we make + # sure that each synonym occurs at least once, and add + # commentary that the other combinations are possible. + my @property_aliases = $table_property->aliases; + my @table_aliases = $table->aliases; + + Carp::my_carp_bug("$table doesn't have any names. Proceeding anyway.") unless @table_aliases; + + # The alias lists above are already ordered in the order we + # want to output them. To ensure that each synonym is listed, + # we must use the max of the two numbers. + my $listed_combos = main::max(scalar @table_aliases, + scalar @property_aliases); + trace "$listed_combos, tables=", scalar @table_aliases, "; names=", scalar @property_aliases if main::DEBUG; + + my $property_had_compound_name = 0; + + for my $i (0 .. $listed_combos - 1) { + $total_entries++; + + # The current alias for the property is the next one on + # the list, or if beyond the end, start over. Similarly + # for the table (\p{prop=table}) + my $property_alias = $property_aliases + [$i % @property_aliases]->name; + my $table_alias_object = $table_aliases + [$i % @table_aliases]; + my $table_alias = $table_alias_object->name; + my $loose_match = $table_alias_object->loose_match; + + if ($table_alias !~ /\D/) { # Clarify large numbers. + $table_alias = main::clarify_number($table_alias) + } + + # Add a comment for this alias combination + my $current_match_comment; + if ($table_property == $perl) { + $current_match_comment = "\\$perl_p" + . "{$table_alias}"; + } + else { + $current_match_comment + = "\\p{$property_alias=$table_alias}"; + $property_had_compound_name = 1; + } + + # Flag any abnormal status for this table. + my $flag = $property->status + || $table->status + || $table_alias_object->status; + if ($flag) { + if ($flag ne $PLACEHOLDER) { + $flags{$flag} = $status_past_participles{$flag}; + } else { + $flags{$flag} = <description; + push @note, $table->note; + push @conflicting, $table->conflicting; + + # And this for output after all the tables. + push @global_comments, $table->comment; + + # Compute an alternate compound name using the final property + # synonym and the first table synonym with a colon instead of + # the equal sign used elsewhere. + if ($property_had_compound_name) { + $properties_with_compound_names ++; + if (! $compound_name || @property_aliases > 1) { + $compound_name = $property_aliases[-1]->name + . ': ' + . $table_aliases[0]->name; + } + } + } # End of looping through all children of this table + + # Here have assembled in $matches_comment all the related tables + # to the current parent (preceded by the same info for all the + # previous parents). Put out information that applies to all of + # the current family. + if (@conflicting) { + + # But output the conflicting information now, as it applies to + # just this table. + my $conflicting = join ", ", @conflicting; + if ($conflicting) { + $matches_comment .= < 1; + $matches_comment .= "$conflicting\n"; + } + } + if (@description) { + $matches_comment .= "\n Meaning: " + . join('; ', @description) + . "\n"; + } + if (@note) { + $matches_comment .= "\n Note: " + . join("\n ", @note) + . "\n"; + } + } # End of looping through all tables + + + my $code_points; + my $match; + my $any_of_these; + if ($count == 1) { + $match = 'matches'; + $code_points = 'single code point'; + } + else { + $match = 'match'; + $code_points = "$string_count code points"; + } + + my $synonyms; + my $entries; + if ($total_entries <= 1) { + $synonyms = ""; + $entries = 'entry'; + $any_of_these = 'this' + } + else { + $synonyms = " any of the following regular expression constructs"; + $entries = 'entries'; + $any_of_these = 'any of these' + } + + my $comment = ""; + if ($has_unrelated) { + $comment .= < 1) { + $comment .= <output_range_counts) { + $comment .= <set_comment(main::join_lines($comment)); + return; + } + + # Accessors for the underlying list + for my $sub (qw( + get_valid_code_point + get_invalid_code_point + )) + { + no strict "refs"; + *$sub = sub { + use strict "refs"; + my $self = shift; + + return $self->_range_list->$sub(@_); + } + } +} # End closure for Match_Table + +package Property; + +# The Property class represents a Unicode property, or the $perl +# pseudo-property. It contains a map table initialized empty at construction +# time, and for properties accessible through regular expressions, various +# match tables, created through the add_match_table() method, and referenced +# by the table('NAME') or tables() methods, the latter returning a list of all +# of the match tables. Otherwise table operations implicitly are for the map +# table. +# +# Most of the data in the property is actually about its map table, so it +# mostly just uses that table's accessors for most methods. The two could +# have been combined into one object, but for clarity because of their +# differing semantics, they have been kept separate. It could be argued that +# the 'file' and 'directory' fields should be kept with the map table. +# +# Each property has a type. This can be set in the constructor, or in the +# set_type accessor, but mostly it is figured out by the data. Every property +# starts with unknown type, overridden by a parameter to the constructor, or +# as match tables are added, or ranges added to the map table, the data is +# inspected, and the type changed. After the table is mostly or entirely +# filled, compute_type() should be called to finalize they analysis. +# +# There are very few operations defined. One can safely remove a range from +# the map table, and property_add_or_replace_non_nulls() adds the maps from another +# table to this one, replacing any in the intersection of the two. + +sub standardize { return main::standardize($_[0]); } +sub trace { return main::trace(@_) if main::DEBUG && $to_trace } + +{ # Closure + + # This hash will contain as keys, all the aliases of all properties, and + # as values, pointers to their respective property objects. This allows + # quick look-up of a property from any of its names. + my %alias_to_property_of; + + sub dump_alias_to_property_of { + # For debugging + + print "\n", main::simple_dumper (\%alias_to_property_of), "\n"; + return; + } + + sub property_ref { + # This is a package subroutine, not called as a method. + # If the single parameter is a literal '*' it returns a list of all + # defined properties. + # Otherwise, the single parameter is a name, and it returns a pointer + # to the corresponding property object, or undef if none. + # + # Properties can have several different names. The 'standard' form of + # each of them is stored in %alias_to_property_of as they are defined. + # But it's possible that this subroutine will be called with some + # variant, so if the initial lookup fails, it is repeated with the + # standarized form of the input name. If found, besides returning the + # result, the input name is added to the list so future calls won't + # have to do the conversion again. + + my $name = shift; + + Carp::carp_extra_args(\@_) if main::DEBUG && @_; + + if (! defined $name) { + Carp::my_carp_bug("Undefined input property. No action taken."); + return; + } + + return main::uniques(values %alias_to_property_of) if $name eq '*'; + + # Return cached result if have it. + my $result = $alias_to_property_of{$name}; + return $result if defined $result; + + # Convert the input to standard form. + my $standard_name = standardize($name); + + $result = $alias_to_property_of{$standard_name}; + return unless defined $result; # Don't cache undefs + + # Cache the result before returning it. + $alias_to_property_of{$name} = $result; + return $result; + } + + + main::setup_package(); + + my %map; + # A pointer to the map table object for this property + main::set_access('map', \%map); + + my %full_name; + # The property's full name. This is a duplicate of the copy kept in the + # map table, but is needed because stringify needs it during + # construction of the map table, and then would have a chicken before egg + # problem. + main::set_access('full_name', \%full_name, 'r'); + + my %table_ref; + # This hash will contain as keys, all the aliases of any match tables + # attached to this property, and as values, the pointers to their + # respective tables. This allows quick look-up of a table from any of its + # names. + main::set_access('table_ref', \%table_ref); + + my %type; + # The type of the property, $ENUM, $BINARY, etc + main::set_access('type', \%type, 'r'); + + my %file; + # The filename where the map table will go (if actually written). + # Normally defaulted, but can be overridden. + main::set_access('file', \%file, 'r', 's'); + + my %directory; + # The directory where the map table will go (if actually written). + # Normally defaulted, but can be overridden. + main::set_access('directory', \%directory, 's'); + + my %pseudo_map_type; + # This is used to affect the calculation of the map types for all the + # ranges in the table. It should be set to one of the values that signify + # to alter the calculation. + main::set_access('pseudo_map_type', \%pseudo_map_type, 'r'); + + my %has_only_code_point_maps; + # A boolean used to help in computing the type of data in the map table. + main::set_access('has_only_code_point_maps', \%has_only_code_point_maps); + + my %unique_maps; + # A list of the first few distinct mappings this property has. This is + # used to disambiguate between binary and enum property types, so don't + # have to keep more than three. + main::set_access('unique_maps', \%unique_maps); + + sub new { + # The only required parameter is the positionally first, name. All + # other parameters are key => value pairs. See the documentation just + # above for the meanings of the ones not passed directly on to the map + # table constructor. + + my $class = shift; + my $name = shift || ""; + + my $self = property_ref($name); + if (defined $self) { + my $options_string = join ", ", @_; + $options_string = ". Ignoring options $options_string" if $options_string; + Carp::my_carp("$self is already in use. Using existing one$options_string;"); + return $self; + } + + my %args = @_; + + $self = bless \do { my $anonymous_scalar }, $class; + my $addr = do { no overloading; pack 'J', $self; }; + + $directory{$addr} = delete $args{'Directory'}; + $file{$addr} = delete $args{'File'}; + $full_name{$addr} = delete $args{'Full_Name'} || $name; + $type{$addr} = delete $args{'Type'} || $UNKNOWN; + $pseudo_map_type{$addr} = delete $args{'Map_Type'}; + # Rest of parameters passed on. + + $has_only_code_point_maps{$addr} = 1; + $table_ref{$addr} = { }; + $unique_maps{$addr} = { }; + + $map{$addr} = Map_Table->new($name, + Full_Name => $full_name{$addr}, + _Alias_Hash => \%alias_to_property_of, + _Property => $self, + %args); + return $self; + } + + # See this program's beginning comment block about overloading the copy + # constructor. Few operations are defined on properties, but a couple are + # useful. It is safe to take the inverse of a property, and to remove a + # single code point from it. + use overload + fallback => 0, + qw("") => "_operator_stringify", + "." => \&main::_operator_dot, + '==' => \&main::_operator_equal, + '!=' => \&main::_operator_not_equal, + '=' => sub { return shift }, + '-=' => "_minus_and_equal", + ; + + sub _operator_stringify { + return "Property '" . shift->full_name . "'"; + } + + sub _minus_and_equal { + # Remove a single code point from the map table of a property. + + my $self = shift; + my $other = shift; + my $reversed = shift; + Carp::carp_extra_args(\@_) if main::DEBUG && @_; + + if (ref $other) { + Carp::my_carp_bug("Can't cope with a " + . ref($other) + . " argument to '-='. Subtraction ignored."); + return $self; + } + elsif ($reversed) { # Shouldnt happen in a -=, but just in case + Carp::my_carp_bug("Can't cope with a " + . __PACKAGE__ + . " being the first parameter in a '-='. Subtraction ignored."); + return $self; + } + else { + no overloading; + $map{pack 'J', $self}->delete_range($other, $other); + } + return $self; + } + + sub add_match_table { + # Add a new match table for this property, with name given by the + # parameter. It returns a pointer to the table. + + my $self = shift; + my $name = shift; + my %args = @_; + + my $addr = do { no overloading; pack 'J', $self; }; + + my $table = $table_ref{$addr}{$name}; + my $standard_name = main::standardize($name); + if (defined $table + || (defined ($table = $table_ref{$addr}{$standard_name}))) + { + Carp::my_carp("Table '$name' in $self is already in use. Using existing one"); + $table_ref{$addr}{$name} = $table; + return $table; + } + else { + + # See if this is a perl extension, if not passed in. + my $perl_extension = delete $args{'Perl_Extension'}; + $perl_extension + = $self->perl_extension if ! defined $perl_extension; + + $table = Match_Table->new( + Name => $name, + Perl_Extension => $perl_extension, + _Alias_Hash => $table_ref{$addr}, + _Property => $self, + + # gets property's status by default + Status => $self->status, + _Status_Info => $self->status_info, + %args, + Internal_Only_Warning => 1); # Override any + # input param + return unless defined $table; + } + + # Save the names for quick look up + $table_ref{$addr}{$standard_name} = $table; + $table_ref{$addr}{$name} = $table; + + # Perhaps we can figure out the type of this property based on the + # fact of adding this match table. First, string properties don't + # have match tables; second, a binary property can't have 3 match + # tables + if ($type{$addr} == $UNKNOWN) { + $type{$addr} = $NON_STRING; + } + elsif ($type{$addr} == $STRING) { + Carp::my_carp("$self Added a match table '$name' to a string property '$self'. Changed it to a non-string property. Bad News."); + $type{$addr} = $NON_STRING; + } + elsif ($type{$addr} != $ENUM) { + if (scalar main::uniques(values %{$table_ref{$addr}}) > 2 + && $type{$addr} == $BINARY) + { + Carp::my_carp("$self now has more than 2 tables (with the addition of '$name'), and so is no longer binary. Changing its type to 'enum'. Bad News."); + $type{$addr} = $ENUM; + } + } + + return $table; + } + + sub table { + # Return a pointer to the match table (with name given by the + # parameter) associated with this property; undef if none. + + my $self = shift; + my $name = shift; + Carp::carp_extra_args(\@_) if main::DEBUG && @_; + + my $addr = do { no overloading; pack 'J', $self; }; + + return $table_ref{$addr}{$name} if defined $table_ref{$addr}{$name}; + + # If quick look-up failed, try again using the standard form of the + # input name. If that succeeds, cache the result before returning so + # won't have to standardize this input name again. + my $standard_name = main::standardize($name); + return unless defined $table_ref{$addr}{$standard_name}; + + $table_ref{$addr}{$name} = $table_ref{$addr}{$standard_name}; + return $table_ref{$addr}{$name}; + } + + sub tables { + # Return a list of pointers to all the match tables attached to this + # property + + no overloading; + return main::uniques(values %{$table_ref{pack 'J', shift}}); + } + + sub directory { + # Returns the directory the map table for this property should be + # output in. If a specific directory has been specified, that has + # priority; 'undef' is returned if the type isn't defined; + # or $map_directory for everything else. + + my $addr = do { no overloading; pack 'J', shift; }; + + return $directory{$addr} if defined $directory{$addr}; + return undef if $type{$addr} == $UNKNOWN; + return $map_directory; + } + + sub swash_name { + # Return the name that is used to both: + # 1) Name the file that the map table is written to. + # 2) The name of swash related stuff inside that file. + # The reason for this is that the Perl core historically has used + # certain names that aren't the same as the Unicode property names. + # To continue using these, $file is hard-coded in this file for those, + # but otherwise the standard name is used. This is different from the + # external_name, so that the rest of the files, like in lib can use + # the standard name always, without regard to historical precedent. + + my $self = shift; + Carp::carp_extra_args(\@_) if main::DEBUG && @_; + + my $addr = do { no overloading; pack 'J', $self; }; + + return $file{$addr} if defined $file{$addr}; + return $map{$addr}->external_name; + } + + sub to_create_match_tables { + # Returns a boolean as to whether or not match tables should be + # created for this property. + + my $self = shift; + Carp::carp_extra_args(\@_) if main::DEBUG && @_; + + # The whole point of this pseudo property is match tables. + return 1 if $self == $perl; + + my $addr = do { no overloading; pack 'J', $self; }; + + # Don't generate tables of code points that match the property values + # of a string property. Such a list would most likely have many + # property values, each with just one or very few code points mapping + # to it. + return 0 if $type{$addr} == $STRING; + + # Don't generate anything for unimplemented properties. + return 0 if grep { $self->complete_name eq $_ } + @unimplemented_properties; + # Otherwise, do. + return 1; + } + + sub property_add_or_replace_non_nulls { + # This adds the mappings in the property $other to $self. Non-null + # mappings from $other override those in $self. It essentially merges + # the two properties, with the second having priority except for null + # mappings. + + my $self = shift; + my $other = shift; + Carp::carp_extra_args(\@_) if main::DEBUG && @_; + + if (! $other->isa(__PACKAGE__)) { + Carp::my_carp_bug("$other should be a " + . __PACKAGE__ + . ". Not a '" + . ref($other) + . "'. Not added;"); + return; + } + + no overloading; + return $map{pack 'J', $self}->map_add_or_replace_non_nulls($map{pack 'J', $other}); + } + + sub set_type { + # Set the type of the property. Mostly this is figured out by the + # data in the table. But this is used to set it explicitly. The + # reason it is not a standard accessor is that when setting a binary + # property, we need to make sure that all the true/false aliases are + # present, as they were omitted in early Unicode releases. + + my $self = shift; + my $type = shift; + Carp::carp_extra_args(\@_) if main::DEBUG && @_; + + if ($type != $ENUM && $type != $BINARY && $type != $STRING) { + Carp::my_carp("Unrecognized type '$type'. Type not set"); + return; + } + + { no overloading; $type{pack 'J', $self} = $type; } + return if $type != $BINARY; + + my $yes = $self->table('Y'); + $yes = $self->table('Yes') if ! defined $yes; + $yes = $self->add_match_table('Y') if ! defined $yes; + $yes->add_alias('Yes'); + $yes->add_alias('T'); + $yes->add_alias('True'); + + my $no = $self->table('N'); + $no = $self->table('No') if ! defined $no; + $no = $self->add_match_table('N') if ! defined $no; + $no->add_alias('No'); + $no->add_alias('F'); + $no->add_alias('False'); + return; + } + + sub add_map { + # Add a map to the property's map table. This also keeps + # track of the maps so that the property type can be determined from + # its data. + + my $self = shift; + my $start = shift; # First code point in range + my $end = shift; # Final code point in range + my $map = shift; # What the range maps to. + # Rest of parameters passed on. + + my $addr = do { no overloading; pack 'J', $self; }; + + # If haven't the type of the property, gather information to figure it + # out. + if ($type{$addr} == $UNKNOWN) { + + # If the map contains an interior blank or dash, or most other + # nonword characters, it will be a string property. This + # heuristic may actually miss some string properties. If so, they + # may need to have explicit set_types called for them. This + # happens in the Unihan properties. + if ($map =~ / (?<= . ) [ -] (?= . ) /x + || $map =~ / [^\w.\/\ -] /x) + { + $self->set_type($STRING); + + # $unique_maps is used for disambiguating between ENUM and + # BINARY later; since we know the property is not going to be + # one of those, no point in keeping the data around + undef $unique_maps{$addr}; + } + else { + + # Not necessarily a string. The final decision has to be + # deferred until all the data are in. We keep track of if all + # the values are code points for that eventual decision. + $has_only_code_point_maps{$addr} &= + $map =~ / ^ $code_point_re $/x; + + # For the purposes of disambiguating between binary and other + # enumerations at the end, we keep track of the first three + # distinct property values. Once we get to three, we know + # it's not going to be binary, so no need to track more. + if (scalar keys %{$unique_maps{$addr}} < 3) { + $unique_maps{$addr}{main::standardize($map)} = 1; + } + } + } + + # Add the mapping by calling our map table's method + return $map{$addr}->add_map($start, $end, $map, @_); + } + + sub compute_type { + # Compute the type of the property: $ENUM, $STRING, or $BINARY. This + # should be called after the property is mostly filled with its maps. + # We have been keeping track of what the property values have been, + # and now have the necessary information to figure out the type. + + my $self = shift; + Carp::carp_extra_args(\@_) if main::DEBUG && @_; + + my $addr = do { no overloading; pack 'J', $self; }; + + my $type = $type{$addr}; + + # If already have figured these out, no need to do so again, but we do + # a double check on ENUMS to make sure that a string property hasn't + # improperly been classified as an ENUM, so continue on with those. + return if $type == $STRING || $type == $BINARY; + + # If every map is to a code point, is a string property. + if ($type == $UNKNOWN + && ($has_only_code_point_maps{$addr} + || (defined $map{$addr}->default_map + && $map{$addr}->default_map eq ""))) + { + $self->set_type($STRING); + } + else { + + # Otherwise, it is to some sort of enumeration. (The case where + # it is a Unicode miscellaneous property, and treated like a + # string in this program is handled in add_map()). Distinguish + # between binary and some other enumeration type. Of course, if + # there are more than two values, it's not binary. But more + # subtle is the test that the default mapping is defined means it + # isn't binary. This in fact may change in the future if Unicode + # changes the way its data is structured. But so far, no binary + # properties ever have @missing lines for them, so the default map + # isn't defined for them. The few properties that are two-valued + # and aren't considered binary have the default map defined + # starting in Unicode 5.0, when the @missing lines appeared; and + # this program has special code to put in a default map for them + # for earlier than 5.0 releases. + if ($type == $ENUM + || scalar keys %{$unique_maps{$addr}} > 2 + || defined $self->default_map) + { + my $tables = $self->tables; + my $count = $self->count; + if ($verbosity && $count > 500 && $tables/$count > .1) { + Carp::my_carp_bug("It appears that $self should be a \$STRING property, not an \$ENUM because it has too many match tables: $count\n"); + } + $self->set_type($ENUM); + } + else { + $self->set_type($BINARY); + } + } + undef $unique_maps{$addr}; # Garbage collect + return; + } + + # Most of the accessors for a property actually apply to its map table. + # Setup up accessor functions for those, referring to %map + for my $sub (qw( + add_alias + add_anomalous_entry + add_comment + add_conflicting + add_description + add_duplicate + add_note + aliases + comment + complete_name + core_access + count + default_map + delete_range + description + each_range + external_name + file_path + format + initialize + inverse_list + is_empty + name + note + perl_extension + property + range_count + ranges + range_size_1 + reset_each_range + set_comment + set_core_access + set_default_map + set_file_path + set_final_comment + set_range_size_1 + set_status + set_to_output_map + short_name + status + status_info + to_output_map + value_of + write + )) + # 'property' above is for symmetry, so that one can take + # the property of a property and get itself, and so don't + # have to distinguish between properties and tables in + # calling code + { + no strict "refs"; + *$sub = sub { + use strict "refs"; + my $self = shift; + no overloading; + return $map{pack 'J', $self}->$sub(@_); + } + } + + +} # End closure + +package main; + +sub join_lines($) { + # Returns lines of the input joined together, so that they can be folded + # properly. + # This causes continuation lines to be joined together into one long line + # for folding. A continuation line is any line that doesn't begin with a + # space or "\b" (the latter is stripped from the output). This is so + # lines can be be in a HERE document so as to fit nicely in the terminal + # width, but be joined together in one long line, and then folded with + # indents, '#' prefixes, etc, properly handled. + # A blank separates the joined lines except if there is a break; an extra + # blank is inserted after a period ending a line. + + # Intialize the return with the first line. + my ($return, @lines) = split "\n", shift; + + # If the first line is null, it was an empty line, add the \n back in + $return = "\n" if $return eq ""; + + # Now join the remainder of the physical lines. + for my $line (@lines) { + + # An empty line means wanted a blank line, so add two \n's to get that + # effect, and go to the next line. + if (length $line == 0) { + $return .= "\n\n"; + next; + } + + # Look at the last character of what we have so far. + my $previous_char = substr($return, -1, 1); + + # And at the next char to be output. + my $next_char = substr($line, 0, 1); + + if ($previous_char ne "\n") { + + # Here didn't end wth a nl. If the next char a blank or \b, it + # means that here there is a break anyway. So add a nl to the + # output. + if ($next_char eq " " || $next_char eq "\b") { + $previous_char = "\n"; + $return .= $previous_char; + } + + # Add an extra space after periods. + $return .= " " if $previous_char eq '.'; + } + + # Here $previous_char is still the latest character to be output. If + # it isn't a nl, it means that the next line is to be a continuation + # line, with a blank inserted between them. + $return .= " " if $previous_char ne "\n"; + + # Get rid of any \b + substr($line, 0, 1) = "" if $next_char eq "\b"; + + # And append this next line. + $return .= $line; + } + + return $return; +} + +sub simple_fold($;$$$) { + # Returns a string of the input (string or an array of strings) folded + # into multiple-lines each of no more than $MAX_LINE_WIDTH characters plus + # a \n + # This is tailored for the kind of text written by this program, + # especially the pod file, which can have very long names with + # underscores in the middle, or words like AbcDefgHij.... We allow + # breaking in the middle of such constructs if the line won't fit + # otherwise. The break in such cases will come either just after an + # underscore, or just before one of the Capital letters. + + local $to_trace = 0 if main::DEBUG; + + my $line = shift; + my $prefix = shift; # Optional string to prepend to each output + # line + $prefix = "" unless defined $prefix; + + my $hanging_indent = shift; # Optional number of spaces to indent + # continuation lines + $hanging_indent = 0 unless $hanging_indent; + + my $right_margin = shift; # Optional number of spaces to narrow the + # total width by. + $right_margin = 0 unless defined $right_margin; + + # Call carp with the 'nofold' option to avoid it from trying to call us + # recursively + Carp::carp_extra_args(\@_, 'nofold') if main::DEBUG && @_; + + # The space available doesn't include what's automatically prepended + # to each line, or what's reserved on the right. + my $max = $MAX_LINE_WIDTH - length($prefix) - $right_margin; + # XXX Instead of using the 'nofold' perhaps better to look up the stack + + if (DEBUG && $hanging_indent >= $max) { + Carp::my_carp("Too large a hanging indent ($hanging_indent); must be < $max. Using 0", 'nofold'); + $hanging_indent = 0; + } + + # First, split into the current physical lines. + my @line; + if (ref $line) { # Better be an array, because not bothering to + # test + foreach my $line (@{$line}) { + push @line, split /\n/, $line; + } + } + else { + @line = split /\n/, $line; + } + + #local $to_trace = 1 if main::DEBUG; + trace "", join(" ", @line), "\n" if main::DEBUG && $to_trace; + + # Look at each current physical line. + for (my $i = 0; $i < @line; $i++) { + Carp::my_carp("Tabs don't work well.", 'nofold') if $line[$i] =~ /\t/; + #local $to_trace = 1 if main::DEBUG; + trace "i=$i: $line[$i]\n" if main::DEBUG && $to_trace; + + # Remove prefix, because will be added back anyway, don't want + # doubled prefix + $line[$i] =~ s/^$prefix//; + + # Remove trailing space + $line[$i] =~ s/\s+\Z//; + + # If the line is too long, fold it. + if (length $line[$i] > $max) { + my $remainder; + + # Here needs to fold. Save the leading space in the line for + # later. + $line[$i] =~ /^ ( \s* )/x; + my $leading_space = $1; + trace "line length", length $line[$i], "; lead length", length($leading_space) if main::DEBUG && $to_trace; + + # If character at final permissible position is white space, + # fold there, which will delete that white space + if (substr($line[$i], $max - 1, 1) =~ /\s/) { + $remainder = substr($line[$i], $max); + $line[$i] = substr($line[$i], 0, $max - 1); + } + else { + + # Otherwise fold at an acceptable break char closest to + # the max length. Look at just the maximal initial + # segment of the line + my $segment = substr($line[$i], 0, $max - 1); + if ($segment =~ + /^ ( .{$hanging_indent} # Don't look before the + # indent. + \ * # Don't look in leading + # blanks past the indent + [^ ] .* # Find the right-most + (?: # acceptable break: + [ \s = ] # space or equal + | - (?! [.0-9] ) # or non-unary minus. + ) # $1 includes the character + )/x) + { + # Split into the initial part that fits, and remaining + # part of the input + $remainder = substr($line[$i], length $1); + $line[$i] = $1; + trace $line[$i] if DEBUG && $to_trace; + trace $remainder if DEBUG && $to_trace; + } + + # If didn't find a good breaking spot, see if there is a + # not-so-good breaking spot. These are just after + # underscores or where the case changes from lower to + # upper. Use \a as a soft hyphen, but give up + # and don't break the line if there is actually a \a + # already in the input. We use an ascii character for the + # soft-hyphen to avoid any attempt by miniperl to try to + # access the files that this program is creating. + elsif ($segment !~ /\a/ + && ($segment =~ s/_/_\a/g + || $segment =~ s/ ( [a-z] ) (?= [A-Z] )/$1\a/xg)) + { + # Here were able to find at least one place to insert + # our substitute soft hyphen. Find the right-most one + # and replace it by a real hyphen. + trace $segment if DEBUG && $to_trace; + substr($segment, + rindex($segment, "\a"), + 1) = '-'; + + # Then remove the soft hyphen substitutes. + $segment =~ s/\a//g; + trace $segment if DEBUG && $to_trace; + + # And split into the initial part that fits, and + # remainder of the line + my $pos = rindex($segment, '-'); + $remainder = substr($line[$i], $pos); + trace $remainder if DEBUG && $to_trace; + $line[$i] = substr($segment, 0, $pos + 1); + } + } + + # Here we know if we can fold or not. If we can, $remainder + # is what remains to be processed in the next iteration. + if (defined $remainder) { + trace "folded='$line[$i]'" if main::DEBUG && $to_trace; + + # Insert the folded remainder of the line as a new element + # of the array. (It may still be too long, but we will + # deal with that next time through the loop.) Omit any + # leading space in the remainder. + $remainder =~ s/^\s+//; + trace "remainder='$remainder'" if main::DEBUG && $to_trace; + + # But then indent by whichever is larger of: + # 1) the leading space on the input line; + # 2) the hanging indent. + # This preserves indentation in the original line. + my $lead = ($leading_space) + ? length $leading_space + : $hanging_indent; + $lead = max($lead, $hanging_indent); + splice @line, $i+1, 0, (" " x $lead) . $remainder; + } + } + + # Ready to output the line. Get rid of any trailing space + # And prefix by the required $prefix passed in. + $line[$i] =~ s/\s+$//; + $line[$i] = "$prefix$line[$i]\n"; + } # End of looping through all the lines. + + return join "", @line; +} + +sub property_ref { # Returns a reference to a property object. + return Property::property_ref(@_); +} + +sub force_unlink ($) { + my $filename = shift; + return unless file_exists($filename); + return if CORE::unlink($filename); + + # We might need write permission + chmod 0777, $filename; + CORE::unlink($filename) or Carp::my_carp("Couldn't unlink $filename. Proceeding anyway: $!"); + return; +} + +sub write ($\@) { + # Given a filename and a reference to an array of lines, write the lines + # to the file + # Filename can be given as an arrayref of directory names + + my $file = shift; + my $lines_ref = shift; + Carp::carp_extra_args(\@_) if main::DEBUG && @_; + + if (! defined $lines_ref) { + Carp::my_carp("Missing lines to write parameter for $file. Writing skipped;"); + return; + } + + # Get into a single string if an array, and get rid of, in Unix terms, any + # leading '.' + $file= File::Spec->join(@$file) if ref $file eq 'ARRAY'; + $file = File::Spec->canonpath($file); + + # If has directories, make sure that they all exist + (undef, my $directories, undef) = File::Spec->splitpath($file); + File::Path::mkpath($directories) if $directories && ! -d $directories; + + push @files_actually_output, $file; + + unless (@$lines_ref) { + Carp::my_carp("Output file '$file' is empty; writing it anyway;"); + } + + force_unlink ($file); + + my $OUT; + if (not open $OUT, ">", $file) { + Carp::my_carp("can't open $file for output. Skipping this file: $!"); + return; + } + + print $OUT @$lines_ref or die Carp::my_carp("write to '$file' failed: $!"); + close $OUT or die Carp::my_carp("close '$file' failed: $!"); + + print "$file written.\n" if $verbosity >= $VERBOSE; + + return; +} + + +sub Standardize($) { + # This converts the input name string into a standardized equivalent to + # use internally. + + my $name = shift; + unless (defined $name) { + Carp::my_carp_bug("Standardize() called with undef. Returning undef."); + return; + } + + # Remove any leading or trailing white space + $name =~ s/^\s+//g; + $name =~ s/\s+$//g; + + # Convert interior white space and hypens into underscores. + $name =~ s/ (?<= .) [ -]+ (.) /_$1/xg; + + # Capitalize the letter following an underscore, and convert a sequence of + # multiple underscores to a single one + $name =~ s/ (?<= .) _+ (.) /_\u$1/xg; + + # And capitalize the first letter, but not for the special cjk ones. + $name = ucfirst($name) unless $name =~ /^k[A-Z]/; + return $name; +} + +sub standardize ($) { + # Returns a lower-cased standardized name, without underscores. This form + # is chosen so that it can distinguish between any real versus superficial + # Unicode name differences. It relies on the fact that Unicode doesn't + # have interior underscores, white space, nor dashes in any + # stricter-matched name. It should not be used on Unicode code point + # names (the Name property), as they mostly, but not always follow these + # rules. + + my $name = Standardize(shift); + return if !defined $name; + + $name =~ s/ (?<= .) _ (?= . ) //xg; + return lc $name; +} + +{ # Closure + + my $indent_increment = " " x 2; + my %already_output; + + $main::simple_dumper_nesting = 0; + + sub simple_dumper { + # Like Simple Data::Dumper. Good enough for our needs. We can't use + # the real thing as we have to run under miniperl. + + # It is designed so that on input it is at the beginning of a line, + # and the final thing output in any call is a trailing ",\n". + + my $item = shift; + my $indent = shift; + $indent = "" if ! defined $indent; + + Carp::carp_extra_args(\@_) if main::DEBUG && @_; + + # nesting level is localized, so that as the call stack pops, it goes + # back to the prior value. + local $main::simple_dumper_nesting = $main::simple_dumper_nesting; + undef %already_output if $main::simple_dumper_nesting == 0; + $main::simple_dumper_nesting++; + #print STDERR __LINE__, ": $main::simple_dumper_nesting: $indent$item\n"; + + Carp::carp_extra_args(\@_) if main::DEBUG && @_; + + # Determine the indent for recursive calls. + my $next_indent = $indent . $indent_increment; + + my $output; + if (! ref $item) { + + # Dump of scalar: just output it in quotes if not a number. To do + # so we must escape certain characters, and therefore need to + # operate on a copy to avoid changing the original + my $copy = $item; + $copy = $UNDEF unless defined $copy; + + # Quote non-numbers (numbers also have optional leading '-' and + # fractions) + if ($copy eq "" || $copy !~ /^ -? \d+ ( \. \d+ )? $/x) { + + # Escape apostrophe and backslash + $copy =~ s/ ( ['\\] ) /\\$1/xg; + $copy = "'$copy'"; + } + $output = "$indent$copy,\n"; + } + else { + + # Keep track of cycles in the input, and refuse to infinitely loop + my $addr = do { no overloading; pack 'J', $item; }; + if (defined $already_output{$addr}) { + return "${indent}ALREADY OUTPUT: $item\n"; + } + $already_output{$addr} = $item; + + if (ref $item eq 'ARRAY') { + my $using_brackets; + $output = $indent; + if ($main::simple_dumper_nesting > 1) { + $output .= '['; + $using_brackets = 1; + } + else { + $using_brackets = 0; + } + + # If the array is empty, put the closing bracket on the same + # line. Otherwise, recursively add each array element + if (@$item == 0) { + $output .= " "; + } + else { + $output .= "\n"; + for (my $i = 0; $i < @$item; $i++) { + + # Indent array elements one level + $output .= &simple_dumper($item->[$i], $next_indent); + $output =~ s/\n$//; # Remove trailing nl so as to + $output .= " # [$i]\n"; # add a comment giving the + # array index + } + $output .= $indent; # Indent closing ']' to orig level + } + $output .= ']' if $using_brackets; + $output .= ",\n"; + } + elsif (ref $item eq 'HASH') { + my $is_first_line; + my $using_braces; + my $body_indent; + + # No surrounding braces at top level + $output .= $indent; + if ($main::simple_dumper_nesting > 1) { + $output .= "{\n"; + $is_first_line = 0; + $body_indent = $next_indent; + $next_indent .= $indent_increment; + $using_braces = 1; + } + else { + $is_first_line = 1; + $body_indent = $indent; + $using_braces = 0; + } + + # Output hashes sorted alphabetically instead of apparently + # random. Use caseless alphabetic sort + foreach my $key (sort { lc $a cmp lc $b } keys %$item) + { + if ($is_first_line) { + $is_first_line = 0; + } + else { + $output .= "$body_indent"; + } + + # The key must be a scalar, but this recursive call quotes + # it + $output .= &simple_dumper($key); + + # And change the trailing comma and nl to the hash fat + # comma for clarity, and so the value can be on the same + # line + $output =~ s/,\n$/ => /; + + # Recursively call to get the value's dump. + my $next = &simple_dumper($item->{$key}, $next_indent); + + # If the value is all on one line, remove its indent, so + # will follow the => immediately. If it takes more than + # one line, start it on a new line. + if ($next !~ /\n.*\n/) { + $next =~ s/^ *//; + } + else { + $output .= "\n"; + } + $output .= $next; + } + + $output .= "$indent},\n" if $using_braces; + } + elsif (ref $item eq 'CODE' || ref $item eq 'GLOB') { + $output = $indent . ref($item) . "\n"; + # XXX see if blessed + } + elsif ($item->can('dump')) { + + # By convention in this program, objects furnish a 'dump' + # method. Since not doing any output at this level, just pass + # on the input indent + $output = $item->dump($indent); + } + else { + Carp::my_carp("Can't cope with dumping a " . ref($item) . ". Skipping."); + } + } + return $output; + } +} + +sub dump_inside_out { + # Dump inside-out hashes in an object's state by converting them to a + # regular hash and then calling simple_dumper on that. + + my $object = shift; + my $fields_ref = shift; + Carp::carp_extra_args(\@_) if main::DEBUG && @_; + + my $addr = do { no overloading; pack 'J', $object; }; + + my %hash; + foreach my $key (keys %$fields_ref) { + $hash{$key} = $fields_ref->{$key}{$addr}; + } + + return simple_dumper(\%hash, @_); +} + +sub _operator_dot { + # Overloaded '.' method that is common to all packages. It uses the + # package's stringify method. + + my $self = shift; + my $other = shift; + my $reversed = shift; + Carp::carp_extra_args(\@_) if main::DEBUG && @_; + + $other = "" unless defined $other; + + foreach my $which (\$self, \$other) { + next unless ref $$which; + if ($$which->can('_operator_stringify')) { + $$which = $$which->_operator_stringify; + } + else { + my $ref = ref $$which; + my $addr = do { no overloading; pack 'J', $$which; }; + $$which = "$ref ($addr)"; + } + } + return ($reversed) + ? "$other$self" + : "$self$other"; +} + +sub _operator_equal { + # Generic overloaded '==' routine. To be equal, they must be the exact + # same object + + my $self = shift; + my $other = shift; + + return 0 unless defined $other; + return 0 unless ref $other; + no overloading; + return $self == $other; +} + +sub _operator_not_equal { + my $self = shift; + my $other = shift; + + return ! _operator_equal($self, $other); +} + +sub process_PropertyAliases($) { + # This reads in the PropertyAliases.txt file, which contains almost all + # the character properties in Unicode and their equivalent aliases: + # scf ; Simple_Case_Folding ; sfc + # + # Field 0 is the preferred short name for the property. + # Field 1 is the full name. + # Any succeeding ones are other accepted names. + + my $file= shift; + Carp::carp_extra_args(\@_) if main::DEBUG && @_; + + # This whole file was non-existent in early releases, so use our own + # internal one. + $file->insert_lines(get_old_property_aliases()) + if ! -e 'PropertyAliases.txt'; + + # Add any cjk properties that may have been defined. + $file->insert_lines(@cjk_properties); + + while ($file->next_line) { + + my @data = split /\s*;\s*/; + + my $full = $data[1]; + + my $this = Property->new($data[0], Full_Name => $full); + + # Start looking for more aliases after these two. + for my $i (2 .. @data - 1) { + $this->add_alias($data[$i]); + } + + } + return; +} + +sub finish_property_setup { + # Finishes setting up after PropertyAliases. + + my $file = shift; + Carp::carp_extra_args(\@_) if main::DEBUG && @_; + + # This entry was missing from this file in earlier Unicode versions + if (-e 'Jamo.txt') { + my $jsn = property_ref('JSN'); + if (! defined $jsn) { + $jsn = Property->new('JSN', Full_Name => 'Jamo_Short_Name'); + } + } + + # This entry is still missing as of 5.2, perhaps because no short name for + # it. + if (-e 'NameAliases.txt') { + my $aliases = property_ref('Name_Alias'); + if (! defined $aliases) { + $aliases = Property->new('Name_Alias'); + } + } + + # These are used so much, that we set globals for them. + $gc = property_ref('General_Category'); + $block = property_ref('Block'); + + # Perl adds this alias. + $gc->add_alias('Category'); + + # For backwards compatibility, these property files have particular names. + my $upper = property_ref('Uppercase_Mapping'); + $upper->set_core_access('uc()'); + $upper->set_file('Upper'); # This is what utf8.c calls it + + my $lower = property_ref('Lowercase_Mapping'); + $lower->set_core_access('lc()'); + $lower->set_file('Lower'); + + my $title = property_ref('Titlecase_Mapping'); + $title->set_core_access('ucfirst()'); + $title->set_file('Title'); + + my $fold = property_ref('Case_Folding'); + $fold->set_file('Fold') if defined $fold; + + # utf8.c can't currently cope with non range-size-1 for these, and even if + # it were changed to do so, someone else may be using them, expecting the + # old style + foreach my $property (qw { + Case_Folding + Lowercase_Mapping + Titlecase_Mapping + Uppercase_Mapping + }) + { + property_ref($property)->set_range_size_1(1); + } + + # These two properties aren't actually used in the core, but unfortunately + # the names just above that are in the core interfere with these, so + # choose different names. These aren't a problem unless the map tables + # for these files get written out. + my $lowercase = property_ref('Lowercase'); + $lowercase->set_file('IsLower') if defined $lowercase; + my $uppercase = property_ref('Uppercase'); + $uppercase->set_file('IsUpper') if defined $uppercase; + + # Set up the hard-coded default mappings, but only on properties defined + # for this release + foreach my $property (keys %default_mapping) { + my $property_object = property_ref($property); + next if ! defined $property_object; + my $default_map = $default_mapping{$property}; + $property_object->set_default_map($default_map); + + # A map of implies the property is string. + if ($property_object->type == $UNKNOWN + && $default_map eq $CODE_POINT) + { + $property_object->set_type($STRING); + } + } + + # The following use the Multi_Default class to create objects for + # defaults. + + # Bidi class has a complicated default, but the derived file takes care of + # the complications, leaving just 'L'. + if (file_exists("${EXTRACTED}DBidiClass.txt")) { + property_ref('Bidi_Class')->set_default_map('L'); + } + else { + my $default; + + # The derived file was introduced in 3.1.1. The values below are + # taken from table 3-8, TUS 3.0 + my $default_R = + 'my $default = Range_List->new; + $default->add_range(0x0590, 0x05FF); + $default->add_range(0xFB1D, 0xFB4F);' + ; + + # The defaults apply only to unassigned characters + $default_R .= '$gc->table("Unassigned") & $default;'; + + if ($v_version lt v3.0.0) { + $default = Multi_Default->new(R => $default_R, 'L'); + } + else { + + # AL apparently not introduced until 3.0: TUS 2.x references are + # not on-line to check it out + my $default_AL = + 'my $default = Range_List->new; + $default->add_range(0x0600, 0x07BF); + $default->add_range(0xFB50, 0xFDFF); + $default->add_range(0xFE70, 0xFEFF);' + ; + + # Non-character code points introduced in this release; aren't AL + if ($v_version ge 3.1.0) { + $default_AL .= '$default->delete_range(0xFDD0, 0xFDEF);'; + } + $default_AL .= '$gc->table("Unassigned") & $default'; + $default = Multi_Default->new(AL => $default_AL, + R => $default_R, + 'L'); + } + property_ref('Bidi_Class')->set_default_map($default); + } + + # Joining type has a complicated default, but the derived file takes care + # of the complications, leaving just 'U' (or Non_Joining), except the file + # is bad in 3.1.0 + if (file_exists("${EXTRACTED}DJoinType.txt") || -e 'ArabicShaping.txt') { + if (file_exists("${EXTRACTED}DJoinType.txt") && $v_version ne 3.1.0) { + property_ref('Joining_Type')->set_default_map('Non_Joining'); + } + else { + + # Otherwise, there are not one, but two possibilities for the + # missing defaults: T and U. + # The missing defaults that evaluate to T are given by: + # T = Mn + Cf - ZWNJ - ZWJ + # where Mn and Cf are the general category values. In other words, + # any non-spacing mark or any format control character, except + # U+200C ZERO WIDTH NON-JOINER (joining type U) and U+200D ZERO + # WIDTH JOINER (joining type C). + my $default = Multi_Default->new( + 'T' => '$gc->table("Mn") + $gc->table("Cf") - 0x200C - 0x200D', + 'Non_Joining'); + property_ref('Joining_Type')->set_default_map($default); + } + } + + # Line break has a complicated default in early releases. It is 'Unknown' + # for non-assigned code points; 'AL' for assigned. + if (file_exists("${EXTRACTED}DLineBreak.txt") || -e 'LineBreak.txt') { + my $lb = property_ref('Line_Break'); + if ($v_version gt 3.2.0) { + $lb->set_default_map('Unknown'); + } + else { + my $default = Multi_Default->new( 'Unknown' => '$gc->table("Cn")', + 'AL'); + $lb->set_default_map($default); + } + + # If has the URS property, make sure that the standard aliases are in + # it, since not in the input tables in some versions. + my $urs = property_ref('Unicode_Radical_Stroke'); + if (defined $urs) { + $urs->add_alias('cjkRSUnicode'); + $urs->add_alias('kRSUnicode'); + } + } + return; +} + +sub get_old_property_aliases() { + # Returns what would be in PropertyAliases.txt if it existed in very old + # versions of Unicode. It was derived from the one in 3.2, and pared + # down based on the data that was actually in the older releases. + # An attempt was made to use the existence of files to mean inclusion or + # not of various aliases, but if this was not sufficient, using version + # numbers was resorted to. + + my @return; + + # These are to be used in all versions (though some are constructed by + # this program if missing) + push @return, split /\n/, <<'END'; +bc ; Bidi_Class +Bidi_M ; Bidi_Mirrored +cf ; Case_Folding +ccc ; Canonical_Combining_Class +dm ; Decomposition_Mapping +dt ; Decomposition_Type +gc ; General_Category +isc ; ISO_Comment +lc ; Lowercase_Mapping +na ; Name +na1 ; Unicode_1_Name +nt ; Numeric_Type +nv ; Numeric_Value +sfc ; Simple_Case_Folding +slc ; Simple_Lowercase_Mapping +stc ; Simple_Titlecase_Mapping +suc ; Simple_Uppercase_Mapping +tc ; Titlecase_Mapping +uc ; Uppercase_Mapping +END + + if (-e 'Blocks.txt') { + push @return, "blk ; Block\n"; + } + if (-e 'ArabicShaping.txt') { + push @return, split /\n/, <<'END'; +jg ; Joining_Group +jt ; Joining_Type +END + } + if (-e 'PropList.txt') { + + # This first set is in the original old-style proplist. + push @return, split /\n/, <<'END'; +Alpha ; Alphabetic +Bidi_C ; Bidi_Control +Dash ; Dash +Dia ; Diacritic +Ext ; Extender +Hex ; Hex_Digit +Hyphen ; Hyphen +IDC ; ID_Continue +Ideo ; Ideographic +Join_C ; Join_Control +Math ; Math +QMark ; Quotation_Mark +Term ; Terminal_Punctuation +WSpace ; White_Space +END + # The next sets were added later + if ($v_version ge v3.0.0) { + push @return, split /\n/, <<'END'; +Upper ; Uppercase +Lower ; Lowercase +END + } + if ($v_version ge v3.0.1) { + push @return, split /\n/, <<'END'; +NChar ; Noncharacter_Code_Point +END + } + # The next sets were added in the new-style + if ($v_version ge v3.1.0) { + push @return, split /\n/, <<'END'; +OAlpha ; Other_Alphabetic +OLower ; Other_Lowercase +OMath ; Other_Math +OUpper ; Other_Uppercase +END + } + if ($v_version ge v3.1.1) { + push @return, "AHex ; ASCII_Hex_Digit\n"; + } + } + if (-e 'EastAsianWidth.txt') { + push @return, "ea ; East_Asian_Width\n"; + } + if (-e 'CompositionExclusions.txt') { + push @return, "CE ; Composition_Exclusion\n"; + } + if (-e 'LineBreak.txt') { + push @return, "lb ; Line_Break\n"; + } + if (-e 'BidiMirroring.txt') { + push @return, "bmg ; Bidi_Mirroring_Glyph\n"; + } + if (-e 'Scripts.txt') { + push @return, "sc ; Script\n"; + } + if (-e 'DNormalizationProps.txt') { + push @return, split /\n/, <<'END'; +Comp_Ex ; Full_Composition_Exclusion +FC_NFKC ; FC_NFKC_Closure +NFC_QC ; NFC_Quick_Check +NFD_QC ; NFD_Quick_Check +NFKC_QC ; NFKC_Quick_Check +NFKD_QC ; NFKD_Quick_Check +XO_NFC ; Expands_On_NFC +XO_NFD ; Expands_On_NFD +XO_NFKC ; Expands_On_NFKC +XO_NFKD ; Expands_On_NFKD +END + } + if (-e 'DCoreProperties.txt') { + push @return, split /\n/, <<'END'; +IDS ; ID_Start +XIDC ; XID_Continue +XIDS ; XID_Start +END + # These can also appear in some versions of PropList.txt + push @return, "Lower ; Lowercase\n" + unless grep { $_ =~ /^Lower\b/} @return; + push @return, "Upper ; Uppercase\n" + unless grep { $_ =~ /^Upper\b/} @return; + } + + # This flag requires the DAge.txt file to be copied into the directory. + if (DEBUG && $compare_versions) { + push @return, 'age ; Age'; + } + + return @return; +} + +sub process_PropValueAliases { + # This file contains values that properties look like: + # bc ; AL ; Arabic_Letter + # blk; n/a ; Greek_And_Coptic ; Greek + # + # Field 0 is the property. + # Field 1 is the short name of a property value or 'n/a' if no + # short name exists; + # Field 2 is the full property value name; + # Any other fields are more synonyms for the property value. + # Purely numeric property values are omitted from the file; as are some + # others, fewer and fewer in later releases + + # Entries for the ccc property have an extra field before the + # abbreviation: + # ccc; 0; NR ; Not_Reordered + # It is the numeric value that the names are synonyms for. + + # There are comment entries for values missing from this file: + # # @missing: 0000..10FFFF; ISO_Comment; + # # @missing: 0000..10FFFF; Lowercase_Mapping; + + my $file= shift; + Carp::carp_extra_args(\@_) if main::DEBUG && @_; + + # This whole file was non-existent in early releases, so use our own + # internal one if necessary. + if (! -e 'PropValueAliases.txt') { + $file->insert_lines(get_old_property_value_aliases()); + } + + # Add any explicit cjk values + $file->insert_lines(@cjk_property_values); + + # This line is used only for testing the code that checks for name + # conflicts. There is a script Inherited, and when this line is executed + # it causes there to be a name conflict with the 'Inherited' that this + # program generates for this block property value + #$file->insert_lines('blk; n/a; Herited'); + + + # Process each line of the file ... + while ($file->next_line) { + + my ($property, @data) = split /\s*;\s*/; + + # The full name for the ccc property value is in field 2 of the + # remaining ones; field 1 for all other properties. Swap ccc fields 1 + # and 2. (Rightmost splice removes field 2, returning it; left splice + # inserts that into field 1, thus shifting former field 1 to field 2.) + splice (@data, 1, 0, splice(@data, 2, 1)) if $property eq 'ccc'; + + # If there is no short name, use the full one in element 1 + $data[0] = $data[1] if $data[0] eq "n/a"; + + # Earlier releases had the pseudo property 'qc' that should expand to + # the ones that replace it below. + if ($property eq 'qc') { + if (lc $data[0] eq 'y') { + $file->insert_lines('NFC_QC; Y ; Yes', + 'NFD_QC; Y ; Yes', + 'NFKC_QC; Y ; Yes', + 'NFKD_QC; Y ; Yes', + ); + } + elsif (lc $data[0] eq 'n') { + $file->insert_lines('NFC_QC; N ; No', + 'NFD_QC; N ; No', + 'NFKC_QC; N ; No', + 'NFKD_QC; N ; No', + ); + } + elsif (lc $data[0] eq 'm') { + $file->insert_lines('NFC_QC; M ; Maybe', + 'NFKC_QC; M ; Maybe', + ); + } + else { + $file->carp_bad_line("qc followed by unexpected '$data[0]"); + } + next; + } + + # The first field is the short name, 2nd is the full one. + my $property_object = property_ref($property); + my $table = $property_object->add_match_table($data[0], + Full_Name => $data[1]); + + # Start looking for more aliases after these two. + for my $i (2 .. @data - 1) { + $table->add_alias($data[$i]); + } + } # End of looping through the file + + # As noted in the comments early in the program, it generates tables for + # the default values for all releases, even those for which the concept + # didn't exist at the time. Here we add those if missing. + my $age = property_ref('age'); + if (defined $age && ! defined $age->table('Unassigned')) { + $age->add_match_table('Unassigned'); + } + $block->add_match_table('No_Block') if -e 'Blocks.txt' + && ! defined $block->table('No_Block'); + + + # Now set the default mappings of the properties from the file. This is + # done after the loop because a number of properties have only @missings + # entries in the file, and may not show up until the end. + my @defaults = $file->get_missings; + foreach my $default_ref (@defaults) { + my $default = $default_ref->[0]; + my $property = property_ref($default_ref->[1]); + $property->set_default_map($default); + } + return; +} + +sub get_old_property_value_aliases () { + # Returns what would be in PropValueAliases.txt if it existed in very old + # versions of Unicode. It was derived from the one in 3.2, and pared + # down. An attempt was made to use the existence of files to mean + # inclusion or not of various aliases, but if this was not sufficient, + # using version numbers was resorted to. + + my @return = split /\n/, <<'END'; +bc ; AN ; Arabic_Number +bc ; B ; Paragraph_Separator +bc ; CS ; Common_Separator +bc ; EN ; European_Number +bc ; ES ; European_Separator +bc ; ET ; European_Terminator +bc ; L ; Left_To_Right +bc ; ON ; Other_Neutral +bc ; R ; Right_To_Left +bc ; WS ; White_Space + +# The standard combining classes are very much different in v1, so only use +# ones that look right (not checked thoroughly) +ccc; 0; NR ; Not_Reordered +ccc; 1; OV ; Overlay +ccc; 7; NK ; Nukta +ccc; 8; KV ; Kana_Voicing +ccc; 9; VR ; Virama +ccc; 202; ATBL ; Attached_Below_Left +ccc; 216; ATAR ; Attached_Above_Right +ccc; 218; BL ; Below_Left +ccc; 220; B ; Below +ccc; 222; BR ; Below_Right +ccc; 224; L ; Left +ccc; 228; AL ; Above_Left +ccc; 230; A ; Above +ccc; 232; AR ; Above_Right +ccc; 234; DA ; Double_Above + +dt ; can ; canonical +dt ; enc ; circle +dt ; fin ; final +dt ; font ; font +dt ; fra ; fraction +dt ; init ; initial +dt ; iso ; isolated +dt ; med ; medial +dt ; n/a ; none +dt ; nb ; noBreak +dt ; sqr ; square +dt ; sub ; sub +dt ; sup ; super + +gc ; C ; Other # Cc | Cf | Cn | Co | Cs +gc ; Cc ; Control +gc ; Cn ; Unassigned +gc ; Co ; Private_Use +gc ; L ; Letter # Ll | Lm | Lo | Lt | Lu +gc ; LC ; Cased_Letter # Ll | Lt | Lu +gc ; Ll ; Lowercase_Letter +gc ; Lm ; Modifier_Letter +gc ; Lo ; Other_Letter +gc ; Lu ; Uppercase_Letter +gc ; M ; Mark # Mc | Me | Mn +gc ; Mc ; Spacing_Mark +gc ; Mn ; Nonspacing_Mark +gc ; N ; Number # Nd | Nl | No +gc ; Nd ; Decimal_Number +gc ; No ; Other_Number +gc ; P ; Punctuation # Pc | Pd | Pe | Pf | Pi | Po | Ps +gc ; Pd ; Dash_Punctuation +gc ; Pe ; Close_Punctuation +gc ; Po ; Other_Punctuation +gc ; Ps ; Open_Punctuation +gc ; S ; Symbol # Sc | Sk | Sm | So +gc ; Sc ; Currency_Symbol +gc ; Sm ; Math_Symbol +gc ; So ; Other_Symbol +gc ; Z ; Separator # Zl | Zp | Zs +gc ; Zl ; Line_Separator +gc ; Zp ; Paragraph_Separator +gc ; Zs ; Space_Separator + +nt ; de ; Decimal +nt ; di ; Digit +nt ; n/a ; None +nt ; nu ; Numeric +END + + if (-e 'ArabicShaping.txt') { + push @return, split /\n/, <<'END'; +jg ; n/a ; AIN +jg ; n/a ; ALEF +jg ; n/a ; DAL +jg ; n/a ; GAF +jg ; n/a ; LAM +jg ; n/a ; MEEM +jg ; n/a ; NO_JOINING_GROUP +jg ; n/a ; NOON +jg ; n/a ; QAF +jg ; n/a ; SAD +jg ; n/a ; SEEN +jg ; n/a ; TAH +jg ; n/a ; WAW + +jt ; C ; Join_Causing +jt ; D ; Dual_Joining +jt ; L ; Left_Joining +jt ; R ; Right_Joining +jt ; U ; Non_Joining +jt ; T ; Transparent +END + if ($v_version ge v3.0.0) { + push @return, split /\n/, <<'END'; +jg ; n/a ; ALAPH +jg ; n/a ; BEH +jg ; n/a ; BETH +jg ; n/a ; DALATH_RISH +jg ; n/a ; E +jg ; n/a ; FEH +jg ; n/a ; FINAL_SEMKATH +jg ; n/a ; GAMAL +jg ; n/a ; HAH +jg ; n/a ; HAMZA_ON_HEH_GOAL +jg ; n/a ; HE +jg ; n/a ; HEH +jg ; n/a ; HEH_GOAL +jg ; n/a ; HETH +jg ; n/a ; KAF +jg ; n/a ; KAPH +jg ; n/a ; KNOTTED_HEH +jg ; n/a ; LAMADH +jg ; n/a ; MIM +jg ; n/a ; NUN +jg ; n/a ; PE +jg ; n/a ; QAPH +jg ; n/a ; REH +jg ; n/a ; REVERSED_PE +jg ; n/a ; SADHE +jg ; n/a ; SEMKATH +jg ; n/a ; SHIN +jg ; n/a ; SWASH_KAF +jg ; n/a ; TAW +jg ; n/a ; TEH_MARBUTA +jg ; n/a ; TETH +jg ; n/a ; YEH +jg ; n/a ; YEH_BARREE +jg ; n/a ; YEH_WITH_TAIL +jg ; n/a ; YUDH +jg ; n/a ; YUDH_HE +jg ; n/a ; ZAIN +END + } + } + + + if (-e 'EastAsianWidth.txt') { + push @return, split /\n/, <<'END'; +ea ; A ; Ambiguous +ea ; F ; Fullwidth +ea ; H ; Halfwidth +ea ; N ; Neutral +ea ; Na ; Narrow +ea ; W ; Wide +END + } + + if (-e 'LineBreak.txt') { + push @return, split /\n/, <<'END'; +lb ; AI ; Ambiguous +lb ; AL ; Alphabetic +lb ; B2 ; Break_Both +lb ; BA ; Break_After +lb ; BB ; Break_Before +lb ; BK ; Mandatory_Break +lb ; CB ; Contingent_Break +lb ; CL ; Close_Punctuation +lb ; CM ; Combining_Mark +lb ; CR ; Carriage_Return +lb ; EX ; Exclamation +lb ; GL ; Glue +lb ; HY ; Hyphen +lb ; ID ; Ideographic +lb ; IN ; Inseperable +lb ; IS ; Infix_Numeric +lb ; LF ; Line_Feed +lb ; NS ; Nonstarter +lb ; NU ; Numeric +lb ; OP ; Open_Punctuation +lb ; PO ; Postfix_Numeric +lb ; PR ; Prefix_Numeric +lb ; QU ; Quotation +lb ; SA ; Complex_Context +lb ; SG ; Surrogate +lb ; SP ; Space +lb ; SY ; Break_Symbols +lb ; XX ; Unknown +lb ; ZW ; ZWSpace +END + } + + if (-e 'DNormalizationProps.txt') { + push @return, split /\n/, <<'END'; +qc ; M ; Maybe +qc ; N ; No +qc ; Y ; Yes +END + } + + if (-e 'Scripts.txt') { + push @return, split /\n/, <<'END'; +sc ; Arab ; Arabic +sc ; Armn ; Armenian +sc ; Beng ; Bengali +sc ; Bopo ; Bopomofo +sc ; Cans ; Canadian_Aboriginal +sc ; Cher ; Cherokee +sc ; Cyrl ; Cyrillic +sc ; Deva ; Devanagari +sc ; Dsrt ; Deseret +sc ; Ethi ; Ethiopic +sc ; Geor ; Georgian +sc ; Goth ; Gothic +sc ; Grek ; Greek +sc ; Gujr ; Gujarati +sc ; Guru ; Gurmukhi +sc ; Hang ; Hangul +sc ; Hani ; Han +sc ; Hebr ; Hebrew +sc ; Hira ; Hiragana +sc ; Ital ; Old_Italic +sc ; Kana ; Katakana +sc ; Khmr ; Khmer +sc ; Knda ; Kannada +sc ; Laoo ; Lao +sc ; Latn ; Latin +sc ; Mlym ; Malayalam +sc ; Mong ; Mongolian +sc ; Mymr ; Myanmar +sc ; Ogam ; Ogham +sc ; Orya ; Oriya +sc ; Qaai ; Inherited +sc ; Runr ; Runic +sc ; Sinh ; Sinhala +sc ; Syrc ; Syriac +sc ; Taml ; Tamil +sc ; Telu ; Telugu +sc ; Thaa ; Thaana +sc ; Thai ; Thai +sc ; Tibt ; Tibetan +sc ; Yiii ; Yi +sc ; Zyyy ; Common +END + } + + if ($v_version ge v2.0.0) { + push @return, split /\n/, <<'END'; +dt ; com ; compat +dt ; nar ; narrow +dt ; sml ; small +dt ; vert ; vertical +dt ; wide ; wide + +gc ; Cf ; Format +gc ; Cs ; Surrogate +gc ; Lt ; Titlecase_Letter +gc ; Me ; Enclosing_Mark +gc ; Nl ; Letter_Number +gc ; Pc ; Connector_Punctuation +gc ; Sk ; Modifier_Symbol +END + } + if ($v_version ge v2.1.2) { + push @return, "bc ; S ; Segment_Separator\n"; + } + if ($v_version ge v2.1.5) { + push @return, split /\n/, <<'END'; +gc ; Pf ; Final_Punctuation +gc ; Pi ; Initial_Punctuation +END + } + if ($v_version ge v2.1.8) { + push @return, "ccc; 240; IS ; Iota_Subscript\n"; + } + + if ($v_version ge v3.0.0) { + push @return, split /\n/, <<'END'; +bc ; AL ; Arabic_Letter +bc ; BN ; Boundary_Neutral +bc ; LRE ; Left_To_Right_Embedding +bc ; LRO ; Left_To_Right_Override +bc ; NSM ; Nonspacing_Mark +bc ; PDF ; Pop_Directional_Format +bc ; RLE ; Right_To_Left_Embedding +bc ; RLO ; Right_To_Left_Override + +ccc; 233; DB ; Double_Below +END + } + + if ($v_version ge v3.1.0) { + push @return, "ccc; 226; R ; Right\n"; + } + + return @return; +} + +sub output_perl_charnames_line ($$) { + + # Output the entries in Perl_charnames specially, using 5 digits instead + # of four. This makes the entries a constant length, and simplifies + # charnames.pm which this table is for. Unicode can have 6 digit + # ordinals, but they are all private use or noncharacters which do not + # have names, so won't be in this table. + + return sprintf "%05X\t%s\n", $_[0], $_[1]; +} + +{ # Closure + # This is used to store the range list of all the code points usable when + # the little used $compare_versions feature is enabled. + my $compare_versions_range_list; + + sub process_generic_property_file { + # This processes a file containing property mappings and puts them + # into internal map tables. It should be used to handle any property + # files that have mappings from a code point or range thereof to + # something else. This means almost all the UCD .txt files. + # each_line_handlers() should be set to adjust the lines of these + # files, if necessary, to what this routine understands: + # + # 0374 ; NFD_QC; N + # 003C..003E ; Math + # + # the fields are: "codepoint range ; property; map" + # + # meaning the codepoints in the range all have the value 'map' under + # 'property'. + # Beginning and trailing white space in each field are not signficant. + # Note there is not a trailing semi-colon in the above. A trailing + # semi-colon means the map is a null-string. An omitted map, as + # opposed to a null-string, is assumed to be 'Y', based on Unicode + # table syntax. (This could have been hidden from this routine by + # doing it in the $file object, but that would require parsing of the + # line there, so would have to parse it twice, or change the interface + # to pass this an array. So not done.) + # + # The map field may begin with a sequence of commands that apply to + # this range. Each such command begins and ends with $CMD_DELIM. + # These are used to indicate, for example, that the mapping for a + # range has a non-default type. + # + # This loops through the file, calling it's next_line() method, and + # then taking the map and adding it to the property's table. + # Complications arise because any number of properties can be in the + # file, in any order, interspersed in any way. The first time a + # property is seen, it gets information about that property and + # caches it for quick retrieval later. It also normalizes the maps + # so that only one of many synonym is stored. The Unicode input files + # do use some multiple synonyms. + + my $file = shift; + Carp::carp_extra_args(\@_) if main::DEBUG && @_; + + my %property_info; # To keep track of what properties + # have already had entries in the + # current file, and info about each, + # so don't have to recompute. + my $property_name; # property currently being worked on + my $property_type; # and its type + my $previous_property_name = ""; # name from last time through loop + my $property_object; # pointer to the current property's + # object + my $property_addr; # the address of that object + my $default_map; # the string that code points missing + # from the file map to + my $default_table; # For non-string properties, a + # reference to the match table that + # will contain the list of code + # points that map to $default_map. + + # Get the next real non-comment line + LINE: + while ($file->next_line) { + + # Default replacement type; means that if parts of the range have + # already been stored in our tables, the new map overrides them if + # they differ more than cosmetically + my $replace = $IF_NOT_EQUIVALENT; + my $map_type; # Default type for the map of this range + + #local $to_trace = 1 if main::DEBUG; + trace $_ if main::DEBUG && $to_trace; + + # Split the line into components + my ($range, $property_name, $map, @remainder) + = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields + + # If more or less on the line than we are expecting, warn and skip + # the line + if (@remainder) { + $file->carp_bad_line('Extra fields'); + next LINE; + } + elsif ( ! defined $property_name) { + $file->carp_bad_line('Missing property'); + next LINE; + } + + # Examine the range. + if ($range !~ /^ ($code_point_re) (?:\.\. ($code_point_re) )? $/x) + { + $file->carp_bad_line("Range '$range' not of the form 'CP1' or 'CP1..CP2' (where CP1,2 are code points in hex)"); + next LINE; + } + my $low = hex $1; + my $high = (defined $2) ? hex $2 : $low; + + # For the very specialized case of comparing two Unicode + # versions... + if (DEBUG && $compare_versions) { + if ($property_name eq 'Age') { + + # Only allow code points at least as old as the version + # specified. + my $age = pack "C*", split(/\./, $map); # v string + next LINE if $age gt $compare_versions; + } + else { + + # Again, we throw out code points younger than those of + # the specified version. By now, the Age property is + # populated. We use the intersection of each input range + # with this property to find what code points in it are + # valid. To do the intersection, we have to convert the + # Age property map to a Range_list. We only have to do + # this once. + if (! defined $compare_versions_range_list) { + my $age = property_ref('Age'); + if (! -e 'DAge.txt') { + croak "Need to have 'DAge.txt' file to do version comparison"; + } + elsif ($age->count == 0) { + croak "The 'Age' table is empty, but its file exists"; + } + $compare_versions_range_list + = Range_List->new(Initialize => $age); + } + + # An undefined map is always 'Y' + $map = 'Y' if ! defined $map; + + # Calculate the intersection of the input range with the + # code points that are known in the specified version + my @ranges = ($compare_versions_range_list + & Range->new($low, $high))->ranges; + + # If the intersection is empty, throw away this range + next LINE unless @ranges; + + # Only examine the first range this time through the loop. + my $this_range = shift @ranges; + + # Put any remaining ranges in the queue to be processed + # later. Note that there is unnecessary work here, as we + # will do the intersection again for each of these ranges + # during some future iteration of the LINE loop, but this + # code is not used in production. The later intersections + # are guaranteed to not splinter, so this will not become + # an infinite loop. + my $line = join ';', $property_name, $map; + foreach my $range (@ranges) { + $file->insert_adjusted_lines(sprintf("%04X..%04X; %s", + $range->start, + $range->end, + $line)); + } + + # And process the first range, like any other. + $low = $this_range->start; + $high = $this_range->end; + } + } # End of $compare_versions + + # If changing to a new property, get the things constant per + # property + if ($previous_property_name ne $property_name) { + + $property_object = property_ref($property_name); + if (! defined $property_object) { + $file->carp_bad_line("Unexpected property '$property_name'. Skipped"); + next LINE; + } + { no overloading; $property_addr = pack 'J', $property_object; } + + # Defer changing names until have a line that is acceptable + # (the 'next' statement above means is unacceptable) + $previous_property_name = $property_name; + + # If not the first time for this property, retrieve info about + # it from the cache + if (defined ($property_info{$property_addr}{'type'})) { + $property_type = $property_info{$property_addr}{'type'}; + $default_map = $property_info{$property_addr}{'default'}; + $map_type + = $property_info{$property_addr}{'pseudo_map_type'}; + $default_table + = $property_info{$property_addr}{'default_table'}; + } + else { + + # Here, is the first time for this property. Set up the + # cache. + $property_type = $property_info{$property_addr}{'type'} + = $property_object->type; + $map_type + = $property_info{$property_addr}{'pseudo_map_type'} + = $property_object->pseudo_map_type; + + # The Unicode files are set up so that if the map is not + # defined, it is a binary property + if (! defined $map && $property_type != $BINARY) { + if ($property_type != $UNKNOWN + && $property_type != $NON_STRING) + { + $file->carp_bad_line("No mapping defined on a non-binary property. Using 'Y' for the map"); + } + else { + $property_object->set_type($BINARY); + $property_type + = $property_info{$property_addr}{'type'} + = $BINARY; + } + } + + # Get any @missings default for this property. This + # should precede the first entry for the property in the + # input file, and is located in a comment that has been + # stored by the Input_file class until we access it here. + # It's possible that there is more than one such line + # waiting for us; collect them all, and parse + my @missings_list = $file->get_missings + if $file->has_missings_defaults; + foreach my $default_ref (@missings_list) { + my $default = $default_ref->[0]; + my $addr = do { no overloading; pack 'J', property_ref($default_ref->[1]); }; + + # For string properties, the default is just what the + # file says, but non-string properties should already + # have set up a table for the default property value; + # use the table for these, so can resolve synonyms + # later to a single standard one. + if ($property_type == $STRING + || $property_type == $UNKNOWN) + { + $property_info{$addr}{'missings'} = $default; + } + else { + $property_info{$addr}{'missings'} + = $property_object->table($default); + } + } + + # Finished storing all the @missings defaults in the input + # file so far. Get the one for the current property. + my $missings = $property_info{$property_addr}{'missings'}; + + # But we likely have separately stored what the default + # should be. (This is to accommodate versions of the + # standard where the @missings lines are absent or + # incomplete.) Hopefully the two will match. But check + # it out. + $default_map = $property_object->default_map; + + # If the map is a ref, it means that the default won't be + # processed until later, so undef it, so next few lines + # will redefine it to something that nothing will match + undef $default_map if ref $default_map; + + # Create a $default_map if don't have one; maybe a dummy + # that won't match anything. + if (! defined $default_map) { + + # Use any @missings line in the file. + if (defined $missings) { + if (ref $missings) { + $default_map = $missings->full_name; + $default_table = $missings; + } + else { + $default_map = $missings; + } + + # And store it with the property for outside use. + $property_object->set_default_map($default_map); + } + else { + + # Neither an @missings nor a default map. Create + # a dummy one, so won't have to test definedness + # in the main loop. + $default_map = '_Perl This will never be in a file + from Unicode'; + } + } + + # Here, we have $default_map defined, possibly in terms of + # $missings, but maybe not, and possibly is a dummy one. + if (defined $missings) { + + # Make sure there is no conflict between the two. + # $missings has priority. + if (ref $missings) { + $default_table + = $property_object->table($default_map); + if (! defined $default_table + || $default_table != $missings) + { + if (! defined $default_table) { + $default_table = $UNDEF; + } + $file->carp_bad_line(<full_name; + } + $property_info{$property_addr}{'default_table'} + = $default_table; + } + elsif ($default_map ne $missings) { + $file->carp_bad_line(<table($default_map); + } + } # End of is first time for this property + } # End of switching properties. + + # Ready to process the line. + # The Unicode files are set up so that if the map is not defined, + # it is a binary property with value 'Y' + if (! defined $map) { + $map = 'Y'; + } + else { + + # If the map begins with a special command to us (enclosed in + # delimiters), extract the command(s). + if (substr($map, 0, 1) eq $CMD_DELIM) { + while ($map =~ s/ ^ $CMD_DELIM (.*?) $CMD_DELIM //x) { + my $command = $1; + if ($command =~ / ^ $REPLACE_CMD= (.*) /x) { + $replace = $1; + } + elsif ($command =~ / ^ $MAP_TYPE_CMD= (.*) /x) { + $map_type = $1; + } + else { + $file->carp_bad_line("Unknown command line: '$1'"); + next LINE; + } + } + } + } + + if ($default_map eq $CODE_POINT && $map =~ / ^ $code_point_re $/x) + { + + # Here, we have a map to a particular code point, and the + # default map is to a code point itself. If the range + # includes the particular code point, change that portion of + # the range to the default. This makes sure that in the final + # table only the non-defaults are listed. + my $decimal_map = hex $map; + if ($low <= $decimal_map && $decimal_map <= $high) { + + # If the range includes stuff before or after the map + # we're changing, split it and process the split-off parts + # later. + if ($low < $decimal_map) { + $file->insert_adjusted_lines( + sprintf("%04X..%04X; %s; %s", + $low, + $decimal_map - 1, + $property_name, + $map)); + } + if ($high > $decimal_map) { + $file->insert_adjusted_lines( + sprintf("%04X..%04X; %s; %s", + $decimal_map + 1, + $high, + $property_name, + $map)); + } + $low = $high = $decimal_map; + $map = $CODE_POINT; + } + } + + # If we can tell that this is a synonym for the default map, use + # the default one instead. + if ($property_type != $STRING + && $property_type != $UNKNOWN) + { + my $table = $property_object->table($map); + if (defined $table && $table == $default_table) { + $map = $default_map; + } + } + + # And figure out the map type if not known. + if (! defined $map_type || $map_type == $COMPUTE_NO_MULTI_CP) { + if ($map eq "") { # Nulls are always $NULL map type + $map_type = $NULL; + } # Otherwise, non-strings, and those that don't allow + # $MULTI_CP, and those that aren't multiple code points are + # 0 + elsif + (($property_type != $STRING && $property_type != $UNKNOWN) + || (defined $map_type && $map_type == $COMPUTE_NO_MULTI_CP) + || $map !~ /^ $code_point_re ( \ $code_point_re )+ $ /x) + { + $map_type = 0; + } + else { + $map_type = $MULTI_CP; + } + } + + $property_object->add_map($low, $high, + $map, + Type => $map_type, + Replace => $replace); + } # End of loop through file's lines + + return; + } +} + +{ # Closure for UnicodeData.txt handling + + # This file was the first one in the UCD; its design leads to some + # awkwardness in processing. Here is a sample line: + # 0041;LATIN CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0061; + # The fields in order are: + my $i = 0; # The code point is in field 0, and is shifted off. + my $CHARNAME = $i++; # character name (e.g. "LATIN CAPITAL LETTER A") + my $CATEGORY = $i++; # category (e.g. "Lu") + my $CCC = $i++; # Canonical combining class (e.g. "230") + my $BIDI = $i++; # directional class (e.g. "L") + my $PERL_DECOMPOSITION = $i++; # decomposition mapping + my $PERL_DECIMAL_DIGIT = $i++; # decimal digit value + my $NUMERIC_TYPE_OTHER_DIGIT = $i++; # digit value, like a superscript + # Dual-use in this program; see below + my $NUMERIC = $i++; # numeric value + my $MIRRORED = $i++; # ? mirrored + my $UNICODE_1_NAME = $i++; # name in Unicode 1.0 + my $COMMENT = $i++; # iso comment + my $UPPER = $i++; # simple uppercase mapping + my $LOWER = $i++; # simple lowercase mapping + my $TITLE = $i++; # simple titlecase mapping + my $input_field_count = $i; + + # This routine in addition outputs these extra fields: + my $DECOMP_TYPE = $i++; # Decomposition type + + # These fields are modifications of ones above, and are usually + # suppressed; they must come last, as for speed, the loop upper bound is + # normally set to ignore them + my $NAME = $i++; # This is the strict name field, not the one that + # charnames uses. + my $DECOMP_MAP = $i++; # Strict decomposition mapping; not the one used + # by Unicode::Normalize + my $last_field = $i - 1; + + # All these are read into an array for each line, with the indices defined + # above. The empty fields in the example line above indicate that the + # value is defaulted. The handler called for each line of the input + # changes these to their defaults. + + # Here are the official names of the properties, in a parallel array: + my @field_names; + $field_names[$BIDI] = 'Bidi_Class'; + $field_names[$CATEGORY] = 'General_Category'; + $field_names[$CCC] = 'Canonical_Combining_Class'; + $field_names[$CHARNAME] = 'Perl_Charnames'; + $field_names[$COMMENT] = 'ISO_Comment'; + $field_names[$DECOMP_MAP] = 'Decomposition_Mapping'; + $field_names[$DECOMP_TYPE] = 'Decomposition_Type'; + $field_names[$LOWER] = 'Lowercase_Mapping'; + $field_names[$MIRRORED] = 'Bidi_Mirrored'; + $field_names[$NAME] = 'Name'; + $field_names[$NUMERIC] = 'Numeric_Value'; + $field_names[$NUMERIC_TYPE_OTHER_DIGIT] = 'Numeric_Type'; + $field_names[$PERL_DECIMAL_DIGIT] = 'Perl_Decimal_Digit'; + $field_names[$PERL_DECOMPOSITION] = 'Perl_Decomposition_Mapping'; + $field_names[$TITLE] = 'Titlecase_Mapping'; + $field_names[$UNICODE_1_NAME] = 'Unicode_1_Name'; + $field_names[$UPPER] = 'Uppercase_Mapping'; + + # Some of these need a little more explanation: + # The $PERL_DECIMAL_DIGIT field does not lead to an official Unicode + # property, but is used in calculating the Numeric_Type. Perl however, + # creates a file from this field, so a Perl property is created from it. + # Similarly, the Other_Digit field is used only for calculating the + # Numeric_Type, and so it can be safely re-used as the place to store + # the value for Numeric_Type; hence it is referred to as + # $NUMERIC_TYPE_OTHER_DIGIT. + # The input field named $PERL_DECOMPOSITION is a combination of both the + # decomposition mapping and its type. Perl creates a file containing + # exactly this field, so it is used for that. The two properties are + # separated into two extra output fields, $DECOMP_MAP and $DECOMP_TYPE. + # $DECOMP_MAP is usually suppressed (unless the lists are changed to + # output it), as Perl doesn't use it directly. + # The input field named here $CHARNAME is used to construct the + # Perl_Charnames property, which is a combination of the Name property + # (which the input field contains), and the Unicode_1_Name property, and + # others from other files. Since, the strict Name property is not used + # by Perl, this field is used for the table that Perl does use. The + # strict Name property table is usually suppressed (unless the lists are + # changed to output it), so it is accumulated in a separate field, + # $NAME, which to save time is discarded unless the table is actually to + # be output + + # This file is processed like most in this program. Control is passed to + # process_generic_property_file() which calls filter_UnicodeData_line() + # for each input line. This filter converts the input into line(s) that + # process_generic_property_file() understands. There is also a setup + # routine called before any of the file is processed, and a handler for + # EOF processing, all in this closure. + + # A huge speed-up occurred at the cost of some added complexity when these + # routines were altered to buffer the outputs into ranges. Almost all the + # lines of the input file apply to just one code point, and for most + # properties, the map for the next code point up is the same as the + # current one. So instead of creating a line for each property for each + # input line, filter_UnicodeData_line() remembers what the previous map + # of a property was, and doesn't generate a line to pass on until it has + # to, as when the map changes; and that passed-on line encompasses the + # whole contiguous range of code points that have the same map for that + # property. This means a slight amount of extra setup, and having to + # flush these buffers on EOF, testing if the maps have changed, plus + # remembering state information in the closure. But it means a lot less + # real time in not having to change the data base for each property on + # each line. + + # Another complication is that there are already a few ranges designated + # in the input. There are two lines for each, with the same maps except + # the code point and name on each line. This was actually the hardest + # thing to design around. The code points in those ranges may actually + # have real maps not given by these two lines. These maps will either + # be algorthimically determinable, or in the extracted files furnished + # with the UCD. In the event of conflicts between these extracted files, + # and this one, Unicode says that this one prevails. But it shouldn't + # prevail for conflicts that occur in these ranges. The data from the + # extracted files prevails in those cases. So, this program is structured + # so that those files are processed first, storing maps. Then the other + # files are processed, generally overwriting what the extracted files + # stored. But just the range lines in this input file are processed + # without overwriting. This is accomplished by adding a special string to + # the lines output to tell process_generic_property_file() to turn off the + # overwriting for just this one line. + # A similar mechanism is used to tell it that the map is of a non-default + # type. + + sub setup_UnicodeData { # Called before any lines of the input are read + my $file = shift; + Carp::carp_extra_args(\@_) if main::DEBUG && @_; + + # Create a new property specially located that is a combination of the + # various Name properties: Name, Unicode_1_Name, Named Sequences, and + # Name_Alias properties. (The final duplicates elements of the + # first.) A comment for it will later be constructed based on the + # actual properties present and used + $perl_charname = Property->new('Perl_Charnames', + Core_Access => '\N{...} and "use charnames"', + Default_Map => "", + Directory => File::Spec->curdir(), + File => 'Name', + Internal_Only_Warning => 1, + Perl_Extension => 1, + Range_Size_1 => \&output_perl_charnames_line, + Type => $STRING, + ); + + my $Perl_decomp = Property->new('Perl_Decomposition_Mapping', + Directory => File::Spec->curdir(), + File => 'Decomposition', + Format => $STRING_FORMAT, + Internal_Only_Warning => 1, + Perl_Extension => 1, + Default_Map => $CODE_POINT, + + # normalize.pm can't cope with these + Output_Range_Counts => 0, + + # This is a specially formatted table + # explicitly for normalize.pm, which + # is expecting a particular format, + # which means that mappings containing + # multiple code points are in the main + # body of the table + Map_Type => $COMPUTE_NO_MULTI_CP, + Type => $STRING, + ); + $Perl_decomp->add_comment(join_lines(<, which denotes the +compatible decomposition type. If the map does not begin with the , the decomposition is canonical. +END + )); + + my $Decimal_Digit = Property->new("Perl_Decimal_Digit", + Default_Map => "", + Perl_Extension => 1, + File => 'Digit', # Trad. location + Directory => $map_directory, + Type => $STRING, + Range_Size_1 => 1, + ); + $Decimal_Digit->add_comment(join_lines(<to_output_map + && ! property_ref('Name')->to_output_map) + { + $last_field = min($NAME, $DECOMP_MAP) - 1; + } elsif (property_ref('Decomposition_Mapping')->to_output_map) { + $last_field = $DECOMP_MAP; + } elsif (property_ref('Name')->to_output_map) { + $last_field = $NAME; + } + return; + } + + my $first_time = 1; # ? Is this the first line of the file + my $in_range = 0; # ? Are we in one of the file's ranges + my $previous_cp; # hex code point of previous line + my $decimal_previous_cp = -1; # And its decimal equivalent + my @start; # For each field, the current starting + # code point in hex for the range + # being accumulated. + my @fields; # The input fields; + my @previous_fields; # And those from the previous call + + sub filter_UnicodeData_line { + # Handle a single input line from UnicodeData.txt; see comments above + # Conceptually this takes a single line from the file containing N + # properties, and converts it into N lines with one property per line, + # which is what the final handler expects. But there are + # complications due to the quirkiness of the input file, and to save + # time, it accumulates ranges where the property values don't change + # and only emits lines when necessary. This is about an order of + # magnitude fewer lines emitted. + + my $file = shift; + Carp::carp_extra_args(\@_) if main::DEBUG && @_; + + # $_ contains the input line. + # -1 in split means retain trailing null fields + (my $cp, @fields) = split /\s*;\s*/, $_, -1; + + #local $to_trace = 1 if main::DEBUG; + trace $cp, @fields , $input_field_count if main::DEBUG && $to_trace; + if (@fields > $input_field_count) { + $file->carp_bad_line('Extra fields'); + $_ = ""; + return; + } + + my $decimal_cp = hex $cp; + + # We have to output all the buffered ranges when the next code point + # is not exactly one after the previous one, which means there is a + # gap in the ranges. + my $force_output = ($decimal_cp != $decimal_previous_cp + 1); + + # The decomposition mapping field requires special handling. It looks + # like either: + # + # 0032 0020 + # 0041 0300 + # + # The decomposition type is enclosed in ; if missing, it + # means the type is canonical. There are two decomposition mapping + # tables: the one for use by Perl's normalize.pm has a special format + # which is this field intact; the other, for general use is of + # standard format. In either case we have to find the decomposition + # type. Empty fields have None as their type, and map to the code + # point itself + if ($fields[$PERL_DECOMPOSITION] eq "") { + $fields[$DECOMP_TYPE] = 'None'; + $fields[$DECOMP_MAP] = $fields[$PERL_DECOMPOSITION] = $CODE_POINT; + } + else { + ($fields[$DECOMP_TYPE], my $map) = $fields[$PERL_DECOMPOSITION] + =~ / < ( .+? ) > \s* ( .+ ) /x; + if (! defined $fields[$DECOMP_TYPE]) { + $fields[$DECOMP_TYPE] = 'Canonical'; + $fields[$DECOMP_MAP] = $fields[$PERL_DECOMPOSITION]; + } + else { + $fields[$DECOMP_MAP] = $map; + } + } + + # The 3 numeric fields also require special handling. The 2 digit + # fields must be either empty or match the number field. This means + # that if it is empty, they must be as well, and the numeric type is + # None, and the numeric value is 'Nan'. + # The decimal digit field must be empty or match the other digit + # field. If the decimal digit field is non-empty, the code point is + # a decimal digit, and the other two fields will have the same value. + # If it is empty, but the other digit field is non-empty, the code + # point is an 'other digit', and the number field will have the same + # value as the other digit field. If the other digit field is empty, + # but the number field is non-empty, the code point is a generic + # numeric type. + if ($fields[$NUMERIC] eq "") { + if ($fields[$PERL_DECIMAL_DIGIT] ne "" + || $fields[$NUMERIC_TYPE_OTHER_DIGIT] ne "" + ) { + $file->carp_bad_line("Numeric values inconsistent. Trying to process anyway"); + } + $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'None'; + $fields[$NUMERIC] = 'NaN'; + } + else { + $file->carp_bad_line("'$fields[$NUMERIC]' should be a whole or rational number. Processing as if it were") if $fields[$NUMERIC] !~ qr{ ^ -? \d+ ( / \d+ )? $ }x; + if ($fields[$PERL_DECIMAL_DIGIT] ne "") { + $file->carp_bad_line("$fields[$PERL_DECIMAL_DIGIT] should equal $fields[$NUMERIC]. Processing anyway") if $fields[$PERL_DECIMAL_DIGIT] != $fields[$NUMERIC]; + $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'Decimal'; + } + elsif ($fields[$NUMERIC_TYPE_OTHER_DIGIT] ne "") { + $file->carp_bad_line("$fields[$NUMERIC_TYPE_OTHER_DIGIT] should equal $fields[$NUMERIC]. Processing anyway") if $fields[$NUMERIC_TYPE_OTHER_DIGIT] != $fields[$NUMERIC]; + $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'Digit'; + } + else { + $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'Numeric'; + + # Rationals require extra effort. + register_fraction($fields[$NUMERIC]) + if $fields[$NUMERIC] =~ qr{/}; + } + } + + # For the properties that have empty fields in the file, and which + # mean something different from empty, change them to that default. + # Certain fields just haven't been empty so far in any Unicode + # version, so don't look at those, namely $MIRRORED, $BIDI, $CCC, + # $CATEGORY. This leaves just the two fields, and so we hard-code in + # the defaults; which are very unlikely to ever change. + $fields[$UPPER] = $CODE_POINT if $fields[$UPPER] eq ""; + $fields[$LOWER] = $CODE_POINT if $fields[$LOWER] eq ""; + + # UAX44 says that if title is empty, it is the same as whatever upper + # is, + $fields[$TITLE] = $fields[$UPPER] if $fields[$TITLE] eq ""; + + # There are a few pairs of lines like: + # AC00;;Lo;0;L;;;;;N;;;;; + # D7A3;;Lo;0;L;;;;;N;;;;; + # that define ranges. These should be processed after the fields are + # adjusted above, as they may override some of them; but mostly what + # is left is to possibly adjust the $CHARNAME field. The names of all the + # paired lines start with a '<', but this is also true of ', + # which isn't one of these special ones. + if ($fields[$CHARNAME] eq '') { + + # Some code points in this file have the pseudo-name + # '', but the official name for such ones is the null + # string. For charnames.pm, we use the Unicode version 1 name + $fields[$NAME] = ""; + $fields[$CHARNAME] = $fields[$UNICODE_1_NAME]; + + # We had better not be in between range lines. + if ($in_range) { + $file->carp_bad_line("Expecting a closing range line, not a $fields[$CHARNAME]'. Trying anyway"); + $in_range = 0; + } + } + elsif (substr($fields[$CHARNAME], 0, 1) ne '<') { + + # Here is a non-range line. We had better not be in between range + # lines. + if ($in_range) { + $file->carp_bad_line("Expecting a closing range line, not a $fields[$CHARNAME]'. Trying anyway"); + $in_range = 0; + } + if ($fields[$CHARNAME] =~ s/- $cp $//x) { + + # These are code points whose names end in their code points, + # which means the names are algorithmically derivable from the + # code points. To shorten the output Name file, the algorithm + # for deriving these is placed in the file instead of each + # code point, so they have map type $CP_IN_NAME + $fields[$CHARNAME] = $CMD_DELIM + . $MAP_TYPE_CMD + . '=' + . $CP_IN_NAME + . $CMD_DELIM + . $fields[$CHARNAME]; + } + $fields[$NAME] = $fields[$CHARNAME]; + } + elsif ($fields[$CHARNAME] =~ /^<(.+), First>$/) { + $fields[$CHARNAME] = $fields[$NAME] = $1; + + # Here we are at the beginning of a range pair. + if ($in_range) { + $file->carp_bad_line("Expecting a closing range line, not a beginning one, $fields[$CHARNAME]'. Trying anyway"); + } + $in_range = 1; + + # Because the properties in the range do not overwrite any already + # in the db, we must flush the buffers of what's already there, so + # they get handled in the normal scheme. + $force_output = 1; + + } + elsif ($fields[$CHARNAME] !~ s/^<(.+), Last>$/$1/) { + $file->carp_bad_line("Unexpected name starting with '<' $fields[$CHARNAME]. Ignoring this line."); + $_ = ""; + return; + } + else { # Here, we are at the last line of a range pair. + + if (! $in_range) { + $file->carp_bad_line("Unexpected end of range $fields[$CHARNAME] when not in one. Ignoring this line."); + $_ = ""; + return; + } + $in_range = 0; + + $fields[$NAME] = $fields[$CHARNAME]; + + # Check that the input is valid: that the closing of the range is + # the same as the beginning. + foreach my $i (0 .. $last_field) { + next if $fields[$i] eq $previous_fields[$i]; + $file->carp_bad_line("Expecting '$fields[$i]' to be the same as '$previous_fields[$i]'. Bad News. Trying anyway"); + } + + # The processing differs depending on the type of range, + # determined by its $CHARNAME + if ($fields[$CHARNAME] =~ /^Hangul Syllable/) { + + # Check that the data looks right. + if ($decimal_previous_cp != $SBase) { + $file->carp_bad_line("Unexpected Hangul syllable start = $previous_cp. Bad News. Results will be wrong"); + } + if ($decimal_cp != $SBase + $SCount - 1) { + $file->carp_bad_line("Unexpected Hangul syllable end = $cp. Bad News. Results will be wrong"); + } + + # The Hangul syllable range has a somewhat complicated name + # generation algorithm. Each code point in it has a canonical + # decomposition also computable by an algorithm. The + # perl decomposition map table built from these is used only + # by normalize.pm, which has the algorithm built in it, so the + # decomposition maps are not needed, and are large, so are + # omitted from it. If the full decomposition map table is to + # be output, the decompositions are generated for it, in the + # EOF handling code for this input file. + + $previous_fields[$DECOMP_TYPE] = 'Canonical'; + + # This range is stored in our internal structure with its + # own map type, different from all others. + $previous_fields[$CHARNAME] = $previous_fields[$NAME] + = $CMD_DELIM + . $MAP_TYPE_CMD + . '=' + . $HANGUL_SYLLABLE + . $CMD_DELIM + . $fields[$CHARNAME]; + } + elsif ($fields[$CHARNAME] =~ /^CJK/) { + + # The name for these contains the code point itself, and all + # are defined to have the same base name, regardless of what + # is in the file. They are stored in our internal structure + # with a map type of $CP_IN_NAME + $previous_fields[$CHARNAME] = $previous_fields[$NAME] + = $CMD_DELIM + . $MAP_TYPE_CMD + . '=' + . $CP_IN_NAME + . $CMD_DELIM + . 'CJK UNIFIED IDEOGRAPH'; + + } + elsif ($fields[$CATEGORY] eq 'Co' + || $fields[$CATEGORY] eq 'Cs') + { + # The names of all the code points in these ranges are set to + # null, as there are no names for the private use and + # surrogate code points. + + $previous_fields[$CHARNAME] = $previous_fields[$NAME] = ""; + } + else { + $file->carp_bad_line("Unexpected code point range $fields[$CHARNAME] because category is $fields[$CATEGORY]. Attempting to process it."); + } + + # The first line of the range caused everything else to be output, + # and then its values were stored as the beginning values for the + # next set of ranges, which this one ends. Now, for each value, + # add a command to tell the handler that these values should not + # replace any existing ones in our database. + foreach my $i (0 .. $last_field) { + $previous_fields[$i] = $CMD_DELIM + . $REPLACE_CMD + . '=' + . $NO + . $CMD_DELIM + . $previous_fields[$i]; + } + + # And change things so it looks like the entire range has been + # gone through with this being the final part of it. Adding the + # command above to each field will cause this range to be flushed + # during the next iteration, as it guaranteed that the stored + # field won't match whatever value the next one has. + $previous_cp = $cp; + $decimal_previous_cp = $decimal_cp; + + # We are now set up for the next iteration; so skip the remaining + # code in this subroutine that does the same thing, but doesn't + # know about these ranges. + $_ = ""; + + return; + } + + # On the very first line, we fake it so the code below thinks there is + # nothing to output, and initialize so that when it does get output it + # uses the first line's values for the lowest part of the range. + # (One could avoid this by using peek(), but then one would need to + # know the adjustments done above and do the same ones in the setup + # routine; not worth it) + if ($first_time) { + $first_time = 0; + @previous_fields = @fields; + @start = ($cp) x scalar @fields; + $decimal_previous_cp = $decimal_cp - 1; + } + + # For each field, output the stored up ranges that this code point + # doesn't fit in. Earlier we figured out if all ranges should be + # terminated because of changing the replace or map type styles, or if + # there is a gap between this new code point and the previous one, and + # that is stored in $force_output. But even if those aren't true, we + # need to output the range if this new code point's value for the + # given property doesn't match the stored range's. + #local $to_trace = 1 if main::DEBUG; + foreach my $i (0 .. $last_field) { + my $field = $fields[$i]; + if ($force_output || $field ne $previous_fields[$i]) { + + # Flush the buffer of stored values. + $file->insert_adjusted_lines("$start[$i]..$previous_cp; $field_names[$i]; $previous_fields[$i]"); + + # Start a new range with this code point and its value + $start[$i] = $cp; + $previous_fields[$i] = $field; + } + } + + # Set the values for the next time. + $previous_cp = $cp; + $decimal_previous_cp = $decimal_cp; + + # The input line has generated whatever adjusted lines are needed, and + # should not be looked at further. + $_ = ""; + return; + } + + sub EOF_UnicodeData { + # Called upon EOF to flush the buffers, and create the Hangul + # decomposition mappings if needed. + + my $file = shift; + Carp::carp_extra_args(\@_) if main::DEBUG && @_; + + # Flush the buffers. + foreach my $i (1 .. $last_field) { + $file->insert_adjusted_lines("$start[$i]..$previous_cp; $field_names[$i]; $previous_fields[$i]"); + } + + if (-e 'Jamo.txt') { + + # The algorithm is published by Unicode, based on values in + # Jamo.txt, (which should have been processed before this + # subroutine), and the results left in %Jamo + unless (%Jamo) { + Carp::my_carp_bug("Jamo.txt should be processed before Unicode.txt. Hangul syllables not generated."); + return; + } + + # If the full decomposition map table is being output, insert + # into it the Hangul syllable mappings. This is to avoid having + # to publish a subroutine in it to compute them. (which would + # essentially be this code.) This uses the algorithm published by + # Unicode. + if (property_ref('Decomposition_Mapping')->to_output_map) { + local $to_trace = 1 if main::DEBUG; + for (my $S = $SBase; $S < $SBase + $SCount; $S++) { + use integer; + my $SIndex = $S - $SBase; + my $L = $LBase + $SIndex / $NCount; + my $V = $VBase + ($SIndex % $NCount) / $TCount; + my $T = $TBase + $SIndex % $TCount; + + trace "L=$L, V=$V, T=$T" if main::DEBUG && $to_trace; + my $decomposition = sprintf("%04X %04X", $L, $V); + $decomposition .= sprintf(" %04X", $T) if $T != $TBase; + $file->insert_adjusted_lines( + sprintf("%04X; Decomposition_Mapping; %s", + $S, + $decomposition)); + } + } + } + + return; + } + + sub filter_v1_ucd { + # Fix UCD lines in version 1. This is probably overkill, but this + # fixes some glaring errors in Version 1 UnicodeData.txt. That file: + # 1) had many Hangul (U+3400 - U+4DFF) code points that were later + # removed. This program retains them + # 2) didn't include ranges, which it should have, and which are now + # added in @corrected_lines below. It was hand populated by + # taking the data from Version 2, verified by analyzing + # DAge.txt. + # 3) There is a syntax error in the entry for U+09F8 which could + # cause problems for utf8_heavy, and so is changed. It's + # numeric value was simply a minus sign, without any number. + # (Eventually Unicode changed the code point to non-numeric.) + # 4) The decomposition types often don't match later versions + # exactly, and the whole syntax of that field is different; so + # the syntax is changed as well as the types to their later + # terminology. Otherwise normalize.pm would be very unhappy + # 5) Many ccc classes are different. These are left intact. + # 6) U+FF10 - U+FF19 are missing their numeric values in all three + # fields. These are unchanged because it doesn't really cause + # problems for Perl. + # 7) A number of code points, such as controls, don't have their + # Unicode Version 1 Names in this file. These are unchanged. + + my @corrected_lines = split /\n/, <<'END'; +4E00;;Lo;0;L;;;;;N;;;;; +9FA5;;Lo;0;L;;;;;N;;;;; +E000;;Co;0;L;;;;;N;;;;; +F8FF;;Co;0;L;;;;;N;;;;; +F900;;Lo;0;L;;;;;N;;;;; +FA2D;;Lo;0;L;;;;;N;;;;; +END + + my $file = shift; + Carp::carp_extra_args(\@_) if main::DEBUG && @_; + + #local $to_trace = 1 if main::DEBUG; + trace $_ if main::DEBUG && $to_trace; + + # -1 => retain trailing null fields + my ($code_point, @fields) = split /\s*;\s*/, $_, -1; + + # At the first place that is wrong in the input, insert all the + # corrections, replacing the wrong line. + if ($code_point eq '4E00') { + my @copy = @corrected_lines; + $_ = shift @copy; + ($code_point, @fields) = split /\s*;\s*/, $_, -1; + + $file->insert_lines(@copy); + } + + + if ($fields[$NUMERIC] eq '-') { + $fields[$NUMERIC] = '-1'; # This is what 2.0 made it. + } + + if ($fields[$PERL_DECOMPOSITION] ne "") { + + # Several entries have this change to superscript 2 or 3 in the + # middle. Convert these to the modern version, which is to use + # the actual U+00B2 and U+00B3 (the superscript forms) instead. + # So 'HHHH HHHH <+sup> 0033 <-sup> HHHH' becomes + # 'HHHH HHHH 00B3 HHHH'. + # It turns out that all of these that don't have another + # decomposition defined at the beginning of the line have the + # decomposition in later releases. + if ($code_point ne '00B2' && $code_point ne '00B3') { + if ($fields[$PERL_DECOMPOSITION] + =~ s/<\+sup> 003([23]) <-sup>/00B$1/) + { + if (substr($fields[$PERL_DECOMPOSITION], 0, 1) ne '<') { + $fields[$PERL_DECOMPOSITION] = ' ' + . $fields[$PERL_DECOMPOSITION]; + } + } + } + + # If is like '<+circled> 0052 <-circled>', convert to + # ' 0052' + $fields[$PERL_DECOMPOSITION] =~ + s/ < \+ ( .*? ) > \s* (.*?) \s* <-\1> /<$1> $2/x; + + # Convert ' HHHH HHHH ' to ' HHHH HHHH', etc. + $fields[$PERL_DECOMPOSITION] =~ + s/ \s* (.*?) \s* / $1/x + or $fields[$PERL_DECOMPOSITION] =~ + s/ \s* (.*?) \s* / $1/x + or $fields[$PERL_DECOMPOSITION] =~ + s/ \s* (.*?) \s* / $1/x + or $fields[$PERL_DECOMPOSITION] =~ + s/ \s* (.*?) \s* / $1/x; + + # Convert ' HHHH HHHH ' to ' HHHH', etc. + $fields[$PERL_DECOMPOSITION] =~ + s/ <(break|no-break)> \s* (.*?) \s* <\1> /<$1> $2/x; + + # Change names to modern form. + $fields[$PERL_DECOMPOSITION] =~ s///g; + $fields[$PERL_DECOMPOSITION] =~ s///g; + $fields[$PERL_DECOMPOSITION] =~ s///g; + $fields[$PERL_DECOMPOSITION] =~ s///g; + + # One entry has weird braces + $fields[$PERL_DECOMPOSITION] =~ s/[{}]//g; + } + + $_ = join ';', $code_point, @fields; + trace $_ if main::DEBUG && $to_trace; + return; + } + + sub filter_v2_1_5_ucd { + # A dozen entries in this 2.1.5 file had the mirrored and numeric + # columns swapped; These all had mirrored be 'N'. So if the numeric + # column appears to be N, swap it back. + + my ($code_point, @fields) = split /\s*;\s*/, $_, -1; + if ($fields[$NUMERIC] eq 'N') { + $fields[$NUMERIC] = $fields[$MIRRORED]; + $fields[$MIRRORED] = 'N'; + $_ = join ';', $code_point, @fields; + } + return; + } +} # End closure for UnicodeData + +sub process_GCB_test { + + my $file = shift; + Carp::carp_extra_args(\@_) if main::DEBUG && @_; + + while ($file->next_line) { + push @backslash_X_tests, $_; + } + + return; +} + +sub process_NamedSequences { + # NamedSequences.txt entries are just added to an array. Because these + # don't look like the other tables, they have their own handler. + # An example: + # LATIN CAPITAL LETTER A WITH MACRON AND GRAVE;0100 0300 + # + # This just adds the sequence to an array for later handling + + my $file = shift; + Carp::carp_extra_args(\@_) if main::DEBUG && @_; + + while ($file->next_line) { + my ($name, $sequence, @remainder) = split /\s*;\s*/, $_, -1; + if (@remainder) { + $file->carp_bad_line( + "Doesn't look like 'KHMER VOWEL SIGN OM;17BB 17C6'"); + next; + } + + # Note single \t in keeping with special output format of + # Perl_charnames. But it turns out that the code points don't have to + # be 5 digits long, like the rest, based on the internal workings of + # charnames.pm. This could be easily changed for consistency. + push @named_sequences, "$sequence\t$name"; + } + return; +} + +{ # Closure + + my $first_range; + + sub filter_early_ea_lb { + # Fixes early EastAsianWidth.txt and LineBreak.txt files. These had a + # third field be the name of the code point, which can be ignored in + # most cases. But it can be meaningful if it marks a range: + # 33FE;W;IDEOGRAPHIC TELEGRAPH SYMBOL FOR DAY THIRTY-ONE + # 3400;W; + # + # We need to see the First in the example above to know it's a range. + # They did not use the later range syntaxes. This routine changes it + # to use the modern syntax. + # $1 is the Input_file object. + + my @fields = split /\s*;\s*/; + if ($fields[2] =~ /^<.*, First>/) { + $first_range = $fields[0]; + $_ = ""; + } + elsif ($fields[2] =~ /^<.*, Last>/) { + $_ = $_ = "$first_range..$fields[0]; $fields[1]"; + } + else { + undef $first_range; + $_ = "$fields[0]; $fields[1]"; + } + + return; + } +} + +sub filter_old_style_arabic_shaping { + # Early versions used a different term for the later one. + + my @fields = split /\s*;\s*/; + $fields[3] =~ s//No_Joining_Group/; + $fields[3] =~ s/\s+/_/g; # Change spaces to underscores + $_ = join ';', @fields; + return; +} + +sub filter_arabic_shaping_line { + # ArabicShaping.txt has entries that look like: + # 062A; TEH; D; BEH + # The field containing 'TEH' is not used. The next field is Joining_Type + # and the last is Joining_Group + # This generates two lines to pass on, one for each property on the input + # line. + + my $file = shift; + Carp::carp_extra_args(\@_) if main::DEBUG && @_; + + my @fields = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields + + if (@fields > 4) { + $file->carp_bad_line('Extra fields'); + $_ = ""; + return; + } + + $file->insert_adjusted_lines("$fields[0]; Joining_Group; $fields[3]"); + $_ = "$fields[0]; Joining_Type; $fields[2]"; + + return; +} + +sub setup_special_casing { + # SpecialCasing.txt contains the non-simple case change mappings. The + # simple ones are in UnicodeData.txt, which should already have been read + # in to the full property data structures, so as to initialize these with + # the simple ones. Then the SpecialCasing.txt entries overwrite the ones + # which have different full mappings. + + # This routine sees if the simple mappings are to be output, and if so, + # copies what has already been put into the full mapping tables, while + # they still contain only the simple mappings. + + # The reason it is done this way is that the simple mappings are probably + # not going to be output, so it saves work to initialize the full tables + # with the simple mappings, and then overwrite those relatively few + # entries in them that have different full mappings, and thus skip the + # simple mapping tables altogether. + + my $file= shift; + Carp::carp_extra_args(\@_) if main::DEBUG && @_; + + # For each of the case change mappings... + foreach my $case ('lc', 'tc', 'uc') { + my $full = property_ref($case); + unless (defined $full && ! $full->is_empty) { + Carp::my_carp_bug("Need to process UnicodeData before SpecialCasing. Only special casing will be generated."); + } + + # The simple version's name in each mapping merely has an 's' in front + # of the full one's + my $simple = property_ref('s' . $case); + $simple->initialize($full) if $simple->to_output_map(); + } + + return; +} + +sub filter_special_casing_line { + # Change the format of $_ from SpecialCasing.txt into something that the + # generic handler understands. Each input line contains three case + # mappings. This will generate three lines to pass to the generic handler + # for each of those. + + # The input syntax (after stripping comments and trailing white space is + # like one of the following (with the final two being entries that we + # ignore): + # 00DF; 00DF; 0053 0073; 0053 0053; # LATIN SMALL LETTER SHARP S + # 03A3; 03C2; 03A3; 03A3; Final_Sigma; + # 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE + # Note the trailing semi-colon, unlike many of the input files. That + # means that there will be an extra null field generated by the split + + my $file = shift; + Carp::carp_extra_args(\@_) if main::DEBUG && @_; + + my @fields = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields + + # field #4 is when this mapping is conditional. If any of these get + # implemented, it would be by hard-coding in the casing functions in the + # Perl core, not through tables. But if there is a new condition we don't + # know about, output a warning. We know about all the conditions through + # 5.2 + if ($fields[4] ne "") { + my @conditions = split ' ', $fields[4]; + if ($conditions[0] ne 'tr' # We know that these languages have + # conditions, and some are multiple + && $conditions[0] ne 'az' + && $conditions[0] ne 'lt' + + # And, we know about a single condition Final_Sigma, but + # nothing else. + && ($v_version gt v5.2.0 + && (@conditions > 1 || $conditions[0] ne 'Final_Sigma'))) + { + $file->carp_bad_line("Unknown condition '$fields[4]'. You should inspect it and either add code to handle it, or add to list of those that are to ignore"); + } + elsif ($conditions[0] ne 'Final_Sigma') { + + # Don't print out a message for Final_Sigma, because we have + # hard-coded handling for it. (But the standard could change + # what the rule should be, but it wouldn't show up here + # anyway. + + print "# SKIPPING Special Casing: $_\n" + if $verbosity >= $VERBOSE; + } + $_ = ""; + return; + } + elsif (@fields > 6 || (@fields == 6 && $fields[5] ne "" )) { + $file->carp_bad_line('Extra fields'); + $_ = ""; + return; + } + + $_ = "$fields[0]; lc; $fields[1]"; + $file->insert_adjusted_lines("$fields[0]; tc; $fields[2]"); + $file->insert_adjusted_lines("$fields[0]; uc; $fields[3]"); + + return; +} + +sub filter_old_style_case_folding { + # This transforms $_ containing the case folding style of 3.0.1, to 3.1 + # and later style. Different letters were used in the earlier. + + my $file = shift; + Carp::carp_extra_args(\@_) if main::DEBUG && @_; + + my @fields = split /\s*;\s*/; + if ($fields[0] =~ /^ 013 [01] $/x) { # The two turkish fields + $fields[1] = 'I'; + } + elsif ($fields[1] eq 'L') { + $fields[1] = 'C'; # L => C always + } + elsif ($fields[1] eq 'E') { + if ($fields[2] =~ / /) { # E => C if one code point; F otherwise + $fields[1] = 'F' + } + else { + $fields[1] = 'C' + } + } + else { + $file->carp_bad_line("Expecting L or E in second field"); + $_ = ""; + return; + } + $_ = join("; ", @fields) . ';'; + return; +} + +{ # Closure for case folding + + # Create the map for simple only if are going to output it, for otherwise + # it takes no part in anything we do. + my $to_output_simple; + + # These are experimental, perhaps will need these to pass to regcomp.c to + # handle the cases where for example the Kelvin sign character folds to k, + # and in regcomp, we need to know which of the characters can have a + # non-latin1 char fold to it, so it doesn't do the optimizations it might + # otherwise. + my @latin1_singly_folded; + my @latin1_folded; + + sub setup_case_folding($) { + # Read in the case foldings in CaseFolding.txt. This handles both + # simple and full case folding. + + $to_output_simple + = property_ref('Simple_Case_Folding')->to_output_map; + + return; + } + + sub filter_case_folding_line { + # Called for each line in CaseFolding.txt + # Input lines look like: + # 0041; C; 0061; # LATIN CAPITAL LETTER A + # 00DF; F; 0073 0073; # LATIN SMALL LETTER SHARP S + # 1E9E; S; 00DF; # LATIN CAPITAL LETTER SHARP S + # + # 'C' means that folding is the same for both simple and full + # 'F' that it is only for full folding + # 'S' that it is only for simple folding + # 'T' is locale-dependent, and ignored + # 'I' is a type of 'F' used in some early releases. + # Note the trailing semi-colon, unlike many of the input files. That + # means that there will be an extra null field generated by the split + # below, which we ignore and hence is not an error. + + my $file = shift; + Carp::carp_extra_args(\@_) if main::DEBUG && @_; + + my ($range, $type, $map, @remainder) = split /\s*;\s*/, $_, -1; + if (@remainder > 1 || (@remainder == 1 && $remainder[0] ne "" )) { + $file->carp_bad_line('Extra fields'); + $_ = ""; + return; + } + + if ($type eq 'T') { # Skip Turkic case folding, is locale dependent + $_ = ""; + return; + } + + # C: complete, F: full, or I: dotted uppercase I -> dotless lowercase + # I are all full foldings + if ($type eq 'C' || $type eq 'F' || $type eq 'I') { + $_ = "$range; Case_Folding; $map"; + } + else { + $_ = ""; + if ($type ne 'S') { + $file->carp_bad_line('Expecting C F I S or T in second field'); + return; + } + } + + # C and S are simple foldings, but simple case folding is not needed + # unless we explicitly want its map table output. + if ($to_output_simple && $type eq 'C' || $type eq 'S') { + $file->insert_adjusted_lines("$range; Simple_Case_Folding; $map"); + } + + # Experimental, see comment above + if ($type ne 'S' && hex($range) >= 256) { # assumes range is 1 point + my @folded = split ' ', $map; + if (hex $folded[0] < 256 && @folded == 1) { + push @latin1_singly_folded, hex $folded[0]; + } + foreach my $folded (@folded) { + push @latin1_folded, hex $folded if hex $folded < 256; + } + } + + return; + } + + sub post_fold { + # Experimental, see comment above + return; + + #local $to_trace = 1 if main::DEBUG; + @latin1_singly_folded = uniques(@latin1_singly_folded); + @latin1_folded = uniques(@latin1_folded); + trace "latin1 single folded:", map { chr $_ } sort { $a <=> $b } @latin1_singly_folded if main::DEBUG && $to_trace; + trace "latin1 folded:", map { chr $_ } sort { $a <=> $b } @latin1_folded if main::DEBUG && $to_trace; + return; + } +} # End case fold closure + +sub filter_jamo_line { + # Filter Jamo.txt lines. This routine mainly is used to populate hashes + # from this file that is used in generating the Name property for Jamo + # code points. But, it also is used to convert early versions' syntax + # into the modern form. Here are two examples: + # 1100; G # HANGUL CHOSEONG KIYEOK # Modern syntax + # U+1100; G; HANGUL CHOSEONG KIYEOK # 2.0 syntax + # + # The input is $_, the output is $_ filtered. + + my @fields = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields + + # Let the caller handle unexpected input. In earlier versions, there was + # a third field which is supposed to be a comment, but did not have a '#' + # before it. + return if @fields > (($v_version gt v3.0.0) ? 2 : 3); + + $fields[0] =~ s/^U\+//; # Also, early versions had this extraneous + # beginning. + + # Some 2.1 versions had this wrong. Causes havoc with the algorithm. + $fields[1] = 'R' if $fields[0] eq '1105'; + + # Add to structure so can generate Names from it. + my $cp = hex $fields[0]; + my $short_name = $fields[1]; + $Jamo{$cp} = $short_name; + if ($cp <= $LBase + $LCount) { + $Jamo_L{$short_name} = $cp - $LBase; + } + elsif ($cp <= $VBase + $VCount) { + $Jamo_V{$short_name} = $cp - $VBase; + } + elsif ($cp <= $TBase + $TCount) { + $Jamo_T{$short_name} = $cp - $TBase; + } + else { + Carp::my_carp_bug("Unexpected Jamo code point in $_"); + } + + + # Reassemble using just the first two fields to look like a typical + # property file line + $_ = "$fields[0]; $fields[1]"; + + return; +} + +sub register_fraction($) { + # This registers the input rational number so that it can be passed on to + # utf8_heavy.pl, both in rational and floating forms. + + my $rational = shift; + + my $float = eval $rational; + $nv_floating_to_rational{$float} = $rational; + return; +} + +sub filter_numeric_value_line { + # DNumValues contains lines of a different syntax than the typical + # property file: + # 0F33 ; -0.5 ; ; -1/2 # No TIBETAN DIGIT HALF ZERO + # + # This routine transforms $_ containing the anomalous syntax to the + # typical, by filtering out the extra columns, and convert early version + # decimal numbers to strings that look like rational numbers. + + my $file = shift; + Carp::carp_extra_args(\@_) if main::DEBUG && @_; + + # Starting in 5.1, there is a rational field. Just use that, omitting the + # extra columns. Otherwise convert the decimal number in the second field + # to a rational, and omit extraneous columns. + my @fields = split /\s*;\s*/, $_, -1; + my $rational; + + if ($v_version ge v5.1.0) { + if (@fields != 4) { + $file->carp_bad_line('Not 4 semi-colon separated fields'); + $_ = ""; + return; + } + $rational = $fields[3]; + $_ = join '; ', @fields[ 0, 3 ]; + } + else { + + # Here, is an older Unicode file, which has decimal numbers instead of + # rationals in it. Use the fraction to calculate the denominator and + # convert to rational. + + if (@fields != 2 && @fields != 3) { + $file->carp_bad_line('Not 2 or 3 semi-colon separated fields'); + $_ = ""; + return; + } + + my $codepoints = $fields[0]; + my $decimal = $fields[1]; + if ($decimal =~ s/\.0+$//) { + + # Anything ending with a decimal followed by nothing but 0's is an + # integer + $_ = "$codepoints; $decimal"; + $rational = $decimal; + } + else { + + my $denominator; + if ($decimal =~ /\.50*$/) { + $denominator = 2; + } + + # Here have the hardcoded repeating decimals in the fraction, and + # the denominator they imply. There were only a few denominators + # in the older Unicode versions of this file which this code + # handles, so it is easy to convert them. + + # The 4 is because of a round-off error in the Unicode 3.2 files + elsif ($decimal =~ /\.33*[34]$/ || $decimal =~ /\.6+7$/) { + $denominator = 3; + } + elsif ($decimal =~ /\.[27]50*$/) { + $denominator = 4; + } + elsif ($decimal =~ /\.[2468]0*$/) { + $denominator = 5; + } + elsif ($decimal =~ /\.16+7$/ || $decimal =~ /\.83+$/) { + $denominator = 6; + } + elsif ($decimal =~ /\.(12|37|62|87)50*$/) { + $denominator = 8; + } + if ($denominator) { + my $sign = ($decimal < 0) ? "-" : ""; + my $numerator = int((abs($decimal) * $denominator) + .5); + $rational = "$sign$numerator/$denominator"; + $_ = "$codepoints; $rational"; + } + else { + $file->carp_bad_line("Can't cope with number '$decimal'."); + $_ = ""; + return; + } + } + } + + register_fraction($rational) if $rational =~ qr{/}; + return; +} + +{ # Closure + my %unihan_properties; + my $iicore; + + + sub setup_unihan { + # Do any special setup for Unihan properties. + + # This property gives the wrong computed type, so override. + my $usource = property_ref('kIRG_USource'); + $usource->set_type($STRING) if defined $usource; + + # This property is to be considered binary, so change all the values + # to Y. + $iicore = property_ref('kIICore'); + if (defined $iicore) { + $iicore->add_match_table('Y') if ! defined $iicore->table('Y'); + + # We have to change the default map, because the @missing line is + # misleading, given that we are treating it as binary. + $iicore->set_default_map('N'); + $iicore->set_type($BINARY); + } + + return; + } + + sub filter_unihan_line { + # Change unihan db lines to look like the others in the db. Here is + # an input sample: + # U+341C kCangjie IEKN + + # Tabs are used instead of semi-colons to separate fields; therefore + # they may have semi-colons embedded in them. Change these to periods + # so won't screw up the rest of the code. + s/;/./g; + + # Remove lines that don't look like ones we accept. + if ($_ !~ /^ [^\t]* \t ( [^\t]* ) /x) { + $_ = ""; + return; + } + + # Extract the property, and save a reference to its object. + my $property = $1; + if (! exists $unihan_properties{$property}) { + $unihan_properties{$property} = property_ref($property); + } + + # Don't do anything unless the property is one we're handling, which + # we determine by seeing if there is an object defined for it or not + if (! defined $unihan_properties{$property}) { + $_ = ""; + return; + } + + # The iicore property is supposed to be a boolean, so convert to our + # standard boolean form. + if (defined $iicore && $unihan_properties{$property} == $iicore) { + $_ =~ s/$property.*/$property\tY/ + } + + # Convert the tab separators to our standard semi-colons, and convert + # the U+HHHH notation to the rest of the standard's HHHH + s/\t/;/g; + s/\b U \+ (?= $code_point_re )//xg; + + #local $to_trace = 1 if main::DEBUG; + trace $_ if main::DEBUG && $to_trace; + + return; + } +} + +sub filter_blocks_lines { + # In the Blocks.txt file, the names of the blocks don't quite match the + # names given in PropertyValueAliases.txt, so this changes them so they + # do match: Blanks and hyphens are changed into underscores. Also makes + # early release versions look like later ones + # + # $_ is transformed to the correct value. + + my $file = shift; + Carp::carp_extra_args(\@_) if main::DEBUG && @_; + + if ($v_version lt v3.2.0) { + if (/FEFF.*Specials/) { # Bug in old versions: line wrongly inserted + $_ = ""; + return; + } + + # Old versions used a different syntax to mark the range. + $_ =~ s/;\s+/../ if $v_version lt v3.1.0; + } + + my @fields = split /\s*;\s*/, $_, -1; + if (@fields != 2) { + $file->carp_bad_line("Expecting exactly two fields"); + $_ = ""; + return; + } + + # Change hyphens and blanks in the block name field only + $fields[1] =~ s/[ -]/_/g; + $fields[1] =~ s/_ ( [a-z] ) /_\u$1/g; # Capitalize first letter of word + + $_ = join("; ", @fields); + return; +} + +{ # Closure + my $current_property; + + sub filter_old_style_proplist { + # PropList.txt has been in Unicode since version 2.0. Until 3.1, it + # was in a completely different syntax. Ken Whistler of Unicode says + # that it was something he used as an aid for his own purposes, but + # was never an official part of the standard. However, comments in + # DAge.txt indicate that non-character code points were available in + # the UCD as of 3.1. It is unclear to me (khw) how they could be + # there except through this file (but on the other hand, they first + # appeared there in 3.0.1), so maybe it was part of the UCD, and maybe + # not. But the claim is that it was published as an aid to others who + # might want some more information than was given in the official UCD + # of the time. Many of the properties in it were incorporated into + # the later PropList.txt, but some were not. This program uses this + # early file to generate property tables that are otherwise not + # accessible in the early UCD's, and most were probably not really + # official at that time, so one could argue that it should be ignored, + # and you can easily modify things to skip this. And there are bugs + # in this file in various versions. (For example, the 2.1.9 version + # removes from Alphabetic the CJK range starting at 4E00, and they + # weren't added back in until 3.1.0.) Many of this file's properties + # were later sanctioned, so this code generates tables for those + # properties that aren't otherwise in the UCD of the time but + # eventually did become official, and throws away the rest. Here is a + # list of all the ones that are thrown away: + # Bidi=* duplicates UnicodeData.txt + # Combining never made into official property; + # is \P{ccc=0} + # Composite never made into official property. + # Currency Symbol duplicates UnicodeData.txt: gc=sc + # Decimal Digit duplicates UnicodeData.txt: gc=nd + # Delimiter never made into official property; + # removed in 3.0.1 + # Format Control never made into official property; + # similar to gc=cf + # High Surrogate duplicates Blocks.txt + # Ignorable Control never made into official property; + # similar to di=y + # ISO Control duplicates UnicodeData.txt: gc=cc + # Left of Pair never made into official property; + # Line Separator duplicates UnicodeData.txt: gc=zl + # Low Surrogate duplicates Blocks.txt + # Non-break was actually listed as a property + # in 3.2, but without any code + # points. Unicode denies that this + # was ever an official property + # Non-spacing duplicate UnicodeData.txt: gc=mn + # Numeric duplicates UnicodeData.txt: gc=cc + # Paired Punctuation never made into official property; + # appears to be gc=ps + gc=pe + # Paragraph Separator duplicates UnicodeData.txt: gc=cc + # Private Use duplicates UnicodeData.txt: gc=co + # Private Use High Surrogate duplicates Blocks.txt + # Punctuation duplicates UnicodeData.txt: gc=p + # Space different definition than eventual + # one. + # Titlecase duplicates UnicodeData.txt: gc=lt + # Unassigned Code Value duplicates UnicodeData.txt: gc=cc + # Zero-width never made into offical property; + # subset of gc=cf + # Most of the properties have the same names in this file as in later + # versions, but a couple do not. + # + # This subroutine filters $_, converting it from the old style into + # the new style. Here's a sample of the old-style + # + # ******************************************* + # + # Property dump for: 0x100000A0 (Join Control) + # + # 200C..200D (2 chars) + # + # In the example, the property is "Join Control". It is kept in this + # closure between calls to the subroutine. The numbers beginning with + # 0x were internal to Ken's program that generated this file. + + # If this line contains the property name, extract it. + if (/^Property dump for: [^(]*\((.*)\)/) { + $_ = $1; + + # Convert white space to underscores. + s/ /_/g; + + # Convert the few properties that don't have the same name as + # their modern counterparts + s/Identifier_Part/ID_Continue/ + or s/Not_a_Character/NChar/; + + # If the name matches an existing property, use it. + if (defined property_ref($_)) { + trace "new property=", $_ if main::DEBUG && $to_trace; + $current_property = $_; + } + else { # Otherwise discard it + trace "rejected property=", $_ if main::DEBUG && $to_trace; + undef $current_property; + } + $_ = ""; # The property is saved for the next lines of the + # file, but this defining line is of no further use, + # so clear it so that the caller won't process it + # further. + } + elsif (! defined $current_property || $_ !~ /^$code_point_re/) { + + # Here, the input line isn't a header defining a property for the + # following section, and either we aren't in such a section, or + # the line doesn't look like one that defines the code points in + # such a section. Ignore this line. + $_ = ""; + } + else { + + # Here, we have a line defining the code points for the current + # stashed property. Anything starting with the first blank is + # extraneous. Otherwise, it should look like a normal range to + # the caller. Append the property name so that it looks just like + # a modern PropList entry. + + $_ =~ s/\s.*//; + $_ .= "; $current_property"; + } + trace $_ if main::DEBUG && $to_trace; + return; + } +} # End closure for old style proplist + +sub filter_old_style_normalization_lines { + # For early releases of Unicode, the lines were like: + # 74..2A76 ; NFKD_NO + # For later releases this became: + # 74..2A76 ; NFKD_QC; N + # Filter $_ to look like those in later releases. + # Similarly for MAYBEs + + s/ _NO \b /_QC; N/x || s/ _MAYBE \b /_QC; M/x; + + # Also, the property FC_NFKC was abbreviated to FNC + s/FNC/FC_NFKC/; + return; +} + +sub finish_Unicode() { + # This routine should be called after all the Unicode files have been read + # in. It: + # 1) Adds the mappings for code points missing from the files which have + # defaults specified for them. + # 2) At this this point all mappings are known, so it computes the type of + # each property whose type hasn't been determined yet. + # 3) Calculates all the regular expression match tables based on the + # mappings. + # 3) Calculates and adds the tables which are defined by Unicode, but + # which aren't derived by them + + # For each property, fill in any missing mappings, and calculate the re + # match tables. If a property has more than one missing mapping, the + # default is a reference to a data structure, and requires data from other + # properties to resolve. The sort is used to cause these to be processed + # last, after all the other properties have been calculated. + # (Fortunately, the missing properties so far don't depend on each other.) + foreach my $property + (sort { (defined $a->default_map && ref $a->default_map) ? 1 : -1 } + property_ref('*')) + { + # $perl has been defined, but isn't one of the Unicode properties that + # need to be finished up. + next if $property == $perl; + + # Handle the properties that have more than one possible default + if (ref $property->default_map) { + my $default_map = $property->default_map; + + # These properties have stored in the default_map: + # One or more of: + # 1) A default map which applies to all code points in a + # certain class + # 2) an expression which will evaluate to the list of code + # points in that class + # And + # 3) the default map which applies to every other missing code + # point. + # + # Go through each list. + while (my ($default, $eval) = $default_map->get_next_defaults) { + + # Get the class list, and intersect it with all the so-far + # unspecified code points yielding all the code points + # in the class that haven't been specified. + my $list = eval $eval; + if ($@) { + Carp::my_carp("Can't set some defaults for missing code points for $property because eval '$eval' failed with '$@'"); + last; + } + + # Narrow down the list to just those code points we don't have + # maps for yet. + $list = $list & $property->inverse_list; + + # Add mappings to the property for each code point in the list + foreach my $range ($list->ranges) { + $property->add_map($range->start, $range->end, $default); + } + } + + # All remaining code points have the other mapping. Set that up + # so the normal single-default mapping code will work on them + $property->set_default_map($default_map->other_default); + + # And fall through to do that + } + + # We should have enough data now to compute the type of the property. + $property->compute_type; + my $property_type = $property->type; + + next if ! $property->to_create_match_tables; + + # Here want to create match tables for this property + + # The Unicode db always (so far, and they claim into the future) have + # the default for missing entries in binary properties be 'N' (unless + # there is a '@missing' line that specifies otherwise) + if ($property_type == $BINARY && ! defined $property->default_map) { + $property->set_default_map('N'); + } + + # Add any remaining code points to the mapping, using the default for + # missing code points + if (defined (my $default_map = $property->default_map)) { + foreach my $range ($property->inverse_list->ranges) { + $property->add_map($range->start, $range->end, $default_map); + } + + # Make sure there is a match table for the default + if (! defined $property->table($default_map)) { + $property->add_match_table($default_map); + } + } + + # Have all we need to populate the match tables. + my $property_name = $property->name; + foreach my $range ($property->ranges) { + my $map = $range->value; + my $table = property_ref($property_name)->table($map); + if (! defined $table) { + + # Integral and rational property values are not necessarily + # defined in PropValueAliases, but all other ones should be, + # starting in 5.1 + if ($v_version ge v5.1.0 + && $map !~ /^ -? \d+ ( \/ \d+ )? $/x) + { + Carp::my_carp("Table '$property_name=$map' should have been defined. Defining it now.") + } + $table = property_ref($property_name)->add_match_table($map); + } + + $table->add_range($range->start, $range->end); + } + + # And add the Is_ prefix synonyms for Perl 5.6 compatibility, in which + # all properties have this optional prefix. These do not get a + # separate entry in the pod file, because are covered by a wild-card + # entry + foreach my $alias ($property->aliases) { + my $Is_name = 'Is_' . $alias->name; + if (! defined (my $pre_existing = property_ref($Is_name))) { + $property->add_alias($Is_name, + Pod_Entry => 0, + Status => $alias->status, + Externally_Ok => 0); + } + else { + + # It seemed too much work to add in these warnings when it + # appears that Unicode has made a decision never to begin a + # property name with 'Is_', so this shouldn't happen, but just + # in case, it is a warning. + Carp::my_carp(<tables) { + my $minor_name = $minor_table->name; + next if length $minor_name == 1; + if (length $minor_name != 2) { + Carp::my_carp_bug("Unexpected general category '$minor_name'. Skipped."); + next; + } + + my $major_name = uc(substr($minor_name, 0, 1)); + my $major_table = $gc->table($major_name); + $major_table += $minor_table; + } + + # LC is Ll, Lu, and Lt. (used to be L& or L_, but PropValueAliases.txt + # defines it as LC) + my $LC = $gc->table('LC'); + $LC->add_alias('L_', Status => $DISCOURAGED); # For backwards... + $LC->add_alias('L&', Status => $DISCOURAGED); # compatibility. + + + if ($LC->is_empty) { # Assume if not empty that Unicode has started to + # deliver the correct values in it + $LC->initialize($gc->table('Ll') + $gc->table('Lu')); + + # Lt not in release 1. + $LC += $gc->table('Lt') if defined $gc->table('Lt'); + } + $LC->add_description('[\p{Ll}\p{Lu}\p{Lt}]'); + + my $Cs = $gc->table('Cs'); + if (defined $Cs) { + $Cs->add_note('Mostly not usable in Perl.'); + $Cs->add_comment(join_lines(<is_empty) { + $fold->initialize(property_ref('Simple_Lowercase_Mapping')); + $fold->add_note(join_lines(<is_empty) { + my $simple = property_ref('Simple_' . $map); + $full->initialize($simple); + $full->add_comment($simple->comment) if ($simple->comment); + $full->add_note(join_lines(<add_match_table('Any', + Description => "[\\x{0000}-\\x{$LAST_UNICODE_CODEPOINT_STRING}]", + Matches_All => 1); + + foreach my $major_table ($gc->tables) { + + # Major categories are the ones with single letter names. + next if length($major_table->name) != 1; + + $Any += $major_table; + } + + if ($Any->max != $LAST_UNICODE_CODEPOINT) { + Carp::my_carp_bug("Generated highest code point (" + . sprintf("%X", $Any->max) + . ") doesn't match expected value $LAST_UNICODE_CODEPOINT_STRING.") + } + if ($Any->range_count != 1 || $Any->min != 0) { + Carp::my_carp_bug("Generated table 'Any' doesn't match all code points.") + } + + $Any->add_alias('All'); + + # Assigned is the opposite of gc=unassigned + my $Assigned = $perl->add_match_table('Assigned', + Description => "All assigned code points", + Initialize => ~ $gc->table('Unassigned'), + ); + + # Our internal-only property should be treated as more than just a + # synonym. + $perl->add_match_table('_CombAbove') + ->set_equivalent_to(property_ref('ccc')->table('Above'), + Related => 1); + + my $ASCII = $perl->add_match_table('ASCII', Description => '[[:ASCII:]]'); + if (defined $block) { # This is equivalent to the block if have it. + my $Unicode_ASCII = $block->table('Basic_Latin'); + if (defined $Unicode_ASCII && ! $Unicode_ASCII->is_empty) { + $ASCII->set_equivalent_to($Unicode_ASCII, Related => 1); + } + } + + # Very early releases didn't have blocks, so initialize ASCII ourselves if + # necessary + if ($ASCII->is_empty) { + $ASCII->initialize([ 0..127 ]); + } + + # Get the best available case definitions. Early Unicode versions didn't + # have Uppercase and Lowercase defined, so use the general category + # instead for them. + my $Lower = $perl->add_match_table('Lower'); + my $Unicode_Lower = property_ref('Lowercase'); + if (defined $Unicode_Lower && ! $Unicode_Lower->is_empty) { + $Lower->set_equivalent_to($Unicode_Lower->table('Y'), Related => 1); + } + else { + $Lower->set_equivalent_to($gc->table('Lowercase_Letter'), + Related => 1); + } + $perl->add_match_table("PosixLower", + Description => "[a-z]", + Initialize => $Lower & $ASCII, + ); + + my $Upper = $perl->add_match_table('Upper'); + my $Unicode_Upper = property_ref('Uppercase'); + if (defined $Unicode_Upper && ! $Unicode_Upper->is_empty) { + $Upper->set_equivalent_to($Unicode_Upper->table('Y'), Related => 1); + } + else { + $Upper->set_equivalent_to($gc->table('Uppercase_Letter'), + Related => 1); + } + $perl->add_match_table("PosixUpper", + Description => "[A-Z]", + Initialize => $Upper & $ASCII, + ); + + # Earliest releases didn't have title case. Initialize it to empty if not + # otherwise present + my $Title = $perl->add_match_table('Title'); + my $lt = $gc->table('Lt'); + if (defined $lt) { + $Title->set_equivalent_to($lt, Related => 1); + } + + # If this Unicode version doesn't have Cased, set up our own. From + # Unicode 5.1: Definition D120: A character C is defined to be cased if + # and only if C has the Lowercase or Uppercase property or has a + # General_Category value of Titlecase_Letter. + unless (defined property_ref('Cased')) { + my $cased = $perl->add_match_table('Cased', + Initialize => $Lower + $Upper + $Title, + Description => 'Uppercase or Lowercase or Titlecase', + ); + } + + # Similarly, set up our own Case_Ignorable property if this Unicode + # version doesn't have it. From Unicode 5.1: Definition D121: A character + # C is defined to be case-ignorable if C has the value MidLetter or the + # value MidNumLet for the Word_Break property or its General_Category is + # one of Nonspacing_Mark (Mn), Enclosing_Mark (Me), Format (Cf), + # Modifier_Letter (Lm), or Modifier_Symbol (Sk). + + # Perl has long had an internal-only alias for this property. + my $perl_case_ignorable = $perl->add_match_table('_Case_Ignorable'); + my $case_ignorable = property_ref('Case_Ignorable'); + if (defined $case_ignorable && ! $case_ignorable->is_empty) { + $perl_case_ignorable->set_equivalent_to($case_ignorable->table('Y'), + Related => 1); + } + else { + + $perl_case_ignorable->initialize($gc->table('Mn') + $gc->table('Lm')); + + # The following three properties are not in early releases + $perl_case_ignorable += $gc->table('Me') if defined $gc->table('Me'); + $perl_case_ignorable += $gc->table('Cf') if defined $gc->table('Cf'); + $perl_case_ignorable += $gc->table('Sk') if defined $gc->table('Sk'); + + # For versions 4.1 - 5.0, there is no MidNumLet property, and + # correspondingly the case-ignorable definition lacks that one. For + # 4.0, it appears that it was meant to be the same definition, but was + # inadvertently omitted from the standard's text, so add it if the + # property actually is there + my $wb = property_ref('Word_Break'); + if (defined $wb) { + my $midlet = $wb->table('MidLetter'); + $perl_case_ignorable += $midlet if defined $midlet; + my $midnumlet = $wb->table('MidNumLet'); + $perl_case_ignorable += $midnumlet if defined $midnumlet; + } + else { + + # In earlier versions of the standard, instead of the above two + # properties , just the following characters were used: + $perl_case_ignorable += 0x0027 # APOSTROPHE + + 0x00AD # SOFT HYPHEN (SHY) + + 0x2019; # RIGHT SINGLE QUOTATION MARK + } + } + + # The remaining perl defined tables are mostly based on Unicode TR 18, + # "Annex C: Compatibility Properties". All of these have two versions, + # one whose name generally begins with Posix that is posix-compliant, and + # one that matches Unicode characters beyond the Posix, ASCII range + + my $Alpha = $perl->add_match_table('Alpha'); + + # Alphabetic was not present in early releases + my $Alphabetic = property_ref('Alphabetic'); + if (defined $Alphabetic && ! $Alphabetic->is_empty) { + $Alpha->set_equivalent_to($Alphabetic->table('Y'), Related => 1); + } + else { + + # For early releases, we don't get it exactly right. The below + # includes more than it should, which in 5.2 terms is: L + Nl + + # Other_Alphabetic. Other_Alphabetic contains many characters from + # Mn and Mc. It's better to match more than we should, than less than + # we should. + $Alpha->initialize($gc->table('Letter') + + $gc->table('Mn') + + $gc->table('Mc')); + $Alpha += $gc->table('Nl') if defined $gc->table('Nl'); + $Alpha->add_description('Alphabetic'); + } + $perl->add_match_table("PosixAlpha", + Description => "[A-Za-z]", + Initialize => $Alpha & $ASCII, + ); + + my $Alnum = $perl->add_match_table('Alnum', + Description => 'Alphabetic and (Decimal) Numeric', + Initialize => $Alpha + $gc->table('Decimal_Number'), + ); + $perl->add_match_table("PosixAlnum", + Description => "[A-Za-z0-9]", + Initialize => $Alnum & $ASCII, + ); + + my $Word = $perl->add_match_table('Word', + Description => '\w, including beyond ASCII', + Initialize => $Alnum + $gc->table('Mark'), + ); + my $Pc = $gc->table('Connector_Punctuation'); # 'Pc' Not in release 1 + $Word += $Pc if defined $Pc; + + # This is a Perl extension, so the name doesn't begin with Posix. + $perl->add_match_table('PerlWord', + Description => '\w, restricted to ASCII = [A-Za-z0-9_]', + Initialize => $Word & $ASCII, + ); + + my $Blank = $perl->add_match_table('Blank', + Description => '\h, Horizontal white space', + + # 200B is Zero Width Space which is for line + # break control, and was listed as + # Space_Separator in early releases + Initialize => $gc->table('Space_Separator') + + 0x0009 # TAB + - 0x200B, # ZWSP + ); + $Blank->add_alias('HorizSpace'); # Another name for it. + $perl->add_match_table("PosixBlank", + Description => "\\t and ' '", + Initialize => $Blank & $ASCII, + ); + + my $VertSpace = $perl->add_match_table('VertSpace', + Description => '\v', + Initialize => $gc->table('Line_Separator') + + $gc->table('Paragraph_Separator') + + 0x000A # LINE FEED + + 0x000B # VERTICAL TAB + + 0x000C # FORM FEED + + 0x000D # CARRIAGE RETURN + + 0x0085, # NEL + ); + # No Posix equivalent for vertical space + + my $Space = $perl->add_match_table('Space', + Description => '\s including beyond ASCII plus vertical tab', + Initialize => $Blank + $VertSpace, + ); + $perl->add_match_table("PosixSpace", + Description => "\\t, \\n, \\cK, \\f, \\r, and ' '. (\\cK is vertical tab)", + Initialize => $Space & $ASCII, + ); + + # Perl's traditional space doesn't include Vertical Tab + my $SpacePerl = $perl->add_match_table('SpacePerl', + Description => '\s, including beyond ASCII', + Initialize => $Space - 0x000B, + ); + $perl->add_match_table('PerlSpace', + Description => '\s, restricted to ASCII', + Initialize => $SpacePerl & $ASCII, + ); + + my $Cntrl = $perl->add_match_table('Cntrl', + Description => 'Control characters'); + $Cntrl->set_equivalent_to($gc->table('Cc'), Related => 1); + $perl->add_match_table("PosixCntrl", + Description => "ASCII control characters: NUL, SOH, STX, ETX, EOT, ENQ, ACK, BEL, BS, HT, LF, VT, FF, CR, SO, SI, DLE, DC1, DC2, DC3, DC4, NAK, SYN, ETB, CAN, EOM, SUB, ESC, FS, GS, RS, US, and DEL", + Initialize => $Cntrl & $ASCII, + ); + + # $controls is a temporary used to construct Graph. + my $controls = Range_List->new(Initialize => $gc->table('Unassigned') + + $gc->table('Control')); + # Cs not in release 1 + $controls += $gc->table('Surrogate') if defined $gc->table('Surrogate'); + + # Graph is ~space & ~(Cc|Cs|Cn) = ~(space + $controls) + my $Graph = $perl->add_match_table('Graph', + Description => 'Characters that are graphical', + Initialize => ~ ($Space + $controls), + ); + $perl->add_match_table("PosixGraph", + Description => + '[-!"#$%&\'()*+,./:;<>?@[\\\]^_`{|}~0-9A-Za-z]', + Initialize => $Graph & $ASCII, + ); + + $print = $perl->add_match_table('Print', + Description => 'Characters that are graphical plus space characters (but no controls)', + Initialize => $Blank + $Graph - $gc->table('Control'), + ); + $perl->add_match_table("PosixPrint", + Description => + '[- 0-9A-Za-z!"#$%&\'()*+,./:;<>?@[\\\]^_`{|}~]', + Initialize => $print & $ASCII, + ); + + my $Punct = $perl->add_match_table('Punct'); + $Punct->set_equivalent_to($gc->table('Punctuation'), Related => 1); + + # \p{punct} doesn't include the symbols, which posix does + $perl->add_match_table('PosixPunct', + Description => '[-!"#$%&\'()*+,./:;<>?@[\\\]^_`{|}~]', + Initialize => $ASCII & ($gc->table('Punctuation') + + $gc->table('Symbol')), + ); + + my $Digit = $perl->add_match_table('Digit', + Description => '\d, extended beyond just [0-9]'); + $Digit->set_equivalent_to($gc->table('Decimal_Number'), Related => 1); + my $PosixDigit = $perl->add_match_table("PosixDigit", + Description => '[0-9]', + Initialize => $Digit & $ASCII, + ); + + # Hex_Digit was not present in first release + my $Xdigit = $perl->add_match_table('XDigit'); + my $Hex = property_ref('Hex_Digit'); + if (defined $Hex && ! $Hex->is_empty) { + $Xdigit->set_equivalent_to($Hex->table('Y'), Related => 1); + } + else { + # (Have to use hex instead of e.g. '0', because could be running on an + # non-ASCII machine, and we want the Unicode (ASCII) values) + $Xdigit->initialize([ 0x30..0x39, 0x41..0x46, 0x61..0x66, + 0xFF10..0xFF19, 0xFF21..0xFF26, 0xFF41..0xFF46]); + $Xdigit->add_description('[0-9A-Fa-f] and corresponding fullwidth versions, like U+FF10: FULLWIDTH DIGIT ZERO'); + } + + my $dt = property_ref('Decomposition_Type'); + $dt->add_match_table('Non_Canon', Full_Name => 'Non_Canonical', + Initialize => ~ ($dt->table('None') + $dt->table('Canonical')), + Perl_Extension => 1, + Note => 'Union of all non-canonical decompositions', + ); + + # _CanonDCIJ is equivalent to Soft_Dotted, but if on a release earlier + # than SD appeared, construct it ourselves, based on the first release SD + # was in. + my $CanonDCIJ = $perl->add_match_table('_CanonDCIJ'); + my $soft_dotted = property_ref('Soft_Dotted'); + if (defined $soft_dotted && ! $soft_dotted->is_empty) { + $CanonDCIJ->set_equivalent_to($soft_dotted->table('Y'), Related => 1); + } + else { + + # This list came from 3.2 Soft_Dotted. + $CanonDCIJ->initialize([ 0x0069, + 0x006A, + 0x012F, + 0x0268, + 0x0456, + 0x0458, + 0x1E2D, + 0x1ECB, + ]); + $CanonDCIJ = $CanonDCIJ & $Assigned; + } + + # These are used in Unicode's definition of \X + my $begin = $perl->add_match_table('_X_Begin', Perl_Extension => 1); + my $extend = $perl->add_match_table('_X_Extend', Perl_Extension => 1); + + my $gcb = property_ref('Grapheme_Cluster_Break'); + + # The 'extended' grapheme cluster came in 5.1. The non-extended + # definition differs too much from the traditional Perl one to use. + if (defined $gcb && defined $gcb->table('SpacingMark')) { + + # Note that assumes HST is defined; it came in an earlier release than + # GCB. In the line below, two negatives means: yes hangul + $begin += ~ property_ref('Hangul_Syllable_Type') + ->table('Not_Applicable') + + ~ ($gcb->table('Control') + + $gcb->table('CR') + + $gcb->table('LF')); + $begin->add_comment('For use in \X; matches: Hangul_Syllable | ! Control'); + + $extend += $gcb->table('Extend') + $gcb->table('SpacingMark'); + $extend->add_comment('For use in \X; matches: Extend | SpacingMark'); + } + else { # Old definition, used on early releases. + $extend += $gc->table('Mark') + + 0x200C # ZWNJ + + 0x200D; # ZWJ + $begin += ~ $extend; + + # Here we may have a release that has the regular grapheme cluster + # defined, or a release that doesn't have anything defined. + # We set things up so the Perl core degrades gracefully, possibly with + # placeholders that match nothing. + + if (! defined $gcb) { + $gcb = Property->new('GCB', Status => $PLACEHOLDER); + } + my $hst = property_ref('HST'); + if (!defined $hst) { + $hst = Property->new('HST', Status => $PLACEHOLDER); + $hst->add_match_table('Not_Applicable', + Initialize => $Any, + Matches_All => 1); + } + + # On some releases, here we may not have the needed tables for the + # perl core, in some releases we may. + foreach my $name (qw{ L LV LVT T V prepend }) { + my $table = $gcb->table($name); + if (! defined $table) { + $table = $gcb->add_match_table($name); + push @tables_that_may_be_empty, $table->complete_name; + } + + # The HST property predates the GCB one, and has identical tables + # for some of them, so use it if we can. + if ($table->is_empty + && defined $hst + && defined $hst->table($name)) + { + $table += $hst->table($name); + } + } + } + + # More GCB. If we found some hangul syllables, populate a combined + # table. + my $lv_lvt_v = $perl->add_match_table('_X_LV_LVT_V'); + my $LV = $gcb->table('LV'); + if ($LV->is_empty) { + push @tables_that_may_be_empty, $lv_lvt_v->complete_name; + } else { + $lv_lvt_v += $LV + $gcb->table('LVT') + $gcb->table('V'); + $lv_lvt_v->add_comment('For use in \X; matches: HST=LV | HST=LVT | HST=V'); + } + + # Was previously constructed to contain both Name and Unicode_1_Name + my @composition = ('Name', 'Unicode_1_Name'); + + if (@named_sequences) { + push @composition, 'Named_Sequence'; + foreach my $sequence (@named_sequences) { + $perl_charname->add_anomalous_entry($sequence); + } + } + + my $alias_sentence = ""; + my $alias = property_ref('Name_Alias'); + if (defined $alias) { + push @composition, 'Name_Alias'; + $alias->reset_each_range; + while (my ($range) = $alias->each_range) { + next if $range->value eq ""; + if ($range->start != $range->end) { + Carp::my_carp("Expecting only one code point in the range $range. Just to keep going, using just the first code point;"); + } + $perl_charname->add_duplicate($range->start, $range->value); + } + $alias_sentence = <add_comment(join_lines( <new('Perl_ccc', + Default_Map => $ccc->default_map, + Full_Name => 'Perl_Canonical_Combining_Class', + Internal_Only_Warning => 1, + Perl_Extension => 1, + Pod_Entry =>0, + Type => $ENUM, + Initialize => $ccc, + File => 'CombiningClass', + Directory => File::Spec->curdir(), + ); + $perl_ccc->set_to_output_map(1); + $perl_ccc->add_comment(join_lines(<add_match_table($ccc->default_map, + Initialize => $ccc->table($ccc->default_map), + Status => $SUPPRESSED); + + # Construct the Present_In property from the Age property. + if (-e 'DAge.txt' && defined (my $age = property_ref('Age'))) { + my $default_map = $age->default_map; + my $in = Property->new('In', + Default_Map => $default_map, + Full_Name => "Present_In", + Internal_Only_Warning => 1, + Perl_Extension => 1, + Type => $ENUM, + Initialize => $age, + ); + $in->add_comment(join_lines(<name !~ /^[\d.]*$/) + ? 1 + : ($b->name !~ /^[\d.]*$/) + ? -1 + : $a->name <=> $b->name + } $age->tables; + + # The Present_In property is the cumulative age properties. The first + # one hence is identical to the first age one. + my $previous_in = $in->add_match_table($first_age->name); + $previous_in->set_equivalent_to($first_age, Related => 1); + + my $description_start = "Code point's usage introduced in version "; + $first_age->add_description($description_start . $first_age->name); + + # To construct the accumlated values, for each of the age tables + # starting with the 2nd earliest, merge the earliest with it, to get + # all those code points existing in the 2nd earliest. Repeat merging + # the new 2nd earliest with the 3rd earliest to get all those existing + # in the 3rd earliest, and so on. + foreach my $current_age (@rest_ages) { + next if $current_age->name !~ /^[\d.]*$/; # Skip the non-numeric + + my $current_in = $in->add_match_table( + $current_age->name, + Initialize => $current_age + $previous_in, + Description => $description_start + . $current_age->name + . ' or earlier', + ); + $previous_in = $current_in; + + # Add clarifying material for the corresponding age file. This is + # in part because of the confusing and contradictory information + # given in the Standard's documentation itself, as of 5.2. + $current_age->add_description( + "Code point's usage was introduced in version " + . $current_age->name); + $current_age->add_note("See also $in"); + + } + + # And finally the code points whose usages have yet to be decided are + # the same in both properties. Note that permanently unassigned code + # points actually have their usage assigned (as being permanently + # unassigned), so that these tables are not the same as gc=cn. + my $unassigned = $in->add_match_table($default_map); + my $age_default = $age->table($default_map); + $age_default->add_description(<set_equivalent_to($age_default, Related => 1); + } + + + # Finished creating all the perl properties. All non-internal non-string + # ones have a synonym of 'Is_' prefixed. (Internal properties begin with + # an underscore.) These do not get a separate entry in the pod file + foreach my $table ($perl->tables) { + foreach my $alias ($table->aliases) { + next if $alias->name =~ /^_/; + $table->add_alias('Is_' . $alias->name, + Pod_Entry => 0, + Status => $alias->status, + Externally_Ok => 0); + } + } + + return; +} + +sub add_perl_synonyms() { + # A number of Unicode tables have Perl synonyms that are expressed in + # the single-form, \p{name}. These are: + # All the binary property Y tables, so that \p{Name=Y} gets \p{Name} and + # \p{Is_Name} as synonyms + # \p{Script=Value} gets \p{Value}, \p{Is_Value} as synonyms + # \p{General_Category=Value} gets \p{Value}, \p{Is_Value} as synonyms + # \p{Block=Value} gets \p{In_Value} as a synonym, and, if there is no + # conflict, \p{Value} and \p{Is_Value} as well + # + # This routine generates these synonyms, warning of any unexpected + # conflicts. + + # Construct the list of tables to get synonyms for. Start with all the + # binary and the General_Category ones. + my @tables = grep { $_->type == $BINARY } property_ref('*'); + push @tables, $gc->tables; + + # If the version of Unicode includes the Script property, add its tables + if (defined property_ref('Script')) { + push @tables, property_ref('Script')->tables; + } + + # The Block tables are kept separate because they are treated differently. + # And the earliest versions of Unicode didn't include them, so add only if + # there are some. + my @blocks; + push @blocks, $block->tables if defined $block; + + # Here, have the lists of tables constructed. Process blocks last so that + # if there are name collisions with them, blocks have lowest priority. + # Should there ever be other collisions, manual intervention would be + # required. See the comments at the beginning of the program for a + # possible way to handle those semi-automatically. + foreach my $table (@tables, @blocks) { + + # For non-binary properties, the synonym is just the name of the + # table, like Greek, but for binary properties the synonym is the name + # of the property, and means the code points in its 'Y' table. + my $nominal = $table; + my $nominal_property = $nominal->property; + my $actual; + if (! $nominal->isa('Property')) { + $actual = $table; + } + else { + + # Here is a binary property. Use the 'Y' table. Verify that is + # there + my $yes = $nominal->table('Y'); + unless (defined $yes) { # Must be defined, but is permissible to + # be empty. + Carp::my_carp_bug("Undefined $nominal, 'Y'. Skipping."); + next; + } + $actual = $yes; + } + + foreach my $alias ($nominal->aliases) { + + # Attempt to create a table in the perl directory for the + # candidate table, using whatever aliases in it that don't + # conflict. Also add non-conflicting aliases for all these + # prefixed by 'Is_' (and/or 'In_' for Block property tables) + PREFIX: + foreach my $prefix ("", 'Is_', 'In_') { + + # Only Block properties can have added 'In_' aliases. + next if $prefix eq 'In_' and $nominal_property != $block; + + my $proposed_name = $prefix . $alias->name; + + # No Is_Is, In_In, nor combinations thereof + trace "$proposed_name is a no-no" if main::DEBUG && $to_trace && $proposed_name =~ /^ I [ns] _I [ns] _/x; + next if $proposed_name =~ /^ I [ns] _I [ns] _/x; + + trace "Seeing if can add alias or table: 'perl=$proposed_name' based on $nominal" if main::DEBUG && $to_trace; + + # Get a reference to any existing table in the perl + # directory with the desired name. + my $pre_existing = $perl->table($proposed_name); + + if (! defined $pre_existing) { + + # No name collision, so ok to add the perl synonym. + + my $make_pod_entry; + my $externally_ok; + my $status = $actual->status; + if ($nominal_property == $block) { + + # For block properties, the 'In' form is preferred for + # external use; the pod file contains wild cards for + # this and the 'Is' form so no entries for those; and + # we don't want people using the name without the + # 'In', so discourage that. + if ($prefix eq "") { + $make_pod_entry = 1; + $status = $status || $DISCOURAGED; + $externally_ok = 0; + } + elsif ($prefix eq 'In_') { + $make_pod_entry = 0; + $status = $status || $NORMAL; + $externally_ok = 1; + } + else { + $make_pod_entry = 0; + $status = $status || $DISCOURAGED; + $externally_ok = 0; + } + } + elsif ($prefix ne "") { + + # The 'Is' prefix is handled in the pod by a wild + # card, and we won't use it for an external name + $make_pod_entry = 0; + $status = $status || $NORMAL; + $externally_ok = 0; + } + else { + + # Here, is an empty prefix, non block. This gets its + # own pod entry and can be used for an external name. + $make_pod_entry = 1; + $status = $status || $NORMAL; + $externally_ok = 1; + } + + # Here, there isn't a perl pre-existing table with the + # name. Look through the list of equivalents of this + # table to see if one is a perl table. + foreach my $equivalent ($actual->leader->equivalents) { + next if $equivalent->property != $perl; + + # Here, have found a table for $perl. Add this alias + # to it, and are done with this prefix. + $equivalent->add_alias($proposed_name, + Pod_Entry => $make_pod_entry, + Status => $status, + Externally_Ok => $externally_ok); + trace "adding alias perl=$proposed_name to $equivalent" if main::DEBUG && $to_trace; + next PREFIX; + } + + # Here, $perl doesn't already have a table that is a + # synonym for this property, add one. + my $added_table = $perl->add_match_table($proposed_name, + Pod_Entry => $make_pod_entry, + Status => $status, + Externally_Ok => $externally_ok); + # And it will be related to the actual table, since it is + # based on it. + $added_table->set_equivalent_to($actual, Related => 1); + trace "added ", $perl->table($proposed_name) if main::DEBUG && $to_trace; + next; + } # End of no pre-existing. + + # Here, there is a pre-existing table that has the proposed + # name. We could be in trouble, but not if this is just a + # synonym for another table that we have already made a child + # of the pre-existing one. + if ($pre_existing->is_equivalent_to($actual)) { + trace "$pre_existing is already equivalent to $actual; adding alias perl=$proposed_name to it" if main::DEBUG && $to_trace; + $pre_existing->add_alias($proposed_name); + next; + } + + # Here, there is a name collision, but it still could be ok if + # the tables match the identical set of code points, in which + # case, we can combine the names. Compare each table's code + # point list to see if they are identical. + trace "Potential name conflict with $pre_existing having ", $pre_existing->count, " code points" if main::DEBUG && $to_trace; + if ($pre_existing->matches_identically_to($actual)) { + + # Here, they do match identically. Not a real conflict. + # Make the perl version a child of the Unicode one, except + # in the non-obvious case of where the perl name is + # already a synonym of another Unicode property. (This is + # excluded by the test for it being its own parent.) The + # reason for this exclusion is that then the two Unicode + # properties become related; and we don't really know if + # they are or not. We generate documentation based on + # relatedness, and this would be misleading. Code + # later executed in the process will cause the tables to + # be represented by a single file anyway, without making + # it look in the pod like they are necessarily related. + if ($pre_existing->parent == $pre_existing + && ($pre_existing->property == $perl + || $actual->property == $perl)) + { + trace "Setting $pre_existing equivalent to $actual since one is \$perl, and match identical sets" if main::DEBUG && $to_trace; + $pre_existing->set_equivalent_to($actual, Related => 1); + } + elsif (main::DEBUG && $to_trace) { + trace "$pre_existing is equivalent to $actual since match identical sets, but not setting them equivalent, to preserve the separateness of the perl aliases"; + trace $pre_existing->parent; + } + next PREFIX; + } + + # Here they didn't match identically, there is a real conflict + # between our new name and a pre-existing property. + $actual->add_conflicting($proposed_name, 'p', $pre_existing); + $pre_existing->add_conflicting($nominal->full_name, + 'p', + $actual); + + # Don't output a warning for aliases for the block + # properties (unless they start with 'In_') as it is + # expected that there will be conflicts and the block + # form loses. + if ($verbosity >= $NORMAL_VERBOSITY + && ($actual->property != $block || $prefix eq 'In_')) + { + print simple_fold(join_lines(<type != $BINARY } property_ref('*')) { + my $yes = $property->table('Yes'); + if (defined $yes) { + my $y = $property->table('Y'); + if (defined $y && $yes == $y) { + foreach my $alias ($property->aliases) { + $yes->add_conflicting($alias->name); + } + } + } + my $no = $property->table('No'); + if (defined $no) { + my $n = $property->table('N'); + if (defined $n && $no == $n) { + foreach my $alias ($property->aliases) { + $no->add_conflicting($alias->name, 'P'); + } + } + } + } + + return; +} + +sub register_file_for_name($$$) { + # Given info about a table and a datafile that it should be associated + # with, register that assocation + + my $table = shift; + my $directory_ref = shift; # Array of the directory path for the file + my $file = shift; # The file name in the final directory, [-1]. + Carp::carp_extra_args(\@_) if main::DEBUG && @_; + + trace "table=$table, file=$file, directory=@$directory_ref" if main::DEBUG && $to_trace; + + if ($table->isa('Property')) { + $table->set_file_path(@$directory_ref, $file); + push @map_properties, $table + if $directory_ref->[0] eq $map_directory; + return; + } + + # Do all of the work for all equivalent tables when called with the leader + # table, so skip if isn't the leader. + return if $table->leader != $table; + + # Join all the file path components together, using slashes. + my $full_filename = join('/', @$directory_ref, $file); + + # All go in the same subdirectory of unicore + if ($directory_ref->[0] ne $matches_directory) { + Carp::my_carp("Unexpected directory in " + . join('/', @{$directory_ref}, $file)); + } + + # For this table and all its equivalents ... + foreach my $table ($table, $table->equivalents) { + + # Associate it with its file internally. Don't include the + # $matches_directory first component + $table->set_file_path(@$directory_ref, $file); + my $sub_filename = join('/', $directory_ref->[1, -1], $file); + + my $property = $table->property; + $property = ($property == $perl) + ? "" # 'perl' is never explicitly stated + : standardize($property->name) . '='; + + my $deprecated = ($table->status eq $DEPRECATED) + ? $table->status_info + : ""; + + # And for each of the table's aliases... This inner loop eventually + # goes through all aliases in the UCD that we generate regex match + # files for + foreach my $alias ($table->aliases) { + my $name = $alias->name; + + # Generate an entry in either the loose or strict hashes, which + # will translate the property and alias names combination into the + # file where the table for them is stored. + my $standard; + if ($alias->loose_match) { + $standard = $property . standardize($alias->name); + if (exists $loose_to_file_of{$standard}) { + Carp::my_carp("Can't change file registered to $loose_to_file_of{$standard} to '$sub_filename'."); + } + else { + $loose_to_file_of{$standard} = $sub_filename; + } + } + else { + $standard = lc ($property . $name); + if (exists $stricter_to_file_of{$standard}) { + Carp::my_carp("Can't change file registered to $stricter_to_file_of{$standard} to '$sub_filename'."); + } + else { + $stricter_to_file_of{$standard} = $sub_filename; + + # Tightly coupled with how utf8_heavy.pl works, for a + # floating point number that is a whole number, get rid of + # the trailing decimal point and 0's, so that utf8_heavy + # will work. Also note that this assumes that such a + # number is matched strictly; so if that were to change, + # this would be wrong. + if ((my $integer_name = $name) + =~ s/^ ( -? \d+ ) \.0+ $ /$1/x) + { + $stricter_to_file_of{$property . $integer_name} + = $sub_filename; + } + } + } + + # Keep a list of the deprecated properties and their filenames + if ($deprecated) { + $utf8::why_deprecated{$sub_filename} = $deprecated; + } + } + } + + return; +} + +{ # Closure + my %base_names; # Names already used for avoiding DOS 8.3 filesystem + # conflicts + my %full_dir_name_of; # Full length names of directories used. + + sub construct_filename($$$) { + # Return a file name for a table, based on the table name, but perhaps + # changed to get rid of non-portable characters in it, and to make + # sure that it is unique on a file system that allows the names before + # any period to be at most 8 characters (DOS). While we're at it + # check and complain if there are any directory conflicts. + + my $name = shift; # The name to start with + my $mutable = shift; # Boolean: can it be changed? If no, but + # yet it must be to work properly, a warning + # is given + my $directories_ref = shift; # A reference to an array containing the + # path to the file, with each element one path + # component. This is used because the same + # name can be used in different directories. + Carp::carp_extra_args(\@_) if main::DEBUG && @_; + + my $warn = ! defined wantarray; # If true, then if the name is + # changed, a warning is issued as well. + + if (! defined $name) { + Carp::my_carp("Undefined name in directory " + . File::Spec->join(@$directories_ref) + . ". '_' used"); + return '_'; + } + + # Make sure that no directory names conflict with each other. Look at + # each directory in the input file's path. If it is already in use, + # assume it is correct, and is merely being re-used, but if we + # truncate it to 8 characters, and find that there are two directories + # that are the same for the first 8 characters, but differ after that, + # then that is a problem. + foreach my $directory (@$directories_ref) { + my $short_dir = substr($directory, 0, 8); + if (defined $full_dir_name_of{$short_dir}) { + next if $full_dir_name_of{$short_dir} eq $directory; + Carp::my_carp("$directory conflicts with $full_dir_name_of{$short_dir}. Bad News. Continuing anyway"); + } + else { + $full_dir_name_of{$short_dir} = $directory; + } + } + + my $path = join '/', @$directories_ref; + $path .= '/' if $path; + + # Remove interior underscores. + (my $filename = $name) =~ s/ (?<=.) _ (?=.) //xg; + + # Change any non-word character into an underscore, and truncate to 8. + $filename =~ s/\W+/_/g; # eg., "L&" -> "L_" + substr($filename, 8) = "" if length($filename) > 8; + + # Make sure the basename doesn't conflict with something we + # might have already written. If we have, say, + # InGreekExtended1 + # InGreekExtended2 + # they become + # InGreekE + # InGreek2 + my $warned = 0; + while (my $num = $base_names{$path}{lc $filename}++) { + $num++; # so basenames with numbers start with '2', which + # just looks more natural. + + # Want to append $num, but if it'll make the basename longer + # than 8 characters, pre-truncate $filename so that the result + # is acceptable. + my $delta = length($filename) + length($num) - 8; + if ($delta > 0) { + substr($filename, -$delta) = $num; + } + else { + $filename .= $num; + } + if ($warn && ! $warned) { + $warned = 1; + Carp::my_carp("'$path$name' conflicts with another name on a filesystem with 8 significant characters (like DOS). Proceeding anyway."); + } + } + + return $filename if $mutable; + + # If not changeable, must return the input name, but warn if needed to + # change it beyond shortening it. + if ($name ne $filename + && substr($name, 0, length($filename)) ne $filename) { + Carp::my_carp("'$path$name' had to be changed into '$filename'. Bad News. Proceeding anyway."); + } + return $name; + } +} + +# The pod file contains a very large table. Many of the lines in that table +# would exceed a typical output window's size, and so need to be wrapped with +# a hanging indent to make them look good. The pod language is really +# insufficient here. There is no general construct to do that in pod, so it +# is done here by beginning each such line with a space to cause the result to +# be output without formatting, and doing all the formatting here. This leads +# to the result that if the eventual display window is too narrow it won't +# look good, and if the window is too wide, no advantage is taken of that +# extra width. A further complication is that the output may be indented by +# the formatter so that there is less space than expected. What I (khw) have +# done is to assume that that indent is a particular number of spaces based on +# what it is in my Linux system; people can always resize their windows if +# necessary, but this is obviously less than desirable, but the best that can +# be expected. +my $automatic_pod_indent = 8; + +# Try to format so that uses fewest lines, but few long left column entries +# slide into the right column. An experiment on 5.1 data yielded the +# following percentages that didn't cut into the other side along with the +# associated first-column widths +# 69% = 24 +# 80% not too bad except for a few blocks +# 90% = 33; # , cuts 353/3053 lines from 37 = 12% +# 95% = 37; +my $indent_info_column = 27; # 75% of lines didn't have overlap + +my $FILLER = 3; # Length of initial boiler-plate columns in a pod line + # The 3 is because of: + # 1 for the leading space to tell the pod formatter to + # output as-is + # 1 for the flag + # 1 for the space between the flag and the main data + +sub format_pod_line ($$$;$$) { + # Take a pod line and return it, formatted properly + + my $first_column_width = shift; + my $entry = shift; # Contents of left column + my $info = shift; # Contents of right column + + my $status = shift || ""; # Any flag + + my $loose_match = shift; # Boolean. + $loose_match = 1 unless defined $loose_match; + + Carp::carp_extra_args(\@_) if main::DEBUG && @_; + + my $flags = ""; + $flags .= $STRICTER if ! $loose_match; + + $flags .= $status if $status; + + # There is a blank in the left column to cause the pod formatter to + # output the line as-is. + return sprintf " %-*s%-*s %s\n", + # The first * in the format is replaced by this, the -1 is + # to account for the leading blank. There isn't a + # hard-coded blank after this to separate the flags from + # the rest of the line, so that in the unlikely event that + # multiple flags are shown on the same line, they both + # will get displayed at the expense of that separation, + # but since they are left justified, a blank will be + # inserted in the normal case. + $FILLER - 1, + $flags, + + # The other * in the format is replaced by this number to + # cause the first main column to right fill with blanks. + # The -1 is for the guaranteed blank following it. + $first_column_width - $FILLER - 1, + $entry, + $info; +} + +my @zero_match_tables; # List of tables that have no matches in this release + +sub make_table_pod_entries($) { + # This generates the entries for the pod file for a given table. + # Also done at this time are any children tables. The output looks like: + # \p{Common} \p{Script=Common} (Short: \p{Zyyy}) (5178) + + my $input_table = shift; # Table the entry is for + Carp::carp_extra_args(\@_) if main::DEBUG && @_; + + # Generate parent and all its children at the same time. + return if $input_table->parent != $input_table; + + my $property = $input_table->property; + my $type = $property->type; + my $full_name = $property->full_name; + + my $count = $input_table->count; + my $string_count = clarify_number($count); + my $status = $input_table->status; + my $status_info = $input_table->status_info; + + my $entry_for_first_table; # The entry for the first table output. + # Almost certainly, it is the parent. + + # For each related table (including itself), we will generate a pod entry + # for each name each table goes by + foreach my $table ($input_table, $input_table->children) { + + # utf8_heavy.pl cannot deal with null string property values, so don't + # output any. + next if $table->name eq ""; + + # First, gather all the info that applies to this table as a whole. + + push @zero_match_tables, $table if $count == 0; + + my $table_property = $table->property; + + # The short name has all the underscores removed, while the full name + # retains them. Later, we decide whether to output a short synonym + # for the full one, we need to compare apples to apples, so we use the + # short name's length including underscores. + my $table_property_short_name_length; + my $table_property_short_name + = $table_property->short_name(\$table_property_short_name_length); + my $table_property_full_name = $table_property->full_name; + + # Get how much savings there is in the short name over the full one + # (delta will always be <= 0) + my $table_property_short_delta = $table_property_short_name_length + - length($table_property_full_name); + my @table_description = $table->description; + my @table_note = $table->note; + + # Generate an entry for each alias in this table. + my $entry_for_first_alias; # saves the first one encountered. + foreach my $alias ($table->aliases) { + + # Skip if not to go in pod. + next unless $alias->make_pod_entry; + + # Start gathering all the components for the entry + my $name = $alias->name; + + my $entry; # Holds the left column, may include extras + my $entry_ref; # To refer to the left column's contents from + # another entry; has no extras + + # First the left column of the pod entry. Tables for the $perl + # property always use the single form. + if ($table_property == $perl) { + $entry = "\\p{$name}"; + $entry_ref = "\\p{$name}"; + } + else { # Compound form. + + # Only generate one entry for all the aliases that mean true + # or false in binary properties. Append a '*' to indicate + # some are missing. (The heading comment notes this.) + my $wild_card_mark; + if ($type == $BINARY) { + next if $name ne 'N' && $name ne 'Y'; + $wild_card_mark = '*'; + } + else { + $wild_card_mark = ""; + } + + # Colon-space is used to give a little more space to be easier + # to read; + $entry = "\\p{" + . $table_property_full_name + . ": $name$wild_card_mark}"; + + # But for the reference to this entry, which will go in the + # right column, where space is at a premium, use equals + # without a space + $entry_ref = "\\p{" . $table_property_full_name . "=$name}"; + } + + # Then the right (info) column. This is stored as components of + # an array for the moment, then joined into a string later. For + # non-internal only properties, begin the info with the entry for + # the first table we encountered (if any), as things are ordered + # so that that one is the most descriptive. This leads to the + # info column of an entry being a more descriptive version of the + # name column + my @info; + if ($name =~ /^_/) { + push @info, + '(For internal use by Perl, not necessarily stable)'; + } + elsif ($entry_for_first_alias) { + push @info, $entry_for_first_alias; + } + + # If this entry is equivalent to another, add that to the info, + # using the first such table we encountered + if ($entry_for_first_table) { + if (@info) { + push @info, "(= $entry_for_first_table)"; + } + else { + push @info, $entry_for_first_table; + } + } + + # If the name is a large integer, add an equivalent with an + # exponent for better readability + if ($name =~ /^[+-]?[\d]+$/ && $name >= 10_000) { + push @info, sprintf "(= %.1e)", $name + } + + my $parenthesized = ""; + if (! $entry_for_first_alias) { + + # This is the first alias for the current table. The alias + # array is ordered so that this is the fullest, most + # descriptive alias, so it gets the fullest info. The other + # aliases are mostly merely pointers to this one, using the + # information already added above. + + # Display any status message, but only on the parent table + if ($status && ! $entry_for_first_table) { + push @info, $status_info; + } + + # Put out any descriptive info + if (@table_description || @table_note) { + push @info, join "; ", @table_description, @table_note; + } + + # Look to see if there is a shorter name we can point people + # at + my $standard_name = standardize($name); + my $short_name; + my $proposed_short = $table->short_name; + if (defined $proposed_short) { + my $standard_short = standardize($proposed_short); + + # If the short name is shorter than the standard one, or + # even it it's not, but the combination of it and its + # short property name (as in \p{prop=short} ($perl doesn't + # have this form)) saves at least two characters, then, + # cause it to be listed as a shorter synonym. + if (length $standard_short < length $standard_name + || ($table_property != $perl + && (length($standard_short) + - length($standard_name) + + $table_property_short_delta) # (<= 0) + < -2)) + { + $short_name = $proposed_short; + if ($table_property != $perl) { + $short_name = $table_property_short_name + . "=$short_name"; + } + $short_name = "\\p{$short_name}"; + } + } + + # And if this is a compound form name, see if there is a + # single form equivalent + my $single_form; + if ($table_property != $perl) { + + # Special case the binary N tables, so that will print + # \P{single}, but use the Y table values to populate + # 'single', as we haven't populated the N table. + my $test_table; + my $p; + if ($type == $BINARY + && $input_table == $property->table('No')) + { + $test_table = $property->table('Yes'); + $p = 'P'; + } + else { + $test_table = $input_table; + $p = 'p'; + } + + # Look for a single form amongst all the children. + foreach my $table ($test_table->children) { + next if $table->property != $perl; + my $proposed_name = $table->short_name; + next if ! defined $proposed_name; + + # Don't mention internal-only properties as a possible + # single form synonym + next if substr($proposed_name, 0, 1) eq '_'; + + $proposed_name = "\\$p\{$proposed_name}"; + if (! defined $single_form + || length($proposed_name) < length $single_form) + { + $single_form = $proposed_name; + + # The goal here is to find a single form; not the + # shortest possible one. We've already found a + # short name. So, stop at the first single form + # found, which is likely to be closer to the + # original. + last; + } + } + } + + # Ouput both short and single in the same parenthesized + # expression, but with only one of 'Single', 'Short' if there + # are both items. + if ($short_name || $single_form || $table->conflicting) { + $parenthesized .= '('; + $parenthesized .= "Short: $short_name" if $short_name; + if ($short_name && $single_form) { + $parenthesized .= ', '; + } + elsif ($single_form) { + $parenthesized .= 'Single: '; + } + $parenthesized .= $single_form if $single_form; + } + } + + + # Warn if this property isn't the same as one that a + # semi-casual user might expect. The other components of this + # parenthesized structure are calculated only for the first entry + # for this table, but the conflicting is deemed important enough + # to go on every entry. + my $conflicting = join " NOR ", $table->conflicting; + if ($conflicting) { + $parenthesized .= '(' if ! $parenthesized; + $parenthesized .= '; ' if $parenthesized ne '('; + $parenthesized .= "NOT $conflicting"; + } + $parenthesized .= ')' if $parenthesized; + + push @info, $parenthesized if $parenthesized; + + if ($table_property != $perl && $table->perl_extension) { + push @info, '(Perl extension)'; + } + push @info, "($string_count)" if $output_range_counts; + + # Now, we have both the entry and info so add them to the + # list of all the properties. + push @match_properties, + format_pod_line($indent_info_column, + $entry, + join( " ", @info), + $alias->status, + $alias->loose_match); + + $entry_for_first_alias = $entry_ref unless $entry_for_first_alias; + } # End of looping through the aliases for this table. + + if (! $entry_for_first_table) { + $entry_for_first_table = $entry_for_first_alias; + } + } # End of looping through all the related tables + return; +} + +sub pod_alphanumeric_sort { + # Sort pod entries alphanumerically. + + # The first few character columns are filler, plus the '\p{'; and get rid + # of all the trailing stuff, starting with the trailing '}', so as to sort + # on just 'Name=Value' + (my $a = lc $a) =~ s/^ .*? { //x; + $a =~ s/}.*//; + (my $b = lc $b) =~ s/^ .*? { //x; + $b =~ s/}.*//; + + # Determine if the two operands are both internal only or both not. + # Character 0 should be a '\'; 1 should be a p; 2 should be '{', so 3 + # should be the underscore that begins internal only + my $a_is_internal = (substr($a, 0, 1) eq '_'); + my $b_is_internal = (substr($b, 0, 1) eq '_'); + + # Sort so the internals come last in the table instead of first (which the + # leading underscore would otherwise indicate). + if ($a_is_internal != $b_is_internal) { + return 1 if $a_is_internal; + return -1 + } + + # Determine if the two operands are numeric property values or not. + # A numeric property will look like xyz: 3. But the number + # can begin with an optional minus sign, and may have a + # fraction or rational component, like xyz: 3/2. If either + # isn't numeric, use alphabetic sort. + my ($a_initial, $a_number) = + ($a =~ /^ ( [^:=]+ [:=] \s* ) (-? \d+ (?: [.\/] \d+)? )/ix); + return $a cmp $b unless defined $a_number; + my ($b_initial, $b_number) = + ($b =~ /^ ( [^:=]+ [:=] \s* ) (-? \d+ (?: [.\/] \d+)? )/ix); + return $a cmp $b unless defined $b_number; + + # Here they are both numeric, but use alphabetic sort if the + # initial parts don't match + return $a cmp $b if $a_initial ne $b_initial; + + # Convert rationals to floating for the comparison. + $a_number = eval $a_number if $a_number =~ qr{/}; + $b_number = eval $b_number if $b_number =~ qr{/}; + + return $a_number <=> $b_number; +} + +sub make_pod () { + # Create the .pod file. This generates the various subsections and then + # combines them in one big HERE document. + + return unless defined $pod_directory; + print "Making pod file\n" if $verbosity >= $PROGRESS; + + my $exception_message = + '(Any exceptions are individually noted beginning with the word NOT.)'; + my @block_warning; + if (-e 'Blocks.txt') { + + # Add the line: '\p{In_*} \p{Block: *}', with the warning message + # if the global $has_In_conflicts indicates we have them. + push @match_properties, format_pod_line($indent_info_column, + '\p{In_*}', + '\p{Block: *}' + . (($has_In_conflicts) + ? " $exception_message" + : "")); + @block_warning = << "END"; + +Matches in the Block property have shortcuts that begin with 'In_'. For +example, \\p{Block=Latin1} can be written as \\p{In_Latin1}. For backward +compatibility, if there is no conflict with another shortcut, these may also +be written as \\p{Latin1} or \\p{Is_Latin1}. But, N.B., there are numerous +such conflicting shortcuts. Use of these forms for Block is discouraged, and +are flagged as such, not only because of the potential confusion as to what is +meant, but also because a later release of Unicode may preempt the shortcut, +and your program would no longer be correct. Use the 'In_' form instead to +avoid this, or even more clearly, use the compound form, e.g., +\\p{blk:latin1}. See L for more information about this. +END + } + my $text = "If an entry has flag(s) at its beginning, like '$DEPRECATED', the 'Is_' form has the same flag(s)"; + $text = "$exception_message $text" if $has_Is_conflicts; + + # And the 'Is_ line'; + push @match_properties, format_pod_line($indent_info_column, + '\p{Is_*}', + "\\p{*} $text"); + + # Sort the properties array for output. It is sorted alphabetically + # except numerically for numeric properties, and only output unique lines. + @match_properties = sort pod_alphanumeric_sort uniques @match_properties; + + my $formatted_properties = simple_fold(\@match_properties, + "", + # indent succeeding lines by two extra + # which looks better + $indent_info_column + 2, + + # shorten the line length by how much + # the formatter indents, so the folded + # line will fit in the space + # presumably available + $automatic_pod_indent); + # Add column headings, indented to be a little more centered, but not + # exactly + $formatted_properties = format_pod_line($indent_info_column, + ' NAME', + ' INFO') + . "\n" + . $formatted_properties; + + # Generate pod documentation lines for the tables that match nothing + my $zero_matches; + if (@zero_match_tables) { + @zero_match_tables = uniques(@zero_match_tables); + $zero_matches = join "\n\n", + map { $_ = '=item \p{' . $_->complete_name . "}" } + sort { $a->complete_name cmp $b->complete_name } + uniques(@zero_match_tables); + + $zero_matches = <[0] cmp $why_list{$b}->[0] } + keys %why_list) + { + # Add to the output, all the properties that have that reason. Start + # with an empty line. + push @bad_re_properties, "\n\n"; + + my $has_item = 0; # Flag if actually output anything. + foreach my $name (@{$why_list{$why}}) { + + # Split compound names into $property and $table components + my $property = $name; + my $table; + if ($property =~ / (.*) = (.*) /x) { + $property = $1; + $table = $2; + } + + # This release of Unicode may not have a property that is + # suppressed, so don't reference a non-existent one. + $property = property_ref($property); + next if ! defined $property; + + # And since this list is only for match tables, don't list the + # ones that don't have match tables. + next if ! $property->to_create_match_tables; + + # Find any abbreviation, and turn it into a compound name if this + # is a property=value pair. + my $short_name = $property->name; + $short_name .= '=' . $property->table($table)->name if $table; + + # And add the property as an item for the reason. + push @bad_re_properties, "\n=item I<$name> ($short_name)\n"; + $has_item = 1; + } + + # And add the reason under the list of properties, if such a list + # actually got generated. Note that the header got added + # unconditionally before. But pod ignores extra blank lines, so no + # harm. + push @bad_re_properties, "\n$why\n" if $has_item; + + } # End of looping through each reason. + + # Generate a list of the properties whose map table we output, from the + # global @map_properties. + my @map_tables_actually_output; + my $info_indent = 20; # Left column is narrower than \p{} table. + foreach my $property (@map_properties) { + + # Get the path to the file; don't output any not in the standard + # directory. + my @path = $property->file_path; + next if $path[0] ne $map_directory; + shift @path; # Remove the standard name + + my $file = join '/', @path; # In case is in sub directory + my $info = $property->full_name; + my $short_name = $property->name; + if ($info ne $short_name) { + $info .= " ($short_name)"; + } + foreach my $more_info ($property->description, + $property->note, + $property->status_info) + { + next unless $more_info; + $info =~ s/\.\Z//; + $info .= ". $more_info"; + } + push @map_tables_actually_output, format_pod_line($info_indent, + $file, + $info, + $property->status); + } + + # Sort alphabetically, and fold for output + @map_tables_actually_output = sort + pod_alphanumeric_sort @map_tables_actually_output; + @map_tables_actually_output + = simple_fold(\@map_tables_actually_output, + ' ', + $info_indent, + $automatic_pod_indent); + + # Generate a list of the formats that can appear in the map tables. + my @map_table_formats; + foreach my $format (sort keys %map_table_formats) { + push @map_table_formats, " $format $map_table_formats{$format}\n"; + } + + # Everything is ready to assemble. + my @OUT = << "END"; +=begin comment + +$HEADER + +To change this file, edit $0 instead. + +=end comment + +=head1 NAME + +$pod_file - Index of Unicode Version $string_version properties in Perl + +=head1 DESCRIPTION + +There are many properties in Unicode, and Perl provides access to almost all of +them, as well as some additional extensions and short-cut synonyms. + +And just about all of the few that aren't accessible through the Perl +core are accessible through the modules: Unicode::Normalize and +Unicode::UCD, and for Unihan properties, via the CPAN module Unicode::Unihan. + +This document merely lists all available properties and does not attempt to +explain what each property really means. There is a brief description of each +Perl extension. There is some detail about Blocks, Scripts, General_Category, +and Bidi_Class in L, but to find out about the intricacies of the +Unicode properties, refer to the Unicode standard. A good starting place is +L<$unicode_reference_url>. More information on the Perl extensions is in +L. + +Note that you can define your own properties; see +L. + +=head1 Properties accessible through \\p{} and \\P{} + +The Perl regular expression \\p{} and \\P{} constructs give access to most of +the Unicode character properties. The table below shows all these constructs, +both single and compound forms. + +B consist of two components, separated by an equals sign or a +colon. The first component is the property name, and the second component is +the particular value of the property to match against, for example, +'\\p{Script: Greek}' or '\\p{Script=Greek}' both mean to match characters +whose Script property is Greek. + +B, like '\\p{Greek}', are mostly Perl-defined shortcuts for +their equivalent compound forms. The table shows these equivalences. (In our +example, '\\p{Greek}' is a just a shortcut for '\\p{Script=Greek}'.) +There are also a few Perl-defined single forms that are not shortcuts for a +compound form. One such is \\p{Word}. These are also listed in the table. + +In parsing these constructs, Perl always ignores Upper/lower case differences +everywhere within the {braces}. Thus '\\p{Greek}' means the same thing as +'\\p{greek}'. But note that changing the case of the 'p' or 'P' before the +left brace completely changes the meaning of the construct, from "match" (for +'\\p{}') to "doesn't match" (for '\\P{}'). Casing in this document is for +improved legibility. + +Also, white space, hyphens, and underscores are also normally ignored +everywhere between the {braces}, and hence can be freely added or removed +even if the C modifier hasn't been specified on the regular expression. +But $a_bold_stricter at the beginning of an entry in the table below +means that tighter (stricter) rules are used for that entry: + +=over 4 + +=item Single form (\\p{name}) tighter rules: + +White space, hyphens, and underscores ARE significant +except for: + +=over 4 + +=item * white space adjacent to a non-word character + +=item * underscores separating digits in numbers + +=back + +That means, for example, that you can freely add or remove white space +adjacent to (but within) the braces without affecting the meaning. + +=item Compound form (\\p{name=value} or \\p{name:value}) tighter rules: + +The tighter rules given above for the single form apply to everything to the +right of the colon or equals; the looser rules still apply to everything to +the left. + +That means, for example, that you can freely add or remove white space +adjacent to (but within) the braces and the colon or equal sign. + +=back + +Some properties are considered obsolete, but still available. There are +several varieties of obsolesence: + +=over 4 + +=item Obsolete + +Properties marked with $a_bold_obsolete in the table are considered +obsolete. At the time of this writing (Unicode version 5.2) there is no +information in the Unicode standard about the implications of a property being +obsolete. + +=item Stabilized + +Obsolete properties may be stabilized. This means that they are not actively +maintained by Unicode, and will not be extended as new characters are added to +the standard. Such properties are marked with $a_bold_stabilized in the +table. At the time of this writing (Unicode version 5.2) there is no further +information in the Unicode standard about the implications of a property being +stabilized. + +=item Deprecated + +Obsolete properties may be deprecated. This means that their use is strongly +discouraged, so much so that a warning will be issued if used, unless the +regular expression is in the scope of a C> +statement. $A_bold_deprecated flags each such entry in the table, and +the entry there for the longest, most descriptive version of the property will +give the reason it is deprecated, and perhaps advice. Perl may issue such a +warning, even for properties that aren't officially deprecated by Unicode, +when there used to be characters or code points that were matched by them, but +no longer. This is to warn you that your program may not work like it did on +earlier Unicode releases. + +A deprecated property may be made unavailable in a future Perl version, so it +is best to move away from them. + +=back + +Some Perl extensions are present for backwards compatibility and are +discouraged from being used, but not obsolete. $A_bold_discouraged +flags each such entry in the table. + +@block_warning + +The table below has two columns. The left column contains the \\p{} +constructs to look up, possibly preceeded by the flags mentioned above; and +the right column contains information about them, like a description, or +synonyms. It shows both the single and compound forms for each property that +has them. If the left column is a short name for a property, the right column +will give its longer, more descriptive name; and if the left column is the +longest name, the right column will show any equivalent shortest name, in both +single and compound forms if applicable. + +The right column will also caution you if a property means something different +than what might normally be expected. + +All single forms are Perl extensions; a few compound forms are as well, and +are noted as such. + +Numbers in (parentheses) indicate the total number of code points matched by +the property. For emphasis, those properties that match no code points at all +are listed as well in a separate section following the table. + +There is no description given for most non-Perl defined properties (See +$unicode_reference_url for that). + +For compactness, 'B<*>' is used as a wildcard instead of showing all possible +combinations. For example, entries like: + + \\p{Gc: *} \\p{General_Category: *} + +mean that 'Gc' is a synonym for 'General_Category', and anything that is valid +for the latter is also valid for the former. Similarly, + + \\p{Is_*} \\p{*} + +means that if and only if, for example, \\p{Foo} exists, then \\p{Is_Foo} and +\\p{IsFoo} are also valid and all mean the same thing. And similarly, +\\p{Foo=Bar} means the same as \\p{Is_Foo=Bar} and \\p{IsFoo=Bar}. '*' here +is restricted to something not beginning with an underscore. + +Also, in binary properties, 'Yes', 'T', and 'True' are all synonyms for 'Y'. +And 'No', 'F', and 'False' are all synonyms for 'N'. The table shows 'Y*' and +'N*' to indicate this, and doesn't have separate entries for the other +possibilities. Note that not all properties which have values 'Yes' and 'No' +are binary, and they have all their values spelled out without using this wild +card, and a C clause in their description that highlights their not being +binary. These also require the compound form to match them, whereas true +binary properties have both single and compound forms available. + +Note that all non-essential underscores are removed in the display of the +short names below. + +B + +=over 4 + +=item B<*> is a wild-card + +=item B<(\\d+)> in the info column gives the number of code points matched by +this property. + +=item B<$DEPRECATED> means this is deprecated. + +=item B<$OBSOLETE> means this is obsolete. + +=item B<$STABILIZED> means this is stabilized. + +=item B<$STRICTER> means tighter (stricter) name matching applies. + +=item B<$DISCOURAGED> means use of this form is discouraged. + +=back + +$formatted_properties + +$zero_matches + +=head1 Properties not accessible through \\p{} and \\P{} + +A few properties are accessible in Perl via various function calls only. +These are: + Lowercase_Mapping lc() and lcfirst() + Titlecase_Mapping ucfirst() + Uppercase_Mapping uc() + +Case_Folding is accessible through the /i modifier in regular expressions. + +The Name property is accessible through the \\N{} interpolation in +double-quoted strings and regular expressions, but both usages require a C to be specified, which also contains related functions viacode(), +vianame(), and string_vianame(). + +=head1 Unicode regular expression properties that are NOT accepted by Perl + +Perl will generate an error for a few character properties in Unicode when +used in a regular expression. The non-Unihan ones are listed below, with the +reasons they are not accepted, perhaps with work-arounds. The short names for +the properties are listed enclosed in (parentheses). + +=over 4 + +@bad_re_properties + +=back + +An installation can choose to allow any of these to be matched by changing the +controlling lists contained in the program C<\$Config{privlib}>/F +and then re-running F<$0>. (C<\%Config> is available from the Config module). + +=head1 Files in the I directory (for serious hackers only) + +All Unicode properties are really mappings (in the mathematical sense) from +code points to their respective values. As part of its build process, +Perl constructs tables containing these mappings for all properties that it +deals with. But only a few of these are written out into files. +Those written out are in the directory C<\$Config{privlib}>/F +(%Config is available from the Config module). + +Those ones written are ones needed by Perl internally during execution, or for +which there is some demand, and those for which there is no access through the +Perl core. Generally, properties that can be used in regular expression +matching do not have their map tables written, like Script. Nor are the +simplistic properties that have a better, more complete version, such as +Simple_Uppercase_Mapping (Uppercase_Mapping is written instead). + +None of the properties in the I directory are currently directly +accessible through the Perl core, although some may be accessed indirectly. +For example, the uc() function implements the Uppercase_Mapping property and +uses the F file found in this directory. + +The available files with their properties (short names in parentheses), +and any flags or comments about them, are: + +@map_tables_actually_output + +An installation can choose to change which files are generated by changing the +controlling lists contained in the program C<\$Config{privlib}>/F +and then re-running F<$0>. + +Each of these files defines two hash entries to help reading programs decipher +it. One of them looks like this: + + \$utf8::SwashInfo{'ToNAME'}{'format'} = 's'; + +where 'NAME' is a name to indicate the property. For backwards compatibility, +this is not necessarily the property's official Unicode name. (The 'To' is +also for backwards compatibility.) The hash entry gives the format of the +mapping fields of the table, currently one of the following: + + @map_table_formats + +This format applies only to the entries in the main body of the table. +Entries defined in hashes or ones that are missing from the list can have a +different format. + +The value that the missing entries have is given by the other SwashInfo hash +entry line; it looks like this: + + \$utf8::SwashInfo{'ToNAME'}{'missing'} = 'NaN'; + +This example line says that any Unicode code points not explicitly listed in +the file have the value 'NaN' under the property indicated by NAME. If the +value is the special string C<< >>, it means that the value for +any missing code point is the code point itself. This happens, for example, +in the file for Uppercase_Mapping (To/Upper.pl), in which code points like the +character 'A', are missing because the uppercase of 'A' is itself. + +=head1 SEE ALSO + +L<$unicode_reference_url> + +L + +L + +END + + # And write it. + main::write([ $pod_directory, "$pod_file.pod" ], @OUT); + return; +} + +sub make_Heavy () { + # Create and write Heavy.pl, which passes info about the tables to + # utf8_heavy.pl + + my @heavy = <file } property_ref('*')) { + my $type = $property->type; + + # And for each table for that property, starting with the mapping + # table for it ... + TABLE: + foreach my $table($property, + + # and all the match tables for it (if any), sorted so + # the ones with the shortest associated file name come + # first. The length sorting prevents problems of a + # longer file taking a name that might have to be used + # by a shorter one. The alphabetic sorting prevents + # differences between releases + sort { my $ext_a = $a->external_name; + return 1 if ! defined $ext_a; + my $ext_b = $b->external_name; + return -1 if ! defined $ext_b; + my $cmp = length $ext_a <=> length $ext_b; + + # Return result if lengths not equal + return $cmp if $cmp; + + # Alphabetic if lengths equal + return $ext_a cmp $ext_b + } $property->tables + ) + { + + # Here we have a table associated with a property. It could be + # the map table (done first for each property), or one of the + # other tables. Determine which type. + my $is_property = $table->isa('Property'); + + my $name = $table->name; + my $complete_name = $table->complete_name; + + # See if should suppress the table if is empty, but warn if it + # contains something. + my $suppress_if_empty_warn_if_not = grep { $complete_name eq $_ } + keys %why_suppress_if_empty_warn_if_not; + + # Calculate if this table should have any code points associated + # with it or not. + my $expected_empty = + + # $perl should be empty, as well as properties that we just + # don't do anything with + ($is_property + && ($table == $perl + || grep { $complete_name eq $_ } + @unimplemented_properties + ) + ) + + # Match tables in properties we skipped populating should be + # empty + || (! $is_property && ! $property->to_create_match_tables) + + # Tables and properties that are expected to have no code + # points should be empty + || $suppress_if_empty_warn_if_not + ; + + # Set a boolean if this table is the complement of an empty binary + # table + my $is_complement_of_empty_binary = + $type == $BINARY && + (($table == $property->table('Y') + && $property->table('N')->is_empty) + || ($table == $property->table('N') + && $property->table('Y')->is_empty)); + + + # Some tables should match everything + my $expected_full = + ($is_property) + ? # All these types of map tables will be full because + # they will have been populated with defaults + ($type == $ENUM || $type == $BINARY) + + : # A match table should match everything if its method + # shows it should + ($table->matches_all + + # The complement of an empty binary table will match + # everything + || $is_complement_of_empty_binary + ) + ; + + if ($table->is_empty) { + + + if ($suppress_if_empty_warn_if_not) { + $table->set_status($SUPPRESSED, + $why_suppress_if_empty_warn_if_not{$complete_name}); + } + + # Suppress expected empty tables. + next TABLE if $expected_empty; + + # And setup to later output a warning for those that aren't + # known to be allowed to be empty. Don't do the warning if + # this table is a child of another one to avoid duplicating + # the warning that should come from the parent one. + if (($table == $property || $table->parent == $table) + && $table->status ne $SUPPRESSED + && ! grep { $complete_name =~ /^$_$/ } + @tables_that_may_be_empty) + { + push @unhandled_properties, "$table"; + } + } + elsif ($expected_empty) { + my $because = ""; + if ($suppress_if_empty_warn_if_not) { + $because = " because $why_suppress_if_empty_warn_if_not{$complete_name}"; + } + + Carp::my_carp("Not expecting property $table$because. Generating file for it anyway."); + } + + my $count = $table->count; + if ($expected_full) { + if ($count != $MAX_UNICODE_CODEPOINTS) { + Carp::my_carp("$table matches only " + . clarify_number($count) + . " Unicode code points but should match " + . clarify_number($MAX_UNICODE_CODEPOINTS) + . " (off by " + . clarify_number(abs($MAX_UNICODE_CODEPOINTS - $count)) + . "). Proceeding anyway."); + } + + # Here is expected to be full. If it is because it is the + # complement of an (empty) binary table that is to be + # suppressed, then suppress this one as well. + if ($is_complement_of_empty_binary) { + my $opposing_name = ($name eq 'Y') ? 'N' : 'Y'; + my $opposing = $property->table($opposing_name); + my $opposing_status = $opposing->status; + if ($opposing_status) { + $table->set_status($opposing_status, + $opposing->status_info); + } + } + } + elsif ($count == $MAX_UNICODE_CODEPOINTS) { + if ($table == $property || $table->leader == $table) { + Carp::my_carp("$table unexpectedly matches all Unicode code points. Proceeding anyway."); + } + } + + if ($table->status eq $SUPPRESSED) { + if (! $is_property) { + my @children = $table->children; + foreach my $child (@children) { + if ($child->status ne $SUPPRESSED) { + Carp::my_carp_bug("'$table' is suppressed and has a child '$child' which isn't"); + } + } + } + next TABLE; + + } + if (! $is_property) { + + # Several things need to be done just once for each related + # group of match tables. Do them on the parent. + if ($table->parent == $table) { + + # Add an entry in the pod file for the table; it also does + # the children. + make_table_pod_entries($table) if defined $pod_directory; + + # See if the the table matches identical code points with + # something that has already been output. In that case, + # no need to have two files with the same code points in + # them. We use the table's hash() method to store these + # in buckets, so that it is quite likely that if two + # tables are in the same bucket they will be identical, so + # don't have to compare tables frequently. The tables + # have to have the same status to share a file, so add + # this to the bucket hash. (The reason for this latter is + # that Heavy.pl associates a status with a file.) + my $hash = $table->hash . ';' . $table->status; + + # Look at each table that is in the same bucket as this + # one would be. + foreach my $comparison (@{$match_tables_to_write{$hash}}) + { + if ($table->matches_identically_to($comparison)) { + $table->set_equivalent_to($comparison, + Related => 0); + next TABLE; + } + } + + # Here, not equivalent, add this table to the bucket. + push @{$match_tables_to_write{$hash}}, $table; + } + } + else { + + # Here is the property itself. + # Don't write out or make references to the $perl property + next if $table == $perl; + + if ($type != $STRING) { + + # There is a mapping stored of the various synonyms to the + # standardized name of the property for utf8_heavy.pl. + # Also, the pod file contains entries of the form: + # \p{alias: *} \p{full: *} + # rather than show every possible combination of things. + + my @property_aliases = $property->aliases; + + # The full name of this property is stored by convention + # first in the alias array + my $full_property_name = + '\p{' . $property_aliases[0]->name . ': *}'; + my $standard_property_name = standardize($table->name); + + # For each synonym ... + for my $i (0 .. @property_aliases - 1) { + my $alias = $property_aliases[$i]; + my $alias_name = $alias->name; + my $alias_standard = standardize($alias_name); + + # Set the mapping for utf8_heavy of the alias to the + # property + if (exists ($loose_property_name_of{$alias_standard})) + { + Carp::my_carp("There already is a property with the same standard name as $alias_name: $loose_property_name_of{$alias_standard}. Old name is retained"); + } + else { + $loose_property_name_of{$alias_standard} + = $standard_property_name; + } + + # Now for the pod entry for this alias. Skip if not + # outputting a pod; skip the first one, which is the + # full name so won't have an entry like: '\p{full: *} + # \p{full: *}', and skip if don't want an entry for + # this one. + next if $i == 0 + || ! defined $pod_directory + || ! $alias->make_pod_entry; + + my $rhs = $full_property_name; + if ($property != $perl && $table->perl_extension) { + $rhs .= ' (Perl extension)'; + } + push @match_properties, + format_pod_line($indent_info_column, + '\p{' . $alias->name . ': *}', + $rhs, + $alias->status); + } + } # End of non-string-like property code + + + # Don't output a mapping file if not desired. + next if ! $property->to_output_map; + } + + # Here, we know we want to write out the table, but don't do it + # yet because there may be other tables that come along and will + # want to share the file, and the file's comments will change to + # mention them. So save for later. + push @writables, $table; + + } # End of looping through the property and all its tables. + } # End of looping through all properties. + + # Now have all the tables that will have files written for them. Do it. + foreach my $table (@writables) { + my @directory; + my $filename; + my $property = $table->property; + my $is_property = ($table == $property); + if (! $is_property) { + + # Match tables for the property go in lib/$subdirectory, which is + # the property's name. Don't use the standard file name for this, + # as may get an unfamiliar alias + @directory = ($matches_directory, $property->external_name); + } + else { + + @directory = $table->directory; + $filename = $table->file; + } + + # Use specified filename if avaliable, or default to property's + # shortest name. We need an 8.3 safe filename (which means "an 8 + # safe" filename, since after the dot is only 'pl', which is < 3) + # The 2nd parameter is if the filename shouldn't be changed, and + # it shouldn't iff there is a hard-coded name for this table. + $filename = construct_filename( + $filename || $table->external_name, + ! $filename, # mutable if no filename + \@directory); + + register_file_for_name($table, \@directory, $filename); + + # Only need to write one file when shared by more than one + # property + next if ! $is_property && $table->leader != $table; + + # Construct a nice comment to add to the file + $table->set_final_comment; + + $table->write; + } + + + # Write out the pod file + make_pod; + + # And Heavy.pl + make_Heavy; + + make_property_test_script() if $make_test_script; + return; +} + +my @white_space_separators = ( # This used only for making the test script. + "", + ' ', + "\t", + ' ' + ); + +sub generate_separator($) { + # This used only for making the test script. It generates the colon or + # equal separator between the property and property value, with random + # white space surrounding the separator + + my $lhs = shift; + + return "" if $lhs eq ""; # No separator if there's only one (the r) side + + # Choose space before and after randomly + my $spaces_before =$white_space_separators[rand(@white_space_separators)]; + my $spaces_after = $white_space_separators[rand(@white_space_separators)]; + + # And return the whole complex, half the time using a colon, half the + # equals + return $spaces_before + . (rand() < 0.5) ? '=' : ':' + . $spaces_after; +} + +sub generate_tests($$$$$) { + # This used only for making the test script. It generates test cases that + # are expected to compile successfully in perl. Note that the lhs and + # rhs are assumed to already be as randomized as the caller wants. + + my $lhs = shift; # The property: what's to the left of the colon + # or equals separator + my $rhs = shift; # The property value; what's to the right + my $valid_code = shift; # A code point that's known to be in the + # table given by lhs=rhs; undef if table is + # empty + my $invalid_code = shift; # A code point known to not be in the table; + # undef if the table is all code points + my $warning = shift; + + # Get the colon or equal + my $separator = generate_separator($lhs); + + # The whole 'property=value' + my $name = "$lhs$separator$rhs"; + + my @output; + # Create a complete set of tests, with complements. + if (defined $valid_code) { + push @output, <<"EOC" +Expect(1, $valid_code, '\\p{$name}', $warning); +Expect(0, $valid_code, '\\p{^$name}', $warning); +Expect(0, $valid_code, '\\P{$name}', $warning); +Expect(1, $valid_code, '\\P{^$name}', $warning); +EOC + } + if (defined $invalid_code) { + push @output, <<"EOC" +Expect(0, $invalid_code, '\\p{$name}', $warning); +Expect(1, $invalid_code, '\\p{^$name}', $warning); +Expect(1, $invalid_code, '\\P{$name}', $warning); +Expect(0, $invalid_code, '\\P{^$name}', $warning); +EOC + } + return @output; +} + +sub generate_error($$$) { + # This used only for making the test script. It generates test cases that + # are expected to not only not match, but to be syntax or similar errors + + my $lhs = shift; # The property: what's to the left of the + # colon or equals separator + my $rhs = shift; # The property value; what's to the right + my $already_in_error = shift; # Boolean; if true it's known that the + # unmodified lhs and rhs will cause an error. + # This routine should not force another one + # Get the colon or equal + my $separator = generate_separator($lhs); + + # Since this is an error only, don't bother to randomly decide whether to + # put the error on the left or right side; and assume that the rhs is + # loosely matched, again for convenience rather than rigor. + $rhs = randomize_loose_name($rhs, 'ERROR') unless $already_in_error; + + my $property = $lhs . $separator . $rhs; + + return <<"EOC"; +Error('\\p{$property}'); +Error('\\P{$property}'); +EOC +} + +# These are used only for making the test script +# XXX Maybe should also have a bad strict seps, which includes underscore. + +my @good_loose_seps = ( + " ", + "-", + "\t", + "", + "_", + ); +my @bad_loose_seps = ( + "/a/", + ':=', + ); + +sub randomize_stricter_name { + # This used only for making the test script. Take the input name and + # return a randomized, but valid version of it under the stricter matching + # rules. + + my $name = shift; + Carp::carp_extra_args(\@_) if main::DEBUG && @_; + + # If the name looks like a number (integer, floating, or rational), do + # some extra work + if ($name =~ qr{ ^ ( -? ) (\d+ ( ( [./] ) \d+ )? ) $ }x) { + my $sign = $1; + my $number = $2; + my $separator = $3; + + # If there isn't a sign, part of the time add a plus + # Note: Not testing having any denominator having a minus sign + if (! $sign) { + $sign = '+' if rand() <= .3; + } + + # And add 0 or more leading zeros. + $name = $sign . ('0' x int rand(10)) . $number; + + if (defined $separator) { + my $extra_zeros = '0' x int rand(10); + + if ($separator eq '.') { + + # Similarly, add 0 or more trailing zeros after a decimal + # point + $name .= $extra_zeros; + } + else { + + # Or, leading zeros before the denominator + $name =~ s,/,/$extra_zeros,; + } + } + } + + # For legibility of the test, only change the case of whole sections at a + # time. To do this, first split into sections. The split returns the + # delimiters + my @sections; + for my $section (split / ( [ - + \s _ . ]+ ) /x, $name) { + trace $section if main::DEBUG && $to_trace; + + if (length $section > 1 && $section !~ /\D/) { + + # If the section is a sequence of digits, about half the time + # randomly add underscores between some of them. + if (rand() > .5) { + + # Figure out how many underscores to add. max is 1 less than + # the number of digits. (But add 1 at the end to make sure + # result isn't 0, and compensate earlier by subtracting 2 + # instead of 1) + my $num_underscores = int rand(length($section) - 2) + 1; + + # And add them evenly throughout, for convenience, not rigor + use integer; + my $spacing = (length($section) - 1)/ $num_underscores; + my $temp = $section; + $section = ""; + for my $i (1 .. $num_underscores) { + $section .= substr($temp, 0, $spacing, "") . '_'; + } + $section .= $temp; + } + push @sections, $section; + } + else { + + # Here not a sequence of digits. Change the case of the section + # randomly + my $switch = int rand(4); + if ($switch == 0) { + push @sections, uc $section; + } + elsif ($switch == 1) { + push @sections, lc $section; + } + elsif ($switch == 2) { + push @sections, ucfirst $section; + } + else { + push @sections, $section; + } + } + } + trace "returning", join "", @sections if main::DEBUG && $to_trace; + return join "", @sections; +} + +sub randomize_loose_name($;$) { + # This used only for making the test script + + my $name = shift; + my $want_error = shift; # if true, make an error + Carp::carp_extra_args(\@_) if main::DEBUG && @_; + + $name = randomize_stricter_name($name); + + my @parts; + push @parts, $good_loose_seps[rand(@good_loose_seps)]; + for my $part (split /[-\s_]+/, $name) { + if (@parts) { + if ($want_error and rand() < 0.3) { + push @parts, $bad_loose_seps[rand(@bad_loose_seps)]; + $want_error = 0; + } + else { + push @parts, $good_loose_seps[rand(@good_loose_seps)]; + } + } + push @parts, $part; + } + my $new = join("", @parts); + trace "$name => $new" if main::DEBUG && $to_trace; + + if ($want_error) { + if (rand() >= 0.5) { + $new .= $bad_loose_seps[rand(@bad_loose_seps)]; + } + else { + $new = $bad_loose_seps[rand(@bad_loose_seps)] . $new; + } + } + return $new; +} + +# Used to make sure don't generate duplicate test cases. +my %test_generated; + +sub make_property_test_script() { + # This used only for making the test script + # this written directly -- it's huge. + + print "Making test script\n" if $verbosity >= $PROGRESS; + + # This uses randomness to test different possibilities without testing all + # possibilities. To ensure repeatability, set the seed to 0. But if + # tests are added, it will perturb all later ones in the .t file + srand 0; + + $t_path = 'TestProp.pl' unless defined $t_path; # the traditional name + + # Keep going down an order of magnitude + # until find that adding this quantity to + # 1 remains 1; but put an upper limit on + # this so in case this algorithm doesn't + # work properly on some platform, that we + # won't loop forever. + my $digits = 0; + my $min_floating_slop = 1; + while (1+ $min_floating_slop != 1 + && $digits++ < 50) + { + my $next = $min_floating_slop / 10; + last if $next == 0; # If underflows, + # use previous one + $min_floating_slop = $next; + } + + # It doesn't matter whether the elements of this array contain single lines + # or multiple lines. main::write doesn't count the lines. + my @output; + + foreach my $property (property_ref('*')) { + foreach my $table ($property->tables) { + + # Find code points that match, and don't match this table. + my $valid = $table->get_valid_code_point; + my $invalid = $table->get_invalid_code_point; + my $warning = ($table->status eq $DEPRECATED) + ? "'deprecated'" + : '""'; + + # Test each possible combination of the property's aliases with + # the table's. If this gets to be too many, could do what is done + # in the set_final_comment() for Tables + my @table_aliases = $table->aliases; + my @property_aliases = $table->property->aliases; + my $max = max(scalar @table_aliases, scalar @property_aliases); + for my $j (0 .. $max - 1) { + + # The current alias for property is the next one on the list, + # or if beyond the end, start over. Similarly for table + my $property_name + = $property_aliases[$j % @property_aliases]->name; + + $property_name = "" if $table->property == $perl; + my $table_alias = $table_aliases[$j % @table_aliases]; + my $table_name = $table_alias->name; + my $loose_match = $table_alias->loose_match; + + # If the table doesn't have a file, any test for it is + # already guaranteed to be in error + my $already_error = ! $table->file_path; + + # Generate error cases for this alias. + push @output, generate_error($property_name, + $table_name, + $already_error); + + # If the table is guaranteed to always generate an error, + # quit now without generating success cases. + next if $already_error; + + # Now for the success cases. + my $random; + if ($loose_match) { + + # For loose matching, create an extra test case for the + # standard name. + my $standard = standardize($table_name); + + # $test_name should be a unique combination for each test + # case; used just to avoid duplicate tests + my $test_name = "$property_name=$standard"; + + # Don't output duplicate test cases. + if (! exists $test_generated{$test_name}) { + $test_generated{$test_name} = 1; + push @output, generate_tests($property_name, + $standard, + $valid, + $invalid, + $warning, + ); + } + $random = randomize_loose_name($table_name) + } + else { # Stricter match + $random = randomize_stricter_name($table_name); + } + + # Now for the main test case for this alias. + my $test_name = "$property_name=$random"; + if (! exists $test_generated{$test_name}) { + $test_generated{$test_name} = 1; + push @output, generate_tests($property_name, + $random, + $valid, + $invalid, + $warning, + ); + + # If the name is a rational number, add tests for the + # floating point equivalent. + if ($table_name =~ qr{/}) { + + # Calculate the float, and find just the fraction. + my $float = eval $table_name; + my ($whole, $fraction) + = $float =~ / (.*) \. (.*) /x; + + # Starting with one digit after the decimal point, + # create a test for each possible precision (number of + # digits past the decimal point) until well beyond the + # native number found on this machine. (If we started + # with 0 digits, it would be an integer, which could + # well match an unrelated table) + PLACE: + for my $i (1 .. $min_floating_slop + 3) { + my $table_name = sprintf("%.*f", $i, $float); + if ($i < $MIN_FRACTION_LENGTH) { + + # If the test case has fewer digits than the + # minimum acceptable precision, it shouldn't + # succeed, so we expect an error for it. + # E.g., 2/3 = .7 at one decimal point, and we + # shouldn't say it matches .7. We should make + # it be .667 at least before agreeing that the + # intent was to match 2/3. But at the + # less-than- acceptable level of precision, it + # might actually match an unrelated number. + # So don't generate a test case if this + # conflating is possible. In our example, we + # don't want 2/3 matching 7/10, if there is + # a 7/10 code point. + for my $existing + (keys %nv_floating_to_rational) + { + next PLACE + if abs($table_name - $existing) + < $MAX_FLOATING_SLOP; + } + push @output, generate_error($property_name, + $table_name, + 1 # 1 => already an error + ); + } + else { + + # Here the number of digits exceeds the + # minimum we think is needed. So generate a + # success test case for it. + push @output, generate_tests($property_name, + $table_name, + $valid, + $invalid, + $warning, + ); + } + } + } + } + } + } + } + + &write($t_path, [, + @output, + (map {"Test_X('$_');\n"} @backslash_X_tests), + "Finished();\n"]); + return; +} + +# This is a list of the input files and how to handle them. The files are +# processed in their order in this list. Some reordering is possible if +# desired, but the v0 files should be first, and the extracted before the +# others except DAge.txt (as data in an extracted file can be over-ridden by +# the non-extracted. Some other files depend on data derived from an earlier +# file, like UnicodeData requires data from Jamo, and the case changing and +# folding requires data from Unicode. Mostly, it safest to order by first +# version releases in (except the Jamo). DAge.txt is read before the +# extracted ones because of the rarely used feature $compare_versions. In the +# unlikely event that there were ever an extracted file that contained the Age +# property information, it would have to go in front of DAge. +# +# The version strings allow the program to know whether to expect a file or +# not, but if a file exists in the directory, it will be processed, even if it +# is in a version earlier than expected, so you can copy files from a later +# release into an earlier release's directory. +my @input_file_objects = ( + Input_file->new('PropertyAliases.txt', v0, + Handler => \&process_PropertyAliases, + ), + Input_file->new(undef, v0, # No file associated with this + Progress_Message => 'Finishing property setup', + Handler => \&finish_property_setup, + ), + Input_file->new('PropValueAliases.txt', v0, + Handler => \&process_PropValueAliases, + Has_Missings_Defaults => $NOT_IGNORED, + ), + Input_file->new('DAge.txt', v3.2.0, + Has_Missings_Defaults => $NOT_IGNORED, + Property => 'Age' + ), + Input_file->new("${EXTRACTED}DGeneralCategory.txt", v3.1.0, + Property => 'General_Category', + ), + Input_file->new("${EXTRACTED}DCombiningClass.txt", v3.1.0, + Property => 'Canonical_Combining_Class', + Has_Missings_Defaults => $NOT_IGNORED, + ), + Input_file->new("${EXTRACTED}DNumType.txt", v3.1.0, + Property => 'Numeric_Type', + Has_Missings_Defaults => $NOT_IGNORED, + ), + Input_file->new("${EXTRACTED}DEastAsianWidth.txt", v3.1.0, + Property => 'East_Asian_Width', + Has_Missings_Defaults => $NOT_IGNORED, + ), + Input_file->new("${EXTRACTED}DLineBreak.txt", v3.1.0, + Property => 'Line_Break', + Has_Missings_Defaults => $NOT_IGNORED, + ), + Input_file->new("${EXTRACTED}DBidiClass.txt", v3.1.1, + Property => 'Bidi_Class', + Has_Missings_Defaults => $NOT_IGNORED, + ), + Input_file->new("${EXTRACTED}DDecompositionType.txt", v3.1.0, + Property => 'Decomposition_Type', + Has_Missings_Defaults => $NOT_IGNORED, + ), + Input_file->new("${EXTRACTED}DBinaryProperties.txt", v3.1.0), + Input_file->new("${EXTRACTED}DNumValues.txt", v3.1.0, + Property => 'Numeric_Value', + Each_Line_Handler => \&filter_numeric_value_line, + Has_Missings_Defaults => $NOT_IGNORED, + ), + Input_file->new("${EXTRACTED}DJoinGroup.txt", v3.1.0, + Property => 'Joining_Group', + Has_Missings_Defaults => $NOT_IGNORED, + ), + + Input_file->new("${EXTRACTED}DJoinType.txt", v3.1.0, + Property => 'Joining_Type', + Has_Missings_Defaults => $NOT_IGNORED, + ), + Input_file->new('Jamo.txt', v2.0.0, + Property => 'Jamo_Short_Name', + Each_Line_Handler => \&filter_jamo_line, + ), + Input_file->new('UnicodeData.txt', v1.1.5, + Pre_Handler => \&setup_UnicodeData, + + # We clean up this file for some early versions. + Each_Line_Handler => [ (($v_version lt v2.0.0 ) + ? \&filter_v1_ucd + : ($v_version eq v2.1.5) + ? \&filter_v2_1_5_ucd + : undef), + + # And the main filter + \&filter_UnicodeData_line, + ], + EOF_Handler => \&EOF_UnicodeData, + ), + Input_file->new('ArabicShaping.txt', v2.0.0, + Each_Line_Handler => + [ ($v_version lt 4.1.0) + ? \&filter_old_style_arabic_shaping + : undef, + \&filter_arabic_shaping_line, + ], + Has_Missings_Defaults => $NOT_IGNORED, + ), + Input_file->new('Blocks.txt', v2.0.0, + Property => 'Block', + Has_Missings_Defaults => $NOT_IGNORED, + Each_Line_Handler => \&filter_blocks_lines + ), + Input_file->new('PropList.txt', v2.0.0, + Each_Line_Handler => (($v_version lt v3.1.0) + ? \&filter_old_style_proplist + : undef), + ), + Input_file->new('Unihan.txt', v2.0.0, + Pre_Handler => \&setup_unihan, + Optional => 1, + Each_Line_Handler => \&filter_unihan_line, + ), + Input_file->new('SpecialCasing.txt', v2.1.8, + Each_Line_Handler => \&filter_special_casing_line, + Pre_Handler => \&setup_special_casing, + ), + Input_file->new( + 'LineBreak.txt', v3.0.0, + Has_Missings_Defaults => $NOT_IGNORED, + Property => 'Line_Break', + # Early versions had problematic syntax + Each_Line_Handler => (($v_version lt v3.1.0) + ? \&filter_early_ea_lb + : undef), + ), + Input_file->new('EastAsianWidth.txt', v3.0.0, + Property => 'East_Asian_Width', + Has_Missings_Defaults => $NOT_IGNORED, + # Early versions had problematic syntax + Each_Line_Handler => (($v_version lt v3.1.0) + ? \&filter_early_ea_lb + : undef), + ), + Input_file->new('CompositionExclusions.txt', v3.0.0, + Property => 'Composition_Exclusion', + ), + Input_file->new('BidiMirroring.txt', v3.0.1, + Property => 'Bidi_Mirroring_Glyph', + ), + Input_file->new("NormalizationTest.txt", v3.0.1, + Skip => 1, + ), + Input_file->new('CaseFolding.txt', v3.0.1, + Pre_Handler => \&setup_case_folding, + Each_Line_Handler => + [ ($v_version lt v3.1.0) + ? \&filter_old_style_case_folding + : undef, + \&filter_case_folding_line + ], + Post_Handler => \&post_fold, + ), + Input_file->new('DCoreProperties.txt', v3.1.0, + # 5.2 changed this file + Has_Missings_Defaults => (($v_version ge v5.2.0) + ? $NOT_IGNORED + : $NO_DEFAULTS), + ), + Input_file->new('Scripts.txt', v3.1.0, + Property => 'Script', + Has_Missings_Defaults => $NOT_IGNORED, + ), + Input_file->new('DNormalizationProps.txt', v3.1.0, + Has_Missings_Defaults => $NOT_IGNORED, + Each_Line_Handler => (($v_version lt v4.0.1) + ? \&filter_old_style_normalization_lines + : undef), + ), + Input_file->new('HangulSyllableType.txt', v4.0.0, + Has_Missings_Defaults => $NOT_IGNORED, + Property => 'Hangul_Syllable_Type'), + Input_file->new("$AUXILIARY/WordBreakProperty.txt", v4.1.0, + Property => 'Word_Break', + Has_Missings_Defaults => $NOT_IGNORED, + ), + Input_file->new("$AUXILIARY/GraphemeBreakProperty.txt", v4.1.0, + Property => 'Grapheme_Cluster_Break', + Has_Missings_Defaults => $NOT_IGNORED, + ), + Input_file->new("$AUXILIARY/GCBTest.txt", v4.1.0, + Handler => \&process_GCB_test, + ), + Input_file->new("$AUXILIARY/LBTest.txt", v4.1.0, + Skip => 1, + ), + Input_file->new("$AUXILIARY/SBTest.txt", v4.1.0, + Skip => 1, + ), + Input_file->new("$AUXILIARY/WBTest.txt", v4.1.0, + Skip => 1, + ), + Input_file->new("$AUXILIARY/SentenceBreakProperty.txt", v4.1.0, + Property => 'Sentence_Break', + Has_Missings_Defaults => $NOT_IGNORED, + ), + Input_file->new('NamedSequences.txt', v4.1.0, + Handler => \&process_NamedSequences + ), + Input_file->new('NameAliases.txt', v5.0.0, + Property => 'Name_Alias', + ), + Input_file->new("BidiTest.txt", v5.2.0, + Skip => 1, + ), + Input_file->new('UnihanIndicesDictionary.txt', v5.2.0, + Optional => 1, + Each_Line_Handler => \&filter_unihan_line, + ), + Input_file->new('UnihanDataDictionaryLike.txt', v5.2.0, + Optional => 1, + Each_Line_Handler => \&filter_unihan_line, + ), + Input_file->new('UnihanIRGSources.txt', v5.2.0, + Optional => 1, + Pre_Handler => \&setup_unihan, + Each_Line_Handler => \&filter_unihan_line, + ), + Input_file->new('UnihanNumericValues.txt', v5.2.0, + Optional => 1, + Each_Line_Handler => \&filter_unihan_line, + ), + Input_file->new('UnihanOtherMappings.txt', v5.2.0, + Optional => 1, + Each_Line_Handler => \&filter_unihan_line, + ), + Input_file->new('UnihanRadicalStrokeCounts.txt', v5.2.0, + Optional => 1, + Each_Line_Handler => \&filter_unihan_line, + ), + Input_file->new('UnihanReadings.txt', v5.2.0, + Optional => 1, + Each_Line_Handler => \&filter_unihan_line, + ), + Input_file->new('UnihanVariants.txt', v5.2.0, + Optional => 1, + Each_Line_Handler => \&filter_unihan_line, + ), +); + +# End of all the preliminaries. +# Do it... + +if ($compare_versions) { + Carp::my_carp(<rel2abs( + internal_file_to_platform($_)) + } keys %ignored_files; +File::Find::find({ + wanted=>sub { + return unless /\.txt$/i; # Some platforms change the name's case + my $full = lc(File::Spec->rel2abs($_)); + $potential_files{$full} = 1 + if ! grep { $full eq lc($_) } @ignored_files_full_names; + return; + } +}, File::Spec->curdir()); + +my @mktables_list_output_files; + +if (! -e $file_list) { + print "'$file_list' doesn't exist, so forcing rebuild.\n" if $verbosity >= $VERBOSE; + $write_unchanged_files = 1; +} elsif ($write_unchanged_files) { + print "Not checking file list '$file_list'.\n" if $verbosity >= $VERBOSE; +} +else { + print "Reading file list '$file_list'\n" if $verbosity >= $VERBOSE; + my $file_handle; + if (! open $file_handle, "<", $file_list) { + Carp::my_carp("Failed to open '$file_list'; turning on -globlist option instead: $!"); + $glob_list = 1; + } + else { + my @input; + + # Read and parse mktables.lst, placing the results from the first part + # into @input, and the second part into @mktables_list_output_files + for my $list ( \@input, \@mktables_list_output_files ) { + while (<$file_handle>) { + s/^ \s+ | \s+ $//xg; + next if /^ \s* (?: \# .* )? $/x; + last if /^ =+ $/x; + my ( $file ) = split /\t/; + push @$list, $file; + } + @$list = uniques(@$list); + next; + } + + # Look through all the input files + foreach my $input (@input) { + next if $input eq 'version'; # Already have checked this. + + # Ignore if doesn't exist. The checking about whether we care or + # not is done via the Input_file object. + next if ! file_exists($input); + + # The paths are stored with relative names, and with '/' as the + # delimiter; convert to absolute on this machine + my $full = lc(File::Spec->rel2abs(internal_file_to_platform($input))); + $potential_files{$full} = 1 + if ! grep { lc($full) eq lc($_) } @ignored_files_full_names; + } + } + + close $file_handle; +} + +if ($glob_list) { + + # Here wants to process all .txt files in the directory structure. + # Convert them to full path names. They are stored in the platform's + # relative style + my @known_files; + foreach my $object (@input_file_objects) { + my $file = $object->file; + next unless defined $file; + push @known_files, File::Spec->rel2abs($file); + } + + my @unknown_input_files; + foreach my $file (keys %potential_files) { + next if grep { lc($file) eq lc($_) } @known_files; + + # Here, the file is unknown to us. Get relative path name + $file = File::Spec->abs2rel($file); + push @unknown_input_files, $file; + + # What will happen is we create a data structure for it, and add it to + # the list of input files to process. First get the subdirectories + # into an array + my (undef, $directories, undef) = File::Spec->splitpath($file); + $directories =~ s;/$;;; # Can have extraneous trailing '/' + my @directories = File::Spec->splitdir($directories); + + # If the file isn't extracted (meaning none of the directories is the + # extracted one), just add it to the end of the list of inputs. + if (! grep { $EXTRACTED_DIR eq $_ } @directories) { + push @input_file_objects, Input_file->new($file, v0); + } + else { + + # Here, the file is extracted. It needs to go ahead of most other + # processing. Search for the first input file that isn't a + # special required property (that is, find one whose first_release + # is non-0), and isn't extracted. Also, the Age property file is + # processed before the extracted ones, just in case + # $compare_versions is set. + for (my $i = 0; $i < @input_file_objects; $i++) { + if ($input_file_objects[$i]->first_released ne v0 + && lc($input_file_objects[$i]->file) ne 'dage.txt' + && $input_file_objects[$i]->file !~ /$EXTRACTED_DIR/i) + { + splice @input_file_objects, $i, 0, + Input_file->new($file, v0); + last; + } + } + + } + } + if (@unknown_input_files) { + print STDERR simple_fold(join_lines(<file; + next if ! defined $file; # Not all objects have files + next if $object->optional && ! -e $file; + push @input_files, $file; +} + +if ( $verbosity >= $VERBOSE ) { + print "Expecting ".scalar( @input_files )." input files. ", + "Checking ".scalar( @mktables_list_output_files )." output files.\n"; +} + +# We set $youngest to be the most recently changed input file, including this +# program itself (done much earlier in this file) +foreach my $in (@input_files) { + my $age = -M $in; + next unless defined $age; # Keep going even if missing a file + $youngest = $age if $age < $youngest; + + # See that the input files have distinct names, to warn someone if they + # are adding a new one + if ($make_list) { + my ($volume, $directories, $file ) = File::Spec->splitpath($in); + $directories =~ s;/$;;; # Can have extraneous trailing '/' + my @directories = File::Spec->splitdir($directories); + my $base = $file =~ s/\.txt$//; + construct_filename($file, 'mutable', \@directories); + } +} + +my $ok = ! $write_unchanged_files + && scalar @mktables_list_output_files; # If none known, rebuild + +# Now we check to see if any output files are older than youngest, if +# they are, we need to continue on, otherwise we can presumably bail. +if ($ok) { + foreach my $out (@mktables_list_output_files) { + if ( ! file_exists($out)) { + print "'$out' is missing.\n" if $verbosity >= $VERBOSE; + $ok = 0; + last; + } + #local $to_trace = 1 if main::DEBUG; + trace $youngest, -M $out if main::DEBUG && $to_trace; + if ( -M $out > $youngest ) { + #trace "$out: age: ", -M $out, ", youngest: $youngest\n" if main::DEBUG && $to_trace; + print "'$out' is too old.\n" if $verbosity >= $VERBOSE; + $ok = 0; + last; + } + } +} +if ($ok) { + print "Files seem to be ok, not bothering to rebuild. Add '-w' option to force build\n"; + exit(0); +} +print "Must rebuild tables.\n" if $verbosity >= $VERBOSE; + +# Ready to do the major processing. First create the perl pseudo-property. +$perl = Property->new('perl', Type => $NON_STRING, Perl_Extension => 1); + +# Process each input file +foreach my $file (@input_file_objects) { + $file->run; +} + +# Finish the table generation. + +print "Finishing processing Unicode properties\n" if $verbosity >= $PROGRESS; +finish_Unicode(); + +print "Compiling Perl properties\n" if $verbosity >= $PROGRESS; +compile_perl(); + +print "Creating Perl synonyms\n" if $verbosity >= $PROGRESS; +add_perl_synonyms(); + +print "Writing tables\n" if $verbosity >= $PROGRESS; +write_all_tables(); + +# Write mktables.lst +if ( $file_list and $make_list ) { + + print "Updating '$file_list'\n" if $verbosity >= $PROGRESS; + foreach my $file (@input_files, @files_actually_output) { + my (undef, $directories, $file) = File::Spec->splitpath($file); + my @directories = File::Spec->splitdir($directories); + $file = join '/', @directories, $file; + } + + my $ofh; + if (! open $ofh,">",$file_list) { + Carp::my_carp("Can't write to '$file_list'. Skipping: $!"); + return + } + else { + print $ofh <<"END"; +# +# $file_list -- File list for $0. +# +# Autogenerated on @{[scalar localtime]} +# +# - First section is input files +# ($0 itself is not listed but is automatically considered an input) +# - Section seperator is /^=+\$/ +# - Second section is a list of output files. +# - Lines matching /^\\s*#/ are treated as comments +# which along with blank lines are ignored. +# + +# Input files: + +END + print $ofh "$_\n" for sort(@input_files); + print $ofh "\n=================================\n# Output files:\n\n"; + print $ofh "$_\n" for sort @files_actually_output; + print $ofh "\n# ",scalar(@input_files)," input files\n", + "# ",scalar(@files_actually_output)+1," output files\n\n", + "# End list\n"; + close $ofh + or Carp::my_carp("Failed to close $ofh: $!"); + + print "Filelist has ",scalar(@input_files)," input files and ", + scalar(@files_actually_output)+1," output files\n" + if $verbosity >= $VERBOSE; + } +} + +# Output these warnings unless -q explicitly specified. +if ($verbosity >= $NORMAL_VERBOSITY) { + if (@unhandled_properties) { + print "\nProperties and tables that unexpectedly have no code points\n"; + foreach my $property (sort @unhandled_properties) { + print $property, "\n"; + } + } + + if (%potential_files) { + print "\nInput files that are not considered:\n"; + foreach my $file (sort keys %potential_files) { + print File::Spec->abs2rel($file), "\n"; + } + } + print "\nAll done\n" if $verbosity >= $VERBOSE; +} +exit(0); + +# TRAILING CODE IS USED BY make_property_test_script() +__DATA__ + +use strict; +use warnings; + +# If run outside the normal test suite on an ASCII platform, you can +# just create a latin1_to_native() function that just returns its +# inputs, because that's the only function used from test.pl +require "test.pl"; + +# Test qr/\X/ and the \p{} regular expression constructs. This file is +# constructed by mktables from the tables it generates, so if mktables is +# buggy, this won't necessarily catch those bugs. Tests are generated for all +# feasible properties; a few aren't currently feasible; see +# is_code_point_usable() in mktables for details. + +# Standard test packages are not used because this manipulates SIG_WARN. It +# exits 0 if every non-skipped test succeeded; -1 if any failed. + +my $Tests = 0; +my $Fails = 0; + +sub Expect($$$$) { + my $expected = shift; + my $ord = shift; + my $regex = shift; + my $warning_type = shift; # Type of warning message, like 'deprecated' + # or empty if none + my $line = (caller)[2]; + $ord = ord(latin1_to_native(chr($ord))); + + # Convert the code point to hex form + my $string = sprintf "\"\\x{%04X}\"", $ord; + + my @tests = ""; + + # The first time through, use all warnings. If the input should generate + # a warning, add another time through with them turned off + push @tests, "no warnings '$warning_type';" if $warning_type; + + foreach my $no_warnings (@tests) { + + # Store any warning messages instead of outputting them + local $SIG{__WARN__} = $SIG{__WARN__}; + my $warning_message; + $SIG{__WARN__} = sub { $warning_message = $_[0] }; + + $Tests++; + + # A string eval is needed because of the 'no warnings'. + # Assumes no parens in the regular expression + my $result = eval "$no_warnings + my \$RegObj = qr($regex); + $string =~ \$RegObj ? 1 : 0"; + if (not defined $result) { + print "not ok $Tests - couldn't compile /$regex/; line $line: $@\n"; + $Fails++; + } + elsif ($result ^ $expected) { + print "not ok $Tests - expected $expected but got $result for $string =~ qr/$regex/; line $line\n"; + $Fails++; + } + elsif ($warning_message) { + if (! $warning_type || ($warning_type && $no_warnings)) { + print "not ok $Tests - for qr/$regex/ did not expect warning message '$warning_message'; line $line\n"; + $Fails++; + } + else { + print "ok $Tests - expected and got a warning message for qr/$regex/; line $line\n"; + } + } + elsif ($warning_type && ! $no_warnings) { + print "not ok $Tests - for qr/$regex/ expected a $warning_type warning message, but got none; line $line\n"; + $Fails++; + } + else { + print "ok $Tests - got $result for $string =~ qr/$regex/; line $line\n"; + } + } + return; +} + +sub Error($) { + my $regex = shift; + $Tests++; + if (eval { 'x' =~ qr/$regex/; 1 }) { + $Fails++; + my $line = (caller)[2]; + print "not ok $Tests - re compiled ok, but expected error for qr/$regex/; line $line: $@\n"; + } + else { + my $line = (caller)[2]; + print "ok $Tests - got and expected error for qr/$regex/; line $line\n"; + } + return; +} + +# GCBTest.txt character that separates grapheme clusters +my $breakable_utf8 = my $breakable = chr(0xF7); +utf8::upgrade($breakable_utf8); + +# GCBTest.txt character that indicates that the adjoining code points are part +# of the same grapheme cluster +my $nobreak_utf8 = my $nobreak = chr(0xD7); +utf8::upgrade($nobreak_utf8); + +sub Test_X($) { + # Test qr/\X/ matches. The input is a line from auxiliary/GCBTest.txt + # Each such line is a sequence of code points given by their hex numbers, + # separated by the two characters defined just before this subroutine that + # indicate that either there can or cannot be a break between the adjacent + # code points. If there isn't a break, that means the sequence forms an + # extended grapheme cluster, which means that \X should match the whole + # thing. If there is a break, \X should stop there. This is all + # converted by this routine into a match: + # $string =~ /(\X)/, + # Each \X should match the next cluster; and that is what is checked. + + my $template = shift; + + my $line = (caller)[2]; + + # The line contains characters above the ASCII range, but in Latin1. It + # may or may not be in utf8, and if it is, it may or may not know it. So, + # convert these characters to 8 bits. If knows is in utf8, simply + # downgrade. + if (utf8::is_utf8($template)) { + utf8::downgrade($template); + } else { + + # Otherwise, if it is in utf8, but doesn't know it, the next lines + # convert the two problematic characters to their 8-bit equivalents. + # If it isn't in utf8, they don't harm anything. + use bytes; + $template =~ s/$nobreak_utf8/$nobreak/g; + $template =~ s/$breakable_utf8/$breakable/g; + } + + # Get rid of the leading and trailing breakables + $template =~ s/^ \s* $breakable \s* //x; + $template =~ s/ \s* $breakable \s* $ //x; + + # And no-breaks become just a space. + $template =~ s/ \s* $nobreak \s* / /xg; + + # Split the input into segments that are breakable between them. + my @segments = split /\s*$breakable\s*/, $template; + + my $string = ""; + my $display_string = ""; + my @should_match; + my @should_display; + + # Convert the code point sequence in each segment into a Perl string of + # characters + foreach my $segment (@segments) { + my @code_points = split /\s+/, $segment; + my $this_string = ""; + my $this_display = ""; + foreach my $code_point (@code_points) { + $this_string .= latin1_to_native(chr(hex $code_point)); + $this_display .= "\\x{$code_point}"; + } + + # The next cluster should match the string in this segment. + push @should_match, $this_string; + push @should_display, $this_display; + $string .= $this_string; + $display_string .= $this_display; + } + + # If a string can be represented in both non-ut8 and utf8, test both cases + UPGRADE: + for my $to_upgrade (0 .. 1) { + + if ($to_upgrade) { + + # If already in utf8, would just be a repeat + next UPGRADE if utf8::is_utf8($string); + + utf8::upgrade($string); + } + + # Finally, do the \X match. + my @matches = $string =~ /(\X)/g; + + # Look through each matched cluster to verify that it matches what we + # expect. + my $min = (@matches < @should_match) ? @matches : @should_match; + for my $i (0 .. $min - 1) { + $Tests++; + if ($matches[$i] eq $should_match[$i]) { + print "ok $Tests - "; + if ($i == 0) { + print "In \"$display_string\" =~ /(\\X)/g, \\X #1"; + } else { + print "And \\X #", $i + 1, + } + print " correctly matched $should_display[$i]; line $line\n"; + } else { + $matches[$i] = join("", map { sprintf "\\x{%04X}", $_ } + unpack("U*", $matches[$i])); + print "not ok $Tests - In \"$display_string\" =~ /(\\X)/g, \\X #", + $i + 1, + " should have matched $should_display[$i]", + " but instead matched $matches[$i]", + ". Abandoning rest of line $line\n"; + next UPGRADE; + } + } + + # And the number of matches should equal the number of expected matches. + $Tests++; + if (@matches == @should_match) { + print "ok $Tests - Nothing was left over; line $line\n"; + } else { + print "not ok $Tests - There were ", scalar @should_match, " \\X matches expected, but got ", scalar @matches, " instead; line $line\n"; + } + } + + return; +} + +sub Finished() { + print "1..$Tests\n"; + exit($Fails ? -1 : 0); +} +Error('\p{Script=InGreek}'); # Bug #69018 +Test_X("1100 $nobreak 1161"); # Bug #70940 +Expect(0, 0x2028, '\p{Print}', ""); # Bug # 71722 +Expect(0, 0x2029, '\p{Print}', ""); # Bug # 71722 +Expect(1, 0xFF10, '\p{XDigit}', ""); # Bug # 71726