lib/unicore/mktables

   1 #!/usr/bin/perl -w
   2
   3 # !!!!!!!!!!!!!!       IF YOU MODIFY THIS FILE       !!!!!!!!!!!!!!!!!!!!!!!!!
   4 # Any files created or read by this program should be listed in 'mktables.lst'
   5 # Use -makelist to regenerate it.
   6
   7 # Needs 'no overloading' to run faster on miniperl.  Code commented out at the
   8 # subroutine objaddr can be used instead to work as far back (untested) as
   9 # 5.8: needs pack "U".  But almost all occurrences of objaddr have been
  10 # removed in favor of using 'no overloading'.  You also would have to go
  11 # through and replace occurrences like:
  12 #       my $addr = do { no overloading; pack 'J', $self; }
  13 # with
  14 #       my $addr = main::objaddr $self;
  15 # (or reverse commit 9b01bafde4b022706c3d6f947a0963f821b2e50b
  16 # that instituted the change to main::objaddr, and subsequent commits that
  17 # changed 0+$self to pack 'J', $self.)
  18
  19 my $start_time;
  20 BEGIN { # Get the time the script started running; do it at compilation to
  21         # get it as close as possible
  22     $start_time= time;
  23 }
  24
  25 require 5.010_001;
  26 use strict;
  27 use warnings;
  28 use Carp;
  29 use Config;
  30 use File::Find;
  31 use File::Path;
  32 use File::Spec;
  33 use Text::Tabs;
  34 use re "/aa";
  35
  36 sub DEBUG () { 0 }  # Set to 0 for production; 1 for development
  37 my $debugging_build = $Config{"ccflags"} =~ /-DDEBUGGING/;
  38
  39 ##########################################################################
  40 #
  41 # mktables -- create the runtime Perl Unicode files (lib/unicore/.../*.pl),
  42 # from the Unicode database files (lib/unicore/.../*.txt),  It also generates
  43 # a pod file and a .t file
  44 #
  45 # The structure of this file is:
  46 #   First these introductory comments; then
  47 #   code needed for everywhere, such as debugging stuff; then
  48 #   code to handle input parameters; then
  49 #   data structures likely to be of external interest (some of which depend on
  50 #       the input parameters, so follows them; then
  51 #   more data structures and subroutine and package (class) definitions; then
  52 #   the small actual loop to process the input files and finish up; then
  53 #   a __DATA__ section, for the .t tests
  54 #
  55 # This program works on all releases of Unicode through at least 6.0.  The
  56 # outputs have been scrutinized most intently for release 5.1.  The others
  57 # have been checked for somewhat more than just sanity.  It can handle all
  58 # existing Unicode character properties in those releases.
  59 #
  60 # This program is mostly about Unicode character (or code point) properties.
  61 # A property describes some attribute or quality of a code point, like if it
  62 # is lowercase or not, its name, what version of Unicode it was first defined
  63 # in, or what its uppercase equivalent is.  Unicode deals with these disparate
  64 # possibilities by making all properties into mappings from each code point
  65 # into some corresponding value.  In the case of it being lowercase or not,
  66 # the mapping is either to 'Y' or 'N' (or various synonyms thereof).  Each
  67 # property maps each Unicode code point to a single value, called a "property
  68 # value".  (Hence each Unicode property is a true mathematical function with
  69 # exactly one value per code point.)
  70 #
  71 # When using a property in a regular expression, what is desired isn't the
  72 # mapping of the code point to its property's value, but the reverse (or the
  73 # mathematical "inverse relation"): starting with the property value, "Does a
  74 # code point map to it?"  These are written in a "compound" form:
  75 # \p{property=value}, e.g., \p{category=punctuation}.  This program generates
  76 # files containing the lists of code points that map to each such regular
  77 # expression property value, one file per list
  78 #
  79 # There is also a single form shortcut that Perl adds for many of the commonly
  80 # used properties.  This happens for all binary properties, plus script,
  81 # general_category, and block properties.
  82 #
  83 # Thus the outputs of this program are files.  There are map files, mostly in
  84 # the 'To' directory; and there are list files for use in regular expression
  85 # matching, all in subdirectories of the 'lib' directory, with each
  86 # subdirectory being named for the property that the lists in it are for.
  87 # Bookkeeping, test, and documentation files are also generated.
  88
  89 my $matches_directory = 'lib';   # Where match (\p{}) files go.
  90 my $map_directory = 'To';        # Where map files go.
  91
  92 # DATA STRUCTURES
  93 #
  94 # The major data structures of this program are Property, of course, but also
  95 # Table.  There are two kinds of tables, very similar to each other.
  96 # "Match_Table" is the data structure giving the list of code points that have
  97 # a particular property value, mentioned above.  There is also a "Map_Table"
  98 # data structure which gives the property's mapping from code point to value.
  99 # There are two structures because the match tables need to be combined in
 100 # various ways, such as constructing unions, intersections, complements, etc.,
 101 # and the map ones don't.  And there would be problems, perhaps subtle, if
 102 # a map table were inadvertently operated on in some of those ways.
 103 # The use of separate classes with operations defined on one but not the other
 104 # prevents accidentally confusing the two.
 105 #
 106 # At the heart of each table's data structure is a "Range_List", which is just
 107 # an ordered list of "Ranges", plus ancillary information, and methods to
 108 # operate on them.  A Range is a compact way to store property information.
 109 # Each range has a starting code point, an ending code point, and a value that
 110 # is meant to apply to all the code points between the two end points,
 111 # inclusive.  For a map table, this value is the property value for those
 112 # code points.  Two such ranges could be written like this:
 113 #   0x41 .. 0x5A, 'Upper',
 114 #   0x61 .. 0x7A, 'Lower'
 115 #
 116 # Each range also has a type used as a convenience to classify the values.
 117 # Most ranges in this program will be Type 0, or normal, but there are some
 118 # ranges that have a non-zero type.  These are used only in map tables, and
 119 # are for mappings that don't fit into the normal scheme of things.  Mappings
 120 # that require a hash entry to communicate with utf8.c are one example;
 121 # another example is mappings for charnames.pm to use which indicate a name
 122 # that is algorithmically determinable from its code point (and vice-versa).
 123 # These are used to significantly compact these tables, instead of listing
 124 # each one of the tens of thousands individually.
 125 #
 126 # In a match table, the value of a range is irrelevant (and hence the type as
 127 # well, which will always be 0), and arbitrarily set to the null string.
 128 # Using the example above, there would be two match tables for those two
 129 # entries, one named Upper would contain the 0x41..0x5A range, and the other
 130 # named Lower would contain 0x61..0x7A.
 131 #
 132 # Actually, there are two types of range lists, "Range_Map" is the one
 133 # associated with map tables, and "Range_List" with match tables.
 134 # Again, this is so that methods can be defined on one and not the other so as
 135 # to prevent operating on them in incorrect ways.
 136 #
 137 # Eventually, most tables are written out to files to be read by utf8_heavy.pl
 138 # in the perl core.  All tables could in theory be written, but some are
 139 # suppressed because there is no current practical use for them.  It is easy
 140 # to change which get written by changing various lists that are near the top
 141 # of the actual code in this file.  The table data structures contain enough
 142 # ancillary information to allow them to be treated as separate entities for
 143 # writing, such as the path to each one's file.  There is a heading in each
 144 # map table that gives the format of its entries, and what the map is for all
 145 # the code points missing from it.  (This allows tables to be more compact.)
 146 #
 147 # The Property data structure contains one or more tables.  All properties
 148 # contain a map table (except the $perl property which is a
 149 # pseudo-property containing only match tables), and any properties that
 150 # are usable in regular expression matches also contain various matching
 151 # tables, one for each value the property can have.  A binary property can
 152 # have two values, True and False (or Y and N, which are preferred by Unicode
 153 # terminology).  Thus each of these properties will have a map table that
 154 # takes every code point and maps it to Y or N (but having ranges cuts the
 155 # number of entries in that table way down), and two match tables, one
 156 # which has a list of all the code points that map to Y, and one for all the
 157 # code points that map to N.  (For each of these, a third table is also
 158 # generated for the pseudo Perl property.  It contains the identical code
 159 # points as the Y table, but can be written, not in the compound form, but in
 160 # a "single" form like \p{IsUppercase}.)  Many properties are binary, but some
 161 # properties have several possible values, some have many, and properties like
 162 # Name have a different value for every named code point.  Those will not,
 163 # unless the controlling lists are changed, have their match tables written
 164 # out.  But all the ones which can be used in regular expression \p{} and \P{}
 165 # constructs will.  Prior to 5.14, generally a property would have either its
 166 # map table or its match tables written but not both.  Again, what gets
 167 # written is controlled by lists which can easily be changed.  Starting in
 168 # 5.14, advantage was taken of this, and all the map tables needed to
 169 # reconstruct the Unicode db are now written out, while suppressing the
 170 # Unicode .txt files that contain the data.  Our tables are much more compact
 171 # than the .txt files, so a significant space savings was achieved.
 172
 173 # Properties have a 'Type', like binary, or string, or enum depending on how
 174 # many match tables there are and the content of the maps.  This 'Type' is
 175 # different than a range 'Type', so don't get confused by the two concepts
 176 # having the same name.
 177 #
 178 # For information about the Unicode properties, see Unicode's UAX44 document:
 179
 180 my $unicode_reference_url = 'http://www.unicode.org/reports/tr44/';
 181
 182 # As stated earlier, this program will work on any release of Unicode so far.
 183 # Most obvious problems in earlier data have NOT been corrected except when
 184 # necessary to make Perl or this program work reasonably.  For example, no
 185 # folding information was given in early releases, so this program substitutes
 186 # lower case instead, just so that a regular expression with the /i option
 187 # will do something that actually gives the right results in many cases.
 188 # There are also a couple other corrections for version 1.1.5, commented at
 189 # the point they are made.  As an example of corrections that weren't made
 190 # (but could be) is this statement from DerivedAge.txt: "The supplementary
 191 # private use code points and the non-character code points were assigned in
 192 # version 2.0, but not specifically listed in the UCD until versions 3.0 and
 193 # 3.1 respectively."  (To be precise it was 3.0.1 not 3.0.0) More information
 194 # on Unicode version glitches is further down in these introductory comments.
 195 #
 196 # This program works on all non-provisional properties as of 6.0, though the
 197 # files for some are suppressed from apparent lack of demand for them.  You
 198 # can change which are output by changing lists in this program.
 199 #
 200 # The old version of mktables emphasized the term "Fuzzy" to mean Unicode's
 201 # loose matchings rules (from Unicode TR18):
 202 #
 203 #    The recommended names for UCD properties and property values are in
 204 #    PropertyAliases.txt [Prop] and PropertyValueAliases.txt
 205 #    [PropValue]. There are both abbreviated names and longer, more
 206 #    descriptive names. It is strongly recommended that both names be
 207 #    recognized, and that loose matching of property names be used,
 208 #    whereby the case distinctions, whitespace, hyphens, and underbar
 209 #    are ignored.
 210 # The program still allows Fuzzy to override its determination of if loose
 211 # matching should be used, but it isn't currently used, as it is no longer
 212 # needed; the calculations it makes are good enough.
 213 #
 214 # SUMMARY OF HOW IT WORKS:
 215 #
 216 #   Process arguments
 217 #
 218 #   A list is constructed containing each input file that is to be processed
 219 #
 220 #   Each file on the list is processed in a loop, using the associated handler
 221 #   code for each:
 222 #        The PropertyAliases.txt and PropValueAliases.txt files are processed
 223 #            first.  These files name the properties and property values.
 224 #            Objects are created of all the property and property value names
 225 #            that the rest of the input should expect, including all synonyms.
 226 #        The other input files give mappings from properties to property
 227 #           values.  That is, they list code points and say what the mapping
 228 #           is under the given property.  Some files give the mappings for
 229 #           just one property; and some for many.  This program goes through
 230 #           each file and populates the properties from them.  Some properties
 231 #           are listed in more than one file, and Unicode has set up a
 232 #           precedence as to which has priority if there is a conflict.  Thus
 233 #           the order of processing matters, and this program handles the
 234 #           conflict possibility by processing the overriding input files
 235 #           last, so that if necessary they replace earlier values.
 236 #        After this is all done, the program creates the property mappings not
 237 #            furnished by Unicode, but derivable from what it does give.
 238 #        The tables of code points that match each property value in each
 239 #            property that is accessible by regular expressions are created.
 240 #        The Perl-defined properties are created and populated.  Many of these
 241 #            require data determined from the earlier steps
 242 #        Any Perl-defined synonyms are created, and name clashes between Perl
 243 #            and Unicode are reconciled and warned about.
 244 #        All the properties are written to files
 245 #        Any other files are written, and final warnings issued.
 246 #
 247 # For clarity, a number of operators have been overloaded to work on tables:
 248 #   ~ means invert (take all characters not in the set).  The more
 249 #       conventional '!' is not used because of the possibility of confusing
 250 #       it with the actual boolean operation.
 251 #   + means union
 252 #   - means subtraction
 253 #   & means intersection
 254 # The precedence of these is the order listed.  Parentheses should be
 255 # copiously used.  These are not a general scheme.  The operations aren't
 256 # defined for a number of things, deliberately, to avoid getting into trouble.
 257 # Operations are done on references and affect the underlying structures, so
 258 # that the copy constructors for them have been overloaded to not return a new
 259 # clone, but the input object itself.
 260 #
 261 # The bool operator is deliberately not overloaded to avoid confusion with
 262 # "should it mean if the object merely exists, or also is non-empty?".
 263 #
 264 # WHY CERTAIN DESIGN DECISIONS WERE MADE
 265 #
 266 # This program needs to be able to run under miniperl.  Therefore, it uses a
 267 # minimum of other modules, and hence implements some things itself that could
 268 # be gotten from CPAN
 269 #
 270 # This program uses inputs published by the Unicode Consortium.  These can
 271 # change incompatibly between releases without the Perl maintainers realizing
 272 # it.  Therefore this program is now designed to try to flag these.  It looks
 273 # at the directories where the inputs are, and flags any unrecognized files.
 274 # It keeps track of all the properties in the files it handles, and flags any
 275 # that it doesn't know how to handle.  It also flags any input lines that
 276 # don't match the expected syntax, among other checks.
 277 #
 278 # It is also designed so if a new input file matches one of the known
 279 # templates, one hopefully just needs to add it to a list to have it
 280 # processed.
 281 #
 282 # As mentioned earlier, some properties are given in more than one file.  In
 283 # particular, the files in the extracted directory are supposedly just
 284 # reformattings of the others.  But they contain information not easily
 285 # derivable from the other files, including results for Unihan, which this
 286 # program doesn't ordinarily look at, and for unassigned code points.  They
 287 # also have historically had errors or been incomplete.  In an attempt to
 288 # create the best possible data, this program thus processes them first to
 289 # glean information missing from the other files; then processes those other
 290 # files to override any errors in the extracted ones.  Much of the design was
 291 # driven by this need to store things and then possibly override them.
 292 #
 293 # It tries to keep fatal errors to a minimum, to generate something usable for
 294 # testing purposes.  It always looks for files that could be inputs, and will
 295 # warn about any that it doesn't know how to handle (the -q option suppresses
 296 # the warning).
 297 #
 298 # Why is there more than one type of range?
 299 #   This simplified things.  There are some very specialized code points that
 300 #   have to be handled specially for output, such as Hangul syllable names.
 301 #   By creating a range type (done late in the development process), it
 302 #   allowed this to be stored with the range, and overridden by other input.
 303 #   Originally these were stored in another data structure, and it became a
 304 #   mess trying to decide if a second file that was for the same property was
 305 #   overriding the earlier one or not.
 306 #
 307 # Why are there two kinds of tables, match and map?
 308 #   (And there is a base class shared by the two as well.)  As stated above,
 309 #   they actually are for different things.  Development proceeded much more
 310 #   smoothly when I (khw) realized the distinction.  Map tables are used to
 311 #   give the property value for every code point (actually every code point
 312 #   that doesn't map to a default value).  Match tables are used for regular
 313 #   expression matches, and are essentially the inverse mapping.  Separating
 314 #   the two allows more specialized methods, and error checks so that one
 315 #   can't just take the intersection of two map tables, for example, as that
 316 #   is nonsensical.
 317 #
 318 # DEBUGGING
 319 #
 320 # This program is written so it will run under miniperl.  Occasionally changes
 321 # will cause an error where the backtrace doesn't work well under miniperl.
 322 # To diagnose the problem, you can instead run it under regular perl, if you
 323 # have one compiled.
 324 #
 325 # There is a good trace facility.  To enable it, first sub DEBUG must be set
 326 # to return true.  Then a line like
 327 #
 328 # local $to_trace = 1 if main::DEBUG;
 329 #
 330 # can be added to enable tracing in its lexical scope or until you insert
 331 # another line:
 332 #
 333 # local $to_trace = 0 if main::DEBUG;
 334 #
 335 # then use a line like "trace $a, @b, %c, ...;
 336 #
 337 # Some of the more complex subroutines already have trace statements in them.
 338 # Permanent trace statements should be like:
 339 #
 340 # trace ... if main::DEBUG && $to_trace;
 341 #
 342 # If there is just one or a few files that you're debugging, you can easily
 343 # cause most everything else to be skipped.  Change the line
 344 #
 345 # my $debug_skip = 0;
 346 #
 347 # to 1, and every file whose object is in @input_file_objects and doesn't have
 348 # a, 'non_skip => 1,' in its constructor will be skipped.
 349 #
 350 # To compare the output tables, it may be useful to specify the -annotate
 351 # flag.  This causes the tables to expand so there is one entry for each
 352 # non-algorithmically named code point giving, currently its name, and its
 353 # graphic representation if printable (and you have a font that knows about
 354 # it).  This makes it easier to see what the particular code points are in
 355 # each output table.  The tables are usable, but because they don't have
 356 # ranges (for the most part), a Perl using them will run slower.  Non-named
 357 # code points are annotated with a description of their status, and contiguous
 358 # ones with the same description will be output as a range rather than
 359 # individually.  Algorithmically named characters are also output as ranges,
 360 # except when there are just a few contiguous ones.
 361 #
 362 # FUTURE ISSUES
 363 #
 364 # The program would break if Unicode were to change its names so that
 365 # interior white space, underscores, or dashes differences were significant
 366 # within property and property value names.
 367 #
 368 # It might be easier to use the xml versions of the UCD if this program ever
 369 # would need heavy revision, and the ability to handle old versions was not
 370 # required.
 371 #
 372 # There is the potential for name collisions, in that Perl has chosen names
 373 # that Unicode could decide it also likes.  There have been such collisions in
 374 # the past, with mostly Perl deciding to adopt the Unicode definition of the
 375 # name.  However in the 5.2 Unicode beta testing, there were a number of such
 376 # collisions, which were withdrawn before the final release, because of Perl's
 377 # and other's protests.  These all involved new properties which began with
 378 # 'Is'.  Based on the protests, Unicode is unlikely to try that again.  Also,
 379 # many of the Perl-defined synonyms, like Any, Word, etc, are listed in a
 380 # Unicode document, so they are unlikely to be used by Unicode for another
 381 # purpose.  However, they might try something beginning with 'In', or use any
 382 # of the other Perl-defined properties.  This program will warn you of name
 383 # collisions, and refuse to generate tables with them, but manual intervention
 384 # will be required in this event.  One scheme that could be implemented, if
 385 # necessary, would be to have this program generate another file, or add a
 386 # field to mktables.lst that gives the date of first definition of a property.
 387 # Each new release of Unicode would use that file as a basis for the next
 388 # iteration.  And the Perl synonym addition code could sort based on the age
 389 # of the property, so older properties get priority, and newer ones that clash
 390 # would be refused; hence existing code would not be impacted, and some other
 391 # synonym would have to be used for the new property.  This is ugly, and
 392 # manual intervention would certainly be easier to do in the short run; lets
 393 # hope it never comes to this.
 394 #
 395 # A NOTE ON UNIHAN
 396 #
 397 # This program can generate tables from the Unihan database.  But it doesn't
 398 # by default, letting the CPAN module Unicode::Unihan handle them.  Prior to
 399 # version 5.2, this database was in a single file, Unihan.txt.  In 5.2 the
 400 # database was split into 8 different files, all beginning with the letters
 401 # 'Unihan'.  This program will read those file(s) if present, but it needs to
 402 # know which of the many properties in the file(s) should have tables created
 403 # for them.  It will create tables for any properties listed in
 404 # PropertyAliases.txt and PropValueAliases.txt, plus any listed in the
 405 # @cjk_properties array and the @cjk_property_values array.  Thus, if a
 406 # property you want is not in those files of the release you are building
 407 # against, you must add it to those two arrays.  Starting in 4.0, the
 408 # Unicode_Radical_Stroke was listed in those files, so if the Unihan database
 409 # is present in the directory, a table will be generated for that property.
 410 # In 5.2, several more properties were added.  For your convenience, the two
 411 # arrays are initialized with all the 6.0 listed properties that are also in
 412 # earlier releases.  But these are commented out.  You can just uncomment the
 413 # ones you want, or use them as a template for adding entries for other
 414 # properties.
 415 #
 416 # You may need to adjust the entries to suit your purposes.  setup_unihan(),
 417 # and filter_unihan_line() are the functions where this is done.  This program
 418 # already does some adjusting to make the lines look more like the rest of the
 419 # Unicode DB;  You can see what that is in filter_unihan_line()
 420 #
 421 # There is a bug in the 3.2 data file in which some values for the
 422 # kPrimaryNumeric property have commas and an unexpected comment.  A filter
 423 # could be added for these; or for a particular installation, the Unihan.txt
 424 # file could be edited to fix them.
 425 #
 426 # HOW TO ADD A FILE TO BE PROCESSED
 427 #
 428 # A new file from Unicode needs to have an object constructed for it in
 429 # @input_file_objects, probably at the end or at the end of the extracted
 430 # ones.  The program should warn you if its name will clash with others on
 431 # restrictive file systems, like DOS.  If so, figure out a better name, and
 432 # add lines to the README.perl file giving that.  If the file is a character
 433 # property, it should be in the format that Unicode has by default
 434 # standardized for such files for the more recently introduced ones.
 435 # If so, the Input_file constructor for @input_file_objects can just be the
 436 # file name and release it first appeared in.  If not, then it should be
 437 # possible to construct an each_line_handler() to massage the line into the
 438 # standardized form.
 439 #
 440 # For non-character properties, more code will be needed.  You can look at
 441 # the existing entries for clues.
 442 #
 443 # UNICODE VERSIONS NOTES
 444 #
 445 # The Unicode UCD has had a number of errors in it over the versions.  And
 446 # these remain, by policy, in the standard for that version.  Therefore it is
 447 # risky to correct them, because code may be expecting the error.  So this
 448 # program doesn't generally make changes, unless the error breaks the Perl
 449 # core.  As an example, some versions of 2.1.x Jamo.txt have the wrong value
 450 # for U+1105, which causes real problems for the algorithms for Jamo
 451 # calculations, so it is changed here.
 452 #
 453 # But it isn't so clear cut as to what to do about concepts that are
 454 # introduced in a later release; should they extend back to earlier releases
 455 # where the concept just didn't exist?  It was easier to do this than to not,
 456 # so that's what was done.  For example, the default value for code points not
 457 # in the files for various properties was probably undefined until changed by
 458 # some version.  No_Block for blocks is such an example.  This program will
 459 # assign No_Block even in Unicode versions that didn't have it.  This has the
 460 # benefit that code being written doesn't have to special case earlier
 461 # versions; and the detriment that it doesn't match the Standard precisely for
 462 # the affected versions.
 463 #
 464 # Here are some observations about some of the issues in early versions:
 465 #
 466 # The number of code points in \p{alpha} halved in 2.1.9.  It turns out that
 467 # the reason is that the CJK block starting at 4E00 was removed from PropList,
 468 # and was not put back in until 3.1.0
 469 #
 470 # Unicode introduced the synonym Space for White_Space in 4.1.  Perl has
 471 # always had a \p{Space}.  In release 3.2 only, they are not synonymous.  The
 472 # reason is that 3.2 introduced U+205F=medium math space, which was not
 473 # classed as white space, but Perl figured out that it should have been. 4.0
 474 # reclassified it correctly.
 475 #
 476 # Another change between 3.2 and 4.0 is the CCC property value ATBL.  In 3.2
 477 # this was erroneously a synonym for 202.  In 4.0, ATB became 202, and ATBL
 478 # was left with no code points, as all the ones that mapped to 202 stayed
 479 # mapped to 202.  Thus if your program used the numeric name for the class,
 480 # it would not have been affected, but if it used the mnemonic, it would have
 481 # been.
 482 #
 483 # \p{Script=Hrkt} (Katakana_Or_Hiragana) came in 4.0.1.  Before that code
 484 # points which eventually came to have this script property value, instead
 485 # mapped to "Unknown".  But in the next release all these code points were
 486 # moved to \p{sc=common} instead.
 487 #
 488 # The default for missing code points for BidiClass is complicated.  Starting
 489 # in 3.1.1, the derived file DBidiClass.txt handles this, but this program
 490 # tries to do the best it can for earlier releases.  It is done in
 491 # process_PropertyAliases()
 492 #
 493 ##############################################################################
 494
 495 my $UNDEF = ':UNDEF:';  # String to print out for undefined values in tracing
 496                         # and errors
 497 my $MAX_LINE_WIDTH = 78;
 498
 499 # Debugging aid to skip most files so as to not be distracted by them when
 500 # concentrating on the ones being debugged.  Add
 501 # non_skip => 1,
 502 # to the constructor for those files you want processed when you set this.
 503 # Files with a first version number of 0 are special: they are always
 504 # processed regardless of the state of this flag.  Generally, Jamo.txt and
 505 # UnicodeData.txt must not be skipped if you want this program to not die
 506 # before normal completion.
 507 my $debug_skip = 0;
 508
 509 # Set to 1 to enable tracing.
 510 our $to_trace = 0;
 511
 512 { # Closure for trace: debugging aid
 513     my $print_caller = 1;        # ? Include calling subroutine name
 514     my $main_with_colon = 'main::';
 515     my $main_colon_length = length($main_with_colon);
 516
 517     sub trace {
 518         return unless $to_trace;        # Do nothing if global flag not set
 519
 520         my @input = @_;
 521
 522         local $DB::trace = 0;
 523         $DB::trace = 0;          # Quiet 'used only once' message
 524
 525         my $line_number;
 526
 527         # Loop looking up the stack to get the first non-trace caller
 528         my $caller_line;
 529         my $caller_name;
 530         my $i = 0;
 531         do {
 532             $line_number = $caller_line;
 533             (my $pkg, my $file, $caller_line, my $caller) = caller $i++;
 534             $caller = $main_with_colon unless defined $caller;
 535
 536             $caller_name = $caller;
 537
 538             # get rid of pkg
 539             $caller_name =~ s/.*:://;
 540             if (substr($caller_name, 0, $main_colon_length)
 541                 eq $main_with_colon)
 542             {
 543                 $caller_name = substr($caller_name, $main_colon_length);
 544             }
 545
 546         } until ($caller_name ne 'trace');
 547
 548         # If the stack was empty, we were called from the top level
 549         $caller_name = 'main' if ($caller_name eq ""
 550                                     || $caller_name eq 'trace');
 551
 552         my $output = "";
 553         foreach my $string (@input) {
 554             #print STDERR __LINE__, ": ", join ", ", @input, "\n";
 555             if (ref $string eq 'ARRAY' || ref $string eq 'HASH') {
 556                 $output .= simple_dumper($string);
 557             }
 558             else {
 559                 $string = "$string" if ref $string;
 560                 $string = $UNDEF unless defined $string;
 561                 chomp $string;
 562                 $string = '""' if $string eq "";
 563                 $output .= " " if $output ne ""
 564                                 && $string ne ""
 565                                 && substr($output, -1, 1) ne " "
 566                                 && substr($string, 0, 1) ne " ";
 567                 $output .= $string;
 568             }
 569         }
 570
 571         print STDERR sprintf "%4d: ", $line_number if defined $line_number;
 572         print STDERR "$caller_name: " if $print_caller;
 573         print STDERR $output, "\n";
 574         return;
 575     }
 576 }
 577
 578 # This is for a rarely used development feature that allows you to compare two
 579 # versions of the Unicode standard without having to deal with changes caused
 580 # by the code points introduced in the later version.  Change the 0 to a
 581 # string containing a SINGLE dotted Unicode release number (e.g. "2.1").  Only
 582 # code points introduced in that release and earlier will be used; later ones
 583 # are thrown away.  You use the version number of the earliest one you want to
 584 # compare; then run this program on directory structures containing each
 585 # release, and compare the outputs.  These outputs will therefore include only
 586 # the code points common to both releases, and you can see the changes caused
 587 # just by the underlying release semantic changes.  For versions earlier than
 588 # 3.2, you must copy a version of DAge.txt into the directory.
 589 my $string_compare_versions = DEBUG && 0; #  e.g., "2.1";
 590 my $compare_versions = DEBUG
 591                        && $string_compare_versions
 592                        && pack "C*", split /\./, $string_compare_versions;
 593
 594 sub uniques {
 595     # Returns non-duplicated input values.  From "Perl Best Practices:
 596     # Encapsulated Cleverness".  p. 455 in first edition.
 597
 598     my %seen;
 599     # Arguably this breaks encapsulation, if the goal is to permit multiple
 600     # distinct objects to stringify to the same value, and be interchangeable.
 601     # However, for this program, no two objects stringify identically, and all
 602     # lists passed to this function are either objects or strings. So this
 603     # doesn't affect correctness, but it does give a couple of percent speedup.
 604     no overloading;
 605     return grep { ! $seen{$_}++ } @_;
 606 }
 607
 608 $0 = File::Spec->canonpath($0);
 609
 610 my $make_test_script = 0;      # ? Should we output a test script
 611 my $write_unchanged_files = 0; # ? Should we update the output files even if
 612                                #    we don't think they have changed
 613 my $use_directory = "";        # ? Should we chdir somewhere.
 614 my $pod_directory;             # input directory to store the pod file.
 615 my $pod_file = 'perluniprops';
 616 my $t_path;                     # Path to the .t test file
 617 my $file_list = 'mktables.lst'; # File to store input and output file names.
 618                                # This is used to speed up the build, by not
 619                                # executing the main body of the program if
 620                                # nothing on the list has changed since the
 621                                # previous build
 622 my $make_list = 1;             # ? Should we write $file_list.  Set to always
 623                                # make a list so that when the pumpking is
 624                                # preparing a release, s/he won't have to do
 625                                # special things
 626 my $glob_list = 0;             # ? Should we try to include unknown .txt files
 627                                # in the input.
 628 my $output_range_counts = $debugging_build;   # ? Should we include the number
 629                                               # of code points in ranges in
 630                                               # the output
 631 my $annotate = 0;              # ? Should character names be in the output
 632
 633 # Verbosity levels; 0 is quiet
 634 my $NORMAL_VERBOSITY = 1;
 635 my $PROGRESS = 2;
 636 my $VERBOSE = 3;
 637
 638 my $verbosity = $NORMAL_VERBOSITY;
 639
 640 # Process arguments
 641 while (@ARGV) {
 642     my $arg = shift @ARGV;
 643     if ($arg eq '-v') {
 644         $verbosity = $VERBOSE;
 645     }
 646     elsif ($arg eq '-p') {
 647         $verbosity = $PROGRESS;
 648         $| = 1;     # Flush buffers as we go.
 649     }
 650     elsif ($arg eq '-q') {
 651         $verbosity = 0;
 652     }
 653     elsif ($arg eq '-w') {
 654         $write_unchanged_files = 1; # update the files even if havent changed
 655     }
 656     elsif ($arg eq '-check') {
 657         my $this = shift @ARGV;
 658         my $ok = shift @ARGV;
 659         if ($this ne $ok) {
 660             print "Skipping as check params are not the same.\n";
 661             exit(0);
 662         }
 663     }
 664     elsif ($arg eq '-P' && defined ($pod_directory = shift)) {
 665         -d $pod_directory or croak "Directory '$pod_directory' doesn't exist";
 666     }
 667     elsif ($arg eq '-maketest' || ($arg eq '-T' && defined ($t_path = shift)))
 668     {
 669         $make_test_script = 1;
 670     }
 671     elsif ($arg eq '-makelist') {
 672         $make_list = 1;
 673     }
 674     elsif ($arg eq '-C' && defined ($use_directory = shift)) {
 675         -d $use_directory or croak "Unknown directory '$use_directory'";
 676     }
 677     elsif ($arg eq '-L') {
 678
 679         # Existence not tested until have chdir'd
 680         $file_list = shift;
 681     }
 682     elsif ($arg eq '-globlist') {
 683         $glob_list = 1;
 684     }
 685     elsif ($arg eq '-c') {
 686         $output_range_counts = ! $output_range_counts
 687     }
 688     elsif ($arg eq '-annotate') {
 689         $annotate = 1;
 690         $debugging_build = 1;
 691         $output_range_counts = 1;
 692     }
 693     else {
 694         my $with_c = 'with';
 695         $with_c .= 'out' if $output_range_counts;   # Complements the state
 696         croak <<END;
 697 usage: $0 [-c|-p|-q|-v|-w] [-C dir] [-L filelist] [ -P pod_dir ]
 698           [ -T test_file_path ] [-globlist] [-makelist] [-maketest]
 699           [-check A B ]
 700   -c          : Output comments $with_c number of code points in ranges
 701   -q          : Quiet Mode: Only output serious warnings.
 702   -p          : Set verbosity level to normal plus show progress.
 703   -v          : Set Verbosity level high:  Show progress and non-serious
 704                 warnings
 705   -w          : Write files regardless
 706   -C dir      : Change to this directory before proceeding. All relative paths
 707                 except those specified by the -P and -T options will be done
 708                 with respect to this directory.
 709   -P dir      : Output $pod_file file to directory 'dir'.
 710   -T path     : Create a test script as 'path'; overrides -maketest
 711   -L filelist : Use alternate 'filelist' instead of standard one
 712   -globlist   : Take as input all non-Test *.txt files in current and sub
 713                 directories
 714   -maketest   : Make test script 'TestProp.pl' in current (or -C directory),
 715                 overrides -T
 716   -makelist   : Rewrite the file list $file_list based on current setup
 717   -annotate   : Output an annotation for each character in the table files;
 718                 useful for debugging mktables, looking at diffs; but is slow,
 719                 memory intensive; resulting tables are usable but slow and
 720                 very large.
 721   -check A B  : Executes $0 only if A and B are the same
 722 END
 723     }
 724 }
 725
 726 # Stores the most-recently changed file.  If none have changed, can skip the
 727 # build
 728 my $most_recent = (stat $0)[9];   # Do this before the chdir!
 729
 730 # Change directories now, because need to read 'version' early.
 731 if ($use_directory) {
 732     if ($pod_directory && ! File::Spec->file_name_is_absolute($pod_directory)) {
 733         $pod_directory = File::Spec->rel2abs($pod_directory);
 734     }
 735     if ($t_path && ! File::Spec->file_name_is_absolute($t_path)) {
 736         $t_path = File::Spec->rel2abs($t_path);
 737     }
 738     chdir $use_directory or croak "Failed to chdir to '$use_directory':$!";
 739     if ($pod_directory && File::Spec->file_name_is_absolute($pod_directory)) {
 740         $pod_directory = File::Spec->abs2rel($pod_directory);
 741     }
 742     if ($t_path && File::Spec->file_name_is_absolute($t_path)) {
 743         $t_path = File::Spec->abs2rel($t_path);
 744     }
 745 }
 746
 747 # Get Unicode version into regular and v-string.  This is done now because
 748 # various tables below get populated based on it.  These tables are populated
 749 # here to be near the top of the file, and so easily seeable by those needing
 750 # to modify things.
 751 open my $VERSION, "<", "version"
 752                     or croak "$0: can't open required file 'version': $!\n";
 753 my $string_version = <$VERSION>;
 754 close $VERSION;
 755 chomp $string_version;
 756 my $v_version = pack "C*", split /\./, $string_version;        # v string
 757
 758 # The following are the complete names of properties with property values that
 759 # are known to not match any code points in some versions of Unicode, but that
 760 # may change in the future so they should be matchable, hence an empty file is
 761 # generated for them.
 762 my @tables_that_may_be_empty = (
 763                                 'Joining_Type=Left_Joining',
 764                                 );
 765 push @tables_that_may_be_empty, 'Script=Common' if $v_version le v4.0.1;
 766 push @tables_that_may_be_empty, 'Title' if $v_version lt v2.0.0;
 767 push @tables_that_may_be_empty, 'Script=Katakana_Or_Hiragana'
 768                                                     if $v_version ge v4.1.0;
 769 push @tables_that_may_be_empty, 'Script_Extensions=Katakana_Or_Hiragana'
 770                                                     if $v_version ge v6.0.0;
 771
 772 # The lists below are hashes, so the key is the item in the list, and the
 773 # value is the reason why it is in the list.  This makes generation of
 774 # documentation easier.
 775
 776 my %why_suppressed;  # No file generated for these.
 777
 778 # Files aren't generated for empty extraneous properties.  This is arguable.
 779 # Extraneous properties generally come about because a property is no longer
 780 # used in a newer version of Unicode.  If we generated a file without code
 781 # points, programs that used to work on that property will still execute
 782 # without errors.  It just won't ever match (or will always match, with \P{}).
 783 # This means that the logic is now likely wrong.  I (khw) think its better to
 784 # find this out by getting an error message.  Just move them to the table
 785 # above to change this behavior
 786 my %why_suppress_if_empty_warn_if_not = (
 787
 788    # It is the only property that has ever officially been removed from the
 789    # Standard.  The database never contained any code points for it.
 790    'Special_Case_Condition' => 'Obsolete',
 791
 792    # Apparently never official, but there were code points in some versions of
 793    # old-style PropList.txt
 794    'Non_Break' => 'Obsolete',
 795 );
 796
 797 # These would normally go in the warn table just above, but they were changed
 798 # a long time before this program was written, so warnings about them are
 799 # moot.
 800 if ($v_version gt v3.2.0) {
 801     push @tables_that_may_be_empty,
 802                                 'Canonical_Combining_Class=Attached_Below_Left'
 803 }
 804
 805 # These are listed in the Property aliases file in 6.0, but Unihan is ignored
 806 # unless explicitly added.
 807 if ($v_version ge v5.2.0) {
 808     my $unihan = 'Unihan; remove from list if using Unihan';
 809     foreach my $table (qw (
 810                            kAccountingNumeric
 811                            kOtherNumeric
 812                            kPrimaryNumeric
 813                            kCompatibilityVariant
 814                            kIICore
 815                            kIRG_GSource
 816                            kIRG_HSource
 817                            kIRG_JSource
 818                            kIRG_KPSource
 819                            kIRG_MSource
 820                            kIRG_KSource
 821                            kIRG_TSource
 822                            kIRG_USource
 823                            kIRG_VSource
 824                            kRSUnicode
 825                         ))
 826     {
 827         $why_suppress_if_empty_warn_if_not{$table} = $unihan;
 828     }
 829 }
 830
 831 # Enum values for to_output_map() method in the Map_Table package.
 832 my $EXTERNAL_MAP = 1;
 833 my $INTERNAL_MAP = 2;
 834
 835 # To override computed values for writing the map tables for these properties.
 836 # The default for enum map tables is to write them out, so that the Unicode
 837 # .txt files can be removed, but all the data to compute any property value
 838 # for any code point is available in a more compact form.
 839 my %global_to_output_map = (
 840     # Needed by UCD.pm, but don't want to publicize that it exists, so won't
 841     # get stuck supporting it if things change.  Since it is a STRING
 842     # property, it normally would be listed in the pod, but INTERNAL_MAP
 843     # suppresses that.
 844     Unicode_1_Name => $INTERNAL_MAP,
 845
 846     Present_In => 0,                # Suppress, as easily computed from Age
 847     Block => 0,                     # Suppress, as Blocks.txt is retained.
 848
 849     # Suppress, as mapping can be found instead from the
 850     # Perl_Decomposition_Mapping file
 851     Decomposition_Type => 0,
 852 );
 853
 854 # Properties that this program ignores.
 855 my @unimplemented_properties;
 856
 857 # With this release, it is automatically handled if the Unihan db is
 858 # downloaded
 859 push @unimplemented_properties, 'Unicode_Radical_Stroke' if $v_version le v5.2.0;
 860
 861 # There are several types of obsolete properties defined by Unicode.  These
 862 # must be hand-edited for every new Unicode release.
 863 my %why_deprecated;  # Generates a deprecated warning message if used.
 864 my %why_stabilized;  # Documentation only
 865 my %why_obsolete;    # Documentation only
 866
 867 {   # Closure
 868     my $simple = 'Perl uses the more complete version of this property';
 869     my $unihan = 'Unihan properties are by default not enabled in the Perl core.  Instead use CPAN: Unicode::Unihan';
 870
 871     my $other_properties = 'other properties';
 872     my $contributory = "Used by Unicode internally for generating $other_properties and not intended to be used stand-alone";
 873     my $why_no_expand  = "Deprecated by Unicode.  These are characters that expand to more than one character in the specified normalization form, but whether they actually take up more bytes or not depends on the encoding being used.  For example, a UTF-8 encoded character may expand to a different number of bytes than a UTF-32 encoded character.";
 874
 875     %why_deprecated = (
 876         'Grapheme_Link' => 'Deprecated by Unicode:  Duplicates ccc=vr (Canonical_Combining_Class=Virama)',
 877         'Jamo_Short_Name' => $contributory,
 878         'Line_Break=Surrogate' => 'Deprecated by Unicode because surrogates should never appear in well-formed text, and therefore shouldn\'t be the basis for line breaking',
 879         'Other_Alphabetic' => $contributory,
 880         'Other_Default_Ignorable_Code_Point' => $contributory,
 881         'Other_Grapheme_Extend' => $contributory,
 882         'Other_ID_Continue' => $contributory,
 883         'Other_ID_Start' => $contributory,
 884         'Other_Lowercase' => $contributory,
 885         'Other_Math' => $contributory,
 886         'Other_Uppercase' => $contributory,
 887         'Expands_On_NFC' => $why_no_expand,
 888         'Expands_On_NFD' => $why_no_expand,
 889         'Expands_On_NFKC' => $why_no_expand,
 890         'Expands_On_NFKD' => $why_no_expand,
 891     );
 892
 893     %why_suppressed = (
 894         # There is a lib/unicore/Decomposition.pl (used by Normalize.pm) which
 895         # contains the same information, but without the algorithmically
 896         # determinable Hangul syllables'.  This file is not published, so it's
 897         # existence is not noted in the comment.
 898         'Decomposition_Mapping' => 'Accessible via Unicode::Normalize or Unicode::UCD::prop_invmap()',
 899
 900         'Indic_Matra_Category' => "Provisional",
 901         'Indic_Syllabic_Category' => "Provisional",
 902
 903         # Don't suppress ISO_Comment, as otherwise special handling is needed
 904         # to differentiate between it and gc=c, which can be written as 'isc',
 905         # which is the same characters as ISO_Comment's short name.
 906
 907         'Name' => "Accessible via \\N{...} or 'use charnames;' or Unicode::UCD::prop_invmap()",
 908
 909         'Simple_Case_Folding' => "$simple.  Can access this through Unicode::UCD::casefold or Unicode::UCD::prop_invmap()",
 910         'Simple_Lowercase_Mapping' => "$simple.  Can access this through Unicode::UCD::charinfo or Unicode::UCD::prop_invmap()",
 911         'Simple_Titlecase_Mapping' => "$simple.  Can access this through Unicode::UCD::charinfo or Unicode::UCD::prop_invmap()",
 912         'Simple_Uppercase_Mapping' => "$simple.  Can access this through Unicode::UCD::charinfo or Unicode::UCD::prop_invmap()",
 913
 914         FC_NFKC_Closure => 'Supplanted in usage by NFKC_Casefold; otherwise not useful',
 915     );
 916
 917     foreach my $property (
 918
 919             # The following are suppressed because they were made contributory
 920             # or deprecated by Unicode before Perl ever thought about
 921             # supporting them.
 922             'Jamo_Short_Name',
 923             'Grapheme_Link',
 924             'Expands_On_NFC',
 925             'Expands_On_NFD',
 926             'Expands_On_NFKC',
 927             'Expands_On_NFKD',
 928
 929             # The following are suppressed because they have been marked
 930             # as deprecated for a sufficient amount of time
 931             'Other_Alphabetic',
 932             'Other_Default_Ignorable_Code_Point',
 933             'Other_Grapheme_Extend',
 934             'Other_ID_Continue',
 935             'Other_ID_Start',
 936             'Other_Lowercase',
 937             'Other_Math',
 938             'Other_Uppercase',
 939     ) {
 940         $why_suppressed{$property} = $why_deprecated{$property};
 941     }
 942
 943     # Customize the message for all the 'Other_' properties
 944     foreach my $property (keys %why_deprecated) {
 945         next if (my $main_property = $property) !~ s/^Other_//;
 946         $why_deprecated{$property} =~ s/$other_properties/the $main_property property (which should be used instead)/;
 947     }
 948 }
 949
 950 if ($v_version ge 4.0.0) {
 951     $why_stabilized{'Hyphen'} = 'Use the Line_Break property instead; see www.unicode.org/reports/tr14';
 952     if ($v_version ge 6.0.0) {
 953         $why_deprecated{'Hyphen'} = 'Supplanted by Line_Break property values; see www.unicode.org/reports/tr14';
 954     }
 955 }
 956 if ($v_version ge 5.2.0 && $v_version lt 6.0.0) {
 957     $why_obsolete{'ISO_Comment'} = 'Code points for it have been removed';
 958     if ($v_version ge 6.0.0) {
 959         $why_deprecated{'ISO_Comment'} = 'No longer needed for Unicode\'s internal chart generation; otherwise not useful, and code points for it have been removed';
 960     }
 961 }
 962
 963 # Probably obsolete forever
 964 if ($v_version ge v4.1.0) {
 965     $why_suppressed{'Script=Katakana_Or_Hiragana'} = 'Obsolete.  All code points previously matched by this have been moved to "Script=Common".';
 966 }
 967 if ($v_version ge v6.0.0) {
 968     $why_suppressed{'Script=Katakana_Or_Hiragana'} .= '  Consider instead using "Script_Extensions=Katakana" or "Script_Extensions=Hiragana (or both)"';
 969     $why_suppressed{'Script_Extensions=Katakana_Or_Hiragana'} = 'All code points that would be matched by this are matched by either "Script_Extensions=Katakana" or "Script_Extensions=Hiragana"';
 970 }
 971
 972 # This program can create files for enumerated-like properties, such as
 973 # 'Numeric_Type'.  This file would be the same format as for a string
 974 # property, with a mapping from code point to its value, so you could look up,
 975 # for example, the script a code point is in.  But no one so far wants this
 976 # mapping, or they have found another way to get it since this is a new
 977 # feature.  So no file is generated except if it is in this list.
 978 my @output_mapped_properties = split "\n", <<END;
 979 END
 980
 981 # If you are using the Unihan database in a Unicode version before 5.2, you
 982 # need to add the properties that you want to extract from it to this table.
 983 # For your convenience, the properties in the 6.0 PropertyAliases.txt file are
 984 # listed, commented out
 985 my @cjk_properties = split "\n", <<'END';
 986 #cjkAccountingNumeric; kAccountingNumeric
 987 #cjkOtherNumeric; kOtherNumeric
 988 #cjkPrimaryNumeric; kPrimaryNumeric
 989 #cjkCompatibilityVariant; kCompatibilityVariant
 990 #cjkIICore ; kIICore
 991 #cjkIRG_GSource; kIRG_GSource
 992 #cjkIRG_HSource; kIRG_HSource
 993 #cjkIRG_JSource; kIRG_JSource
 994 #cjkIRG_KPSource; kIRG_KPSource
 995 #cjkIRG_KSource; kIRG_KSource
 996 #cjkIRG_TSource; kIRG_TSource
 997 #cjkIRG_USource; kIRG_USource
 998 #cjkIRG_VSource; kIRG_VSource
 999 #cjkRSUnicode; kRSUnicode                ; Unicode_Radical_Stroke; URS
1000 END
1001
1002 # Similarly for the property values.  For your convenience, the lines in the
1003 # 6.0 PropertyAliases.txt file are listed.  Just remove the first BUT NOT both
1004 # '#' marks (for Unicode versions before 5.2)
1005 my @cjk_property_values = split "\n", <<'END';
1006 ## @missing: 0000..10FFFF; cjkAccountingNumeric; NaN
1007 ## @missing: 0000..10FFFF; cjkCompatibilityVariant; <code point>
1008 ## @missing: 0000..10FFFF; cjkIICore; <none>
1009 ## @missing: 0000..10FFFF; cjkIRG_GSource; <none>
1010 ## @missing: 0000..10FFFF; cjkIRG_HSource; <none>
1011 ## @missing: 0000..10FFFF; cjkIRG_JSource; <none>
1012 ## @missing: 0000..10FFFF; cjkIRG_KPSource; <none>
1013 ## @missing: 0000..10FFFF; cjkIRG_KSource; <none>
1014 ## @missing: 0000..10FFFF; cjkIRG_TSource; <none>
1015 ## @missing: 0000..10FFFF; cjkIRG_USource; <none>
1016 ## @missing: 0000..10FFFF; cjkIRG_VSource; <none>
1017 ## @missing: 0000..10FFFF; cjkOtherNumeric; NaN
1018 ## @missing: 0000..10FFFF; cjkPrimaryNumeric; NaN
1019 ## @missing: 0000..10FFFF; cjkRSUnicode; <none>
1020 END
1021
1022 # The input files don't list every code point.  Those not listed are to be
1023 # defaulted to some value.  Below are hard-coded what those values are for
1024 # non-binary properties as of 5.1.  Starting in 5.0, there are
1025 # machine-parsable comment lines in the files the give the defaults; so this
1026 # list shouldn't have to be extended.  The claim is that all missing entries
1027 # for binary properties will default to 'N'.  Unicode tried to change that in
1028 # 5.2, but the beta period produced enough protest that they backed off.
1029 #
1030 # The defaults for the fields that appear in UnicodeData.txt in this hash must
1031 # be in the form that it expects.  The others may be synonyms.
1032 my $CODE_POINT = '<code point>';
1033 my %default_mapping = (
1034     Age => "Unassigned",
1035     # Bidi_Class => Complicated; set in code
1036     Bidi_Mirroring_Glyph => "",
1037     Block => 'No_Block',
1038     Canonical_Combining_Class => 0,
1039     Case_Folding => $CODE_POINT,
1040     Decomposition_Mapping => $CODE_POINT,
1041     Decomposition_Type => 'None',
1042     East_Asian_Width => "Neutral",
1043     FC_NFKC_Closure => $CODE_POINT,
1044     General_Category => 'Cn',
1045     Grapheme_Cluster_Break => 'Other',
1046     Hangul_Syllable_Type => 'NA',
1047     ISO_Comment => "",
1048     Jamo_Short_Name => "",
1049     Joining_Group => "No_Joining_Group",
1050     # Joining_Type => Complicated; set in code
1051     kIICore => 'N',   #                       Is converted to binary
1052     #Line_Break => Complicated; set in code
1053     Lowercase_Mapping => $CODE_POINT,
1054     Name => "",
1055     Name_Alias => "",
1056     NFC_QC => 'Yes',
1057     NFD_QC => 'Yes',
1058     NFKC_QC => 'Yes',
1059     NFKD_QC => 'Yes',
1060     Numeric_Type => 'None',
1061     Numeric_Value => 'NaN',
1062     Script => ($v_version le 4.1.0) ? 'Common' : 'Unknown',
1063     Sentence_Break => 'Other',
1064     Simple_Case_Folding => $CODE_POINT,
1065     Simple_Lowercase_Mapping => $CODE_POINT,
1066     Simple_Titlecase_Mapping => $CODE_POINT,
1067     Simple_Uppercase_Mapping => $CODE_POINT,
1068     Titlecase_Mapping => $CODE_POINT,
1069     Unicode_1_Name => "",
1070     Unicode_Radical_Stroke => "",
1071     Uppercase_Mapping => $CODE_POINT,
1072     Word_Break => 'Other',
1073 );
1074
1075 # Below are files that Unicode furnishes, but this program ignores, and why
1076 my %ignored_files = (
1077     'CJKRadicals.txt' => 'Maps the kRSUnicode property values to corresponding code points',
1078     'Index.txt' => 'Alphabetical index of Unicode characters',
1079     'NamedSqProv.txt' => 'Named sequences proposed for inclusion in a later version of the Unicode Standard; if you need them now, you can append this file to F<NamedSequences.txt> and recompile perl',
1080     'NamesList.txt' => 'Annotated list of characters',
1081     'NormalizationCorrections.txt' => 'Documentation of corrections already incorporated into the Unicode data base',
1082     'Props.txt' => 'Only in very early releases; is a subset of F<PropList.txt> (which is used instead)',
1083     'ReadMe.txt' => 'Documentation',
1084     'StandardizedVariants.txt' => 'Certain glyph variations for character display are standardized.  This lists the non-Unihan ones; the Unihan ones are also not used by Perl, and are in a separate Unicode data base L<http://www.unicode.org/ivd>',
1085     'EmojiSources.txt' => 'Maps certain Unicode code points to their legacy Japanese cell-phone values',
1086     'auxiliary/WordBreakTest.html' => 'Documentation of validation tests',
1087     'auxiliary/SentenceBreakTest.html' => 'Documentation of validation tests',
1088     'auxiliary/GraphemeBreakTest.html' => 'Documentation of validation tests',
1089     'auxiliary/LineBreakTest.html' => 'Documentation of validation tests',
1090 );
1091
1092 my %skipped_files;  # List of files that we skip
1093
1094 ### End of externally interesting definitions, except for @input_file_objects
1095
1096 my $HEADER=<<"EOF";
1097 # !!!!!!!   DO NOT EDIT THIS FILE   !!!!!!!
1098 # This file is machine-generated by $0 from the Unicode
1099 # database, Version $string_version.  Any changes made here will be lost!
1100 EOF
1101
1102 my $INTERNAL_ONLY_HEADER = <<"EOF";
1103
1104 # !!!!!!!   INTERNAL PERL USE ONLY   !!!!!!!
1105 # This file is for internal use by core Perl only.  The format and even the
1106 # name or existence of this file are subject to change without notice.  Don't
1107 # use it directly.
1108 EOF
1109
1110 my $DEVELOPMENT_ONLY=<<"EOF";
1111 # !!!!!!!   DEVELOPMENT USE ONLY   !!!!!!!
1112 # This file contains information artificially constrained to code points
1113 # present in Unicode release $string_compare_versions.
1114 # IT CANNOT BE RELIED ON.  It is for use during development only and should
1115 # not be used for production.
1116
1117 EOF
1118
1119 my $MAX_UNICODE_CODEPOINT_STRING = "10FFFF";
1120 my $MAX_UNICODE_CODEPOINT = hex $MAX_UNICODE_CODEPOINT_STRING;
1121 my $MAX_UNICODE_CODEPOINTS = $MAX_UNICODE_CODEPOINT + 1;
1122
1123 # Matches legal code point.  4-6 hex numbers, If there are 6, the first
1124 # two must be 10; if there are 5, the first must not be a 0.  Written this way
1125 # to decrease backtracking.  The first regex allows the code point to be at
1126 # the end of a word, but to work properly, the word shouldn't end with a valid
1127 # hex character.  The second one won't match a code point at the end of a
1128 # word, and doesn't have the run-on issue
1129 my $run_on_code_point_re =
1130             qr/ (?: 10[0-9A-F]{4} | [1-9A-F][0-9A-F]{4} | [0-9A-F]{4} ) \b/x;
1131 my $code_point_re = qr/\b$run_on_code_point_re/;
1132
1133 # This matches the beginning of the line in the Unicode db files that give the
1134 # defaults for code points not listed (i.e., missing) in the file.  The code
1135 # depends on this ending with a semi-colon, so it can assume it is a valid
1136 # field when the line is split() by semi-colons
1137 my $missing_defaults_prefix =
1138             qr/^#\s+\@missing:\s+0000\.\.$MAX_UNICODE_CODEPOINT_STRING\s*;/;
1139
1140 # Property types.  Unicode has more types, but these are sufficient for our
1141 # purposes.
1142 my $UNKNOWN = -1;   # initialized to illegal value
1143 my $NON_STRING = 1; # Either binary or enum
1144 my $BINARY = 2;
1145 my $FORCED_BINARY = 3; # Not a binary property, but, besides its normal
1146                        # tables, additional true and false tables are
1147                        # generated so that false is anything matching the
1148                        # default value, and true is everything else.
1149 my $ENUM = 4;       # Include catalog
1150 my $STRING = 5;     # Anything else: string or misc
1151
1152 # Some input files have lines that give default values for code points not
1153 # contained in the file.  Sometimes these should be ignored.
1154 my $NO_DEFAULTS = 0;        # Must evaluate to false
1155 my $NOT_IGNORED = 1;
1156 my $IGNORED = 2;
1157
1158 # Range types.  Each range has a type.  Most ranges are type 0, for normal,
1159 # and will appear in the main body of the tables in the output files, but
1160 # there are other types of ranges as well, listed below, that are specially
1161 # handled.   There are pseudo-types as well that will never be stored as a
1162 # type, but will affect the calculation of the type.
1163
1164 # 0 is for normal, non-specials
1165 my $MULTI_CP = 1;           # Sequence of more than code point
1166 my $HANGUL_SYLLABLE = 2;
1167 my $CP_IN_NAME = 3;         # The NAME contains the code point appended to it.
1168 my $NULL = 4;               # The map is to the null string; utf8.c can't
1169                             # handle these, nor is there an accepted syntax
1170                             # for them in \p{} constructs
1171 my $COMPUTE_NO_MULTI_CP = 5; # Pseudo-type; means that ranges that would
1172                              # otherwise be $MULTI_CP type are instead type 0
1173
1174 # process_generic_property_file() can accept certain overrides in its input.
1175 # Each of these must begin AND end with $CMD_DELIM.
1176 my $CMD_DELIM = "\a";
1177 my $REPLACE_CMD = 'replace';    # Override the Replace
1178 my $MAP_TYPE_CMD = 'map_type';  # Override the Type
1179
1180 my $NO = 0;
1181 my $YES = 1;
1182
1183 # Values for the Replace argument to add_range.
1184 # $NO                      # Don't replace; add only the code points not
1185                            # already present.
1186 my $IF_NOT_EQUIVALENT = 1; # Replace only under certain conditions; details in
1187                            # the comments at the subroutine definition.
1188 my $UNCONDITIONALLY = 2;   # Replace without conditions.
1189 my $MULTIPLE_BEFORE = 4;   # Don't replace, but add a duplicate record if
1190                            # already there
1191 my $MULTIPLE_AFTER = 5;    # Don't replace, but add a duplicate record if
1192                            # already there
1193 my $CROAK = 6;             # Die with an error if is already there
1194
1195 # Flags to give property statuses.  The phrases are to remind maintainers that
1196 # if the flag is changed, the indefinite article referring to it in the
1197 # documentation may need to be as well.
1198 my $NORMAL = "";
1199 my $DEPRECATED = 'D';
1200 my $a_bold_deprecated = "a 'B<$DEPRECATED>'";
1201 my $A_bold_deprecated = "A 'B<$DEPRECATED>'";
1202 my $DISCOURAGED = 'X';
1203 my $a_bold_discouraged = "an 'B<$DISCOURAGED>'";
1204 my $A_bold_discouraged = "An 'B<$DISCOURAGED>'";
1205 my $STRICTER = 'T';
1206 my $a_bold_stricter = "a 'B<$STRICTER>'";
1207 my $A_bold_stricter = "A 'B<$STRICTER>'";
1208 my $STABILIZED = 'S';
1209 my $a_bold_stabilized = "an 'B<$STABILIZED>'";
1210 my $A_bold_stabilized = "An 'B<$STABILIZED>'";
1211 my $OBSOLETE = 'O';
1212 my $a_bold_obsolete = "an 'B<$OBSOLETE>'";
1213 my $A_bold_obsolete = "An 'B<$OBSOLETE>'";
1214
1215 my %status_past_participles = (
1216     $DISCOURAGED => 'discouraged',
1217     $STABILIZED => 'stabilized',
1218     $OBSOLETE => 'obsolete',
1219     $DEPRECATED => 'deprecated',
1220 );
1221
1222 # Table fates.  These are somewhat ordered, so that fates < $MAP_PROXIED should be
1223 # externally documented.
1224 my $ORDINARY = 0;       # The normal fate.
1225 my $MAP_PROXIED = 1;    # The map table for the property isn't written out,
1226                         # but there is a file written that can be used to
1227                         # reconstruct this table
1228 my $SUPPRESSED = 3;     # The file for this table is not written out.
1229 my $INTERNAL_ONLY = 4;  # The file for this table is written out, but it is
1230                         # for Perl's internal use only
1231 my $PLACEHOLDER = 5;    # A property that is defined as a placeholder in a
1232                         # Unicode version that doesn't have it, but we need it
1233                         # to be defined, if empty, to have things work.
1234                         # Implies no pod entry generated
1235
1236 # The format of the values of the tables:
1237 my $EMPTY_FORMAT = "";
1238 my $BINARY_FORMAT = 'b';
1239 my $DECIMAL_FORMAT = 'd';
1240 my $FLOAT_FORMAT = 'f';
1241 my $INTEGER_FORMAT = 'i';
1242 my $HEX_FORMAT = 'x';
1243 my $RATIONAL_FORMAT = 'r';
1244 my $STRING_FORMAT = 's';
1245 my $DECOMP_STRING_FORMAT = 'c';
1246 my $STRING_WHITE_SPACE_LIST = 'sw';
1247
1248 my %map_table_formats = (
1249     $BINARY_FORMAT => 'binary',
1250     $DECIMAL_FORMAT => 'single decimal digit',
1251     $FLOAT_FORMAT => 'floating point number',
1252     $INTEGER_FORMAT => 'integer',
1253     $HEX_FORMAT => 'non-negative hex whole number; a code point',
1254     $RATIONAL_FORMAT => 'rational: an integer or a fraction',
1255     $STRING_FORMAT => 'string',
1256     $DECOMP_STRING_FORMAT => 'Perl\'s internal (Normalize.pm) decomposition mapping',
1257     $STRING_WHITE_SPACE_LIST => 'string, but some elements are interpreted as a list; white space occurs only as list item separators'
1258 );
1259
1260 # Unicode didn't put such derived files in a separate directory at first.
1261 my $EXTRACTED_DIR = (-d 'extracted') ? 'extracted' : "";
1262 my $EXTRACTED = ($EXTRACTED_DIR) ? "$EXTRACTED_DIR/" : "";
1263 my $AUXILIARY = 'auxiliary';
1264
1265 # Hashes that will eventually go into Heavy.pl for the use of utf8_heavy.pl
1266 # and into UCD.pl for the use of UCD.pm
1267 my %loose_to_file_of;       # loosely maps table names to their respective
1268                             # files
1269 my %stricter_to_file_of;    # same; but for stricter mapping.
1270 my %loose_property_to_file_of; # Maps a loose property name to its map file
1271 my %file_to_swash_name;     # Maps the file name to its corresponding key name
1272                             # in the hash %utf8::SwashInfo
1273 my %nv_floating_to_rational; # maps numeric values floating point numbers to
1274                              # their rational equivalent
1275 my %loose_property_name_of; # Loosely maps (non_string) property names to
1276                             # standard form
1277 my %string_property_loose_to_name; # Same, for string properties.
1278 my %loose_defaults;         # keys are of form "prop=value", where 'prop' is
1279                             # the property name in standard loose form, and
1280                             # 'value' is the default value for that property,
1281                             # also in standard loose form.
1282 my %loose_to_standard_value; # loosely maps table names to the canonical
1283                             # alias for them
1284 my %ambiguous_names;        # keys are alias names (in standard form) that
1285                             # have more than one possible meaning.
1286 my %prop_aliases;           # Keys are standard property name; values are each
1287                             # one's aliases
1288 my %prop_value_aliases;     # Keys of top level are standard property name;
1289                             # values are keys to another hash,  Each one is
1290                             # one of the property's values, in standard form.
1291                             # The values are that prop-val's aliases.
1292 my %ucd_pod;    # Holds entries that will go into the UCD section of the pod
1293
1294 # Most properties are immune to caseless matching, otherwise you would get
1295 # nonsensical results, as properties are a function of a code point, not
1296 # everything that is caselessly equivalent to that code point.  For example,
1297 # Changes_When_Case_Folded('s') should be false, whereas caselessly it would
1298 # be true because 's' and 'S' are equivalent caselessly.  However,
1299 # traditionally, [:upper:] and [:lower:] are equivalent caselessly, so we
1300 # extend that concept to those very few properties that are like this.  Each
1301 # such property will match the full range caselessly.  They are hard-coded in
1302 # the program; it's not worth trying to make it general as it's extremely
1303 # unlikely that they will ever change.
1304 my %caseless_equivalent_to;
1305
1306 # These constants names and values were taken from the Unicode standard,
1307 # version 5.1, section 3.12.  They are used in conjunction with Hangul
1308 # syllables.  The '_string' versions are so generated tables can retain the
1309 # hex format, which is the more familiar value
1310 my $SBase_string = "0xAC00";
1311 my $SBase = CORE::hex $SBase_string;
1312 my $LBase_string = "0x1100";
1313 my $LBase = CORE::hex $LBase_string;
1314 my $VBase_string = "0x1161";
1315 my $VBase = CORE::hex $VBase_string;
1316 my $TBase_string = "0x11A7";
1317 my $TBase = CORE::hex $TBase_string;
1318 my $SCount = 11172;
1319 my $LCount = 19;
1320 my $VCount = 21;
1321 my $TCount = 28;
1322 my $NCount = $VCount * $TCount;
1323
1324 # For Hangul syllables;  These store the numbers from Jamo.txt in conjunction
1325 # with the above published constants.
1326 my %Jamo;
1327 my %Jamo_L;     # Leading consonants
1328 my %Jamo_V;     # Vowels
1329 my %Jamo_T;     # Trailing consonants
1330
1331 # For code points whose name contains its ordinal as a '-ABCD' suffix.
1332 # The key is the base name of the code point, and the value is an
1333 # array giving all the ranges that use this base name.  Each range
1334 # is actually a hash giving the 'low' and 'high' values of it.
1335 my %names_ending_in_code_point;
1336 my %loose_names_ending_in_code_point;   # Same as above, but has blanks, dashes
1337                                         # removed from the names
1338 # Inverse mapping.  The list of ranges that have these kinds of
1339 # names.  Each element contains the low, high, and base names in an
1340 # anonymous hash.
1341 my @code_points_ending_in_code_point;
1342
1343 # Boolean: does this Unicode version have the hangul syllables, and are we
1344 # writing out a table for them?
1345 my $has_hangul_syllables = 0;
1346
1347 # Does this Unicode version have code points whose names end in their
1348 # respective code points, and are we writing out a table for them?  0 for no;
1349 # otherwise points to first property that a table is needed for them, so that
1350 # if multiple tables are needed, we don't create duplicates
1351 my $needing_code_points_ending_in_code_point = 0;
1352
1353 my @backslash_X_tests;     # List of tests read in for testing \X
1354 my @unhandled_properties;  # Will contain a list of properties found in
1355                            # the input that we didn't process.
1356 my @match_properties;      # Properties that have match tables, to be
1357                            # listed in the pod
1358 my @map_properties;        # Properties that get map files written
1359 my @named_sequences;       # NamedSequences.txt contents.
1360 my %potential_files;       # Generated list of all .txt files in the directory
1361                            # structure so we can warn if something is being
1362                            # ignored.
1363 my @files_actually_output; # List of files we generated.
1364 my @more_Names;            # Some code point names are compound; this is used
1365                            # to store the extra components of them.
1366 my $MIN_FRACTION_LENGTH = 3; # How many digits of a floating point number at
1367                            # the minimum before we consider it equivalent to a
1368                            # candidate rational
1369 my $MAX_FLOATING_SLOP = 10 ** - $MIN_FRACTION_LENGTH; # And in floating terms
1370
1371 # These store references to certain commonly used property objects
1372 my $gc;
1373 my $perl;
1374 my $block;
1375 my $perl_charname;
1376 my $print;
1377 my $Any;
1378 my $script;
1379
1380 # Are there conflicting names because of beginning with 'In_', or 'Is_'
1381 my $has_In_conflicts = 0;
1382 my $has_Is_conflicts = 0;
1383
1384 sub internal_file_to_platform ($) {
1385     # Convert our file paths which have '/' separators to those of the
1386     # platform.
1387
1388     my $file = shift;
1389     return undef unless defined $file;
1390
1391     return File::Spec->join(split '/', $file);
1392 }
1393
1394 sub file_exists ($) {   # platform independent '-e'.  This program internally
1395                         # uses slash as a path separator.
1396     my $file = shift;
1397     return 0 if ! defined $file;
1398     return -e internal_file_to_platform($file);
1399 }
1400
1401 sub objaddr($) {
1402     # Returns the address of the blessed input object.
1403     # It doesn't check for blessedness because that would do a string eval
1404     # every call, and the program is structured so that this is never called
1405     # for a non-blessed object.
1406
1407     no overloading; # If overloaded, numifying below won't work.
1408
1409     # Numifying a ref gives its address.
1410     return pack 'J', $_[0];
1411 }
1412
1413 # These are used only if $annotate is true.
1414 # The entire range of Unicode characters is examined to populate these
1415 # after all the input has been processed.  But most can be skipped, as they
1416 # have the same descriptive phrases, such as being unassigned
1417 my @viacode;            # Contains the 1 million character names
1418 my @printable;          # boolean: And are those characters printable?
1419 my @annotate_char_type; # Contains a type of those characters, specifically
1420                         # for the purposes of annotation.
1421 my $annotate_ranges;    # A map of ranges of code points that have the same
1422                         # name for the purposes of annotation.  They map to the
1423                         # upper edge of the range, so that the end point can
1424                         # be immediately found.  This is used to skip ahead to
1425                         # the end of a range, and avoid processing each
1426                         # individual code point in it.
1427 my $unassigned_sans_noncharacters; # A Range_List of the unassigned
1428                                    # characters, but excluding those which are
1429                                    # also noncharacter code points
1430
1431 # The annotation types are an extension of the regular range types, though
1432 # some of the latter are folded into one.  Make the new types negative to
1433 # avoid conflicting with the regular types
1434 my $SURROGATE_TYPE = -1;
1435 my $UNASSIGNED_TYPE = -2;
1436 my $PRIVATE_USE_TYPE = -3;
1437 my $NONCHARACTER_TYPE = -4;
1438 my $CONTROL_TYPE = -5;
1439 my $UNKNOWN_TYPE = -6;  # Used only if there is a bug in this program
1440
1441 sub populate_char_info ($) {
1442     # Used only with the $annotate option.  Populates the arrays with the
1443     # input code point's info that are needed for outputting more detailed
1444     # comments.  If calling context wants a return, it is the end point of
1445     # any contiguous range of characters that share essentially the same info
1446
1447     my $i = shift;
1448     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
1449
1450     $viacode[$i] = $perl_charname->value_of($i) || "";
1451
1452     # A character is generally printable if Unicode says it is,
1453     # but below we make sure that most Unicode general category 'C' types
1454     # aren't.
1455     $printable[$i] = $print->contains($i);
1456
1457     $annotate_char_type[$i] = $perl_charname->type_of($i) || 0;
1458
1459     # Only these two regular types are treated specially for annotations
1460     # purposes
1461     $annotate_char_type[$i] = 0 if $annotate_char_type[$i] != $CP_IN_NAME
1462                                 && $annotate_char_type[$i] != $HANGUL_SYLLABLE;
1463
1464     # Give a generic name to all code points that don't have a real name.
1465     # We output ranges, if applicable, for these.  Also calculate the end
1466     # point of the range.
1467     my $end;
1468     if (! $viacode[$i]) {
1469         if ($gc-> table('Surrogate')->contains($i)) {
1470             $viacode[$i] = 'Surrogate';
1471             $annotate_char_type[$i] = $SURROGATE_TYPE;
1472             $printable[$i] = 0;
1473             $end = $gc->table('Surrogate')->containing_range($i)->end;
1474         }
1475         elsif ($gc-> table('Private_use')->contains($i)) {
1476             $viacode[$i] = 'Private Use';
1477             $annotate_char_type[$i] = $PRIVATE_USE_TYPE;
1478             $printable[$i] = 0;
1479             $end = $gc->table('Private_Use')->containing_range($i)->end;
1480         }
1481         elsif (Property::property_ref('Noncharacter_Code_Point')-> table('Y')->
1482                                                                 contains($i))
1483         {
1484             $viacode[$i] = 'Noncharacter';
1485             $annotate_char_type[$i] = $NONCHARACTER_TYPE;
1486             $printable[$i] = 0;
1487             $end = property_ref('Noncharacter_Code_Point')->table('Y')->
1488                                                     containing_range($i)->end;
1489         }
1490         elsif ($gc-> table('Control')->contains($i)) {
1491             $viacode[$i] = 'Control';
1492             $annotate_char_type[$i] = $CONTROL_TYPE;
1493             $printable[$i] = 0;
1494             $end = 0x81 if $i == 0x80;  # Hard-code this one known case
1495         }
1496         elsif ($gc-> table('Unassigned')->contains($i)) {
1497             $viacode[$i] = 'Unassigned, block=' . $block-> value_of($i);
1498             $annotate_char_type[$i] = $UNASSIGNED_TYPE;
1499             $printable[$i] = 0;
1500
1501             # Because we name the unassigned by the blocks they are in, it
1502             # can't go past the end of that block, and it also can't go past
1503             # the unassigned range it is in.  The special table makes sure
1504             # that the non-characters, which are unassigned, are separated
1505             # out.
1506             $end = min($block->containing_range($i)->end,
1507                        $unassigned_sans_noncharacters-> containing_range($i)->
1508                                                                          end);
1509         }
1510         else {
1511             Carp::my_carp_bug("Can't figure out how to annotate "
1512                               . sprintf("U+%04X", $i)
1513                               . ".  Proceeding anyway.");
1514             $viacode[$i] = 'UNKNOWN';
1515             $annotate_char_type[$i] = $UNKNOWN_TYPE;
1516             $printable[$i] = 0;
1517         }
1518     }
1519
1520     # Here, has a name, but if it's one in which the code point number is
1521     # appended to the name, do that.
1522     elsif ($annotate_char_type[$i] == $CP_IN_NAME) {
1523         $viacode[$i] .= sprintf("-%04X", $i);
1524         $end = $perl_charname->containing_range($i)->end;
1525     }
1526
1527     # And here, has a name, but if it's a hangul syllable one, replace it with
1528     # the correct name from the Unicode algorithm
1529     elsif ($annotate_char_type[$i] == $HANGUL_SYLLABLE) {
1530         use integer;
1531         my $SIndex = $i - $SBase;
1532         my $L = $LBase + $SIndex / $NCount;
1533         my $V = $VBase + ($SIndex % $NCount) / $TCount;
1534         my $T = $TBase + $SIndex % $TCount;
1535         $viacode[$i] = "HANGUL SYLLABLE $Jamo{$L}$Jamo{$V}";
1536         $viacode[$i] .= $Jamo{$T} if $T != $TBase;
1537         $end = $perl_charname->containing_range($i)->end;
1538     }
1539
1540     return if ! defined wantarray;
1541     return $i if ! defined $end;    # If not a range, return the input
1542
1543     # Save this whole range so can find the end point quickly
1544     $annotate_ranges->add_map($i, $end, $end);
1545
1546     return $end;
1547 }
1548
1549 # Commented code below should work on Perl 5.8.
1550 ## This 'require' doesn't necessarily work in miniperl, and even if it does,
1551 ## the native perl version of it (which is what would operate under miniperl)
1552 ## is extremely slow, as it does a string eval every call.
1553 #my $has_fast_scalar_util = $\18 !~ /miniperl/
1554 #                            && defined eval "require Scalar::Util";
1555 #
1556 #sub objaddr($) {
1557 #    # Returns the address of the blessed input object.  Uses the XS version if
1558 #    # available.  It doesn't check for blessedness because that would do a
1559 #    # string eval every call, and the program is structured so that this is
1560 #    # never called for a non-blessed object.
1561 #
1562 #    return Scalar::Util::refaddr($_[0]) if $has_fast_scalar_util;
1563 #
1564 #    # Check at least that is a ref.
1565 #    my $pkg = ref($_[0]) or return undef;
1566 #
1567 #    # Change to a fake package to defeat any overloaded stringify
1568 #    bless $_[0], 'main::Fake';
1569 #
1570 #    # Numifying a ref gives its address.
1571 #    my $addr = pack 'J', $_[0];
1572 #
1573 #    # Return to original class
1574 #    bless $_[0], $pkg;
1575 #    return $addr;
1576 #}
1577
1578 sub max ($$) {
1579     my $a = shift;
1580     my $b = shift;
1581     return $a if $a >= $b;
1582     return $b;
1583 }
1584
1585 sub min ($$) {
1586     my $a = shift;
1587     my $b = shift;
1588     return $a if $a <= $b;
1589     return $b;
1590 }
1591
1592 sub clarify_number ($) {
1593     # This returns the input number with underscores inserted every 3 digits
1594     # in large (5 digits or more) numbers.  Input must be entirely digits, not
1595     # checked.
1596
1597     my $number = shift;
1598     my $pos = length($number) - 3;
1599     return $number if $pos <= 1;
1600     while ($pos > 0) {
1601         substr($number, $pos, 0) = '_';
1602         $pos -= 3;
1603     }
1604     return $number;
1605 }
1606
1607
1608 package Carp;
1609
1610 # These routines give a uniform treatment of messages in this program.  They
1611 # are placed in the Carp package to cause the stack trace to not include them,
1612 # although an alternative would be to use another package and set @CARP_NOT
1613 # for it.
1614
1615 our $Verbose = 1 if main::DEBUG;  # Useful info when debugging
1616
1617 # This is a work-around suggested by Nicholas Clark to fix a problem with Carp
1618 # and overload trying to load Scalar:Util under miniperl.  See
1619 # http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/2009-11/msg01057.html
1620 undef $overload::VERSION;
1621
1622 sub my_carp {
1623     my $message = shift || "";
1624     my $nofold = shift || 0;
1625
1626     if ($message) {
1627         $message = main::join_lines($message);
1628         $message =~ s/^$0: *//;     # Remove initial program name
1629         $message =~ s/[.;,]+$//;    # Remove certain ending punctuation
1630         $message = "\n$0: $message;";
1631
1632         # Fold the message with program name, semi-colon end punctuation
1633         # (which looks good with the message that carp appends to it), and a
1634         # hanging indent for continuation lines.
1635         $message = main::simple_fold($message, "", 4) unless $nofold;
1636         $message =~ s/\n$//;        # Remove the trailing nl so what carp
1637                                     # appends is to the same line
1638     }
1639
1640     return $message if defined wantarray;   # If a caller just wants the msg
1641
1642     carp $message;
1643     return;
1644 }
1645
1646 sub my_carp_bug {
1647     # This is called when it is clear that the problem is caused by a bug in
1648     # this program.
1649
1650     my $message = shift;
1651     $message =~ s/^$0: *//;
1652     $message = my_carp("Bug in $0.  Please report it by running perlbug or if that is unavailable, by sending email to perbug\@perl.org:\n$message");
1653     carp $message;
1654     return;
1655 }
1656
1657 sub carp_too_few_args {
1658     if (@_ != 2) {
1659         my_carp_bug("Wrong number of arguments: to 'carp_too_few_arguments'.  No action taken.");
1660         return;
1661     }
1662
1663     my $args_ref = shift;
1664     my $count = shift;
1665
1666     my_carp_bug("Need at least $count arguments to "
1667         . (caller 1)[3]
1668         . ".  Instead got: '"
1669         . join ', ', @$args_ref
1670         . "'.  No action taken.");
1671     return;
1672 }
1673
1674 sub carp_extra_args {
1675     my $args_ref = shift;
1676     my_carp_bug("Too many arguments to 'carp_extra_args': (" . join(', ', @_) . ");  Extras ignored.") if @_;
1677
1678     unless (ref $args_ref) {
1679         my_carp_bug("Argument to 'carp_extra_args' ($args_ref) must be a ref.  Not checking arguments.");
1680         return;
1681     }
1682     my ($package, $file, $line) = caller;
1683     my $subroutine = (caller 1)[3];
1684
1685     my $list;
1686     if (ref $args_ref eq 'HASH') {
1687         foreach my $key (keys %$args_ref) {
1688             $args_ref->{$key} = $UNDEF unless defined $args_ref->{$key};
1689         }
1690         $list = join ', ', each %{$args_ref};
1691     }
1692     elsif (ref $args_ref eq 'ARRAY') {
1693         foreach my $arg (@$args_ref) {
1694             $arg = $UNDEF unless defined $arg;
1695         }
1696         $list = join ', ', @$args_ref;
1697     }
1698     else {
1699         my_carp_bug("Can't cope with ref "
1700                 . ref($args_ref)
1701                 . " . argument to 'carp_extra_args'.  Not checking arguments.");
1702         return;
1703     }
1704
1705     my_carp_bug("Unrecognized parameters in options: '$list' to $subroutine.  Skipped.");
1706     return;
1707 }
1708
1709 package main;
1710
1711 { # Closure
1712
1713     # This program uses the inside-out method for objects, as recommended in
1714     # "Perl Best Practices".  This closure aids in generating those.  There
1715     # are two routines.  setup_package() is called once per package to set
1716     # things up, and then set_access() is called for each hash representing a
1717     # field in the object.  These routines arrange for the object to be
1718     # properly destroyed when no longer used, and for standard accessor
1719     # functions to be generated.  If you need more complex accessors, just
1720     # write your own and leave those accesses out of the call to set_access().
1721     # More details below.
1722
1723     my %constructor_fields; # fields that are to be used in constructors; see
1724                             # below
1725
1726     # The values of this hash will be the package names as keys to other
1727     # hashes containing the name of each field in the package as keys, and
1728     # references to their respective hashes as values.
1729     my %package_fields;
1730
1731     sub setup_package {
1732         # Sets up the package, creating standard DESTROY and dump methods
1733         # (unless already defined).  The dump method is used in debugging by
1734         # simple_dumper().
1735         # The optional parameters are:
1736         #   a)  a reference to a hash, that gets populated by later
1737         #       set_access() calls with one of the accesses being
1738         #       'constructor'.  The caller can then refer to this, but it is
1739         #       not otherwise used by these two routines.
1740         #   b)  a reference to a callback routine to call during destruction
1741         #       of the object, before any fields are actually destroyed
1742
1743         my %args = @_;
1744         my $constructor_ref = delete $args{'Constructor_Fields'};
1745         my $destroy_callback = delete $args{'Destroy_Callback'};
1746         Carp::carp_extra_args(\@_) if main::DEBUG && %args;
1747
1748         my %fields;
1749         my $package = (caller)[0];
1750
1751         $package_fields{$package} = \%fields;
1752         $constructor_fields{$package} = $constructor_ref;
1753
1754         unless ($package->can('DESTROY')) {
1755             my $destroy_name = "${package}::DESTROY";
1756             no strict "refs";
1757
1758             # Use typeglob to give the anonymous subroutine the name we want
1759             *$destroy_name = sub {
1760                 my $self = shift;
1761                 my $addr = do { no overloading; pack 'J', $self; };
1762
1763                 $self->$destroy_callback if $destroy_callback;
1764                 foreach my $field (keys %{$package_fields{$package}}) {
1765                     #print STDERR __LINE__, ": Destroying ", ref $self, " ", sprintf("%04X", $addr), ": ", $field, "\n";
1766                     delete $package_fields{$package}{$field}{$addr};
1767                 }
1768                 return;
1769             }
1770         }
1771
1772         unless ($package->can('dump')) {
1773             my $dump_name = "${package}::dump";
1774             no strict "refs";
1775             *$dump_name = sub {
1776                 my $self = shift;
1777                 return dump_inside_out($self, $package_fields{$package}, @_);
1778             }
1779         }
1780         return;
1781     }
1782
1783     sub set_access {
1784         # Arrange for the input field to be garbage collected when no longer
1785         # needed.  Also, creates standard accessor functions for the field
1786         # based on the optional parameters-- none if none of these parameters:
1787         #   'addable'    creates an 'add_NAME()' accessor function.
1788         #   'readable' or 'readable_array'   creates a 'NAME()' accessor
1789         #                function.
1790         #   'settable'   creates a 'set_NAME()' accessor function.
1791         #   'constructor' doesn't create an accessor function, but adds the
1792         #                field to the hash that was previously passed to
1793         #                setup_package();
1794         # Any of the accesses can be abbreviated down, so that 'a', 'ad',
1795         # 'add' etc. all mean 'addable'.
1796         # The read accessor function will work on both array and scalar
1797         # values.  If another accessor in the parameter list is 'a', the read
1798         # access assumes an array.  You can also force it to be array access
1799         # by specifying 'readable_array' instead of 'readable'
1800         #
1801         # A sort-of 'protected' access can be set-up by preceding the addable,
1802         # readable or settable with some initial portion of 'protected_' (but,
1803         # the underscore is required), like 'p_a', 'pro_set', etc.  The
1804         # "protection" is only by convention.  All that happens is that the
1805         # accessor functions' names begin with an underscore.  So instead of
1806         # calling set_foo, the call is _set_foo.  (Real protection could be
1807         # accomplished by having a new subroutine, end_package, called at the
1808         # end of each package, and then storing the __LINE__ ranges and
1809         # checking them on every accessor.  But that is way overkill.)
1810
1811         # We create anonymous subroutines as the accessors and then use
1812         # typeglobs to assign them to the proper package and name
1813
1814         my $name = shift;   # Name of the field
1815         my $field = shift;  # Reference to the inside-out hash containing the
1816                             # field
1817
1818         my $package = (caller)[0];
1819
1820         if (! exists $package_fields{$package}) {
1821             croak "$0: Must call 'setup_package' before 'set_access'";
1822         }
1823
1824         # Stash the field so DESTROY can get it.
1825         $package_fields{$package}{$name} = $field;
1826
1827         # Remaining arguments are the accessors.  For each...
1828         foreach my $access (@_) {
1829             my $access = lc $access;
1830
1831             my $protected = "";
1832
1833             # Match the input as far as it goes.
1834             if ($access =~ /^(p[^_]*)_/) {
1835                 $protected = $1;
1836                 if (substr('protected_', 0, length $protected)
1837                     eq $protected)
1838                 {
1839
1840                     # Add 1 for the underscore not included in $protected
1841                     $access = substr($access, length($protected) + 1);
1842                     $protected = '_';
1843                 }
1844                 else {
1845                     $protected = "";
1846                 }
1847             }
1848
1849             if (substr('addable', 0, length $access) eq $access) {
1850                 my $subname = "${package}::${protected}add_$name";
1851                 no strict "refs";
1852
1853                 # add_ accessor.  Don't add if already there, which we
1854                 # determine using 'eq' for scalars and '==' otherwise.
1855                 *$subname = sub {
1856                     use strict "refs";
1857                     return Carp::carp_too_few_args(\@_, 2) if main::DEBUG && @_ < 2;
1858                     my $self = shift;
1859                     my $value = shift;
1860                     my $addr = do { no overloading; pack 'J', $self; };
1861                     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
1862                     if (ref $value) {
1863                         return if grep { $value == $_ } @{$field->{$addr}};
1864                     }
1865                     else {
1866                         return if grep { $value eq $_ } @{$field->{$addr}};
1867                     }
1868                     push @{$field->{$addr}}, $value;
1869                     return;
1870                 }
1871             }
1872             elsif (substr('constructor', 0, length $access) eq $access) {
1873                 if ($protected) {
1874                     Carp::my_carp_bug("Can't set-up 'protected' constructors")
1875                 }
1876                 else {
1877                     $constructor_fields{$package}{$name} = $field;
1878                 }
1879             }
1880             elsif (substr('readable_array', 0, length $access) eq $access) {
1881
1882                 # Here has read access.  If one of the other parameters for
1883                 # access is array, or this one specifies array (by being more
1884                 # than just 'readable_'), then create a subroutine that
1885                 # assumes the data is an array.  Otherwise just a scalar
1886                 my $subname = "${package}::${protected}$name";
1887                 if (grep { /^a/i } @_
1888                     or length($access) > length('readable_'))
1889                 {
1890                     no strict "refs";
1891                     *$subname = sub {
1892                         use strict "refs";
1893                         Carp::carp_extra_args(\@_) if main::DEBUG && @_ > 1;
1894                         my $addr = do { no overloading; pack 'J', $_[0]; };
1895                         if (ref $field->{$addr} ne 'ARRAY') {
1896                             my $type = ref $field->{$addr};
1897                             $type = 'scalar' unless $type;
1898                             Carp::my_carp_bug("Trying to read $name as an array when it is a $type.  Big problems.");
1899                             return;
1900                         }
1901                         return scalar @{$field->{$addr}} unless wantarray;
1902
1903                         # Make a copy; had problems with caller modifying the
1904                         # original otherwise
1905                         my @return = @{$field->{$addr}};
1906                         return @return;
1907                     }
1908                 }
1909                 else {
1910
1911                     # Here not an array value, a simpler function.
1912                     no strict "refs";
1913                     *$subname = sub {
1914                         use strict "refs";
1915                         Carp::carp_extra_args(\@_) if main::DEBUG && @_ > 1;
1916                         no overloading;
1917                         return $field->{pack 'J', $_[0]};
1918                     }
1919                 }
1920             }
1921             elsif (substr('settable', 0, length $access) eq $access) {
1922                 my $subname = "${package}::${protected}set_$name";
1923                 no strict "refs";
1924                 *$subname = sub {
1925                     use strict "refs";
1926                     if (main::DEBUG) {
1927                         return Carp::carp_too_few_args(\@_, 2) if @_ < 2;
1928                         Carp::carp_extra_args(\@_) if @_ > 2;
1929                     }
1930                     # $self is $_[0]; $value is $_[1]
1931                     no overloading;
1932                     $field->{pack 'J', $_[0]} = $_[1];
1933                     return;
1934                 }
1935             }
1936             else {
1937                 Carp::my_carp_bug("Unknown accessor type $access.  No accessor set.");
1938             }
1939         }
1940         return;
1941     }
1942 }
1943
1944 package Input_file;
1945
1946 # All input files use this object, which stores various attributes about them,
1947 # and provides for convenient, uniform handling.  The run method wraps the
1948 # processing.  It handles all the bookkeeping of opening, reading, and closing
1949 # the file, returning only significant input lines.
1950 #
1951 # Each object gets a handler which processes the body of the file, and is
1952 # called by run().  Most should use the generic, default handler, which has
1953 # code scrubbed to handle things you might not expect.  A handler should
1954 # basically be a while(next_line()) {...} loop.
1955 #
1956 # You can also set up handlers to
1957 #   1) call before the first line is read for pre processing
1958 #   2) call to adjust each line of the input before the main handler gets them
1959 #   3) call upon EOF before the main handler exits its loop
1960 #   4) call at the end for post processing
1961 #
1962 # $_ is used to store the input line, and is to be filtered by the
1963 # each_line_handler()s.  So, if the format of the line is not in the desired
1964 # format for the main handler, these are used to do that adjusting.  They can
1965 # be stacked (by enclosing them in an [ anonymous array ] in the constructor,
1966 # so the $_ output of one is used as the input to the next.  None of the other
1967 # handlers are stackable, but could easily be changed to be so.
1968 #
1969 # Most of the handlers can call insert_lines() or insert_adjusted_lines()
1970 # which insert the parameters as lines to be processed before the next input
1971 # file line is read.  This allows the EOF handler to flush buffers, for
1972 # example.  The difference between the two routines is that the lines inserted
1973 # by insert_lines() are subjected to the each_line_handler()s.  (So if you
1974 # called it from such a handler, you would get infinite recursion.)  Lines
1975 # inserted by insert_adjusted_lines() go directly to the main handler without
1976 # any adjustments.  If the  post-processing handler calls any of these, there
1977 # will be no effect.  Some error checking for these conditions could be added,
1978 # but it hasn't been done.
1979 #
1980 # carp_bad_line() should be called to warn of bad input lines, which clears $_
1981 # to prevent further processing of the line.  This routine will output the
1982 # message as a warning once, and then keep a count of the lines that have the
1983 # same message, and output that count at the end of the file's processing.
1984 # This keeps the number of messages down to a manageable amount.
1985 #
1986 # get_missings() should be called to retrieve any @missing input lines.
1987 # Messages will be raised if this isn't done if the options aren't to ignore
1988 # missings.
1989
1990 sub trace { return main::trace(@_); }
1991
1992 { # Closure
1993     # Keep track of fields that are to be put into the constructor.
1994     my %constructor_fields;
1995
1996     main::setup_package(Constructor_Fields => \%constructor_fields);
1997
1998     my %file; # Input file name, required
1999     main::set_access('file', \%file, qw{ c r });
2000
2001     my %first_released; # Unicode version file was first released in, required
2002     main::set_access('first_released', \%first_released, qw{ c r });
2003
2004     my %handler;    # Subroutine to process the input file, defaults to
2005                     # 'process_generic_property_file'
2006     main::set_access('handler', \%handler, qw{ c });
2007
2008     my %property;
2009     # name of property this file is for.  defaults to none, meaning not
2010     # applicable, or is otherwise determinable, for example, from each line.
2011     main::set_access('property', \%property, qw{ c });
2012
2013     my %optional;
2014     # If this is true, the file is optional.  If not present, no warning is
2015     # output.  If it is present, the string given by this parameter is
2016     # evaluated, and if false the file is not processed.
2017     main::set_access('optional', \%optional, 'c', 'r');
2018
2019     my %non_skip;
2020     # This is used for debugging, to skip processing of all but a few input
2021     # files.  Add 'non_skip => 1' to the constructor for those files you want
2022     # processed when you set the $debug_skip global.
2023     main::set_access('non_skip', \%non_skip, 'c');
2024
2025     my %skip;
2026     # This is used to skip processing of this input file semi-permanently,
2027     # when it evaluates to true.  The value should be the reason the file is
2028     # being skipped.  It is used for files that we aren't planning to process
2029     # anytime soon, but want to allow to be in the directory and not raise a
2030     # message that we are not handling.  Mostly for test files.  This is in
2031     # contrast to the non_skip element, which is supposed to be used very
2032     # temporarily for debugging.  Sets 'optional' to 1.  Also, files that we
2033     # pretty much will never look at can be placed in the global
2034     # %ignored_files instead.  Ones used here will be added to %skipped files
2035     main::set_access('skip', \%skip, 'c');
2036
2037     my %each_line_handler;
2038     # list of subroutines to look at and filter each non-comment line in the
2039     # file.  defaults to none.  The subroutines are called in order, each is
2040     # to adjust $_ for the next one, and the final one adjusts it for
2041     # 'handler'
2042     main::set_access('each_line_handler', \%each_line_handler, 'c');
2043
2044     my %has_missings_defaults;
2045     # ? Are there lines in the file giving default values for code points
2046     # missing from it?.  Defaults to NO_DEFAULTS.  Otherwise NOT_IGNORED is
2047     # the norm, but IGNORED means it has such lines, but the handler doesn't
2048     # use them.  Having these three states allows us to catch changes to the
2049     # UCD that this program should track
2050     main::set_access('has_missings_defaults',
2051                                         \%has_missings_defaults, qw{ c r });
2052
2053     my %pre_handler;
2054     # Subroutine to call before doing anything else in the file.  If undef, no
2055     # such handler is called.
2056     main::set_access('pre_handler', \%pre_handler, qw{ c });
2057
2058     my %eof_handler;
2059     # Subroutine to call upon getting an EOF on the input file, but before
2060     # that is returned to the main handler.  This is to allow buffers to be
2061     # flushed.  The handler is expected to call insert_lines() or
2062     # insert_adjusted() with the buffered material
2063     main::set_access('eof_handler', \%eof_handler, qw{ c r });
2064
2065     my %post_handler;
2066     # Subroutine to call after all the lines of the file are read in and
2067     # processed.  If undef, no such handler is called.
2068     main::set_access('post_handler', \%post_handler, qw{ c });
2069
2070     my %progress_message;
2071     # Message to print to display progress in lieu of the standard one
2072     main::set_access('progress_message', \%progress_message, qw{ c });
2073
2074     my %handle;
2075     # cache open file handle, internal.  Is undef if file hasn't been
2076     # processed at all, empty if has;
2077     main::set_access('handle', \%handle);
2078
2079     my %added_lines;
2080     # cache of lines added virtually to the file, internal
2081     main::set_access('added_lines', \%added_lines);
2082
2083     my %errors;
2084     # cache of errors found, internal
2085     main::set_access('errors', \%errors);
2086
2087     my %missings;
2088     # storage of '@missing' defaults lines
2089     main::set_access('missings', \%missings);
2090
2091     sub new {
2092         my $class = shift;
2093
2094         my $self = bless \do{ my $anonymous_scalar }, $class;
2095         my $addr = do { no overloading; pack 'J', $self; };
2096
2097         # Set defaults
2098         $handler{$addr} = \&main::process_generic_property_file;
2099         $non_skip{$addr} = 0;
2100         $skip{$addr} = 0;
2101         $has_missings_defaults{$addr} = $NO_DEFAULTS;
2102         $handle{$addr} = undef;
2103         $added_lines{$addr} = [ ];
2104         $each_line_handler{$addr} = [ ];
2105         $errors{$addr} = { };
2106         $missings{$addr} = [ ];
2107
2108         # Two positional parameters.
2109         return Carp::carp_too_few_args(\@_, 2) if main::DEBUG && @_ < 2;
2110         $file{$addr} = main::internal_file_to_platform(shift);
2111         $first_released{$addr} = shift;
2112
2113         # The rest of the arguments are key => value pairs
2114         # %constructor_fields has been set up earlier to list all possible
2115         # ones.  Either set or push, depending on how the default has been set
2116         # up just above.
2117         my %args = @_;
2118         foreach my $key (keys %args) {
2119             my $argument = $args{$key};
2120
2121             # Note that the fields are the lower case of the constructor keys
2122             my $hash = $constructor_fields{lc $key};
2123             if (! defined $hash) {
2124                 Carp::my_carp_bug("Unrecognized parameters '$key => $argument' to new() for $self.  Skipped");
2125                 next;
2126             }
2127             if (ref $hash->{$addr} eq 'ARRAY') {
2128                 if (ref $argument eq 'ARRAY') {
2129                     foreach my $argument (@{$argument}) {
2130                         next if ! defined $argument;
2131                         push @{$hash->{$addr}}, $argument;
2132                     }
2133                 }
2134                 else {
2135                     push @{$hash->{$addr}}, $argument if defined $argument;
2136                 }
2137             }
2138             else {
2139                 $hash->{$addr} = $argument;
2140             }
2141             delete $args{$key};
2142         };
2143
2144         # If the file has a property for it, it means that the property is not
2145         # listed in the file's entries.  So add a handler to the list of line
2146         # handlers to insert the property name into the lines, to provide a
2147         # uniform interface to the final processing subroutine.
2148         # the final code doesn't have to worry about that.
2149         if ($property{$addr}) {
2150             push @{$each_line_handler{$addr}}, \&_insert_property_into_line;
2151         }
2152
2153         if ($non_skip{$addr} && ! $debug_skip && $verbosity) {
2154             print "Warning: " . __PACKAGE__ . " constructor for $file{$addr} has useless 'non_skip' in it\n";
2155         }
2156
2157         # If skipping, set to optional, and add to list of ignored files,
2158         # including its reason
2159         if ($skip{$addr}) {
2160             $optional{$addr} = 1;
2161             $skipped_files{$file{$addr}} = $skip{$addr}
2162         }
2163
2164         return $self;
2165     }
2166
2167
2168     use overload
2169         fallback => 0,
2170         qw("") => "_operator_stringify",
2171         "." => \&main::_operator_dot,
2172     ;
2173
2174     sub _operator_stringify {
2175         my $self = shift;
2176
2177         return __PACKAGE__ . " object for " . $self->file;
2178     }
2179
2180     # flag to make sure extracted files are processed early
2181     my $seen_non_extracted_non_age = 0;
2182
2183     sub run {
2184         # Process the input object $self.  This opens and closes the file and
2185         # calls all the handlers for it.  Currently,  this can only be called
2186         # once per file, as it destroy's the EOF handler
2187
2188         my $self = shift;
2189         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2190
2191         my $addr = do { no overloading; pack 'J', $self; };
2192
2193         my $file = $file{$addr};
2194
2195         # Don't process if not expecting this file (because released later
2196         # than this Unicode version), and isn't there.  This means if someone
2197         # copies it into an earlier version's directory, we will go ahead and
2198         # process it.
2199         return if $first_released{$addr} gt $v_version && ! -e $file;
2200
2201         # If in debugging mode and this file doesn't have the non-skip
2202         # flag set, and isn't one of the critical files, skip it.
2203         if ($debug_skip
2204             && $first_released{$addr} ne v0
2205             && ! $non_skip{$addr})
2206         {
2207             print "Skipping $file in debugging\n" if $verbosity;
2208             return;
2209         }
2210
2211         # File could be optional
2212         if ($optional{$addr}) {
2213             return unless -e $file;
2214             my $result = eval $optional{$addr};
2215             if (! defined $result) {
2216                 Carp::my_carp_bug("Got '$@' when tried to eval $optional{$addr}.  $file Skipped.");
2217                 return;
2218             }
2219             if (! $result) {
2220                 if ($verbosity) {
2221                     print STDERR "Skipping processing input file '$file' because '$optional{$addr}' is not true\n";
2222                 }
2223                 return;
2224             }
2225         }
2226
2227         if (! defined $file || ! -e $file) {
2228
2229             # If the file doesn't exist, see if have internal data for it
2230             # (based on first_released being 0).
2231             if ($first_released{$addr} eq v0) {
2232                 $handle{$addr} = 'pretend_is_open';
2233             }
2234             else {
2235                 if (! $optional{$addr}  # File could be optional
2236                     && $v_version ge $first_released{$addr})
2237                 {
2238                     print STDERR "Skipping processing input file '$file' because not found\n" if $v_version ge $first_released{$addr};
2239                 }
2240                 return;
2241             }
2242         }
2243         else {
2244
2245             # Here, the file exists.  Some platforms may change the case of
2246             # its name
2247             if ($seen_non_extracted_non_age) {
2248                 if ($file =~ /$EXTRACTED/i) {
2249                     Carp::my_carp_bug(join_lines(<<END
2250 $file should be processed just after the 'Prop...Alias' files, and before
2251 anything not in the $EXTRACTED_DIR directory.  Proceeding, but the results may
2252 have subtle problems
2253 END
2254                     ));
2255                 }
2256             }
2257             elsif ($EXTRACTED_DIR
2258                     && $first_released{$addr} ne v0
2259                     && $file !~ /$EXTRACTED/i
2260                     && lc($file) ne 'dage.txt')
2261             {
2262                 # We don't set this (by the 'if' above) if we have no
2263                 # extracted directory, so if running on an early version,
2264                 # this test won't work.  Not worth worrying about.
2265                 $seen_non_extracted_non_age = 1;
2266             }
2267
2268             # And mark the file as having being processed, and warn if it
2269             # isn't a file we are expecting.  As we process the files,
2270             # they are deleted from the hash, so any that remain at the
2271             # end of the program are files that we didn't process.
2272             my $fkey = File::Spec->rel2abs($file);
2273             my $expecting = delete $potential_files{lc($fkey)};
2274
2275             Carp::my_carp("Was not expecting '$file'.") if
2276                     ! $expecting
2277                     && ! defined $handle{$addr};
2278
2279             # Having deleted from expected files, we can quit if not to do
2280             # anything.  Don't print progress unless really want verbosity
2281             if ($skip{$addr}) {
2282                 print "Skipping $file.\n" if $verbosity >= $VERBOSE;
2283                 return;
2284             }
2285
2286             # Open the file, converting the slashes used in this program
2287             # into the proper form for the OS
2288             my $file_handle;
2289             if (not open $file_handle, "<", $file) {
2290                 Carp::my_carp("Can't open $file.  Skipping: $!");
2291                 return 0;
2292             }
2293             $handle{$addr} = $file_handle; # Cache the open file handle
2294         }
2295
2296         if ($verbosity >= $PROGRESS) {
2297             if ($progress_message{$addr}) {
2298                 print "$progress_message{$addr}\n";
2299             }
2300             else {
2301                 # If using a virtual file, say so.
2302                 print "Processing ", (-e $file)
2303                                        ? $file
2304                                        : "substitute $file",
2305                                      "\n";
2306             }
2307         }
2308
2309
2310         # Call any special handler for before the file.
2311         &{$pre_handler{$addr}}($self) if $pre_handler{$addr};
2312
2313         # Then the main handler
2314         &{$handler{$addr}}($self);
2315
2316         # Then any special post-file handler.
2317         &{$post_handler{$addr}}($self) if $post_handler{$addr};
2318
2319         # If any errors have been accumulated, output the counts (as the first
2320         # error message in each class was output when it was encountered).
2321         if ($errors{$addr}) {
2322             my $total = 0;
2323             my $types = 0;
2324             foreach my $error (keys %{$errors{$addr}}) {
2325                 $total += $errors{$addr}->{$error};
2326                 delete $errors{$addr}->{$error};
2327                 $types++;
2328             }
2329             if ($total > 1) {
2330                 my $message
2331                         = "A total of $total lines had errors in $file.  ";
2332
2333                 $message .= ($types == 1)
2334                             ? '(Only the first one was displayed.)'
2335                             : '(Only the first of each type was displayed.)';
2336                 Carp::my_carp($message);
2337             }
2338         }
2339
2340         if (@{$missings{$addr}}) {
2341             Carp::my_carp_bug("Handler for $file didn't look at all the \@missing lines.  Generated tables likely are wrong");
2342         }
2343
2344         # If a real file handle, close it.
2345         close $handle{$addr} or Carp::my_carp("Can't close $file: $!") if
2346                                                         ref $handle{$addr};
2347         $handle{$addr} = "";   # Uses empty to indicate that has already seen
2348                                # the file, as opposed to undef
2349         return;
2350     }
2351
2352     sub next_line {
2353         # Sets $_ to be the next logical input line, if any.  Returns non-zero
2354         # if such a line exists.  'logical' means that any lines that have
2355         # been added via insert_lines() will be returned in $_ before the file
2356         # is read again.
2357
2358         my $self = shift;
2359         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2360
2361         my $addr = do { no overloading; pack 'J', $self; };
2362
2363         # Here the file is open (or if the handle is not a ref, is an open
2364         # 'virtual' file).  Get the next line; any inserted lines get priority
2365         # over the file itself.
2366         my $adjusted;
2367
2368         LINE:
2369         while (1) { # Loop until find non-comment, non-empty line
2370             #local $to_trace = 1 if main::DEBUG;
2371             my $inserted_ref = shift @{$added_lines{$addr}};
2372             if (defined $inserted_ref) {
2373                 ($adjusted, $_) = @{$inserted_ref};
2374                 trace $adjusted, $_ if main::DEBUG && $to_trace;
2375                 return 1 if $adjusted;
2376             }
2377             else {
2378                 last if ! ref $handle{$addr}; # Don't read unless is real file
2379                 last if ! defined ($_ = readline $handle{$addr});
2380             }
2381             chomp;
2382             trace $_ if main::DEBUG && $to_trace;
2383
2384             # See if this line is the comment line that defines what property
2385             # value that code points that are not listed in the file should
2386             # have.  The format or existence of these lines is not guaranteed
2387             # by Unicode since they are comments, but the documentation says
2388             # that this was added for machine-readability, so probably won't
2389             # change.  This works starting in Unicode Version 5.0.  They look
2390             # like:
2391             #
2392             # @missing: 0000..10FFFF; Not_Reordered
2393             # @missing: 0000..10FFFF; Decomposition_Mapping; <code point>
2394             # @missing: 0000..10FFFF; ; NaN
2395             #
2396             # Save the line for a later get_missings() call.
2397             if (/$missing_defaults_prefix/) {
2398                 if ($has_missings_defaults{$addr} == $NO_DEFAULTS) {
2399                     $self->carp_bad_line("Unexpected \@missing line.  Assuming no missing entries");
2400                 }
2401                 elsif ($has_missings_defaults{$addr} == $NOT_IGNORED) {
2402                     my @defaults = split /\s* ; \s*/x, $_;
2403
2404                     # The first field is the @missing, which ends in a
2405                     # semi-colon, so can safely shift.
2406                     shift @defaults;
2407
2408                     # Some of these lines may have empty field placeholders
2409                     # which get in the way.  An example is:
2410                     # @missing: 0000..10FFFF; ; NaN
2411                     # Remove them.  Process starting from the top so the
2412                     # splice doesn't affect things still to be looked at.
2413                     for (my $i = @defaults - 1; $i >= 0; $i--) {
2414                         next if $defaults[$i] ne "";
2415                         splice @defaults, $i, 1;
2416                     }
2417
2418                     # What's left should be just the property (maybe) and the
2419                     # default.  Having only one element means it doesn't have
2420                     # the property.
2421                     my $default;
2422                     my $property;
2423                     if (@defaults >= 1) {
2424                         if (@defaults == 1) {
2425                             $default = $defaults[0];
2426                         }
2427                         else {
2428                             $property = $defaults[0];
2429                             $default = $defaults[1];
2430                         }
2431                     }
2432
2433                     if (@defaults < 1
2434                         || @defaults > 2
2435                         || ($default =~ /^</
2436                             && $default !~ /^<code *point>$/i
2437                             && $default !~ /^<none>$/i
2438                             && $default !~ /^<script>$/i))
2439                     {
2440                         $self->carp_bad_line("Unrecognized \@missing line: $_.  Assuming no missing entries");
2441                     }
2442                     else {
2443
2444                         # If the property is missing from the line, it should
2445                         # be the one for the whole file
2446                         $property = $property{$addr} if ! defined $property;
2447
2448                         # Change <none> to the null string, which is what it
2449                         # really means.  If the default is the code point
2450                         # itself, set it to <code point>, which is what
2451                         # Unicode uses (but sometimes they've forgotten the
2452                         # space)
2453                         if ($default =~ /^<none>$/i) {
2454                             $default = "";
2455                         }
2456                         elsif ($default =~ /^<code *point>$/i) {
2457                             $default = $CODE_POINT;
2458                         }
2459                         elsif ($default =~ /^<script>$/i) {
2460
2461                             # Special case this one.  Currently is from
2462                             # ScriptExtensions.txt, and means for all unlisted
2463                             # code points, use their Script property values.
2464                             # For the code points not listed in that file, the
2465                             # default value is 'Unknown'.
2466                             $default = "Unknown";
2467                         }
2468
2469                         # Store them as a sub-arrays with both components.
2470                         push @{$missings{$addr}}, [ $default, $property ];
2471                     }
2472                 }
2473
2474                 # There is nothing for the caller to process on this comment
2475                 # line.
2476                 next;
2477             }
2478
2479             # Remove comments and trailing space, and skip this line if the
2480             # result is empty
2481             s/#.*//;
2482             s/\s+$//;
2483             next if /^$/;
2484
2485             # Call any handlers for this line, and skip further processing of
2486             # the line if the handler sets the line to null.
2487             foreach my $sub_ref (@{$each_line_handler{$addr}}) {
2488                 &{$sub_ref}($self);
2489                 next LINE if /^$/;
2490             }
2491
2492             # Here the line is ok.  return success.
2493             return 1;
2494         } # End of looping through lines.
2495
2496         # If there is an EOF handler, call it (only once) and if it generates
2497         # more lines to process go back in the loop to handle them.
2498         if ($eof_handler{$addr}) {
2499             &{$eof_handler{$addr}}($self);
2500             $eof_handler{$addr} = "";   # Currently only get one shot at it.
2501             goto LINE if $added_lines{$addr};
2502         }
2503
2504         # Return failure -- no more lines.
2505         return 0;
2506
2507     }
2508
2509 #   Not currently used, not fully tested.
2510 #    sub peek {
2511 #        # Non-destructive look-ahead one non-adjusted, non-comment, non-blank
2512 #        # record.  Not callable from an each_line_handler(), nor does it call
2513 #        # an each_line_handler() on the line.
2514 #
2515 #        my $self = shift;
2516 #        my $addr = do { no overloading; pack 'J', $self; };
2517 #
2518 #        foreach my $inserted_ref (@{$added_lines{$addr}}) {
2519 #            my ($adjusted, $line) = @{$inserted_ref};
2520 #            next if $adjusted;
2521 #
2522 #            # Remove comments and trailing space, and return a non-empty
2523 #            # resulting line
2524 #            $line =~ s/#.*//;
2525 #            $line =~ s/\s+$//;
2526 #            return $line if $line ne "";
2527 #        }
2528 #
2529 #        return if ! ref $handle{$addr}; # Don't read unless is real file
2530 #        while (1) { # Loop until find non-comment, non-empty line
2531 #            local $to_trace = 1 if main::DEBUG;
2532 #            trace $_ if main::DEBUG && $to_trace;
2533 #            return if ! defined (my $line = readline $handle{$addr});
2534 #            chomp $line;
2535 #            push @{$added_lines{$addr}}, [ 0, $line ];
2536 #
2537 #            $line =~ s/#.*//;
2538 #            $line =~ s/\s+$//;
2539 #            return $line if $line ne "";
2540 #        }
2541 #
2542 #        return;
2543 #    }
2544
2545
2546     sub insert_lines {
2547         # Lines can be inserted so that it looks like they were in the input
2548         # file at the place it was when this routine is called.  See also
2549         # insert_adjusted_lines().  Lines inserted via this routine go through
2550         # any each_line_handler()
2551
2552         my $self = shift;
2553
2554         # Each inserted line is an array, with the first element being 0 to
2555         # indicate that this line hasn't been adjusted, and needs to be
2556         # processed.
2557         no overloading;
2558         push @{$added_lines{pack 'J', $self}}, map { [ 0, $_ ] } @_;
2559         return;
2560     }
2561
2562     sub insert_adjusted_lines {
2563         # Lines can be inserted so that it looks like they were in the input
2564         # file at the place it was when this routine is called.  See also
2565         # insert_lines().  Lines inserted via this routine are already fully
2566         # adjusted, ready to be processed; each_line_handler()s handlers will
2567         # not be called.  This means this is not a completely general
2568         # facility, as only the last each_line_handler on the stack should
2569         # call this.  It could be made more general, by passing to each of the
2570         # line_handlers their position on the stack, which they would pass on
2571         # to this routine, and that would replace the boolean first element in
2572         # the anonymous array pushed here, so that the next_line routine could
2573         # use that to call only those handlers whose index is after it on the
2574         # stack.  But this is overkill for what is needed now.
2575
2576         my $self = shift;
2577         trace $_[0] if main::DEBUG && $to_trace;
2578
2579         # Each inserted line is an array, with the first element being 1 to
2580         # indicate that this line has been adjusted
2581         no overloading;
2582         push @{$added_lines{pack 'J', $self}}, map { [ 1, $_ ] } @_;
2583         return;
2584     }
2585
2586     sub get_missings {
2587         # Returns the stored up @missings lines' values, and clears the list.
2588         # The values are in an array, consisting of the default in the first
2589         # element, and the property in the 2nd.  However, since these lines
2590         # can be stacked up, the return is an array of all these arrays.
2591
2592         my $self = shift;
2593         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2594
2595         my $addr = do { no overloading; pack 'J', $self; };
2596
2597         # If not accepting a list return, just return the first one.
2598         return shift @{$missings{$addr}} unless wantarray;
2599
2600         my @return = @{$missings{$addr}};
2601         undef @{$missings{$addr}};
2602         return @return;
2603     }
2604
2605     sub _insert_property_into_line {
2606         # Add a property field to $_, if this file requires it.
2607
2608         my $self = shift;
2609         my $addr = do { no overloading; pack 'J', $self; };
2610         my $property = $property{$addr};
2611         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2612
2613         $_ =~ s/(;|$)/; $property$1/;
2614         return;
2615     }
2616
2617     sub carp_bad_line {
2618         # Output consistent error messages, using either a generic one, or the
2619         # one given by the optional parameter.  To avoid gazillions of the
2620         # same message in case the syntax of a  file is way off, this routine
2621         # only outputs the first instance of each message, incrementing a
2622         # count so the totals can be output at the end of the file.
2623
2624         my $self = shift;
2625         my $message = shift;
2626         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2627
2628         my $addr = do { no overloading; pack 'J', $self; };
2629
2630         $message = 'Unexpected line' unless $message;
2631
2632         # No trailing punctuation so as to fit with our addenda.
2633         $message =~ s/[.:;,]$//;
2634
2635         # If haven't seen this exact message before, output it now.  Otherwise
2636         # increment the count of how many times it has occurred
2637         unless ($errors{$addr}->{$message}) {
2638             Carp::my_carp("$message in '$_' in "
2639                             . $file{$addr}
2640                             . " at line $..  Skipping this line;");
2641             $errors{$addr}->{$message} = 1;
2642         }
2643         else {
2644             $errors{$addr}->{$message}++;
2645         }
2646
2647         # Clear the line to prevent any further (meaningful) processing of it.
2648         $_ = "";
2649
2650         return;
2651     }
2652 } # End closure
2653
2654 package Multi_Default;
2655
2656 # Certain properties in early versions of Unicode had more than one possible
2657 # default for code points missing from the files.  In these cases, one
2658 # default applies to everything left over after all the others are applied,
2659 # and for each of the others, there is a description of which class of code
2660 # points applies to it.  This object helps implement this by storing the
2661 # defaults, and for all but that final default, an eval string that generates
2662 # the class that it applies to.
2663
2664
2665 {   # Closure
2666
2667     main::setup_package();
2668
2669     my %class_defaults;
2670     # The defaults structure for the classes
2671     main::set_access('class_defaults', \%class_defaults);
2672
2673     my %other_default;
2674     # The default that applies to everything left over.
2675     main::set_access('other_default', \%other_default, 'r');
2676
2677
2678     sub new {
2679         # The constructor is called with default => eval pairs, terminated by
2680         # the left-over default. e.g.
2681         # Multi_Default->new(
2682         #        'T' => '$gc->table("Mn") + $gc->table("Cf") - 0x200C
2683         #               -  0x200D',
2684         #        'R' => 'some other expression that evaluates to code points',
2685         #        .
2686         #        .
2687         #        .
2688         #        'U'));
2689
2690         my $class = shift;
2691
2692         my $self = bless \do{my $anonymous_scalar}, $class;
2693         my $addr = do { no overloading; pack 'J', $self; };
2694
2695         while (@_ > 1) {
2696             my $default = shift;
2697             my $eval = shift;
2698             $class_defaults{$addr}->{$default} = $eval;
2699         }
2700
2701         $other_default{$addr} = shift;
2702
2703         return $self;
2704     }
2705
2706     sub get_next_defaults {
2707         # Iterates and returns the next class of defaults.
2708         my $self = shift;
2709         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2710
2711         my $addr = do { no overloading; pack 'J', $self; };
2712
2713         return each %{$class_defaults{$addr}};
2714     }
2715 }
2716
2717 package Alias;
2718
2719 # An alias is one of the names that a table goes by.  This class defines them
2720 # including some attributes.  Everything is currently setup in the
2721 # constructor.
2722
2723
2724 {   # Closure
2725
2726     main::setup_package();
2727
2728     my %name;
2729     main::set_access('name', \%name, 'r');
2730
2731     my %loose_match;
2732     # Should this name match loosely or not.
2733     main::set_access('loose_match', \%loose_match, 'r');
2734
2735     my %make_re_pod_entry;
2736     # Some aliases should not get their own entries in the re section of the
2737     # pod, because they are covered by a wild-card, and some we want to
2738     # discourage use of.  Binary
2739     main::set_access('make_re_pod_entry', \%make_re_pod_entry, 'r', 's');
2740
2741     my %ucd;
2742     # Is this documented to be accessible via Unicode::UCD
2743     main::set_access('ucd', \%ucd, 'r', 's');
2744
2745     my %status;
2746     # Aliases have a status, like deprecated, or even suppressed (which means
2747     # they don't appear in documentation).  Enum
2748     main::set_access('status', \%status, 'r');
2749
2750     my %ok_as_filename;
2751     # Similarly, some aliases should not be considered as usable ones for
2752     # external use, such as file names, or we don't want documentation to
2753     # recommend them.  Boolean
2754     main::set_access('ok_as_filename', \%ok_as_filename, 'r');
2755
2756     sub new {
2757         my $class = shift;
2758
2759         my $self = bless \do { my $anonymous_scalar }, $class;
2760         my $addr = do { no overloading; pack 'J', $self; };
2761
2762         $name{$addr} = shift;
2763         $loose_match{$addr} = shift;
2764         $make_re_pod_entry{$addr} = shift;
2765         $ok_as_filename{$addr} = shift;
2766         $status{$addr} = shift;
2767         $ucd{$addr} = shift;
2768
2769         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2770
2771         # Null names are never ok externally
2772         $ok_as_filename{$addr} = 0 if $name{$addr} eq "";
2773
2774         return $self;
2775     }
2776 }
2777
2778 package Range;
2779
2780 # A range is the basic unit for storing code points, and is described in the
2781 # comments at the beginning of the program.  Each range has a starting code
2782 # point; an ending code point (not less than the starting one); a value
2783 # that applies to every code point in between the two end-points, inclusive;
2784 # and an enum type that applies to the value.  The type is for the user's
2785 # convenience, and has no meaning here, except that a non-zero type is
2786 # considered to not obey the normal Unicode rules for having standard forms.
2787 #
2788 # The same structure is used for both map and match tables, even though in the
2789 # latter, the value (and hence type) is irrelevant and could be used as a
2790 # comment.  In map tables, the value is what all the code points in the range
2791 # map to.  Type 0 values have the standardized version of the value stored as
2792 # well, so as to not have to recalculate it a lot.
2793
2794 sub trace { return main::trace(@_); }
2795
2796 {   # Closure
2797
2798     main::setup_package();
2799
2800     my %start;
2801     main::set_access('start', \%start, 'r', 's');
2802
2803     my %end;
2804     main::set_access('end', \%end, 'r', 's');
2805
2806     my %value;
2807     main::set_access('value', \%value, 'r');
2808
2809     my %type;
2810     main::set_access('type', \%type, 'r');
2811
2812     my %standard_form;
2813     # The value in internal standard form.  Defined only if the type is 0.
2814     main::set_access('standard_form', \%standard_form);
2815
2816     # Note that if these fields change, the dump() method should as well
2817
2818     sub new {
2819         return Carp::carp_too_few_args(\@_, 3) if main::DEBUG && @_ < 3;
2820         my $class = shift;
2821
2822         my $self = bless \do { my $anonymous_scalar }, $class;
2823         my $addr = do { no overloading; pack 'J', $self; };
2824
2825         $start{$addr} = shift;
2826         $end{$addr} = shift;
2827
2828         my %args = @_;
2829
2830         my $value = delete $args{'Value'};  # Can be 0
2831         $value = "" unless defined $value;
2832         $value{$addr} = $value;
2833
2834         $type{$addr} = delete $args{'Type'} || 0;
2835
2836         Carp::carp_extra_args(\%args) if main::DEBUG && %args;
2837
2838         if (! $type{$addr}) {
2839             $standard_form{$addr} = main::standardize($value);
2840         }
2841
2842         return $self;
2843     }
2844
2845     use overload
2846         fallback => 0,
2847         qw("") => "_operator_stringify",
2848         "." => \&main::_operator_dot,
2849     ;
2850
2851     sub _operator_stringify {
2852         my $self = shift;
2853         my $addr = do { no overloading; pack 'J', $self; };
2854
2855         # Output it like '0041..0065 (value)'
2856         my $return = sprintf("%04X", $start{$addr})
2857                         .  '..'
2858                         . sprintf("%04X", $end{$addr});
2859         my $value = $value{$addr};
2860         my $type = $type{$addr};
2861         $return .= ' (';
2862         $return .= "$value";
2863         $return .= ", Type=$type" if $type != 0;
2864         $return .= ')';
2865
2866         return $return;
2867     }
2868
2869     sub standard_form {
2870         # The standard form is the value itself if the standard form is
2871         # undefined (that is if the value is special)
2872
2873         my $self = shift;
2874         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2875
2876         my $addr = do { no overloading; pack 'J', $self; };
2877
2878         return $standard_form{$addr} if defined $standard_form{$addr};
2879         return $value{$addr};
2880     }
2881
2882     sub dump {
2883         # Human, not machine readable.  For machine readable, comment out this
2884         # entire routine and let the standard one take effect.
2885         my $self = shift;
2886         my $indent = shift;
2887         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2888
2889         my $addr = do { no overloading; pack 'J', $self; };
2890
2891         my $return = $indent
2892                     . sprintf("%04X", $start{$addr})
2893                     . '..'
2894                     . sprintf("%04X", $end{$addr})
2895                     . " '$value{$addr}';";
2896         if (! defined $standard_form{$addr}) {
2897             $return .= "(type=$type{$addr})";
2898         }
2899         elsif ($standard_form{$addr} ne $value{$addr}) {
2900             $return .= "(standard '$standard_form{$addr}')";
2901         }
2902         return $return;
2903     }
2904 } # End closure
2905
2906 package _Range_List_Base;
2907
2908 # Base class for range lists.  A range list is simply an ordered list of
2909 # ranges, so that the ranges with the lowest starting numbers are first in it.
2910 #
2911 # When a new range is added that is adjacent to an existing range that has the
2912 # same value and type, it merges with it to form a larger range.
2913 #
2914 # Ranges generally do not overlap, except that there can be multiple entries
2915 # of single code point ranges.  This is because of NameAliases.txt.
2916 #
2917 # In this program, there is a standard value such that if two different
2918 # values, have the same standard value, they are considered equivalent.  This
2919 # value was chosen so that it gives correct results on Unicode data
2920
2921 # There are a number of methods to manipulate range lists, and some operators
2922 # are overloaded to handle them.
2923
2924 sub trace { return main::trace(@_); }
2925
2926 { # Closure
2927
2928     our $addr;
2929
2930     main::setup_package();
2931
2932     my %ranges;
2933     # The list of ranges
2934     main::set_access('ranges', \%ranges, 'readable_array');
2935
2936     my %max;
2937     # The highest code point in the list.  This was originally a method, but
2938     # actual measurements said it was used a lot.
2939     main::set_access('max', \%max, 'r');
2940
2941     my %each_range_iterator;
2942     # Iterator position for each_range()
2943     main::set_access('each_range_iterator', \%each_range_iterator);
2944
2945     my %owner_name_of;
2946     # Name of parent this is attached to, if any.  Solely for better error
2947     # messages.
2948     main::set_access('owner_name_of', \%owner_name_of, 'p_r');
2949
2950     my %_search_ranges_cache;
2951     # A cache of the previous result from _search_ranges(), for better
2952     # performance
2953     main::set_access('_search_ranges_cache', \%_search_ranges_cache);
2954
2955     sub new {
2956         my $class = shift;
2957         my %args = @_;
2958
2959         # Optional initialization data for the range list.
2960         my $initialize = delete $args{'Initialize'};
2961
2962         my $self;
2963
2964         # Use _union() to initialize.  _union() returns an object of this
2965         # class, which means that it will call this constructor recursively.
2966         # But it won't have this $initialize parameter so that it won't
2967         # infinitely loop on this.
2968         return _union($class, $initialize, %args) if defined $initialize;
2969
2970         $self = bless \do { my $anonymous_scalar }, $class;
2971         my $addr = do { no overloading; pack 'J', $self; };
2972
2973         # Optional parent object, only for debug info.
2974         $owner_name_of{$addr} = delete $args{'Owner'};
2975         $owner_name_of{$addr} = "" if ! defined $owner_name_of{$addr};
2976
2977         # Stringify, in case it is an object.
2978         $owner_name_of{$addr} = "$owner_name_of{$addr}";
2979
2980         # This is used only for error messages, and so a colon is added
2981         $owner_name_of{$addr} .= ": " if $owner_name_of{$addr} ne "";
2982
2983         Carp::carp_extra_args(\%args) if main::DEBUG && %args;
2984
2985         # Max is initialized to a negative value that isn't adjacent to 0,
2986         # for simpler tests
2987         $max{$addr} = -2;
2988
2989         $_search_ranges_cache{$addr} = 0;
2990         $ranges{$addr} = [];
2991
2992         return $self;
2993     }
2994
2995     use overload
2996         fallback => 0,
2997         qw("") => "_operator_stringify",
2998         "." => \&main::_operator_dot,
2999     ;
3000
3001     sub _operator_stringify {
3002         my $self = shift;
3003         my $addr = do { no overloading; pack 'J', $self; };
3004
3005         return "Range_List attached to '$owner_name_of{$addr}'"
3006                                                 if $owner_name_of{$addr};
3007         return "anonymous Range_List " . \$self;
3008     }
3009
3010     sub _union {
3011         # Returns the union of the input code points.  It can be called as
3012         # either a constructor or a method.  If called as a method, the result
3013         # will be a new() instance of the calling object, containing the union
3014         # of that object with the other parameter's code points;  if called as
3015         # a constructor, the first parameter gives the class the new object
3016         # should be, and the second parameter gives the code points to go into
3017         # it.
3018         # In either case, there are two parameters looked at by this routine;
3019         # any additional parameters are passed to the new() constructor.
3020         #
3021         # The code points can come in the form of some object that contains
3022         # ranges, and has a conventionally named method to access them; or
3023         # they can be an array of individual code points (as integers); or
3024         # just a single code point.
3025         #
3026         # If they are ranges, this routine doesn't make any effort to preserve
3027         # the range values of one input over the other.  Therefore this base
3028         # class should not allow _union to be called from other than
3029         # initialization code, so as to prevent two tables from being added
3030         # together where the range values matter.  The general form of this
3031         # routine therefore belongs in a derived class, but it was moved here
3032         # to avoid duplication of code.  The failure to overload this in this
3033         # class keeps it safe.
3034         #
3035
3036         my $self;
3037         my @args;   # Arguments to pass to the constructor
3038
3039         my $class = shift;
3040
3041         # If a method call, will start the union with the object itself, and
3042         # the class of the new object will be the same as self.
3043         if (ref $class) {
3044             $self = $class;
3045             $class = ref $self;
3046             push @args, $self;
3047         }
3048
3049         # Add the other required parameter.
3050         push @args, shift;
3051         # Rest of parameters are passed on to the constructor
3052
3053         # Accumulate all records from both lists.
3054         my @records;
3055         for my $arg (@args) {
3056             #local $to_trace = 0 if main::DEBUG;
3057             trace "argument = $arg" if main::DEBUG && $to_trace;
3058             if (! defined $arg) {
3059                 my $message = "";
3060                 if (defined $self) {
3061                     no overloading;
3062                     $message .= $owner_name_of{pack 'J', $self};
3063                 }
3064                 Carp::my_carp_bug($message .= "Undefined argument to _union.  No union done.");
3065                 return;
3066             }
3067             $arg = [ $arg ] if ! ref $arg;
3068             my $type = ref $arg;
3069             if ($type eq 'ARRAY') {
3070                 foreach my $element (@$arg) {
3071                     push @records, Range->new($element, $element);
3072                 }
3073             }
3074             elsif ($arg->isa('Range')) {
3075                 push @records, $arg;
3076             }
3077             elsif ($arg->can('ranges')) {
3078                 push @records, $arg->ranges;
3079             }
3080             else {
3081                 my $message = "";
3082                 if (defined $self) {
3083                     no overloading;
3084                     $message .= $owner_name_of{pack 'J', $self};
3085                 }
3086                 Carp::my_carp_bug($message . "Cannot take the union of a $type.  No union done.");
3087                 return;
3088             }
3089         }
3090
3091         # Sort with the range containing the lowest ordinal first, but if
3092         # two ranges start at the same code point, sort with the bigger range
3093         # of the two first, because it takes fewer cycles.
3094         @records = sort { ($a->start <=> $b->start)
3095                                       or
3096                                     # if b is shorter than a, b->end will be
3097                                     # less than a->end, and we want to select
3098                                     # a, so want to return -1
3099                                     ($b->end <=> $a->end)
3100                                    } @records;
3101
3102         my $new = $class->new(@_);
3103
3104         # Fold in records so long as they add new information.
3105         for my $set (@records) {
3106             my $start = $set->start;
3107             my $end   = $set->end;
3108             my $value   = $set->value;
3109             if ($start > $new->max) {
3110                 $new->_add_delete('+', $start, $end, $value);
3111             }
3112             elsif ($end > $new->max) {
3113                 $new->_add_delete('+', $new->max +1, $end, $value);
3114             }
3115         }
3116
3117         return $new;
3118     }
3119
3120     sub range_count {        # Return the number of ranges in the range list
3121         my $self = shift;
3122         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3123
3124         no overloading;
3125         return scalar @{$ranges{pack 'J', $self}};
3126     }
3127
3128     sub min {
3129         # Returns the minimum code point currently in the range list, or if
3130         # the range list is empty, 2 beyond the max possible.  This is a
3131         # method because used so rarely, that not worth saving between calls,
3132         # and having to worry about changing it as ranges are added and
3133         # deleted.
3134
3135         my $self = shift;
3136         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3137
3138         my $addr = do { no overloading; pack 'J', $self; };
3139
3140         # If the range list is empty, return a large value that isn't adjacent
3141         # to any that could be in the range list, for simpler tests
3142         return $MAX_UNICODE_CODEPOINT + 2 unless scalar @{$ranges{$addr}};
3143         return $ranges{$addr}->[0]->start;
3144     }
3145
3146     sub contains {
3147         # Boolean: Is argument in the range list?  If so returns $i such that:
3148         #   range[$i]->end < $codepoint <= range[$i+1]->end
3149         # which is one beyond what you want; this is so that the 0th range
3150         # doesn't return false
3151         my $self = shift;
3152         my $codepoint = shift;
3153         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3154
3155         my $i = $self->_search_ranges($codepoint);
3156         return 0 unless defined $i;
3157
3158         # The search returns $i, such that
3159         #   range[$i-1]->end < $codepoint <= range[$i]->end
3160         # So is in the table if and only iff it is at least the start position
3161         # of range $i.
3162         no overloading;
3163         return 0 if $ranges{pack 'J', $self}->[$i]->start > $codepoint;
3164         return $i + 1;
3165     }
3166
3167     sub containing_range {
3168         # Returns the range object that contains the code point, undef if none
3169
3170         my $self = shift;
3171         my $codepoint = shift;
3172         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3173
3174         my $i = $self->contains($codepoint);
3175         return unless $i;
3176
3177         # contains() returns 1 beyond where we should look
3178         no overloading;
3179         return $ranges{pack 'J', $self}->[$i-1];
3180     }
3181
3182     sub value_of {
3183         # Returns the value associated with the code point, undef if none
3184
3185         my $self = shift;
3186         my $codepoint = shift;
3187         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3188
3189         my $range = $self->containing_range($codepoint);
3190         return unless defined $range;
3191
3192         return $range->value;
3193     }
3194
3195     sub type_of {
3196         # Returns the type of the range containing the code point, undef if
3197         # the code point is not in the table
3198
3199         my $self = shift;
3200         my $codepoint = shift;
3201         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3202
3203         my $range = $self->containing_range($codepoint);
3204         return unless defined $range;
3205
3206         return $range->type;
3207     }
3208
3209     sub _search_ranges {
3210         # Find the range in the list which contains a code point, or where it
3211         # should go if were to add it.  That is, it returns $i, such that:
3212         #   range[$i-1]->end < $codepoint <= range[$i]->end
3213         # Returns undef if no such $i is possible (e.g. at end of table), or
3214         # if there is an error.
3215
3216         my $self = shift;
3217         my $code_point = shift;
3218         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3219
3220         my $addr = do { no overloading; pack 'J', $self; };
3221
3222         return if $code_point > $max{$addr};
3223         my $r = $ranges{$addr};                # The current list of ranges
3224         my $range_list_size = scalar @$r;
3225         my $i;
3226
3227         use integer;        # want integer division
3228
3229         # Use the cached result as the starting guess for this one, because,
3230         # an experiment on 5.1 showed that 90% of the time the cache was the
3231         # same as the result on the next call (and 7% it was one less).
3232         $i = $_search_ranges_cache{$addr};
3233         $i = 0 if $i >= $range_list_size;   # Reset if no longer valid (prob.
3234                                             # from an intervening deletion
3235         #local $to_trace = 1 if main::DEBUG;
3236         trace "previous \$i is still valid: $i" if main::DEBUG && $to_trace && $code_point <= $r->[$i]->end && ($i == 0 || $r->[$i-1]->end < $code_point);
3237         return $i if $code_point <= $r->[$i]->end
3238                      && ($i == 0 || $r->[$i-1]->end < $code_point);
3239
3240         # Here the cache doesn't yield the correct $i.  Try adding 1.
3241         if ($i < $range_list_size - 1
3242             && $r->[$i]->end < $code_point &&
3243             $code_point <= $r->[$i+1]->end)
3244         {
3245             $i++;
3246             trace "next \$i is correct: $i" if main::DEBUG && $to_trace;
3247             $_search_ranges_cache{$addr} = $i;
3248             return $i;
3249         }
3250
3251         # Here, adding 1 also didn't work.  We do a binary search to
3252         # find the correct position, starting with current $i
3253         my $lower = 0;
3254         my $upper = $range_list_size - 1;
3255         while (1) {
3256             trace "top of loop i=$i:", sprintf("%04X", $r->[$lower]->start), "[$lower] .. ", sprintf("%04X", $r->[$i]->start), "[$i] .. ", sprintf("%04X", $r->[$upper]->start), "[$upper]" if main::DEBUG && $to_trace;
3257
3258             if ($code_point <= $r->[$i]->end) {
3259
3260                 # Here we have met the upper constraint.  We can quit if we
3261                 # also meet the lower one.
3262                 last if $i == 0 || $r->[$i-1]->end < $code_point;
3263
3264                 $upper = $i;        # Still too high.
3265
3266             }
3267             else {
3268
3269                 # Here, $r[$i]->end < $code_point, so look higher up.
3270                 $lower = $i;
3271             }
3272
3273             # Split search domain in half to try again.
3274             my $temp = ($upper + $lower) / 2;
3275
3276             # No point in continuing unless $i changes for next time
3277             # in the loop.
3278             if ($temp == $i) {
3279
3280                 # We can't reach the highest element because of the averaging.
3281                 # So if one below the upper edge, force it there and try one
3282                 # more time.
3283                 if ($i == $range_list_size - 2) {
3284
3285                     trace "Forcing to upper edge" if main::DEBUG && $to_trace;
3286                     $i = $range_list_size - 1;
3287
3288                     # Change $lower as well so if fails next time through,
3289                     # taking the average will yield the same $i, and we will
3290                     # quit with the error message just below.
3291                     $lower = $i;
3292                     next;
3293                 }
3294                 Carp::my_carp_bug("$owner_name_of{$addr}Can't find where the range ought to go.  No action taken.");
3295                 return;
3296             }
3297             $i = $temp;
3298         } # End of while loop
3299
3300         if (main::DEBUG && $to_trace) {
3301             trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i;
3302             trace "i=  [ $i ]", $r->[$i];
3303             trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < $range_list_size - 1;
3304         }
3305
3306         # Here we have found the offset.  Cache it as a starting point for the
3307         # next call.
3308         $_search_ranges_cache{$addr} = $i;
3309         return $i;
3310     }
3311
3312     sub _add_delete {
3313         # Add, replace or delete ranges to or from a list.  The $type
3314         # parameter gives which:
3315         #   '+' => insert or replace a range, returning a list of any changed
3316         #          ranges.
3317         #   '-' => delete a range, returning a list of any deleted ranges.
3318         #
3319         # The next three parameters give respectively the start, end, and
3320         # value associated with the range.  'value' should be null unless the
3321         # operation is '+';
3322         #
3323         # The range list is kept sorted so that the range with the lowest
3324         # starting position is first in the list, and generally, adjacent
3325         # ranges with the same values are merged into a single larger one (see
3326         # exceptions below).
3327         #
3328         # There are more parameters; all are key => value pairs:
3329         #   Type    gives the type of the value.  It is only valid for '+'.
3330         #           All ranges have types; if this parameter is omitted, 0 is
3331         #           assumed.  Ranges with type 0 are assumed to obey the
3332         #           Unicode rules for casing, etc; ranges with other types are
3333         #           not.  Otherwise, the type is arbitrary, for the caller's
3334         #           convenience, and looked at only by this routine to keep
3335         #           adjacent ranges of different types from being merged into
3336         #           a single larger range, and when Replace =>
3337         #           $IF_NOT_EQUIVALENT is specified (see just below).
3338         #   Replace  determines what to do if the range list already contains
3339         #            ranges which coincide with all or portions of the input
3340         #            range.  It is only valid for '+':
3341         #       => $NO            means that the new value is not to replace
3342         #                         any existing ones, but any empty gaps of the
3343         #                         range list coinciding with the input range
3344         #                         will be filled in with the new value.
3345         #       => $UNCONDITIONALLY  means to replace the existing values with
3346         #                         this one unconditionally.  However, if the
3347         #                         new and old values are identical, the
3348         #                         replacement is skipped to save cycles
3349         #       => $IF_NOT_EQUIVALENT means to replace the existing values
3350         #                         with this one if they are not equivalent.
3351         #                         Ranges are equivalent if their types are the
3352         #                         same, and they are the same string; or if
3353         #                         both are type 0 ranges, if their Unicode
3354         #                         standard forms are identical.  In this last
3355         #                         case, the routine chooses the more "modern"
3356         #                         one to use.  This is because some of the
3357         #                         older files are formatted with values that
3358         #                         are, for example, ALL CAPs, whereas the
3359         #                         derived files have a more modern style,
3360         #                         which looks better.  By looking for this
3361         #                         style when the pre-existing and replacement
3362         #                         standard forms are the same, we can move to
3363         #                         the modern style
3364         #       => $MULTIPLE_BEFORE means that if this range duplicates an
3365         #                         existing one, but has a different value,
3366         #                         don't replace the existing one, but insert
3367         #                         this, one so that the same range can occur
3368         #                         multiple times.  They are stored LIFO, so
3369         #                         that the final one inserted is the first one
3370         #                         returned in an ordered search of the table.
3371         #       => $MULTIPLE_AFTER is like $MULTIPLE_BEFORE, but is stored
3372         #                         FIFO, so that this one is inserted after all
3373         #                         others that currently exist.
3374         #       => anything else  is the same as => $IF_NOT_EQUIVALENT
3375         #
3376         # "same value" means identical for non-type-0 ranges, and it means
3377         # having the same standard forms for type-0 ranges.
3378
3379         return Carp::carp_too_few_args(\@_, 5) if main::DEBUG && @_ < 5;
3380
3381         my $self = shift;
3382         my $operation = shift;   # '+' for add/replace; '-' for delete;
3383         my $start = shift;
3384         my $end   = shift;
3385         my $value = shift;
3386
3387         my %args = @_;
3388
3389         $value = "" if not defined $value;        # warning: $value can be "0"
3390
3391         my $replace = delete $args{'Replace'};
3392         $replace = $IF_NOT_EQUIVALENT unless defined $replace;
3393
3394         my $type = delete $args{'Type'};
3395         $type = 0 unless defined $type;
3396
3397         Carp::carp_extra_args(\%args) if main::DEBUG && %args;
3398
3399         my $addr = do { no overloading; pack 'J', $self; };
3400
3401         if ($operation ne '+' && $operation ne '-') {
3402             Carp::my_carp_bug("$owner_name_of{$addr}First parameter to _add_delete must be '+' or '-'.  No action taken.");
3403             return;
3404         }
3405         unless (defined $start && defined $end) {
3406             Carp::my_carp_bug("$owner_name_of{$addr}Undefined start and/or end to _add_delete.  No action taken.");
3407             return;
3408         }
3409         unless ($end >= $start) {
3410             Carp::my_carp_bug("$owner_name_of{$addr}End of range (" . sprintf("%04X", $end) . ") must not be before start (" . sprintf("%04X", $start) . ").  No action taken.");
3411             return;
3412         }
3413         #local $to_trace = 1 if main::DEBUG;
3414
3415         if ($operation eq '-') {
3416             if ($replace != $IF_NOT_EQUIVALENT) {
3417                 Carp::my_carp_bug("$owner_name_of{$addr}Replace => \$IF_NOT_EQUIVALENT is required when deleting a range from a range list.  Assuming Replace => \$IF_NOT_EQUIVALENT.");
3418                 $replace = $IF_NOT_EQUIVALENT;
3419             }
3420             if ($type) {
3421                 Carp::my_carp_bug("$owner_name_of{$addr}Type => 0 is required when deleting a range from a range list.  Assuming Type => 0.");
3422                 $type = 0;
3423             }
3424             if ($value ne "") {
3425                 Carp::my_carp_bug("$owner_name_of{$addr}Value => \"\" is required when deleting a range from a range list.  Assuming Value => \"\".");
3426                 $value = "";
3427             }
3428         }
3429
3430         my $r = $ranges{$addr};               # The current list of ranges
3431         my $range_list_size = scalar @$r;     # And its size
3432         my $max = $max{$addr};                # The current high code point in
3433                                               # the list of ranges
3434
3435         # Do a special case requiring fewer machine cycles when the new range
3436         # starts after the current highest point.  The Unicode input data is
3437         # structured so this is common.
3438         if ($start > $max) {
3439
3440             trace "$owner_name_of{$addr} $operation", sprintf("%04X", $start) . '..' . sprintf("%04X", $end) . " ($value) type=$type" if main::DEBUG && $to_trace;
3441             return if $operation eq '-'; # Deleting a non-existing range is a
3442                                          # no-op
3443
3444             # If the new range doesn't logically extend the current final one
3445             # in the range list, create a new range at the end of the range
3446             # list.  (max cleverly is initialized to a negative number not
3447             # adjacent to 0 if the range list is empty, so even adding a range
3448             # to an empty range list starting at 0 will have this 'if'
3449             # succeed.)
3450             if ($start > $max + 1        # non-adjacent means can't extend.
3451                 || @{$r}[-1]->value ne $value # values differ, can't extend.
3452                 || @{$r}[-1]->type != $type # types differ, can't extend.
3453             ) {
3454                 push @$r, Range->new($start, $end,
3455                                      Value => $value,
3456                                      Type => $type);
3457             }
3458             else {
3459
3460                 # Here, the new range starts just after the current highest in
3461                 # the range list, and they have the same type and value.
3462                 # Extend the current range to incorporate the new one.
3463                 @{$r}[-1]->set_end($end);
3464             }
3465
3466             # This becomes the new maximum.
3467             $max{$addr} = $end;
3468
3469             return;
3470         }
3471         #local $to_trace = 0 if main::DEBUG;
3472
3473         trace "$owner_name_of{$addr} $operation", sprintf("%04X", $start) . '..' . sprintf("%04X", $end) . " ($value) replace=$replace" if main::DEBUG && $to_trace;
3474
3475         # Here, the input range isn't after the whole rest of the range list.
3476         # Most likely 'splice' will be needed.  The rest of the routine finds
3477         # the needed splice parameters, and if necessary, does the splice.
3478         # First, find the offset parameter needed by the splice function for
3479         # the input range.  Note that the input range may span multiple
3480         # existing ones, but we'll worry about that later.  For now, just find
3481         # the beginning.  If the input range is to be inserted starting in a
3482         # position not currently in the range list, it must (obviously) come
3483         # just after the range below it, and just before the range above it.
3484         # Slightly less obviously, it will occupy the position currently
3485         # occupied by the range that is to come after it.  More formally, we
3486         # are looking for the position, $i, in the array of ranges, such that:
3487         #
3488         # r[$i-1]->start <= r[$i-1]->end < $start < r[$i]->start <= r[$i]->end
3489         #
3490         # (The ordered relationships within existing ranges are also shown in
3491         # the equation above).  However, if the start of the input range is
3492         # within an existing range, the splice offset should point to that
3493         # existing range's position in the list; that is $i satisfies a
3494         # somewhat different equation, namely:
3495         #
3496         #r[$i-1]->start <= r[$i-1]->end < r[$i]->start <= $start <= r[$i]->end
3497         #
3498         # More briefly, $start can come before or after r[$i]->start, and at
3499         # this point, we don't know which it will be.  However, these
3500         # two equations share these constraints:
3501         #
3502         #   r[$i-1]->end < $start <= r[$i]->end
3503         #
3504         # And that is good enough to find $i.
3505
3506         my $i = $self->_search_ranges($start);
3507         if (! defined $i) {
3508             Carp::my_carp_bug("Searching $self for range beginning with $start unexpectedly returned undefined.  Operation '$operation' not performed");
3509             return;
3510         }
3511
3512         # The search function returns $i such that:
3513         #
3514         # r[$i-1]->end < $start <= r[$i]->end
3515         #
3516         # That means that $i points to the first range in the range list
3517         # that could possibly be affected by this operation.  We still don't
3518         # know if the start of the input range is within r[$i], or if it
3519         # points to empty space between r[$i-1] and r[$i].
3520         trace "[$i] is the beginning splice point.  Existing range there is ", $r->[$i] if main::DEBUG && $to_trace;
3521
3522         # Special case the insertion of data that is not to replace any
3523         # existing data.
3524         if ($replace == $NO) {  # If $NO, has to be operation '+'
3525             #local $to_trace = 1 if main::DEBUG;
3526             trace "Doesn't replace" if main::DEBUG && $to_trace;
3527
3528             # Here, the new range is to take effect only on those code points
3529             # that aren't already in an existing range.  This can be done by
3530             # looking through the existing range list and finding the gaps in
3531             # the ranges that this new range affects, and then calling this
3532             # function recursively on each of those gaps, leaving untouched
3533             # anything already in the list.  Gather up a list of the changed
3534             # gaps first so that changes to the internal state as new ranges
3535             # are added won't be a problem.
3536             my @gap_list;
3537
3538             # First, if the starting point of the input range is outside an
3539             # existing one, there is a gap from there to the beginning of the
3540             # existing range -- add a span to fill the part that this new
3541             # range occupies
3542             if ($start < $r->[$i]->start) {
3543                 push @gap_list, Range->new($start,
3544                                            main::min($end,
3545                                                      $r->[$i]->start - 1),
3546                                            Type => $type);
3547                 trace "gap before $r->[$i] [$i], will add", $gap_list[-1] if main::DEBUG && $to_trace;
3548             }
3549
3550             # Then look through the range list for other gaps until we reach
3551             # the highest range affected by the input one.
3552             my $j;
3553             for ($j = $i+1; $j < $range_list_size; $j++) {
3554                 trace "j=[$j]", $r->[$j] if main::DEBUG && $to_trace;
3555                 last if $end < $r->[$j]->start;
3556
3557                 # If there is a gap between when this range starts and the
3558                 # previous one ends, add a span to fill it.  Note that just
3559                 # because there are two ranges doesn't mean there is a
3560                 # non-zero gap between them.  It could be that they have
3561                 # different values or types
3562                 if ($r->[$j-1]->end + 1 != $r->[$j]->start) {
3563                     push @gap_list,
3564                         Range->new($r->[$j-1]->end + 1,
3565                                    $r->[$j]->start - 1,
3566                                    Type => $type);
3567                     trace "gap between $r->[$j-1] and $r->[$j] [$j], will add: $gap_list[-1]" if main::DEBUG && $to_trace;
3568                 }
3569             }
3570
3571             # Here, we have either found an existing range in the range list,
3572             # beyond the area affected by the input one, or we fell off the
3573             # end of the loop because the input range affects the whole rest
3574             # of the range list.  In either case, $j is 1 higher than the
3575             # highest affected range.  If $j == $i, it means that there are no
3576             # affected ranges, that the entire insertion is in the gap between
3577             # r[$i-1], and r[$i], which we already have taken care of before
3578             # the loop.
3579             # On the other hand, if there are affected ranges, it might be
3580             # that there is a gap that needs filling after the final such
3581             # range to the end of the input range
3582             if ($r->[$j-1]->end < $end) {
3583                     push @gap_list, Range->new(main::max($start,
3584                                                          $r->[$j-1]->end + 1),
3585                                                $end,
3586                                                Type => $type);
3587                     trace "gap after $r->[$j-1], will add $gap_list[-1]" if main::DEBUG && $to_trace;
3588             }
3589
3590             # Call recursively to fill in all the gaps.
3591             foreach my $gap (@gap_list) {
3592                 $self->_add_delete($operation,
3593                                    $gap->start,
3594                                    $gap->end,
3595                                    $value,
3596                                    Type => $type);
3597             }
3598
3599             return;
3600         }
3601
3602         # Here, we have taken care of the case where $replace is $NO.
3603         # Remember that here, r[$i-1]->end < $start <= r[$i]->end
3604         # If inserting a multiple record, this is where it goes, before the
3605         # first (if any) existing one if inserting LIFO.  (If this is to go
3606         # afterwards, FIFO, we below move the pointer to there.)  These imply
3607         # an insertion, and no change to any existing ranges.  Note that $i
3608         # can be -1 if this new range doesn't actually duplicate any existing,
3609         # and comes at the beginning of the list.
3610         if ($replace == $MULTIPLE_BEFORE || $replace == $MULTIPLE_AFTER) {
3611
3612             if ($start != $end) {
3613                 Carp::my_carp_bug("$owner_name_of{$addr}Can't cope with adding a multiple record when the range ($start..$end) contains more than one code point.  No action taken.");
3614                 return;
3615             }
3616
3617             # If the new code point is within a current range ...
3618             if ($end >= $r->[$i]->start) {
3619
3620                 # Don't add an exact duplicate, as it isn't really a multiple
3621                 my $existing_value = $r->[$i]->value;
3622                 my $existing_type = $r->[$i]->type;
3623                 return if $value eq $existing_value && $type eq $existing_type;
3624
3625                 # If the multiple value is part of an existing range, we want
3626                 # to split up that range, so that only the single code point
3627                 # is affected.  To do this, we first call ourselves
3628                 # recursively to delete that code point from the table, having
3629                 # preserved its current data above.  Then we call ourselves
3630                 # recursively again to add the new multiple, which we know by
3631                 # the test just above is different than the current code
3632                 # point's value, so it will become a range containing a single
3633                 # code point: just itself.  Finally, we add back in the
3634                 # pre-existing code point, which will again be a single code
3635                 # point range.  Because 'i' likely will have changed as a
3636                 # result of these operations, we can't just continue on, but
3637                 # do this operation recursively as well.  If we are inserting
3638                 # LIFO, the pre-existing code point needs to go after the new
3639                 # one, so use MULTIPLE_AFTER; and vice versa.
3640                 if ($r->[$i]->start != $r->[$i]->end) {
3641                     $self->_add_delete('-', $start, $end, "");
3642                     $self->_add_delete('+', $start, $end, $value, Type => $type);
3643                     return $self->_add_delete('+',
3644                             $start, $end,
3645                             $existing_value,
3646                             Type => $existing_type,
3647                             Replace => ($replace == $MULTIPLE_BEFORE)
3648                                        ? $MULTIPLE_AFTER
3649                                        : $MULTIPLE_BEFORE);
3650                 }
3651             }
3652
3653             # If to place this new record after, move to beyond all existing
3654             # ones.
3655             if ($replace == $MULTIPLE_AFTER) {
3656                 while ($i < @$r && $r->[$i]->start == $start) {
3657                     $i++;
3658                 }
3659             }
3660
3661             trace "Adding multiple record at $i with $start..$end, $value" if main::DEBUG && $to_trace;
3662             my @return = splice @$r,
3663                                 $i,
3664                                 0,
3665                                 Range->new($start,
3666                                            $end,
3667                                            Value => $value,
3668                                            Type => $type);
3669             if (main::DEBUG && $to_trace) {
3670                 trace "After splice:";
3671                 trace 'i-2=[', $i-2, ']', $r->[$i-2] if $i >= 2;
3672                 trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i >= 1;
3673                 trace "i  =[", $i, "]", $r->[$i] if $i >= 0;
3674                 trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < @$r - 1;
3675                 trace 'i+2=[', $i+2, ']', $r->[$i+2] if $i < @$r - 2;
3676                 trace 'i+3=[', $i+3, ']', $r->[$i+3] if $i < @$r - 3;
3677             }
3678             return @return;
3679         }
3680
3681         # Here, we have taken care of $NO and $MULTIPLE_foo replaces.  This
3682         # leaves delete, insert, and replace either unconditionally or if not
3683         # equivalent.  $i still points to the first potential affected range.
3684         # Now find the highest range affected, which will determine the length
3685         # parameter to splice.  (The input range can span multiple existing
3686         # ones.)  If this isn't a deletion, while we are looking through the
3687         # range list, see also if this is a replacement rather than a clean
3688         # insertion; that is if it will change the values of at least one
3689         # existing range.  Start off assuming it is an insert, until find it
3690         # isn't.
3691         my $clean_insert = $operation eq '+';
3692         my $j;        # This will point to the highest affected range
3693
3694         # For non-zero types, the standard form is the value itself;
3695         my $standard_form = ($type) ? $value : main::standardize($value);
3696
3697         for ($j = $i; $j < $range_list_size; $j++) {
3698             trace "Looking for highest affected range; the one at $j is ", $r->[$j] if main::DEBUG && $to_trace;
3699
3700             # If find a range that it doesn't overlap into, we can stop
3701             # searching
3702             last if $end < $r->[$j]->start;
3703
3704             # Here, overlaps the range at $j.  If the values don't match,
3705             # and so far we think this is a clean insertion, it becomes a
3706             # non-clean insertion, i.e., a 'change' or 'replace' instead.
3707             if ($clean_insert) {
3708                 if ($r->[$j]->standard_form ne $standard_form) {
3709                     $clean_insert = 0;
3710                     if ($replace == $CROAK) {
3711                         main::croak("The range to add "
3712                         . sprintf("%04X", $start)
3713                         . '-'
3714                         . sprintf("%04X", $end)
3715                         . " with value '$value' overlaps an existing range $r->[$j]");
3716                     }
3717                 }
3718                 else {
3719
3720                     # Here, the two values are essentially the same.  If the
3721                     # two are actually identical, replacing wouldn't change
3722                     # anything so skip it.
3723                     my $pre_existing = $r->[$j]->value;
3724                     if ($pre_existing ne $value) {
3725
3726                         # Here the new and old standardized values are the
3727                         # same, but the non-standardized values aren't.  If
3728                         # replacing unconditionally, then replace
3729                         if( $replace == $UNCONDITIONALLY) {
3730                             $clean_insert = 0;
3731                         }
3732                         else {
3733
3734                             # Here, are replacing conditionally.  Decide to
3735                             # replace or not based on which appears to look
3736                             # the "nicest".  If one is mixed case and the
3737                             # other isn't, choose the mixed case one.
3738                             my $new_mixed = $value =~ /[A-Z]/
3739                                             && $value =~ /[a-z]/;
3740                             my $old_mixed = $pre_existing =~ /[A-Z]/
3741                                             && $pre_existing =~ /[a-z]/;
3742
3743                             if ($old_mixed != $new_mixed) {
3744                                 $clean_insert = 0 if $new_mixed;
3745                                 if (main::DEBUG && $to_trace) {
3746                                     if ($clean_insert) {
3747                                         trace "Retaining $pre_existing over $value";
3748                                     }
3749                                     else {
3750                                         trace "Replacing $pre_existing with $value";
3751                                     }
3752                                 }
3753                             }
3754                             else {
3755
3756                                 # Here casing wasn't different between the two.
3757                                 # If one has hyphens or underscores and the
3758                                 # other doesn't, choose the one with the
3759                                 # punctuation.
3760                                 my $new_punct = $value =~ /[-_]/;
3761                                 my $old_punct = $pre_existing =~ /[-_]/;
3762
3763                                 if ($old_punct != $new_punct) {
3764                                     $clean_insert = 0 if $new_punct;
3765                                     if (main::DEBUG && $to_trace) {
3766                                         if ($clean_insert) {
3767                                             trace "Retaining $pre_existing over $value";
3768                                         }
3769                                         else {
3770                                             trace "Replacing $pre_existing with $value";
3771                                         }
3772                                     }
3773                                 }   # else existing one is just as "good";
3774                                     # retain it to save cycles.
3775                             }
3776                         }
3777                     }
3778                 }
3779             }
3780         } # End of loop looking for highest affected range.
3781
3782         # Here, $j points to one beyond the highest range that this insertion
3783         # affects (hence to beyond the range list if that range is the final
3784         # one in the range list).
3785
3786         # The splice length is all the affected ranges.  Get it before
3787         # subtracting, for efficiency, so we don't have to later add 1.
3788         my $length = $j - $i;
3789
3790         $j--;        # $j now points to the highest affected range.
3791         trace "Final affected range is $j: $r->[$j]" if main::DEBUG && $to_trace;
3792
3793         # Here, have taken care of $NO and $MULTIPLE_foo replaces.
3794         # $j points to the highest affected range.  But it can be < $i or even
3795         # -1.  These happen only if the insertion is entirely in the gap
3796         # between r[$i-1] and r[$i].  Here's why: j < i means that the j loop
3797         # above exited first time through with $end < $r->[$i]->start.  (And
3798         # then we subtracted one from j)  This implies also that $start <
3799         # $r->[$i]->start, but we know from above that $r->[$i-1]->end <
3800         # $start, so the entire input range is in the gap.
3801         if ($j < $i) {
3802
3803             # Here the entire input range is in the gap before $i.
3804
3805             if (main::DEBUG && $to_trace) {
3806                 if ($i) {
3807                     trace "Entire range is between $r->[$i-1] and $r->[$i]";
3808                 }
3809                 else {
3810                     trace "Entire range is before $r->[$i]";
3811                 }
3812             }
3813             return if $operation ne '+'; # Deletion of a non-existent range is
3814                                          # a no-op
3815         }
3816         else {
3817
3818             # Here part of the input range is not in the gap before $i.  Thus,
3819             # there is at least one affected one, and $j points to the highest
3820             # such one.
3821
3822             # At this point, here is the situation:
3823             # This is not an insertion of a multiple, nor of tentative ($NO)
3824             # data.
3825             #   $i  points to the first element in the current range list that
3826             #            may be affected by this operation.  In fact, we know
3827             #            that the range at $i is affected because we are in
3828             #            the else branch of this 'if'
3829             #   $j  points to the highest affected range.
3830             # In other words,
3831             #   r[$i-1]->end < $start <= r[$i]->end
3832             # And:
3833             #   r[$i-1]->end < $start <= $end <= r[$j]->end
3834             #
3835             # Also:
3836             #   $clean_insert is a boolean which is set true if and only if
3837             #        this is a "clean insertion", i.e., not a change nor a
3838             #        deletion (multiple was handled above).
3839
3840             # We now have enough information to decide if this call is a no-op
3841             # or not.  It is a no-op if this is an insertion of already
3842             # existing data.
3843
3844             if (main::DEBUG && $to_trace && $clean_insert
3845                                          && $i == $j
3846                                          && $start >= $r->[$i]->start)
3847             {
3848                     trace "no-op";
3849             }
3850             return if $clean_insert
3851                       && $i == $j # more than one affected range => not no-op
3852
3853                       # Here, r[$i-1]->end < $start <= $end <= r[$i]->end
3854                       # Further, $start and/or $end is >= r[$i]->start
3855                       # The test below hence guarantees that
3856                       #     r[$i]->start < $start <= $end <= r[$i]->end
3857                       # This means the input range is contained entirely in
3858                       # the one at $i, so is a no-op
3859                       && $start >= $r->[$i]->start;
3860         }
3861
3862         # Here, we know that some action will have to be taken.  We have
3863         # calculated the offset and length (though adjustments may be needed)
3864         # for the splice.  Now start constructing the replacement list.
3865         my @replacement;
3866         my $splice_start = $i;
3867
3868         my $extends_below;
3869         my $extends_above;
3870
3871         # See if should extend any adjacent ranges.
3872         if ($operation eq '-') { # Don't extend deletions
3873             $extends_below = $extends_above = 0;
3874         }
3875         else {  # Here, should extend any adjacent ranges.  See if there are
3876                 # any.
3877             $extends_below = ($i > 0
3878                             # can't extend unless adjacent
3879                             && $r->[$i-1]->end == $start -1
3880                             # can't extend unless are same standard value
3881                             && $r->[$i-1]->standard_form eq $standard_form
3882                             # can't extend unless share type
3883                             && $r->[$i-1]->type == $type);
3884             $extends_above = ($j+1 < $range_list_size
3885                             && $r->[$j+1]->start == $end +1
3886                             && $r->[$j+1]->standard_form eq $standard_form
3887                             && $r->[$j+1]->type == $type);
3888         }
3889         if ($extends_below && $extends_above) { # Adds to both
3890             $splice_start--;     # start replace at element below
3891             $length += 2;        # will replace on both sides
3892             trace "Extends both below and above ranges" if main::DEBUG && $to_trace;
3893
3894             # The result will fill in any gap, replacing both sides, and
3895             # create one large range.
3896             @replacement = Range->new($r->[$i-1]->start,
3897                                       $r->[$j+1]->end,
3898                                       Value => $value,
3899                                       Type => $type);
3900         }
3901         else {
3902
3903             # Here we know that the result won't just be the conglomeration of
3904             # a new range with both its adjacent neighbors.  But it could
3905             # extend one of them.
3906
3907             if ($extends_below) {
3908
3909                 # Here the new element adds to the one below, but not to the
3910                 # one above.  If inserting, and only to that one range,  can
3911                 # just change its ending to include the new one.
3912                 if ($length == 0 && $clean_insert) {
3913                     $r->[$i-1]->set_end($end);
3914                     trace "inserted range extends range to below so it is now $r->[$i-1]" if main::DEBUG && $to_trace;
3915                     return;
3916                 }
3917                 else {
3918                     trace "Changing inserted range to start at ", sprintf("%04X",  $r->[$i-1]->start), " instead of ", sprintf("%04X", $start) if main::DEBUG && $to_trace;
3919                     $splice_start--;        # start replace at element below
3920                     $length++;              # will replace the element below
3921                     $start = $r->[$i-1]->start;
3922                 }
3923             }
3924             elsif ($extends_above) {
3925
3926                 # Here the new element adds to the one above, but not below.
3927                 # Mirror the code above
3928                 if ($length == 0 && $clean_insert) {
3929                     $r->[$j+1]->set_start($start);
3930                     trace "inserted range extends range to above so it is now $r->[$j+1]" if main::DEBUG && $to_trace;
3931                     return;
3932                 }
3933                 else {
3934                     trace "Changing inserted range to end at ", sprintf("%04X",  $r->[$j+1]->end), " instead of ", sprintf("%04X", $end) if main::DEBUG && $to_trace;
3935                     $length++;        # will replace the element above
3936                     $end = $r->[$j+1]->end;
3937                 }
3938             }
3939
3940             trace "Range at $i is $r->[$i]" if main::DEBUG && $to_trace;
3941
3942             # Finally, here we know there will have to be a splice.
3943             # If the change or delete affects only the highest portion of the
3944             # first affected range, the range will have to be split.  The
3945             # splice will remove the whole range, but will replace it by a new
3946             # range containing just the unaffected part.  So, in this case,
3947             # add to the replacement list just this unaffected portion.
3948             if (! $extends_below
3949                 && $start > $r->[$i]->start && $start <= $r->[$i]->end)
3950             {
3951                 push @replacement,
3952                     Range->new($r->[$i]->start,
3953                                $start - 1,
3954                                Value => $r->[$i]->value,
3955                                Type => $r->[$i]->type);
3956             }
3957
3958             # In the case of an insert or change, but not a delete, we have to
3959             # put in the new stuff;  this comes next.
3960             if ($operation eq '+') {
3961                 push @replacement, Range->new($start,
3962                                               $end,
3963                                               Value => $value,
3964                                               Type => $type);
3965             }
3966
3967             trace "Range at $j is $r->[$j]" if main::DEBUG && $to_trace && $j != $i;
3968             #trace "$end >=", $r->[$j]->start, " && $end <", $r->[$j]->end if main::DEBUG && $to_trace;
3969
3970             # And finally, if we're changing or deleting only a portion of the
3971             # highest affected range, it must be split, as the lowest one was.
3972             if (! $extends_above
3973                 && $j >= 0  # Remember that j can be -1 if before first
3974                             # current element
3975                 && $end >= $r->[$j]->start
3976                 && $end < $r->[$j]->end)
3977             {
3978                 push @replacement,
3979                     Range->new($end + 1,
3980                                $r->[$j]->end,
3981                                Value => $r->[$j]->value,
3982                                Type => $r->[$j]->type);
3983             }
3984         }
3985
3986         # And do the splice, as calculated above
3987         if (main::DEBUG && $to_trace) {
3988             trace "replacing $length element(s) at $i with ";
3989             foreach my $replacement (@replacement) {
3990                 trace "    $replacement";
3991             }
3992             trace "Before splice:";
3993             trace 'i-2=[', $i-2, ']', $r->[$i-2] if $i >= 2;
3994             trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i >= 1;
3995             trace "i  =[", $i, "]", $r->[$i];
3996             trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < @$r - 1;
3997             trace 'i+2=[', $i+2, ']', $r->[$i+2] if $i < @$r - 2;
3998         }
3999
4000         my @return = splice @$r, $splice_start, $length, @replacement;
4001
4002         if (main::DEBUG && $to_trace) {
4003             trace "After splice:";
4004             trace 'i-2=[', $i-2, ']', $r->[$i-2] if $i >= 2;
4005             trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i >= 1;
4006             trace "i  =[", $i, "]", $r->[$i];
4007             trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < @$r - 1;
4008             trace 'i+2=[', $i+2, ']', $r->[$i+2] if $i < @$r - 2;
4009             trace "removed ", @return if @return;
4010         }
4011
4012         # An actual deletion could have changed the maximum in the list.
4013         # There was no deletion if the splice didn't return something, but
4014         # otherwise recalculate it.  This is done too rarely to worry about
4015         # performance.
4016         if ($operation eq '-' && @return) {
4017             $max{$addr} = $r->[-1]->end;
4018         }
4019         return @return;
4020     }
4021
4022     sub reset_each_range {  # reset the iterator for each_range();
4023         my $self = shift;
4024         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4025
4026         no overloading;
4027         undef $each_range_iterator{pack 'J', $self};
4028         return;
4029     }
4030
4031     sub each_range {
4032         # Iterate over each range in a range list.  Results are undefined if
4033         # the range list is changed during the iteration.
4034
4035         my $self = shift;
4036         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4037
4038         my $addr = do { no overloading; pack 'J', $self; };
4039
4040         return if $self->is_empty;
4041
4042         $each_range_iterator{$addr} = -1
4043                                 if ! defined $each_range_iterator{$addr};
4044         $each_range_iterator{$addr}++;
4045         return $ranges{$addr}->[$each_range_iterator{$addr}]
4046                         if $each_range_iterator{$addr} < @{$ranges{$addr}};
4047         undef $each_range_iterator{$addr};
4048         return;
4049     }
4050
4051     sub count {        # Returns count of code points in range list
4052         my $self = shift;
4053         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4054
4055         my $addr = do { no overloading; pack 'J', $self; };
4056
4057         my $count = 0;
4058         foreach my $range (@{$ranges{$addr}}) {
4059             $count += $range->end - $range->start + 1;
4060         }
4061         return $count;
4062     }
4063
4064     sub delete_range {    # Delete a range
4065         my $self = shift;
4066         my $start = shift;
4067         my $end = shift;
4068
4069         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4070
4071         return $self->_add_delete('-', $start, $end, "");
4072     }
4073
4074     sub is_empty { # Returns boolean as to if a range list is empty
4075         my $self = shift;
4076         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4077
4078         no overloading;
4079         return scalar @{$ranges{pack 'J', $self}} == 0;
4080     }
4081
4082     sub hash {
4083         # Quickly returns a scalar suitable for separating tables into
4084         # buckets, i.e. it is a hash function of the contents of a table, so
4085         # there are relatively few conflicts.
4086
4087         my $self = shift;
4088         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4089
4090         my $addr = do { no overloading; pack 'J', $self; };
4091
4092         # These are quickly computable.  Return looks like 'min..max;count'
4093         return $self->min . "..$max{$addr};" . scalar @{$ranges{$addr}};
4094     }
4095 } # End closure for _Range_List_Base
4096
4097 package Range_List;
4098 use base '_Range_List_Base';
4099
4100 # A Range_List is a range list for match tables; i.e. the range values are
4101 # not significant.  Thus a number of operations can be safely added to it,
4102 # such as inversion, intersection.  Note that union is also an unsafe
4103 # operation when range values are cared about, and that method is in the base
4104 # class, not here.  But things are set up so that that method is callable only
4105 # during initialization.  Only in this derived class, is there an operation
4106 # that combines two tables.  A Range_Map can thus be used to initialize a
4107 # Range_List, and its mappings will be in the list, but are not significant to
4108 # this class.
4109
4110 sub trace { return main::trace(@_); }
4111
4112 { # Closure
4113
4114     use overload
4115         fallback => 0,
4116         '+' => sub { my $self = shift;
4117                     my $other = shift;
4118
4119                     return $self->_union($other)
4120                 },
4121         '&' => sub { my $self = shift;
4122                     my $other = shift;
4123
4124                     return $self->_intersect($other, 0);
4125                 },
4126         '~' => "_invert",
4127         '-' => "_subtract",
4128     ;
4129
4130     sub _invert {
4131         # Returns a new Range_List that gives all code points not in $self.
4132
4133         my $self = shift;
4134
4135         my $new = Range_List->new;
4136
4137         # Go through each range in the table, finding the gaps between them
4138         my $max = -1;   # Set so no gap before range beginning at 0
4139         for my $range ($self->ranges) {
4140             my $start = $range->start;
4141             my $end   = $range->end;
4142
4143             # If there is a gap before this range, the inverse will contain
4144             # that gap.
4145             if ($start > $max + 1) {
4146                 $new->add_range($max + 1, $start - 1);
4147             }
4148             $max = $end;
4149         }
4150
4151         # And finally, add the gap from the end of the table to the max
4152         # possible code point
4153         if ($max < $MAX_UNICODE_CODEPOINT) {
4154             $new->add_range($max + 1, $MAX_UNICODE_CODEPOINT);
4155         }
4156         return $new;
4157     }
4158
4159     sub _subtract {
4160         # Returns a new Range_List with the argument deleted from it.  The
4161         # argument can be a single code point, a range, or something that has
4162         # a range, with the _range_list() method on it returning them
4163
4164         my $self = shift;
4165         my $other = shift;
4166         my $reversed = shift;
4167         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4168
4169         if ($reversed) {
4170             Carp::my_carp_bug("Can't cope with a "
4171              .  __PACKAGE__
4172              . " being the second parameter in a '-'.  Subtraction ignored.");
4173             return $self;
4174         }
4175
4176         my $new = Range_List->new(Initialize => $self);
4177
4178         if (! ref $other) { # Single code point
4179             $new->delete_range($other, $other);
4180         }
4181         elsif ($other->isa('Range')) {
4182             $new->delete_range($other->start, $other->end);
4183         }
4184         elsif ($other->can('_range_list')) {
4185             foreach my $range ($other->_range_list->ranges) {
4186                 $new->delete_range($range->start, $range->end);
4187             }
4188         }
4189         else {
4190             Carp::my_carp_bug("Can't cope with a "
4191                         . ref($other)
4192                         . " argument to '-'.  Subtraction ignored."
4193                         );
4194             return $self;
4195         }
4196
4197         return $new;
4198     }
4199
4200     sub _intersect {
4201         # Returns either a boolean giving whether the two inputs' range lists
4202         # intersect (overlap), or a new Range_List containing the intersection
4203         # of the two lists.  The optional final parameter being true indicates
4204         # to do the check instead of the intersection.
4205
4206         my $a_object = shift;
4207         my $b_object = shift;
4208         my $check_if_overlapping = shift;
4209         $check_if_overlapping = 0 unless defined $check_if_overlapping;
4210         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4211
4212         if (! defined $b_object) {
4213             my $message = "";
4214             $message .= $a_object->_owner_name_of if defined $a_object;
4215             Carp::my_carp_bug($message .= "Called with undefined value.  Intersection not done.");
4216             return;
4217         }
4218
4219         # a & b = !(!a | !b), or in our terminology = ~ ( ~a + -b )
4220         # Thus the intersection could be much more simply be written:
4221         #   return ~(~$a_object + ~$b_object);
4222         # But, this is slower, and when taking the inverse of a large
4223         # range_size_1 table, back when such tables were always stored that
4224         # way, it became prohibitively slow, hence the code was changed to the
4225         # below
4226
4227         if ($b_object->isa('Range')) {
4228             $b_object = Range_List->new(Initialize => $b_object,
4229                                         Owner => $a_object->_owner_name_of);
4230         }
4231         $b_object = $b_object->_range_list if $b_object->can('_range_list');
4232
4233         my @a_ranges = $a_object->ranges;
4234         my @b_ranges = $b_object->ranges;
4235
4236         #local $to_trace = 1 if main::DEBUG;
4237         trace "intersecting $a_object with ", scalar @a_ranges, "ranges and $b_object with", scalar @b_ranges, " ranges" if main::DEBUG && $to_trace;
4238
4239         # Start with the first range in each list
4240         my $a_i = 0;
4241         my $range_a = $a_ranges[$a_i];
4242         my $b_i = 0;
4243         my $range_b = $b_ranges[$b_i];
4244
4245         my $new = __PACKAGE__->new(Owner => $a_object->_owner_name_of)
4246                                                 if ! $check_if_overlapping;
4247
4248         # If either list is empty, there is no intersection and no overlap
4249         if (! defined $range_a || ! defined $range_b) {
4250             return $check_if_overlapping ? 0 : $new;
4251         }
4252         trace "range_a[$a_i]=$range_a; range_b[$b_i]=$range_b" if main::DEBUG && $to_trace;
4253
4254         # Otherwise, must calculate the intersection/overlap.  Start with the
4255         # very first code point in each list
4256         my $a = $range_a->start;
4257         my $b = $range_b->start;
4258
4259         # Loop through all the ranges of each list; in each iteration, $a and
4260         # $b are the current code points in their respective lists
4261         while (1) {
4262
4263             # If $a and $b are the same code point, ...
4264             if ($a == $b) {
4265
4266                 # it means the lists overlap.  If just checking for overlap
4267                 # know the answer now,
4268                 return 1 if $check_if_overlapping;
4269
4270                 # The intersection includes this code point plus anything else
4271                 # common to both current ranges.
4272                 my $start = $a;
4273                 my $end = main::min($range_a->end, $range_b->end);
4274                 if (! $check_if_overlapping) {
4275                     trace "adding intersection range ", sprintf("%04X", $start) . ".." . sprintf("%04X", $end) if main::DEBUG && $to_trace;
4276                     $new->add_range($start, $end);
4277                 }
4278
4279                 # Skip ahead to the end of the current intersect
4280                 $a = $b = $end;
4281
4282                 # If the current intersect ends at the end of either range (as
4283                 # it must for at least one of them), the next possible one
4284                 # will be the beginning code point in it's list's next range.
4285                 if ($a == $range_a->end) {
4286                     $range_a = $a_ranges[++$a_i];
4287                     last unless defined $range_a;
4288                     $a = $range_a->start;
4289                 }
4290                 if ($b == $range_b->end) {
4291                     $range_b = $b_ranges[++$b_i];
4292                     last unless defined $range_b;
4293                     $b = $range_b->start;
4294                 }
4295
4296                 trace "range_a[$a_i]=$range_a; range_b[$b_i]=$range_b" if main::DEBUG && $to_trace;
4297             }
4298             elsif ($a < $b) {
4299
4300                 # Not equal, but if the range containing $a encompasses $b,
4301                 # change $a to be the middle of the range where it does equal
4302                 # $b, so the next iteration will get the intersection
4303                 if ($range_a->end >= $b) {
4304                     $a = $b;
4305                 }
4306                 else {
4307
4308                     # Here, the current range containing $a is entirely below
4309                     # $b.  Go try to find a range that could contain $b.
4310                     $a_i = $a_object->_search_ranges($b);
4311
4312                     # If no range found, quit.
4313                     last unless defined $a_i;
4314
4315                     # The search returns $a_i, such that
4316                     #   range_a[$a_i-1]->end < $b <= range_a[$a_i]->end
4317                     # Set $a to the beginning of this new range, and repeat.
4318                     $range_a = $a_ranges[$a_i];
4319                     $a = $range_a->start;
4320                 }
4321             }
4322             else { # Here, $b < $a.
4323
4324                 # Mirror image code to the leg just above
4325                 if ($range_b->end >= $a) {
4326                     $b = $a;
4327                 }
4328                 else {
4329                     $b_i = $b_object->_search_ranges($a);
4330                     last unless defined $b_i;
4331                     $range_b = $b_ranges[$b_i];
4332                     $b = $range_b->start;
4333                 }
4334             }
4335         } # End of looping through ranges.
4336
4337         # Intersection fully computed, or now know that there is no overlap
4338         return $check_if_overlapping ? 0 : $new;
4339     }
4340
4341     sub overlaps {
4342         # Returns boolean giving whether the two arguments overlap somewhere
4343
4344         my $self = shift;
4345         my $other = shift;
4346         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4347
4348         return $self->_intersect($other, 1);
4349     }
4350
4351     sub add_range {
4352         # Add a range to the list.
4353
4354         my $self = shift;
4355         my $start = shift;
4356         my $end = shift;
4357         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4358
4359         return $self->_add_delete('+', $start, $end, "");
4360     }
4361
4362     sub matches_identically_to {
4363         # Return a boolean as to whether or not two Range_Lists match identical
4364         # sets of code points.
4365
4366         my $self = shift;
4367         my $other = shift;
4368         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4369
4370         # These are ordered in increasing real time to figure out (at least
4371         # until a patch changes that and doesn't change this)
4372         return 0 if $self->max != $other->max;
4373         return 0 if $self->min != $other->min;
4374         return 0 if $self->range_count != $other->range_count;
4375         return 0 if $self->count != $other->count;
4376
4377         # Here they could be identical because all the tests above passed.
4378         # The loop below is somewhat simpler since we know they have the same
4379         # number of elements.  Compare range by range, until reach the end or
4380         # find something that differs.
4381         my @a_ranges = $self->ranges;
4382         my @b_ranges = $other->ranges;
4383         for my $i (0 .. @a_ranges - 1) {
4384             my $a = $a_ranges[$i];
4385             my $b = $b_ranges[$i];
4386             trace "self $a; other $b" if main::DEBUG && $to_trace;
4387             return 0 if ! defined $b
4388                         || $a->start != $b->start
4389                         || $a->end != $b->end;
4390         }
4391         return 1;
4392     }
4393
4394     sub is_code_point_usable {
4395         # This used only for making the test script.  See if the input
4396         # proposed trial code point is one that Perl will handle.  If second
4397         # parameter is 0, it won't select some code points for various
4398         # reasons, noted below.
4399
4400         my $code = shift;
4401         my $try_hard = shift;
4402         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4403
4404         return 0 if $code < 0;                # Never use a negative
4405
4406         # shun null.  I'm (khw) not sure why this was done, but NULL would be
4407         # the character very frequently used.
4408         return $try_hard if $code == 0x0000;
4409
4410         # shun non-character code points.
4411         return $try_hard if $code >= 0xFDD0 && $code <= 0xFDEF;
4412         return $try_hard if ($code & 0xFFFE) == 0xFFFE; # includes FFFF
4413
4414         return $try_hard if $code > $MAX_UNICODE_CODEPOINT;   # keep in range
4415         return $try_hard if $code >= 0xD800 && $code <= 0xDFFF; # no surrogate
4416
4417         return 1;
4418     }
4419
4420     sub get_valid_code_point {
4421         # Return a code point that's part of the range list.  Returns nothing
4422         # if the table is empty or we can't find a suitable code point.  This
4423         # used only for making the test script.
4424
4425         my $self = shift;
4426         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4427
4428         my $addr = do { no overloading; pack 'J', $self; };
4429
4430         # On first pass, don't choose less desirable code points; if no good
4431         # one is found, repeat, allowing a less desirable one to be selected.
4432         for my $try_hard (0, 1) {
4433
4434             # Look through all the ranges for a usable code point.
4435             for my $set ($self->ranges) {
4436
4437                 # Try the edge cases first, starting with the end point of the
4438                 # range.
4439                 my $end = $set->end;
4440                 return $end if is_code_point_usable($end, $try_hard);
4441
4442                 # End point didn't, work.  Start at the beginning and try
4443                 # every one until find one that does work.
4444                 for my $trial ($set->start .. $end - 1) {
4445                     return $trial if is_code_point_usable($trial, $try_hard);
4446                 }
4447             }
4448         }
4449         return ();  # If none found, give up.
4450     }
4451
4452     sub get_invalid_code_point {
4453         # Return a code point that's not part of the table.  Returns nothing
4454         # if the table covers all code points or a suitable code point can't
4455         # be found.  This used only for making the test script.
4456
4457         my $self = shift;
4458         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4459
4460         # Just find a valid code point of the inverse, if any.
4461         return Range_List->new(Initialize => ~ $self)->get_valid_code_point;
4462     }
4463 } # end closure for Range_List
4464
4465 package Range_Map;
4466 use base '_Range_List_Base';
4467
4468 # A Range_Map is a range list in which the range values (called maps) are
4469 # significant, and hence shouldn't be manipulated by our other code, which
4470 # could be ambiguous or lose things.  For example, in taking the union of two
4471 # lists, which share code points, but which have differing values, which one
4472 # has precedence in the union?
4473 # It turns out that these operations aren't really necessary for map tables,
4474 # and so this class was created to make sure they aren't accidentally
4475 # applied to them.
4476
4477 { # Closure
4478
4479     sub add_map {
4480         # Add a range containing a mapping value to the list
4481
4482         my $self = shift;
4483         # Rest of parameters passed on
4484
4485         return $self->_add_delete('+', @_);
4486     }
4487
4488     sub add_duplicate {
4489         # Adds entry to a range list which can duplicate an existing entry
4490
4491         my $self = shift;
4492         my $code_point = shift;
4493         my $value = shift;
4494         my %args = @_;
4495         my $replace = delete $args{'Replace'} // $MULTIPLE_BEFORE;
4496         Carp::carp_extra_args(\%args) if main::DEBUG && %args;
4497
4498         return $self->add_map($code_point, $code_point,
4499                                 $value, Replace => $replace);
4500     }
4501 } # End of closure for package Range_Map
4502
4503 package _Base_Table;
4504
4505 # A table is the basic data structure that gets written out into a file for
4506 # use by the Perl core.  This is the abstract base class implementing the
4507 # common elements from the derived ones.  A list of the methods to be
4508 # furnished by an implementing class is just after the constructor.
4509
4510 sub standardize { return main::standardize($_[0]); }
4511 sub trace { return main::trace(@_); }
4512
4513 { # Closure
4514
4515     main::setup_package();
4516
4517     my %range_list;
4518     # Object containing the ranges of the table.
4519     main::set_access('range_list', \%range_list, 'p_r', 'p_s');
4520
4521     my %full_name;
4522     # The full table name.
4523     main::set_access('full_name', \%full_name, 'r');
4524
4525     my %name;
4526     # The table name, almost always shorter
4527     main::set_access('name', \%name, 'r');
4528
4529     my %short_name;
4530     # The shortest of all the aliases for this table, with underscores removed
4531     main::set_access('short_name', \%short_name);
4532
4533     my %nominal_short_name_length;
4534     # The length of short_name before removing underscores
4535     main::set_access('nominal_short_name_length',
4536                     \%nominal_short_name_length);
4537
4538     my %complete_name;
4539     # The complete name, including property.
4540     main::set_access('complete_name', \%complete_name, 'r');
4541
4542     my %property;
4543     # Parent property this table is attached to.
4544     main::set_access('property', \%property, 'r');
4545
4546     my %aliases;
4547     # Ordered list of alias objects of the table's name.  The first ones in
4548     # the list are output first in comments
4549     main::set_access('aliases', \%aliases, 'readable_array');
4550
4551     my %comment;
4552     # A comment associated with the table for human readers of the files
4553     main::set_access('comment', \%comment, 's');
4554
4555     my %description;
4556     # A comment giving a short description of the table's meaning for human
4557     # readers of the files.
4558     main::set_access('description', \%description, 'readable_array');
4559
4560     my %note;
4561     # A comment giving a short note about the table for human readers of the
4562     # files.
4563     main::set_access('note', \%note, 'readable_array');
4564
4565     my %fate;
4566     # Enum; there are a number of possibilities for what happens to this
4567     # table: it could be normal, or suppressed, or not for external use.  See
4568     # values at definition for $SUPPRESSED.
4569     main::set_access('fate', \%fate, 'r');
4570
4571     my %find_table_from_alias;
4572     # The parent property passes this pointer to a hash which this class adds
4573     # all its aliases to, so that the parent can quickly take an alias and
4574     # find this table.
4575     main::set_access('find_table_from_alias', \%find_table_from_alias, 'p_r');
4576
4577     my %locked;
4578     # After this table is made equivalent to another one; we shouldn't go
4579     # changing the contents because that could mean it's no longer equivalent
4580     main::set_access('locked', \%locked, 'r');
4581
4582     my %file_path;
4583     # This gives the final path to the file containing the table.  Each
4584     # directory in the path is an element in the array
4585     main::set_access('file_path', \%file_path, 'readable_array');
4586
4587     my %status;
4588     # What is the table's status, normal, $OBSOLETE, etc.  Enum
4589     main::set_access('status', \%status, 'r');
4590
4591     my %status_info;
4592     # A comment about its being obsolete, or whatever non normal status it has
4593     main::set_access('status_info', \%status_info, 'r');
4594
4595     my %caseless_equivalent;
4596     # The table this is equivalent to under /i matching, if any.
4597     main::set_access('caseless_equivalent', \%caseless_equivalent, 'r', 's');
4598
4599     my %range_size_1;
4600     # Is the table to be output with each range only a single code point?
4601     # This is done to avoid breaking existing code that may have come to rely
4602     # on this behavior in previous versions of this program.)
4603     main::set_access('range_size_1', \%range_size_1, 'r', 's');
4604
4605     my %perl_extension;
4606     # A boolean set iff this table is a Perl extension to the Unicode
4607     # standard.
4608     main::set_access('perl_extension', \%perl_extension, 'r');
4609
4610     my %output_range_counts;
4611     # A boolean set iff this table is to have comments written in the
4612     # output file that contain the number of code points in the range.
4613     # The constructor can override the global flag of the same name.
4614     main::set_access('output_range_counts', \%output_range_counts, 'r');
4615
4616     my %format;
4617     # The format of the entries of the table.  This is calculated from the
4618     # data in the table (or passed in the constructor).  This is an enum e.g.,
4619     # $STRING_FORMAT.  It is marked protected as it should not be generally
4620     # used to override calculations.
4621     main::set_access('format', \%format, 'r', 'p_s');
4622
4623     sub new {
4624         # All arguments are key => value pairs, which you can see below, most
4625         # of which match fields documented above.  Otherwise: Re_Pod_Entry,
4626         # OK_as_Filename, and Fuzzy apply to the names of the table, and are
4627         # documented in the Alias package
4628
4629         return Carp::carp_too_few_args(\@_, 2) if main::DEBUG && @_ < 2;
4630
4631         my $class = shift;
4632
4633         my $self = bless \do { my $anonymous_scalar }, $class;
4634         my $addr = do { no overloading; pack 'J', $self; };
4635
4636         my %args = @_;
4637
4638         $name{$addr} = delete $args{'Name'};
4639         $find_table_from_alias{$addr} = delete $args{'_Alias_Hash'};
4640         $full_name{$addr} = delete $args{'Full_Name'};
4641         my $complete_name = $complete_name{$addr}
4642                           = delete $args{'Complete_Name'};
4643         $format{$addr} = delete $args{'Format'};
4644         $output_range_counts{$addr} = delete $args{'Output_Range_Counts'};
4645         $property{$addr} = delete $args{'_Property'};
4646         $range_list{$addr} = delete $args{'_Range_List'};
4647         $status{$addr} = delete $args{'Status'} || $NORMAL;
4648         $status_info{$addr} = delete $args{'_Status_Info'} || "";
4649         $range_size_1{$addr} = delete $args{'Range_Size_1'} || 0;
4650         $caseless_equivalent{$addr} = delete $args{'Caseless_Equivalent'} || 0;
4651         $fate{$addr} = delete $args{'Fate'} || $ORDINARY;
4652         my $ucd = delete $args{'UCD'};
4653
4654         my $description = delete $args{'Description'};
4655         my $ok_as_filename = delete $args{'OK_as_Filename'};
4656         my $loose_match = delete $args{'Fuzzy'};
4657         my $note = delete $args{'Note'};
4658         my $make_re_pod_entry = delete $args{'Re_Pod_Entry'};
4659         my $perl_extension = delete $args{'Perl_Extension'};
4660
4661         # Shouldn't have any left over
4662         Carp::carp_extra_args(\%args) if main::DEBUG && %args;
4663
4664         # Can't use || above because conceivably the name could be 0, and
4665         # can't use // operator in case this program gets used in Perl 5.8
4666         $full_name{$addr} = $name{$addr} if ! defined $full_name{$addr};
4667         $output_range_counts{$addr} = $output_range_counts if
4668                                         ! defined $output_range_counts{$addr};
4669
4670         $aliases{$addr} = [ ];
4671         $comment{$addr} = [ ];
4672         $description{$addr} = [ ];
4673         $note{$addr} = [ ];
4674         $file_path{$addr} = [ ];
4675         $locked{$addr} = "";
4676
4677         push @{$description{$addr}}, $description if $description;
4678         push @{$note{$addr}}, $note if $note;
4679
4680         if ($fate{$addr} == $PLACEHOLDER) {
4681
4682             # A placeholder table doesn't get documented, is a perl extension,
4683             # and quite likely will be empty
4684             $make_re_pod_entry = 0 if ! defined $make_re_pod_entry;
4685             $perl_extension = 1 if ! defined $perl_extension;
4686             $ucd = 0 if ! defined $ucd;
4687             push @tables_that_may_be_empty, $complete_name{$addr};
4688             $self->add_comment(<<END);
4689 This is a placeholder because it is not in Version $string_version of Unicode,
4690 but is needed by the Perl core to work gracefully.  Because it is not in this
4691 version of Unicode, it will not be listed in $pod_file.pod
4692 END
4693         }
4694         elsif (exists $why_suppressed{$complete_name}
4695                 # Don't suppress if overridden
4696                 && ! grep { $_ eq $complete_name{$addr} }
4697                                                     @output_mapped_properties)
4698         {
4699             $fate{$addr} = $SUPPRESSED;
4700         }
4701         elsif ($fate{$addr} == $SUPPRESSED
4702                && ! exists $why_suppressed{$property{$addr}->complete_name})
4703         {
4704             Carp::my_carp_bug("There is no current capability to set the reason for suppressing.");
4705             # perhaps Fate => [ $SUPPRESSED, "reason" ]
4706         }
4707
4708         # If hasn't set its status already, see if it is on one of the
4709         # lists of properties or tables that have particular statuses; if
4710         # not, is normal.  The lists are prioritized so the most serious
4711         # ones are checked first
4712         if (! $status{$addr}) {
4713             if (exists $why_deprecated{$complete_name}) {
4714                 $status{$addr} = $DEPRECATED;
4715             }
4716             elsif (exists $why_stabilized{$complete_name}) {
4717                 $status{$addr} = $STABILIZED;
4718             }
4719             elsif (exists $why_obsolete{$complete_name}) {
4720                 $status{$addr} = $OBSOLETE;
4721             }
4722
4723             # Existence above doesn't necessarily mean there is a message
4724             # associated with it.  Use the most serious message.
4725             if ($status{$addr}) {
4726                 if ($why_deprecated{$complete_name}) {
4727                     $status_info{$addr}
4728                                 = $why_deprecated{$complete_name};
4729                 }
4730                 elsif ($why_stabilized{$complete_name}) {
4731                     $status_info{$addr}
4732                                 = $why_stabilized{$complete_name};
4733                 }
4734                 elsif ($why_obsolete{$complete_name}) {
4735                     $status_info{$addr}
4736                                 = $why_obsolete{$complete_name};
4737                 }
4738             }
4739         }
4740
4741         $perl_extension{$addr} = $perl_extension || 0;
4742
4743         # Don't list a property by default that is internal only
4744         if ($fate{$addr} > $MAP_PROXIED) {
4745             $make_re_pod_entry = 0 if ! defined $make_re_pod_entry;
4746             $ucd = 0 if ! defined $ucd;
4747         }
4748         else {
4749             $ucd = 1 if ! defined $ucd;
4750         }
4751
4752         # By convention what typically gets printed only or first is what's
4753         # first in the list, so put the full name there for good output
4754         # clarity.  Other routines rely on the full name being first on the
4755         # list
4756         $self->add_alias($full_name{$addr},
4757                             OK_as_Filename => $ok_as_filename,
4758                             Fuzzy => $loose_match,
4759                             Re_Pod_Entry => $make_re_pod_entry,
4760                             Status => $status{$addr},
4761                             UCD => $ucd,
4762                             );
4763
4764         # Then comes the other name, if meaningfully different.
4765         if (standardize($full_name{$addr}) ne standardize($name{$addr})) {
4766             $self->add_alias($name{$addr},
4767                             OK_as_Filename => $ok_as_filename,
4768                             Fuzzy => $loose_match,
4769                             Re_Pod_Entry => $make_re_pod_entry,
4770                             Status => $status{$addr},
4771                             UCD => $ucd,
4772                             );
4773         }
4774
4775         return $self;
4776     }
4777
4778     # Here are the methods that are required to be defined by any derived
4779     # class
4780     for my $sub (qw(
4781                     handle_special_range
4782                     append_to_body
4783                     pre_body
4784                 ))
4785                 # write() knows how to write out normal ranges, but it calls
4786                 # handle_special_range() when it encounters a non-normal one.
4787                 # append_to_body() is called by it after it has handled all
4788                 # ranges to add anything after the main portion of the table.
4789                 # And finally, pre_body() is called after all this to build up
4790                 # anything that should appear before the main portion of the
4791                 # table.  Doing it this way allows things in the middle to
4792                 # affect what should appear before the main portion of the
4793                 # table.
4794     {
4795         no strict "refs";
4796         *$sub = sub {
4797             Carp::my_carp_bug( __LINE__
4798                               . ": Must create method '$sub()' for "
4799                               . ref shift);
4800             return;
4801         }
4802     }
4803
4804     use overload
4805         fallback => 0,
4806         "." => \&main::_operator_dot,
4807         '!=' => \&main::_operator_not_equal,
4808         '==' => \&main::_operator_equal,
4809     ;
4810
4811     sub ranges {
4812         # Returns the array of ranges associated with this table.
4813
4814         no overloading;
4815         return $range_list{pack 'J', shift}->ranges;
4816     }
4817
4818     sub add_alias {
4819         # Add a synonym for this table.
4820
4821         return Carp::carp_too_few_args(\@_, 3) if main::DEBUG && @_ < 3;
4822
4823         my $self = shift;
4824         my $name = shift;       # The name to add.
4825         my $pointer = shift;    # What the alias hash should point to.  For
4826                                 # map tables, this is the parent property;
4827                                 # for match tables, it is the table itself.
4828
4829         my %args = @_;
4830         my $loose_match = delete $args{'Fuzzy'};
4831
4832         my $make_re_pod_entry = delete $args{'Re_Pod_Entry'};
4833         $make_re_pod_entry = $YES unless defined $make_re_pod_entry;
4834
4835         my $ok_as_filename = delete $args{'OK_as_Filename'};
4836         $ok_as_filename = 1 unless defined $ok_as_filename;
4837
4838         my $status = delete $args{'Status'};
4839         $status = $NORMAL unless defined $status;
4840
4841         my $ucd = delete $args{'UCD'} // 1;
4842
4843         Carp::carp_extra_args(\%args) if main::DEBUG && %args;
4844
4845         # Capitalize the first letter of the alias unless it is one of the CJK
4846         # ones which specifically begins with a lower 'k'.  Do this because
4847         # Unicode has varied whether they capitalize first letters or not, and
4848         # have later changed their minds and capitalized them, but not the
4849         # other way around.  So do it always and avoid changes from release to
4850         # release
4851         $name = ucfirst($name) unless $name =~ /^k[A-Z]/;
4852
4853         my $addr = do { no overloading; pack 'J', $self; };
4854
4855         # Figure out if should be loosely matched if not already specified.
4856         if (! defined $loose_match) {
4857
4858             # Is a loose_match if isn't null, and doesn't begin with an
4859             # underscore and isn't just a number
4860             if ($name ne ""
4861                 && substr($name, 0, 1) ne '_'
4862                 && $name !~ qr{^[0-9_.+-/]+$})
4863             {
4864                 $loose_match = 1;
4865             }
4866             else {
4867                 $loose_match = 0;
4868             }
4869         }
4870
4871         # If this alias has already been defined, do nothing.
4872         return if defined $find_table_from_alias{$addr}->{$name};
4873
4874         # That includes if it is standardly equivalent to an existing alias,
4875         # in which case, add this name to the list, so won't have to search
4876         # for it again.
4877         my $standard_name = main::standardize($name);
4878         if (defined $find_table_from_alias{$addr}->{$standard_name}) {
4879             $find_table_from_alias{$addr}->{$name}
4880                         = $find_table_from_alias{$addr}->{$standard_name};
4881             return;
4882         }
4883
4884         # Set the index hash for this alias for future quick reference.
4885         $find_table_from_alias{$addr}->{$name} = $pointer;
4886         $find_table_from_alias{$addr}->{$standard_name} = $pointer;
4887         local $to_trace = 0 if main::DEBUG;
4888         trace "adding alias $name to $pointer" if main::DEBUG && $to_trace;
4889         trace "adding alias $standard_name to $pointer" if main::DEBUG && $to_trace;
4890
4891
4892         # Put the new alias at the end of the list of aliases unless the final
4893         # element begins with an underscore (meaning it is for internal perl
4894         # use) or is all numeric, in which case, put the new one before that
4895         # one.  This floats any all-numeric or underscore-beginning aliases to
4896         # the end.  This is done so that they are listed last in output lists,
4897         # to encourage the user to use a better name (either more descriptive
4898         # or not an internal-only one) instead.  This ordering is relied on
4899         # implicitly elsewhere in this program, like in short_name()
4900         my $list = $aliases{$addr};
4901         my $insert_position = (@$list == 0
4902                                 || (substr($list->[-1]->name, 0, 1) ne '_'
4903                                     && $list->[-1]->name =~ /\D/))
4904                             ? @$list
4905                             : @$list - 1;
4906         splice @$list,
4907                 $insert_position,
4908                 0,
4909                 Alias->new($name, $loose_match, $make_re_pod_entry,
4910                                                 $ok_as_filename, $status, $ucd);
4911
4912         # This name may be shorter than any existing ones, so clear the cache
4913         # of the shortest, so will have to be recalculated.
4914         no overloading;
4915         undef $short_name{pack 'J', $self};
4916         return;
4917     }
4918
4919     sub short_name {
4920         # Returns a name suitable for use as the base part of a file name.
4921         # That is, shorter wins.  It can return undef if there is no suitable
4922         # name.  The name has all non-essential underscores removed.
4923
4924         # The optional second parameter is a reference to a scalar in which
4925         # this routine will store the length the returned name had before the
4926         # underscores were removed, or undef if the return is undef.
4927
4928         # The shortest name can change if new aliases are added.  So using
4929         # this should be deferred until after all these are added.  The code
4930         # that does that should clear this one's cache.
4931         # Any name with alphabetics is preferred over an all numeric one, even
4932         # if longer.
4933
4934         my $self = shift;
4935         my $nominal_length_ptr = shift;
4936         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4937
4938         my $addr = do { no overloading; pack 'J', $self; };
4939
4940         # For efficiency, don't recalculate, but this means that adding new
4941         # aliases could change what the shortest is, so the code that does
4942         # that needs to undef this.
4943         if (defined $short_name{$addr}) {
4944             if ($nominal_length_ptr) {
4945                 $$nominal_length_ptr = $nominal_short_name_length{$addr};
4946             }
4947             return $short_name{$addr};
4948         }
4949
4950         # Look at each alias
4951         foreach my $alias ($self->aliases()) {
4952
4953             # Don't use an alias that isn't ok to use for an external name.
4954             next if ! $alias->ok_as_filename;
4955
4956             my $name = main::Standardize($alias->name);
4957             trace $self, $name if main::DEBUG && $to_trace;
4958
4959             # Take the first one, or a shorter one that isn't numeric.  This
4960             # relies on numeric aliases always being last in the array
4961             # returned by aliases().  Any alpha one will have precedence.
4962             if (! defined $short_name{$addr}
4963                 || ($name =~ /\D/
4964                     && length($name) < length($short_name{$addr})))
4965             {
4966                 # Remove interior underscores.
4967                 ($short_name{$addr} = $name) =~ s/ (?<= . ) _ (?= . ) //xg;
4968
4969                 $nominal_short_name_length{$addr} = length $name;
4970             }
4971         }
4972
4973         # If the short name isn't a nice one, perhaps an equivalent table has
4974         # a better one.
4975         if (! defined $short_name{$addr}
4976             || $short_name{$addr} eq ""
4977             || $short_name{$addr} eq "_")
4978         {
4979             my $return;
4980             foreach my $follower ($self->children) {    # All equivalents
4981                 my $follower_name = $follower->short_name;
4982                 next unless defined $follower_name;
4983
4984                 # Anything (except undefined) is better than underscore or
4985                 # empty
4986                 if (! defined $return || $return eq "_") {
4987                     $return = $follower_name;
4988                     next;
4989                 }
4990
4991                 # If the new follower name isn't "_" and is shorter than the
4992                 # current best one, prefer the new one.
4993                 next if $follower_name eq "_";
4994                 next if length $follower_name > length $return;
4995                 $return = $follower_name;
4996             }
4997             $short_name{$addr} = $return if defined $return;
4998         }
4999
5000         # If no suitable external name return undef
5001         if (! defined $short_name{$addr}) {
5002             $$nominal_length_ptr = undef if $nominal_length_ptr;
5003             return;
5004         }
5005
5006         # Don't allow a null short name.
5007         if ($short_name{$addr} eq "") {
5008             $short_name{$addr} = '_';
5009             $nominal_short_name_length{$addr} = 1;
5010         }
5011
5012         trace $self, $short_name{$addr} if main::DEBUG && $to_trace;
5013
5014         if ($nominal_length_ptr) {
5015             $$nominal_length_ptr = $nominal_short_name_length{$addr};
5016         }
5017         return $short_name{$addr};
5018     }
5019
5020     sub external_name {
5021         # Returns the external name that this table should be known by.  This
5022         # is usually the short_name, but not if the short_name is undefined,
5023         # in which case the external_name is arbitrarily set to the
5024         # underscore.
5025
5026         my $self = shift;
5027         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5028
5029         my $short = $self->short_name;
5030         return $short if defined $short;
5031
5032         return '_';
5033     }
5034
5035     sub add_description { # Adds the parameter as a short description.
5036
5037         my $self = shift;
5038         my $description = shift;
5039         chomp $description;
5040         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5041
5042         no overloading;
5043         push @{$description{pack 'J', $self}}, $description;
5044
5045         return;
5046     }
5047
5048     sub add_note { # Adds the parameter as a short note.
5049
5050         my $self = shift;
5051         my $note = shift;
5052         chomp $note;
5053         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5054
5055         no overloading;
5056         push @{$note{pack 'J', $self}}, $note;
5057
5058         return;
5059     }
5060
5061     sub add_comment { # Adds the parameter as a comment.
5062
5063         return unless $debugging_build;
5064
5065         my $self = shift;
5066         my $comment = shift;
5067         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5068
5069         chomp $comment;
5070
5071         no overloading;
5072         push @{$comment{pack 'J', $self}}, $comment;
5073
5074         return;
5075     }
5076
5077     sub comment {
5078         # Return the current comment for this table.  If called in list
5079         # context, returns the array of comments.  In scalar, returns a string
5080         # of each element joined together with a period ending each.
5081
5082         my $self = shift;
5083         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5084
5085         my $addr = do { no overloading; pack 'J', $self; };
5086         my @list = @{$comment{$addr}};
5087         return @list if wantarray;
5088         my $return = "";
5089         foreach my $sentence (@list) {
5090             $return .= '.  ' if $return;
5091             $return .= $sentence;
5092             $return =~ s/\.$//;
5093         }
5094         $return .= '.' if $return;
5095         return $return;
5096     }
5097
5098     sub initialize {
5099         # Initialize the table with the argument which is any valid
5100         # initialization for range lists.
5101
5102         my $self = shift;
5103         my $addr = do { no overloading; pack 'J', $self; };
5104         my $initialization = shift;
5105         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5106
5107         # Replace the current range list with a new one of the same exact
5108         # type.
5109         my $class = ref $range_list{$addr};
5110         $range_list{$addr} = $class->new(Owner => $self,
5111                                         Initialize => $initialization);
5112         return;
5113
5114     }
5115
5116     sub header {
5117         # The header that is output for the table in the file it is written
5118         # in.
5119
5120         my $self = shift;
5121         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5122
5123         my $return = "";
5124         $return .= $DEVELOPMENT_ONLY if $compare_versions;
5125         $return .= $HEADER;
5126         return $return;
5127     }
5128
5129     sub write {
5130         # Write a representation of the table to its file.  It calls several
5131         # functions furnished by sub-classes of this abstract base class to
5132         # handle non-normal ranges, to add stuff before the table, and at its
5133         # end.
5134
5135         my $self = shift;
5136         my $tab_stops = shift;       # The number of tab stops over to put any
5137                                      # comment.
5138         my $suppress_value = shift;  # Optional, if the value associated with
5139                                      # a range equals this one, don't write
5140                                      # the range
5141         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5142
5143         my $addr = do { no overloading; pack 'J', $self; };
5144
5145         # Start with the header
5146         my @HEADER = $self->header;
5147
5148         # Then the comments
5149         push @HEADER, "\n", main::simple_fold($comment{$addr}, '# '), "\n"
5150                                                         if $comment{$addr};
5151
5152         # Things discovered processing the main body of the document may
5153         # affect what gets output before it, therefore pre_body() isn't called
5154         # until after all other processing of the table is done.
5155
5156         # The main body looks like a 'here' document.  If annotating, get rid
5157         # of the comments before passing to the caller, as some callers, such
5158         # as charnames.pm, can't cope with them.  (Outputting range counts
5159         # also introduces comments, but these don't show up in the tables that
5160         # can't cope with comments, and there aren't that many of them that
5161         # it's worth the extra real time to get rid of them).
5162         my @OUT;
5163         if ($annotate) {
5164             # Use the line below in Perls that don't have /r
5165             #push @OUT, 'return join "\n",  map { s/\s*#.*//mg; $_ } split "\n", <<\'END\';' . "\n";
5166             push @OUT, "return <<'END' =~ s/\\s*#.*//mgr;\n";
5167         } else {
5168             push @OUT, "return <<'END';\n";
5169         }
5170
5171         if ($range_list{$addr}->is_empty) {
5172
5173             # This is a kludge for empty tables to silence a warning in
5174             # utf8.c, which can't really deal with empty tables, but it can
5175             # deal with a table that matches nothing, as the inverse of 'Any'
5176             # does.
5177             push @OUT, "!utf8::Any\n";
5178         }
5179         elsif ($self->name eq 'N'
5180
5181                # To save disk space and table cache space, avoid putting out
5182                # binary N tables, but instead create a file which just inverts
5183                # the Y table.  Since the file will still exist and occupy a
5184                # certain number of blocks, might as well output the whole
5185                # thing if it all will fit in one block.   The number of
5186                # ranges below is an approximate number for that.
5187                && ($self->property->type == $BINARY
5188                    || $self->property->type == $FORCED_BINARY)
5189                # && $self->property->tables == 2  Can't do this because the
5190                #        non-binary properties, like NFDQC aren't specifiable
5191                #        by the notation
5192                && $range_list{$addr}->ranges > 15
5193                && ! $annotate)  # Under --annotate, want to see everything
5194         {
5195             push @OUT, "!utf8::" . $self->property->name . "\n";
5196         }
5197         else {
5198             my $range_size_1 = $range_size_1{$addr};
5199             my $format;            # Used only in $annotate option
5200             my $include_name;      # Used only in $annotate option
5201
5202             if ($annotate) {
5203
5204                 # if annotating each code point, must print 1 per line.
5205                 # The variable could point to a subroutine, and we don't want
5206                 # to lose that fact, so only set if not set already
5207                 $range_size_1 = 1 if ! $range_size_1;
5208
5209                 $format = $self->format;
5210
5211                 # The name of the character is output only for tables that
5212                 # don't already include the name in the output.
5213                 my $property = $self->property;
5214                 $include_name =
5215                     !  ($property == $perl_charname
5216                         || $property == main::property_ref('Unicode_1_Name')
5217                         || $property == main::property_ref('Name')
5218                         || $property == main::property_ref('Name_Alias')
5219                        );
5220             }
5221
5222             # Output each range as part of the here document.
5223             RANGE:
5224             for my $set ($range_list{$addr}->ranges) {
5225                 if ($set->type != 0) {
5226                     $self->handle_special_range($set);
5227                     next RANGE;
5228                 }
5229                 my $start = $set->start;
5230                 my $end   = $set->end;
5231                 my $value  = $set->value;
5232
5233                 # Don't output ranges whose value is the one to suppress
5234                 next RANGE if defined $suppress_value
5235                               && $value eq $suppress_value;
5236
5237                 # If there is a range and doesn't need a single point range
5238                 # output
5239                 if ($start != $end && ! $range_size_1) {
5240                     push @OUT, sprintf "%04X\t%04X", $start, $end;
5241                     $OUT[-1] .= "\t$value" if $value ne "";
5242
5243                     # Add a comment with the size of the range, if requested.
5244                     # Expand Tabs to make sure they all start in the same
5245                     # column, and then unexpand to use mostly tabs.
5246                     if (! $output_range_counts{$addr}) {
5247                         $OUT[-1] .= "\n";
5248                     }
5249                     else {
5250                         $OUT[-1] = Text::Tabs::expand($OUT[-1]);
5251                         my $count = main::clarify_number($end - $start + 1);
5252                         use integer;
5253
5254                         my $width = $tab_stops * 8 - 1;
5255                         $OUT[-1] = sprintf("%-*s # [%s]\n",
5256                                             $width,
5257                                             $OUT[-1],
5258                                             $count);
5259                         $OUT[-1] = Text::Tabs::unexpand($OUT[-1]);
5260                     }
5261                     next RANGE;
5262                 }
5263
5264                 # Here to output a single code point per line
5265
5266                 # If not to annotate, use the simple formats
5267                 if (! $annotate) {
5268
5269                     # Use any passed in subroutine to output.
5270                     if (ref $range_size_1 eq 'CODE') {
5271                         for my $i ($start .. $end) {
5272                             push @OUT, &{$range_size_1}($i, $value);
5273                         }
5274                     }
5275                     else {
5276
5277                         # Here, caller is ok with default output.
5278                         for (my $i = $start; $i <= $end; $i++) {
5279                             push @OUT, sprintf "%04X\t\t%s\n", $i, $value;
5280                         }
5281                     }
5282                     next RANGE;
5283                 }
5284
5285                 # Here, wants annotation.
5286                 for (my $i = $start; $i <= $end; $i++) {
5287
5288                     # Get character information if don't have it already
5289                     main::populate_char_info($i)
5290                                         if ! defined $viacode[$i];
5291                     my $type = $annotate_char_type[$i];
5292
5293                     # Figure out if should output the next code points as part
5294                     # of a range or not.  If this is not in an annotation
5295                     # range, then won't output as a range, so returns $i.
5296                     # Otherwise use the end of the annotation range, but no
5297                     # further than the maximum possible end point of the loop.
5298                     my $range_end = main::min($annotate_ranges->value_of($i)
5299                                                                         || $i,
5300                                                $end);
5301
5302                     # Use a range if it is a range, and either is one of the
5303                     # special annotation ranges, or the range is at most 3
5304                     # long.  This last case causes the algorithmically named
5305                     # code points to be output individually in spans of at
5306                     # most 3, as they are the ones whose $type is > 0.
5307                     if ($range_end != $i
5308                         && ( $type < 0 || $range_end - $i > 2))
5309                     {
5310                         # Here is to output a range.  We don't allow a
5311                         # caller-specified output format--just use the
5312                         # standard one.
5313                         push @OUT, sprintf "%04X\t%04X\t%s\t#", $i,
5314                                                                 $range_end,
5315                                                                 $value;
5316                         my $range_name = $viacode[$i];
5317
5318                         # For the code points which end in their hex value, we
5319                         # eliminate that from the output annotation, and
5320                         # capitalize only the first letter of each word.
5321                         if ($type == $CP_IN_NAME) {
5322                             my $hex = sprintf "%04X", $i;
5323                             $range_name =~ s/-$hex$//;
5324                             my @words = split " ", $range_name;
5325                             for my $word (@words) {
5326                                 $word = ucfirst(lc($word)) if $word ne 'CJK';
5327                             }
5328                             $range_name = join " ", @words;
5329                         }
5330                         elsif ($type == $HANGUL_SYLLABLE) {
5331                             $range_name = "Hangul Syllable";
5332                         }
5333
5334                         $OUT[-1] .= " $range_name" if $range_name;
5335
5336                         # Include the number of code points in the range
5337                         my $count = main::clarify_number($range_end - $i + 1);
5338                         $OUT[-1] .= " [$count]\n";
5339
5340                         # Skip to the end of the range
5341                         $i = $range_end;
5342                     }
5343                     else { # Not in a range.
5344                         my $comment = "";
5345
5346                         # When outputting the names of each character, use
5347                         # the character itself if printable
5348                         $comment .= "'" . chr($i) . "' " if $printable[$i];
5349
5350                         # To make it more readable, use a minimum indentation
5351                         my $comment_indent;
5352
5353                         # Determine the annotation
5354                         if ($format eq $DECOMP_STRING_FORMAT) {
5355
5356                             # This is very specialized, with the type of
5357                             # decomposition beginning the line enclosed in
5358                             # <...>, and the code points that the code point
5359                             # decomposes to separated by blanks.  Create two
5360                             # strings, one of the printable characters, and
5361                             # one of their official names.
5362                             (my $map = $value) =~ s/ \ * < .*? > \ +//x;
5363                             my $tostr = "";
5364                             my $to_name = "";
5365                             my $to_chr = "";
5366                             foreach my $to (split " ", $map) {
5367                                 $to = CORE::hex $to;
5368                                 $to_name .= " + " if $to_name;
5369                                 $to_chr .= chr($to);
5370                                 main::populate_char_info($to)
5371                                                     if ! defined $viacode[$to];
5372                                 $to_name .=  $viacode[$to];
5373                             }
5374
5375                             $comment .=
5376                                     "=> '$to_chr'; $viacode[$i] => $to_name";
5377                             $comment_indent = 25;   # Determined by experiment
5378                         }
5379                         else {
5380
5381                             # Assume that any table that has hex format is a
5382                             # mapping of one code point to another.
5383                             if ($format eq $HEX_FORMAT) {
5384                                 my $decimal_value = CORE::hex $value;
5385                                 main::populate_char_info($decimal_value)
5386                                         if ! defined $viacode[$decimal_value];
5387                                 $comment .= "=> '"
5388                                          . chr($decimal_value)
5389                                          . "'; " if $printable[$decimal_value];
5390                             }
5391                             $comment .= $viacode[$i] if $include_name
5392                                                         && $viacode[$i];
5393                             if ($format eq $HEX_FORMAT) {
5394                                 my $decimal_value = CORE::hex $value;
5395                                 $comment .= " => $viacode[$decimal_value]"
5396                                                     if $viacode[$decimal_value];
5397                             }
5398
5399                             # If including the name, no need to indent, as the
5400                             # name will already be way across the line.
5401                             $comment_indent = ($include_name) ? 0 : 60;
5402                         }
5403
5404                         # Use any passed in routine to output the base part of
5405                         # the line.
5406                         if (ref $range_size_1 eq 'CODE') {
5407                             my $base_part = &{$range_size_1}($i, $value);
5408                             chomp $base_part;
5409                             push @OUT, $base_part;
5410                         }
5411                         else {
5412                             push @OUT, sprintf "%04X\t\t%s", $i, $value;
5413                         }
5414
5415                         # And add the annotation.
5416                         $OUT[-1] = sprintf "%-*s\t# %s", $comment_indent,
5417                                                          $OUT[-1],
5418                                                          $comment if $comment;
5419                         $OUT[-1] .= "\n";
5420                     }
5421                 }
5422             } # End of loop through all the table's ranges
5423         }
5424
5425         # Add anything that goes after the main body, but within the here
5426         # document,
5427         my $append_to_body = $self->append_to_body;
5428         push @OUT, $append_to_body if $append_to_body;
5429
5430         # And finish the here document.
5431         push @OUT, "END\n";
5432
5433         # Done with the main portion of the body.  Can now figure out what
5434         # should appear before it in the file.
5435         my $pre_body = $self->pre_body;
5436         push @HEADER, $pre_body, "\n" if $pre_body;
5437
5438         # All these files should have a .pl suffix added to them.
5439         my @file_with_pl = @{$file_path{$addr}};
5440         $file_with_pl[-1] .= '.pl';
5441
5442         main::write(\@file_with_pl,
5443                     $annotate,      # utf8 iff annotating
5444                     \@HEADER,
5445                     \@OUT);
5446         return;
5447     }
5448
5449     sub set_status {    # Set the table's status
5450         my $self = shift;
5451         my $status = shift; # The status enum value
5452         my $info = shift;   # Any message associated with it.
5453         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5454
5455         my $addr = do { no overloading; pack 'J', $self; };
5456
5457         $status{$addr} = $status;
5458         $status_info{$addr} = $info;
5459         return;
5460     }
5461
5462     sub set_fate {  # Set the fate of a table
5463         my $self = shift;
5464         my $fate = shift;
5465         my $reason = shift;
5466         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5467
5468         my $addr = do { no overloading; pack 'J', $self; };
5469
5470         return if $fate{$addr} == $fate;    # If no-op
5471
5472         # Can only change the ordinary fate, except if going to $MAP_PROXIED
5473         return if $fate{$addr} != $ORDINARY && $fate != $MAP_PROXIED;
5474
5475         $fate{$addr} = $fate;
5476
5477         # Don't document anything to do with a non-normal fated table
5478         if ($fate != $ORDINARY) {
5479             my $put_in_pod = ($fate == $MAP_PROXIED) ? 1 : 0;
5480             foreach my $alias ($self->aliases) {
5481                 $alias->set_ucd($put_in_pod);
5482
5483                 # MAP_PROXIED doesn't affect the match tables
5484                 next if $fate == $MAP_PROXIED;
5485                 $alias->set_make_re_pod_entry($put_in_pod);
5486             }
5487         }
5488
5489         # Save the reason for suppression for output
5490         if ($fate == $SUPPRESSED && defined $reason) {
5491             $why_suppressed{$complete_name{$addr}} = $reason;
5492         }
5493
5494         return;
5495     }
5496
5497     sub lock {
5498         # Don't allow changes to the table from now on.  This stores a stack
5499         # trace of where it was called, so that later attempts to modify it
5500         # can immediately show where it got locked.
5501
5502         my $self = shift;
5503         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5504
5505         my $addr = do { no overloading; pack 'J', $self; };
5506
5507         $locked{$addr} = "";
5508
5509         my $line = (caller(0))[2];
5510         my $i = 1;
5511
5512         # Accumulate the stack trace
5513         while (1) {
5514             my ($pkg, $file, $caller_line, $caller) = caller $i++;
5515
5516             last unless defined $caller;
5517
5518             $locked{$addr} .= "    called from $caller() at line $line\n";
5519             $line = $caller_line;
5520         }
5521         $locked{$addr} .= "    called from main at line $line\n";
5522
5523         return;
5524     }
5525
5526     sub carp_if_locked {
5527         # Return whether a table is locked or not, and, by the way, complain
5528         # if is locked
5529
5530         my $self = shift;
5531         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5532
5533         my $addr = do { no overloading; pack 'J', $self; };
5534
5535         return 0 if ! $locked{$addr};
5536         Carp::my_carp_bug("Can't modify a locked table. Stack trace of locking:\n$locked{$addr}\n\n");
5537         return 1;
5538     }
5539
5540     sub set_file_path { # Set the final directory path for this table
5541         my $self = shift;
5542         # Rest of parameters passed on
5543
5544         no overloading;
5545         @{$file_path{pack 'J', $self}} = @_;
5546         return
5547     }
5548
5549     # Accessors for the range list stored in this table.  First for
5550     # unconditional
5551     for my $sub (qw(
5552                     containing_range
5553                     contains
5554                     count
5555                     each_range
5556                     hash
5557                     is_empty
5558                     matches_identically_to
5559                     max
5560                     min
5561                     range_count
5562                     reset_each_range
5563                     type_of
5564                     value_of
5565                 ))
5566     {
5567         no strict "refs";
5568         *$sub = sub {
5569             use strict "refs";
5570             my $self = shift;
5571             return $self->_range_list->$sub(@_);
5572         }
5573     }
5574
5575     # Then for ones that should fail if locked
5576     for my $sub (qw(
5577                     delete_range
5578                 ))
5579     {
5580         no strict "refs";
5581         *$sub = sub {
5582             use strict "refs";
5583             my $self = shift;
5584
5585             return if $self->carp_if_locked;
5586             no overloading;
5587             return $self->_range_list->$sub(@_);
5588         }
5589     }
5590
5591 } # End closure
5592
5593 package Map_Table;
5594 use base '_Base_Table';
5595
5596 # A Map Table is a table that contains the mappings from code points to
5597 # values.  There are two weird cases:
5598 # 1) Anomalous entries are ones that aren't maps of ranges of code points, but
5599 #    are written in the table's file at the end of the table nonetheless.  It
5600 #    requires specially constructed code to handle these; utf8.c can not read
5601 #    these in, so they should not go in $map_directory.  As of this writing,
5602 #    the only case that these happen is for named sequences used in
5603 #    charnames.pm.   But this code doesn't enforce any syntax on these, so
5604 #    something else could come along that uses it.
5605 # 2) Specials are anything that doesn't fit syntactically into the body of the
5606 #    table.  The ranges for these have a map type of non-zero.  The code below
5607 #    knows about and handles each possible type.   In most cases, these are
5608 #    written as part of the header.
5609 #
5610 # A map table deliberately can't be manipulated at will unlike match tables.
5611 # This is because of the ambiguities having to do with what to do with
5612 # overlapping code points.  And there just isn't a need for those things;
5613 # what one wants to do is just query, add, replace, or delete mappings, plus
5614 # write the final result.
5615 # However, there is a method to get the list of possible ranges that aren't in
5616 # this table to use for defaulting missing code point mappings.  And,
5617 # map_add_or_replace_non_nulls() does allow one to add another table to this
5618 # one, but it is clearly very specialized, and defined that the other's
5619 # non-null values replace this one's if there is any overlap.
5620
5621 sub trace { return main::trace(@_); }
5622
5623 { # Closure
5624
5625     main::setup_package();
5626
5627     my %default_map;
5628     # Many input files omit some entries; this gives what the mapping for the
5629     # missing entries should be
5630     main::set_access('default_map', \%default_map, 'r');
5631
5632     my %anomalous_entries;
5633     # Things that go in the body of the table which don't fit the normal
5634     # scheme of things, like having a range.  Not much can be done with these
5635     # once there except to output them.  This was created to handle named
5636     # sequences.
5637     main::set_access('anomalous_entry', \%anomalous_entries, 'a');
5638     main::set_access('anomalous_entries',       # Append singular, read plural
5639                     \%anomalous_entries,
5640                     'readable_array');
5641
5642     my %to_output_map;
5643     # Enum as to whether or not to write out this map table:
5644     #   0               don't output
5645     #   $EXTERNAL_MAP   means its existence is noted in the documentation, and
5646     #                   it should not be removed nor its format changed.  This
5647     #                   is done for those files that have traditionally been
5648     #                   output.
5649     #   $INTERNAL_MAP   means Perl reserves the right to do anything it wants
5650     #                   with this file
5651     main::set_access('to_output_map', \%to_output_map, 's');
5652
5653
5654     sub new {
5655         my $class = shift;
5656         my $name = shift;
5657
5658         my %args = @_;
5659
5660         # Optional initialization data for the table.
5661         my $initialize = delete $args{'Initialize'};
5662
5663         my $default_map = delete $args{'Default_Map'};
5664         my $property = delete $args{'_Property'};
5665         my $full_name = delete $args{'Full_Name'};
5666
5667         # Rest of parameters passed on
5668
5669         my $range_list = Range_Map->new(Owner => $property);
5670
5671         my $self = $class->SUPER::new(
5672                                     Name => $name,
5673                                     Complete_Name =>  $full_name,
5674                                     Full_Name => $full_name,
5675                                     _Property => $property,
5676                                     _Range_List => $range_list,
5677                                     %args);
5678
5679         my $addr = do { no overloading; pack 'J', $self; };
5680
5681         $anomalous_entries{$addr} = [];
5682         $default_map{$addr} = $default_map;
5683
5684         $self->initialize($initialize) if defined $initialize;
5685
5686         return $self;
5687     }
5688
5689     use overload
5690         fallback => 0,
5691         qw("") => "_operator_stringify",
5692     ;
5693
5694     sub _operator_stringify {
5695         my $self = shift;
5696
5697         my $name = $self->property->full_name;
5698         $name = '""' if $name eq "";
5699         return "Map table for Property '$name'";
5700     }
5701
5702     sub add_alias {
5703         # Add a synonym for this table (which means the property itself)
5704         my $self = shift;
5705         my $name = shift;
5706         # Rest of parameters passed on.
5707
5708         $self->SUPER::add_alias($name, $self->property, @_);
5709         return;
5710     }
5711
5712     sub add_map {
5713         # Add a range of code points to the list of specially-handled code
5714         # points.  $MULTI_CP is assumed if the type of special is not passed
5715         # in.
5716
5717         my $self = shift;
5718         my $lower = shift;
5719         my $upper = shift;
5720         my $string = shift;
5721         my %args = @_;
5722
5723         my $type = delete $args{'Type'} || 0;
5724         # Rest of parameters passed on
5725
5726         # Can't change the table if locked.
5727         return if $self->carp_if_locked;
5728
5729         my $addr = do { no overloading; pack 'J', $self; };
5730
5731         $self->_range_list->add_map($lower, $upper,
5732                                     $string,
5733                                     @_,
5734                                     Type => $type);
5735         return;
5736     }
5737
5738     sub append_to_body {
5739         # Adds to the written HERE document of the table's body any anomalous
5740         # entries in the table..
5741
5742         my $self = shift;
5743         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5744
5745         my $addr = do { no overloading; pack 'J', $self; };
5746
5747         return "" unless @{$anomalous_entries{$addr}};
5748         return join("\n", @{$anomalous_entries{$addr}}) . "\n";
5749     }
5750
5751     sub map_add_or_replace_non_nulls {
5752         # This adds the mappings in the table $other to $self.  Non-null
5753         # mappings from $other override those in $self.  It essentially merges
5754         # the two tables, with the second having priority except for null
5755         # mappings.
5756
5757         my $self = shift;
5758         my $other = shift;
5759         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5760
5761         return if $self->carp_if_locked;
5762
5763         if (! $other->isa(__PACKAGE__)) {
5764             Carp::my_carp_bug("$other should be a "
5765                         . __PACKAGE__
5766                         . ".  Not a '"
5767                         . ref($other)
5768                         . "'.  Not added;");
5769             return;
5770         }
5771
5772         my $addr = do { no overloading; pack 'J', $self; };
5773         my $other_addr = do { no overloading; pack 'J', $other; };
5774
5775         local $to_trace = 0 if main::DEBUG;
5776
5777         my $self_range_list = $self->_range_list;
5778         my $other_range_list = $other->_range_list;
5779         foreach my $range ($other_range_list->ranges) {
5780             my $value = $range->value;
5781             next if $value eq "";
5782             $self_range_list->_add_delete('+',
5783                                           $range->start,
5784                                           $range->end,
5785                                           $value,
5786                                           Type => $range->type,
5787                                           Replace => $UNCONDITIONALLY);
5788         }
5789
5790         return;
5791     }
5792
5793     sub set_default_map {
5794         # Define what code points that are missing from the input files should
5795         # map to
5796
5797         my $self = shift;
5798         my $map = shift;
5799         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5800
5801         my $addr = do { no overloading; pack 'J', $self; };
5802
5803         # Convert the input to the standard equivalent, if any (won't have any
5804         # for $STRING properties)
5805         my $standard = $self->_find_table_from_alias->{$map};
5806         $map = $standard->name if defined $standard;
5807
5808         # Warn if there already is a non-equivalent default map for this
5809         # property.  Note that a default map can be a ref, which means that
5810         # what it actually means is delayed until later in the program, and it
5811         # IS permissible to override it here without a message.
5812         my $default_map = $default_map{$addr};
5813         if (defined $default_map
5814             && ! ref($default_map)
5815             && $default_map ne $map
5816             && main::Standardize($map) ne $default_map)
5817         {
5818             my $property = $self->property;
5819             my $map_table = $property->table($map);
5820             my $default_table = $property->table($default_map);
5821             if (defined $map_table
5822                 && defined $default_table
5823                 && $map_table != $default_table)
5824             {
5825                 Carp::my_carp("Changing the default mapping for "
5826                             . $property
5827                             . " from $default_map to $map'");
5828             }
5829         }
5830
5831         $default_map{$addr} = $map;
5832
5833         # Don't also create any missing table for this map at this point,
5834         # because if we did, it could get done before the main table add is
5835         # done for PropValueAliases.txt; instead the caller will have to make
5836         # sure it exists, if desired.
5837         return;
5838     }
5839
5840     sub to_output_map {
5841         # Returns boolean: should we write this map table?
5842
5843         my $self = shift;
5844         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5845
5846         my $addr = do { no overloading; pack 'J', $self; };
5847
5848         # If overridden, use that
5849         return $to_output_map{$addr} if defined $to_output_map{$addr};
5850
5851         my $full_name = $self->full_name;
5852         return $global_to_output_map{$full_name}
5853                                 if defined $global_to_output_map{$full_name};
5854
5855         # If table says to output, do so; if says to suppress it, do so.
5856         my $fate = $self->fate;
5857         return $INTERNAL_MAP if $fate == $INTERNAL_ONLY;
5858         return $EXTERNAL_MAP if grep { $_ eq $full_name } @output_mapped_properties;
5859         return 0 if $fate == $SUPPRESSED || $fate == $MAP_PROXIED;
5860
5861         my $type = $self->property->type;
5862
5863         # Don't want to output binary map tables even for debugging.
5864         return 0 if $type == $BINARY;
5865
5866         # But do want to output string ones.
5867         return $EXTERNAL_MAP if $type == $STRING;
5868
5869         # Otherwise is an $ENUM, do output it, for Perl's purposes
5870         return $INTERNAL_MAP;
5871     }
5872
5873     sub inverse_list {
5874         # Returns a Range_List that is gaps of the current table.  That is,
5875         # the inversion
5876
5877         my $self = shift;
5878         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5879
5880         my $current = Range_List->new(Initialize => $self->_range_list,
5881                                 Owner => $self->property);
5882         return ~ $current;
5883     }
5884
5885     sub header {
5886         my $self = shift;
5887         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5888
5889         my $return = $self->SUPER::header();
5890
5891         if ($self->to_output_map == $INTERNAL_MAP) {
5892             $return .= $INTERNAL_ONLY_HEADER;
5893         }
5894         else {
5895             my $property_name = $self->property->full_name;
5896             $return .= <<END;
5897
5898 # !!!!!!!   IT IS DEPRECATED TO USE THIS FILE   !!!!!!!
5899
5900 # This file is for internal use by core Perl only.  It is retained for
5901 # backwards compatibility with applications that may have come to rely on it,
5902 # but its format and even its name or existence are subject to change without
5903 # notice in a future Perl version.  Don't use it directly.  Instead, its
5904 # contents are now retrievable through a stable API in the Unicode::UCD
5905 # module: Unicode::UCD::prop_invmap('$property_name').
5906 END
5907         }
5908         return $return;
5909     }
5910
5911     sub set_final_comment {
5912         # Just before output, create the comment that heads the file
5913         # containing this table.
5914
5915         return unless $debugging_build;
5916
5917         my $self = shift;
5918         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5919
5920         # No sense generating a comment if aren't going to write it out.
5921         return if ! $self->to_output_map;
5922
5923         my $addr = do { no overloading; pack 'J', $self; };
5924
5925         my $property = $self->property;
5926
5927         # Get all the possible names for this property.  Don't use any that
5928         # aren't ok for use in a file name, etc.  This is perhaps causing that
5929         # flag to do double duty, and may have to be changed in the future to
5930         # have our own flag for just this purpose; but it works now to exclude
5931         # Perl generated synonyms from the lists for properties, where the
5932         # name is always the proper Unicode one.
5933         my @property_aliases = grep { $_->ok_as_filename } $self->aliases;
5934
5935         my $count = $self->count;
5936         my $default_map = $default_map{$addr};
5937
5938         # The ranges that map to the default aren't output, so subtract that
5939         # to get those actually output.  A property with matching tables
5940         # already has the information calculated.
5941         if ($property->type != $STRING) {
5942             $count -= $property->table($default_map)->count;
5943         }
5944         elsif (defined $default_map) {
5945
5946             # But for $STRING properties, must calculate now.  Subtract the
5947             # count from each range that maps to the default.
5948             foreach my $range ($self->_range_list->ranges) {
5949                 if ($range->value eq $default_map) {
5950                     $count -= $range->end +1 - $range->start;
5951                 }
5952             }
5953
5954         }
5955
5956         # Get a  string version of $count with underscores in large numbers,
5957         # for clarity.
5958         my $string_count = main::clarify_number($count);
5959
5960         my $code_points = ($count == 1)
5961                         ? 'single code point'
5962                         : "$string_count code points";
5963
5964         my $mapping;
5965         my $these_mappings;
5966         my $are;
5967         if (@property_aliases <= 1) {
5968             $mapping = 'mapping';
5969             $these_mappings = 'this mapping';
5970             $are = 'is'
5971         }
5972         else {
5973             $mapping = 'synonymous mappings';
5974             $these_mappings = 'these mappings';
5975             $are = 'are'
5976         }
5977         my $cp;
5978         if ($count >= $MAX_UNICODE_CODEPOINTS) {
5979             $cp = "any code point in Unicode Version $string_version";
5980         }
5981         else {
5982             my $map_to;
5983             if ($default_map eq "") {
5984                 $map_to = 'the null string';
5985             }
5986             elsif ($default_map eq $CODE_POINT) {
5987                 $map_to = "itself";
5988             }
5989             else {
5990                 $map_to = "'$default_map'";
5991             }
5992             if ($count == 1) {
5993                 $cp = "the single code point";
5994             }
5995             else {
5996                 $cp = "one of the $code_points";
5997             }
5998             $cp .= " in Unicode Version $string_version for which the mapping is not to $map_to";
5999         }
6000
6001         my $comment = "";
6002
6003         my $status = $self->status;
6004         if ($status) {
6005             my $warn = uc $status_past_participles{$status};
6006             $comment .= <<END;
6007
6008 !!!!!!!   $warn !!!!!!!!!!!!!!!!!!!
6009  All property or property=value combinations contained in this file are $warn.
6010  See $unicode_reference_url for what this means.
6011
6012 END
6013         }
6014         $comment .= "This file returns the $mapping:\n";
6015
6016         for my $i (0 .. @property_aliases - 1) {
6017             $comment .= sprintf("%-8s%s\n",
6018                                 " ",
6019                                 $property_aliases[$i]->name . '(cp)'
6020                                 );
6021         }
6022         my $full_name = $self->property->full_name;
6023         $comment .= "\nwhere 'cp' is $cp.  Note that $these_mappings $are accessible via the function prop_invmap('$full_name') in Unicode::UCD";
6024
6025         # And append any commentary already set from the actual property.
6026         $comment .= "\n\n" . $self->comment if $self->comment;
6027         if ($self->description) {
6028             $comment .= "\n\n" . join " ", $self->description;
6029         }
6030         if ($self->note) {
6031             $comment .= "\n\n" . join " ", $self->note;
6032         }
6033         $comment .= "\n";
6034
6035         if (! $self->perl_extension) {
6036             $comment .= <<END;
6037
6038 For information about what this property really means, see:
6039 $unicode_reference_url
6040 END
6041         }
6042
6043         if ($count) {        # Format differs for empty table
6044                 $comment.= "\nThe format of the ";
6045             if ($self->range_size_1) {
6046                 $comment.= <<END;
6047 main body of lines of this file is: CODE_POINT\\t\\tMAPPING where CODE_POINT
6048 is in hex; MAPPING is what CODE_POINT maps to.
6049 END
6050             }
6051             else {
6052
6053                 # There are tables which end up only having one element per
6054                 # range, but it is not worth keeping track of for making just
6055                 # this comment a little better.
6056                 $comment.= <<END;
6057 non-comment portions of the main body of lines of this file is:
6058 START\\tSTOP\\tMAPPING where START is the starting code point of the
6059 range, in hex; STOP is the ending point, or if omitted, the range has just one
6060 code point; MAPPING is what each code point between START and STOP maps to.
6061 END
6062                 if ($self->output_range_counts) {
6063                     $comment .= <<END;
6064 Numbers in comments in [brackets] indicate how many code points are in the
6065 range (omitted when the range is a single code point or if the mapping is to
6066 the null string).
6067 END
6068                 }
6069             }
6070         }
6071         $self->set_comment(main::join_lines($comment));
6072         return;
6073     }
6074
6075     my %swash_keys; # Makes sure don't duplicate swash names.
6076
6077     # The remaining variables are temporaries used while writing each table,
6078     # to output special ranges.
6079     my @multi_code_point_maps;  # Map is to more than one code point.
6080
6081     sub handle_special_range {
6082         # Called in the middle of write when it finds a range it doesn't know
6083         # how to handle.
6084
6085         my $self = shift;
6086         my $range = shift;
6087         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6088
6089         my $addr = do { no overloading; pack 'J', $self; };
6090
6091         my $type = $range->type;
6092
6093         my $low = $range->start;
6094         my $high = $range->end;
6095         my $map = $range->value;
6096
6097         # No need to output the range if it maps to the default.
6098         return if $map eq $default_map{$addr};
6099
6100         my $property = $self->property;
6101
6102         # Switch based on the map type...
6103         if ($type == $HANGUL_SYLLABLE) {
6104
6105             # These are entirely algorithmically determinable based on
6106             # some constants furnished by Unicode; for now, just set a
6107             # flag to indicate that have them.  After everything is figured
6108             # out, we will output the code that does the algorithm.  (Don't
6109             # output them if not needed because we are suppressing this
6110             # property.)
6111             $has_hangul_syllables = 1 if $property->to_output_map;
6112         }
6113         elsif ($type == $CP_IN_NAME) {
6114
6115             # Code points whose name ends in their code point are also
6116             # algorithmically determinable, but need information about the map
6117             # to do so.  Both the map and its inverse are stored in data
6118             # structures output in the file.  They are stored in the mean time
6119             # in global lists The lists will be written out later into Name.pm,
6120             # which is created only if needed.  In order to prevent duplicates
6121             # in the list, only add to them for one property, should multiple
6122             # ones need them.
6123             if ($needing_code_points_ending_in_code_point == 0) {
6124                 $needing_code_points_ending_in_code_point = $property;
6125             }
6126             if ($property == $needing_code_points_ending_in_code_point) {
6127                 push @{$names_ending_in_code_point{$map}->{'low'}}, $low;
6128                 push @{$names_ending_in_code_point{$map}->{'high'}}, $high;
6129
6130                 my $squeezed = $map =~ s/[-\s]+//gr;
6131                 push @{$loose_names_ending_in_code_point{$squeezed}->{'low'}},
6132                                                                           $low;
6133                 push @{$loose_names_ending_in_code_point{$squeezed}->{'high'}},
6134                                                                          $high;
6135
6136                 push @code_points_ending_in_code_point, { low => $low,
6137                                                         high => $high,
6138                                                         name => $map
6139                                                         };
6140             }
6141         }
6142         elsif ($range->type == $MULTI_CP || $range->type == $NULL) {
6143
6144             # Multi-code point maps and null string maps have an entry
6145             # for each code point in the range.  They use the same
6146             # output format.
6147             for my $code_point ($low .. $high) {
6148
6149                 # The pack() below can't cope with surrogates.  XXX This may
6150                 # no longer be true
6151                 if ($code_point >= 0xD800 && $code_point <= 0xDFFF) {
6152                     Carp::my_carp("Surrogate code point '$code_point' in mapping to '$map' in $self.  No map created");
6153                     next;
6154                 }
6155
6156                 # Generate the hash entries for these in the form that
6157                 # utf8.c understands.
6158                 my $tostr = "";
6159                 my $to_name = "";
6160                 my $to_chr = "";
6161                 foreach my $to (split " ", $map) {
6162                     if ($to !~ /^$code_point_re$/) {
6163                         Carp::my_carp("Illegal code point '$to' in mapping '$map' from $code_point in $self.  No map created");
6164                         next;
6165                     }
6166                     $tostr .= sprintf "\\x{%s}", $to;
6167                     $to = CORE::hex $to;
6168                     if ($annotate) {
6169                         $to_name .= " + " if $to_name;
6170                         $to_chr .= chr($to);
6171                         main::populate_char_info($to)
6172                                             if ! defined $viacode[$to];
6173                         $to_name .=  $viacode[$to];
6174                     }
6175                 }
6176
6177                 # I (khw) have never waded through this line to
6178                 # understand it well enough to comment it.
6179                 my $utf8 = sprintf(qq["%s" => "$tostr",],
6180                         join("", map { sprintf "\\x%02X", $_ }
6181                             unpack("U0C*", pack("U", $code_point))));
6182
6183                 # Add a comment so that a human reader can more easily
6184                 # see what's going on.
6185                 push @multi_code_point_maps,
6186                         sprintf("%-45s # U+%04X", $utf8, $code_point);
6187                 if (! $annotate) {
6188                     $multi_code_point_maps[-1] .= " => $map";
6189                 }
6190                 else {
6191                     main::populate_char_info($code_point)
6192                                     if ! defined $viacode[$code_point];
6193                     $multi_code_point_maps[-1] .= " '"
6194                         . chr($code_point)
6195                         . "' => '$to_chr'; $viacode[$code_point] => $to_name";
6196                 }
6197             }
6198         }
6199         else {
6200             Carp::my_carp("Unrecognized map type '$range->type' in '$range' in $self.  Not written");
6201         }
6202
6203         return;
6204     }
6205
6206     sub pre_body {
6207         # Returns the string that should be output in the file before the main
6208         # body of this table.  It isn't called until the main body is
6209         # calculated, saving a pass.  The string includes some hash entries
6210         # identifying the format of the body, and what the single value should
6211         # be for all ranges missing from it.  It also includes any code points
6212         # which have map_types that don't go in the main table.
6213
6214         my $self = shift;
6215         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6216
6217         my $addr = do { no overloading; pack 'J', $self; };
6218
6219         my $name = $self->property->swash_name;
6220
6221         # Currently there is nothing in the pre_body unless a swash is being
6222         # generated.
6223         return unless defined $name;
6224
6225         if (defined $swash_keys{$name}) {
6226             Carp::my_carp(join_lines(<<END
6227 Already created a swash name '$name' for $swash_keys{$name}.  This means that
6228 the same name desired for $self shouldn't be used.  Bad News.  This must be
6229 fixed before production use, but proceeding anyway
6230 END
6231             ));
6232         }
6233         $swash_keys{$name} = "$self";
6234
6235         my $pre_body = "";
6236
6237         # Here we assume we were called after have gone through the whole
6238         # file.  If we actually generated anything for each map type, add its
6239         # respective header and trailer
6240         my $specials_name = "";
6241         if (@multi_code_point_maps) {
6242             $specials_name = "utf8::ToSpec$name";
6243             $pre_body .= <<END;
6244
6245 # Some code points require special handling because their mappings are each to
6246 # multiple code points.  These do not appear in the main body, but are defined
6247 # in the hash below.
6248
6249 # Each key is the string of N bytes that together make up the UTF-8 encoding
6250 # for the code point.  (i.e. the same as looking at the code point's UTF-8
6251 # under "use bytes").  Each value is the UTF-8 of the translation, for speed.
6252 \%$specials_name = (
6253 END
6254             $pre_body .= join("\n", @multi_code_point_maps) . "\n);\n";
6255         }
6256
6257         my $format = $self->format;
6258
6259         my $return = <<END;
6260 # The name this swash is to be known by, with the format of the mappings in
6261 # the main body of the table, and what all code points missing from this file
6262 # map to.
6263 \$utf8::SwashInfo{'To$name'}{'format'} = '$format'; # $map_table_formats{$format}
6264 END
6265         if ($specials_name) {
6266         $return .= <<END;
6267 \$utf8::SwashInfo{'To$name'}{'specials_name'} = '$specials_name'; # Name of hash of special mappings
6268 END
6269         }
6270         my $default_map = $default_map{$addr};
6271         $return .= "\$utf8::SwashInfo{'To$name'}{'missing'} = '$default_map';";
6272
6273         if ($default_map eq $CODE_POINT) {
6274             $return .= ' # code point maps to itself';
6275         }
6276         elsif ($default_map eq "") {
6277             $return .= ' # code point maps to the null string';
6278         }
6279         $return .= "\n";
6280
6281         $return .= $pre_body;
6282
6283         return $return;
6284     }
6285
6286     sub write {
6287         # Write the table to the file.
6288
6289         my $self = shift;
6290         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6291
6292         my $addr = do { no overloading; pack 'J', $self; };
6293
6294         # Clear the temporaries
6295         undef @multi_code_point_maps;
6296
6297         # Calculate the format of the table if not already done.
6298         my $format = $self->format;
6299         my $type = $self->property->type;
6300         my $default_map = $self->default_map;
6301         if (! defined $format) {
6302             if ($type == $BINARY) {
6303
6304                 # Don't bother checking the values, because we elsewhere
6305                 # verify that a binary table has only 2 values.
6306                 $format = $BINARY_FORMAT;
6307             }
6308             else {
6309                 my @ranges = $self->_range_list->ranges;
6310
6311                 # default an empty table based on its type and default map
6312                 if (! @ranges) {
6313
6314                     # But it turns out that the only one we can say is a
6315                     # non-string (besides binary, handled above) is when the
6316                     # table is a string and the default map is to a code point
6317                     if ($type == $STRING && $default_map eq $CODE_POINT) {
6318                         $format = $HEX_FORMAT;
6319                     }
6320                     else {
6321                         $format = $STRING_FORMAT;
6322                     }
6323                 }
6324                 else {
6325
6326                     # Start with the most restrictive format, and as we find
6327                     # something that doesn't fit with that, change to the next
6328                     # most restrictive, and so on.
6329                     $format = $DECIMAL_FORMAT;
6330                     foreach my $range (@ranges) {
6331                         next if $range->type != 0;  # Non-normal ranges don't
6332                                                     # affect the main body
6333                         my $map = $range->value;
6334                         if ($map ne $default_map) {
6335                             last if $format eq $STRING_FORMAT;  # already at
6336                                                                 # least
6337                                                                 # restrictive
6338                             $format = $INTEGER_FORMAT
6339                                                 if $format eq $DECIMAL_FORMAT
6340                                                     && $map !~ / ^ [0-9] $ /x;
6341                             $format = $FLOAT_FORMAT
6342                                             if $format eq $INTEGER_FORMAT
6343                                                 && $map !~ / ^ -? [0-9]+ $ /x;
6344                             $format = $RATIONAL_FORMAT
6345                                 if $format eq $FLOAT_FORMAT
6346                                     && $map !~ / ^ -? [0-9]+ \. [0-9]* $ /x;
6347                             $format = $HEX_FORMAT
6348                             if $format eq $RATIONAL_FORMAT
6349                                 && $map !~ / ^ -? [0-9]+ ( \/ [0-9]+ )? $ /x;
6350                             $format = $STRING_FORMAT if $format eq $HEX_FORMAT
6351                                                        && $map =~ /[^0-9A-F]/;
6352                         }
6353                     }
6354                 }
6355             }
6356         } # end of calculating format
6357
6358         if ($default_map eq $CODE_POINT
6359             && $format ne $HEX_FORMAT
6360             && ! defined $self->format)    # manual settings are always
6361                                            # considered ok
6362         {
6363             Carp::my_carp_bug("Expecting hex format for mapping table for $self, instead got '$format'")
6364         }
6365
6366         $self->_set_format($format);
6367
6368         # Core Perl has a different definition of mapping ranges than we do,
6369         # that is applicable mainly to mapping code points, so for tables
6370         # where it is possible that core Perl could be used to read it,
6371         # make it range size 1 to prevent possible confusion
6372         $self->set_range_size_1(1) if $format eq $HEX_FORMAT;
6373
6374         return $self->SUPER::write(
6375             ($self->property == $block)
6376                 ? 7     # block file needs more tab stops
6377                 : 3,
6378             $default_map);   # don't write defaulteds
6379     }
6380
6381     # Accessors for the underlying list that should fail if locked.
6382     for my $sub (qw(
6383                     add_duplicate
6384                 ))
6385     {
6386         no strict "refs";
6387         *$sub = sub {
6388             use strict "refs";
6389             my $self = shift;
6390
6391             return if $self->carp_if_locked;
6392             return $self->_range_list->$sub(@_);
6393         }
6394     }
6395 } # End closure for Map_Table
6396
6397 package Match_Table;
6398 use base '_Base_Table';
6399
6400 # A Match table is one which is a list of all the code points that have
6401 # the same property and property value, for use in \p{property=value}
6402 # constructs in regular expressions.  It adds very little data to the base
6403 # structure, but many methods, as these lists can be combined in many ways to
6404 # form new ones.
6405 # There are only a few concepts added:
6406 # 1) Equivalents and Relatedness.
6407 #    Two tables can match the identical code points, but have different names.
6408 #    This always happens when there is a perl single form extension
6409 #    \p{IsProperty} for the Unicode compound form \P{Property=True}.  The two
6410 #    tables are set to be related, with the Perl extension being a child, and
6411 #    the Unicode property being the parent.
6412 #
6413 #    It may be that two tables match the identical code points and we don't
6414 #    know if they are related or not.  This happens most frequently when the
6415 #    Block and Script properties have the exact range.  But note that a
6416 #    revision to Unicode could add new code points to the script, which would
6417 #    now have to be in a different block (as the block was filled, or there
6418 #    would have been 'Unknown' script code points in it and they wouldn't have
6419 #    been identical).  So we can't rely on any two properties from Unicode
6420 #    always matching the same code points from release to release, and thus
6421 #    these tables are considered coincidentally equivalent--not related.  When
6422 #    two tables are unrelated but equivalent, one is arbitrarily chosen as the
6423 #    'leader', and the others are 'equivalents'.  This concept is useful
6424 #    to minimize the number of tables written out.  Only one file is used for
6425 #    any identical set of code points, with entries in Heavy.pl mapping all
6426 #    the involved tables to it.
6427 #
6428 #    Related tables will always be identical; we set them up to be so.  Thus
6429 #    if the Unicode one is deprecated, the Perl one will be too.  Not so for
6430 #    unrelated tables.  Relatedness makes generating the documentation easier.
6431 #
6432 # 2) Complement.
6433 #    Like equivalents, two tables may be the inverses of each other, the
6434 #    intersection between them is null, and the union is every Unicode code
6435 #    point.  The two tables that occupy a binary property are necessarily like
6436 #    this.  By specifying one table as the complement of another, we can avoid
6437 #    storing it on disk (using the other table and performing a fast
6438 #    transform), and some memory and calculations.
6439 #
6440 # 3) Conflicting.  It may be that there will eventually be name clashes, with
6441 #    the same name meaning different things.  For a while, there actually were
6442 #    conflicts, but they have so far been resolved by changing Perl's or
6443 #    Unicode's definitions to match the other, but when this code was written,
6444 #    it wasn't clear that that was what was going to happen.  (Unicode changed
6445 #    because of protests during their beta period.)  Name clashes are warned
6446 #    about during compilation, and the documentation.  The generated tables
6447 #    are sane, free of name clashes, because the code suppresses the Perl
6448 #    version.  But manual intervention to decide what the actual behavior
6449 #    should be may be required should this happen.  The introductory comments
6450 #    have more to say about this.
6451
6452 sub standardize { return main::standardize($_[0]); }
6453 sub trace { return main::trace(@_); }
6454
6455
6456 { # Closure
6457
6458     main::setup_package();
6459
6460     my %leader;
6461     # The leader table of this one; initially $self.
6462     main::set_access('leader', \%leader, 'r');
6463
6464     my %equivalents;
6465     # An array of any tables that have this one as their leader
6466     main::set_access('equivalents', \%equivalents, 'readable_array');
6467
6468     my %parent;
6469     # The parent table to this one, initially $self.  This allows us to
6470     # distinguish between equivalent tables that are related (for which this
6471     # is set to), and those which may not be, but share the same output file
6472     # because they match the exact same set of code points in the current
6473     # Unicode release.
6474     main::set_access('parent', \%parent, 'r');
6475
6476     my %children;
6477     # An array of any tables that have this one as their parent
6478     main::set_access('children', \%children, 'readable_array');
6479
6480     my %conflicting;
6481     # Array of any tables that would have the same name as this one with
6482     # a different meaning.  This is used for the generated documentation.
6483     main::set_access('conflicting', \%conflicting, 'readable_array');
6484
6485     my %matches_all;
6486     # Set in the constructor for tables that are expected to match all code
6487     # points.
6488     main::set_access('matches_all', \%matches_all, 'r');
6489
6490     my %complement;
6491     # Points to the complement that this table is expressed in terms of; 0 if
6492     # none.
6493     main::set_access('complement', \%complement, 'r');
6494
6495     sub new {
6496         my $class = shift;
6497
6498         my %args = @_;
6499
6500         # The property for which this table is a listing of property values.
6501         my $property = delete $args{'_Property'};
6502
6503         my $name = delete $args{'Name'};
6504         my $full_name = delete $args{'Full_Name'};
6505         $full_name = $name if ! defined $full_name;
6506
6507         # Optional
6508         my $initialize = delete $args{'Initialize'};
6509         my $matches_all = delete $args{'Matches_All'} || 0;
6510         my $format = delete $args{'Format'};
6511         # Rest of parameters passed on.
6512
6513         my $range_list = Range_List->new(Initialize => $initialize,
6514                                          Owner => $property);
6515
6516         my $complete = $full_name;
6517         $complete = '""' if $complete eq "";  # A null name shouldn't happen,
6518                                               # but this helps debug if it
6519                                               # does
6520         # The complete name for a match table includes it's property in a
6521         # compound form 'property=table', except if the property is the
6522         # pseudo-property, perl, in which case it is just the single form,
6523         # 'table' (If you change the '=' must also change the ':' in lots of
6524         # places in this program that assume an equal sign)
6525         $complete = $property->full_name . "=$complete" if $property != $perl;
6526
6527         my $self = $class->SUPER::new(%args,
6528                                       Name => $name,
6529                                       Complete_Name => $complete,
6530                                       Full_Name => $full_name,
6531                                       _Property => $property,
6532                                       _Range_List => $range_list,
6533                                       Format => $EMPTY_FORMAT,
6534                                       );
6535         my $addr = do { no overloading; pack 'J', $self; };
6536
6537         $conflicting{$addr} = [ ];
6538         $equivalents{$addr} = [ ];
6539         $children{$addr} = [ ];
6540         $matches_all{$addr} = $matches_all;
6541         $leader{$addr} = $self;
6542         $parent{$addr} = $self;
6543         $complement{$addr} = 0;
6544
6545         if (defined $format && $format ne $EMPTY_FORMAT) {
6546             Carp::my_carp_bug("'Format' must be '$EMPTY_FORMAT' in a match table instead of '$format'.  Using '$EMPTY_FORMAT'");
6547         }
6548
6549         return $self;
6550     }
6551
6552     # See this program's beginning comment block about overloading these.
6553     use overload
6554         fallback => 0,
6555         qw("") => "_operator_stringify",
6556         '=' => sub {
6557                     my $self = shift;
6558
6559                     return if $self->carp_if_locked;
6560                     return $self;
6561                 },
6562
6563         '+' => sub {
6564                         my $self = shift;
6565                         my $other = shift;
6566
6567                         return $self->_range_list + $other;
6568                     },
6569         '&' => sub {
6570                         my $self = shift;
6571                         my $other = shift;
6572
6573                         return $self->_range_list & $other;
6574                     },
6575         '+=' => sub {
6576                         my $self = shift;
6577                         my $other = shift;
6578
6579                         return if $self->carp_if_locked;
6580
6581                         my $addr = do { no overloading; pack 'J', $self; };
6582
6583                         if (ref $other) {
6584
6585                             # Change the range list of this table to be the
6586                             # union of the two.
6587                             $self->_set_range_list($self->_range_list
6588                                                     + $other);
6589                         }
6590                         else {    # $other is just a simple value
6591                             $self->add_range($other, $other);
6592                         }
6593                         return $self;
6594                     },
6595         '-' => sub { my $self = shift;
6596                     my $other = shift;
6597                     my $reversed = shift;
6598
6599                     if ($reversed) {
6600                         Carp::my_carp_bug("Can't cope with a "
6601                             .  __PACKAGE__
6602                             . " being the first parameter in a '-'.  Subtraction ignored.");
6603                         return;
6604                     }
6605
6606                     return $self->_range_list - $other;
6607                 },
6608         '~' => sub { my $self = shift;
6609                     return ~ $self->_range_list;
6610                 },
6611     ;
6612
6613     sub _operator_stringify {
6614         my $self = shift;
6615
6616         my $name = $self->complete_name;
6617         return "Table '$name'";
6618     }
6619
6620     sub _range_list {
6621         # Returns the range list associated with this table, which will be the
6622         # complement's if it has one.
6623
6624         my $self = shift;
6625         my $complement;
6626         if (($complement = $self->complement) != 0) {
6627             return ~ $complement->_range_list;
6628         }
6629         else {
6630             return $self->SUPER::_range_list;
6631         }
6632     }
6633
6634     sub add_alias {
6635         # Add a synonym for this table.  See the comments in the base class
6636
6637         my $self = shift;
6638         my $name = shift;
6639         # Rest of parameters passed on.
6640
6641         $self->SUPER::add_alias($name, $self, @_);
6642         return;
6643     }
6644
6645     sub add_conflicting {
6646         # Add the name of some other object to the list of ones that name
6647         # clash with this match table.
6648
6649         my $self = shift;
6650         my $conflicting_name = shift;   # The name of the conflicting object
6651         my $p = shift || 'p';           # Optional, is this a \p{} or \P{} ?
6652         my $conflicting_object = shift; # Optional, the conflicting object
6653                                         # itself.  This is used to
6654                                         # disambiguate the text if the input
6655                                         # name is identical to any of the
6656                                         # aliases $self is known by.
6657                                         # Sometimes the conflicting object is
6658                                         # merely hypothetical, so this has to
6659                                         # be an optional parameter.
6660         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6661
6662         my $addr = do { no overloading; pack 'J', $self; };
6663
6664         # Check if the conflicting name is exactly the same as any existing
6665         # alias in this table (as long as there is a real object there to
6666         # disambiguate with).
6667         if (defined $conflicting_object) {
6668             foreach my $alias ($self->aliases) {
6669                 if ($alias->name eq $conflicting_name) {
6670
6671                     # Here, there is an exact match.  This results in
6672                     # ambiguous comments, so disambiguate by changing the
6673                     # conflicting name to its object's complete equivalent.
6674                     $conflicting_name = $conflicting_object->complete_name;
6675                     last;
6676                 }
6677             }
6678         }
6679
6680         # Convert to the \p{...} final name
6681         $conflicting_name = "\\$p" . "{$conflicting_name}";
6682
6683         # Only add once
6684         return if grep { $conflicting_name eq $_ } @{$conflicting{$addr}};
6685
6686         push @{$conflicting{$addr}}, $conflicting_name;
6687
6688         return;
6689     }
6690
6691     sub is_set_equivalent_to {
6692         # Return boolean of whether or not the other object is a table of this
6693         # type and has been marked equivalent to this one.
6694
6695         my $self = shift;
6696         my $other = shift;
6697         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6698
6699         return 0 if ! defined $other; # Can happen for incomplete early
6700                                       # releases
6701         unless ($other->isa(__PACKAGE__)) {
6702             my $ref_other = ref $other;
6703             my $ref_self = ref $self;
6704             Carp::my_carp_bug("Argument to 'is_set_equivalent_to' must be another $ref_self, not a '$ref_other'.  $other not set equivalent to $self.");
6705             return 0;
6706         }
6707
6708         # Two tables are equivalent if they have the same leader.
6709         no overloading;
6710         return $leader{pack 'J', $self} == $leader{pack 'J', $other};
6711         return;
6712     }
6713
6714     sub set_equivalent_to {
6715         # Set $self equivalent to the parameter table.
6716         # The required Related => 'x' parameter is a boolean indicating
6717         # whether these tables are related or not.  If related, $other becomes
6718         # the 'parent' of $self; if unrelated it becomes the 'leader'
6719         #
6720         # Related tables share all characteristics except names; equivalents
6721         # not quite so many.
6722         # If they are related, one must be a perl extension.  This is because
6723         # we can't guarantee that Unicode won't change one or the other in a
6724         # later release even if they are identical now.
6725
6726         my $self = shift;
6727         my $other = shift;
6728
6729         my %args = @_;
6730         my $related = delete $args{'Related'};
6731
6732         Carp::carp_extra_args(\%args) if main::DEBUG && %args;
6733
6734         return if ! defined $other;     # Keep on going; happens in some early
6735                                         # Unicode releases.
6736
6737         if (! defined $related) {
6738             Carp::my_carp_bug("set_equivalent_to must have 'Related => [01] parameter.  Assuming $self is not related to $other");
6739             $related = 0;
6740         }
6741
6742         # If already are equivalent, no need to re-do it;  if subroutine
6743         # returns null, it found an error, also do nothing
6744         my $are_equivalent = $self->is_set_equivalent_to($other);
6745         return if ! defined $are_equivalent || $are_equivalent;
6746
6747         my $addr = do { no overloading; pack 'J', $self; };
6748         my $current_leader = ($related) ? $parent{$addr} : $leader{$addr};
6749
6750         if ($related) {
6751             if ($current_leader->perl_extension) {
6752                 if ($other->perl_extension) {
6753                     Carp::my_carp_bug("Use add_alias() to set two Perl tables '$self' and '$other', equivalent.");
6754                     return;
6755                 }
6756             } elsif ($self->property != $other->property    # Depending on
6757                                                             # situation, might
6758                                                             # be better to use
6759                                                             # add_alias()
6760                                                             # instead for same
6761                                                             # property
6762                      && ! $other->perl_extension)
6763             {
6764                 Carp::my_carp_bug("set_equivalent_to should have 'Related => 0 for equivalencing two Unicode properties.  Assuming $self is not related to $other");
6765                 $related = 0;
6766             }
6767         }
6768
6769         if (! $self->is_empty && ! $self->matches_identically_to($other)) {
6770             Carp::my_carp_bug("$self should be empty or match identically to $other.  Not setting equivalent");
6771             return;
6772         }
6773
6774         my $leader = do { no overloading; pack 'J', $current_leader; };
6775         my $other_addr = do { no overloading; pack 'J', $other; };
6776
6777         # Any tables that are equivalent to or children of this table must now
6778         # instead be equivalent to or (children) to the new leader (parent),
6779         # still equivalent.  The equivalency includes their matches_all info,
6780         # and for related tables, their fate and status.
6781         # All related tables are of necessity equivalent, but the converse
6782         # isn't necessarily true
6783         my $status = $other->status;
6784         my $status_info = $other->status_info;
6785         my $fate = $other->fate;
6786         my $matches_all = $matches_all{other_addr};
6787         my $caseless_equivalent = $other->caseless_equivalent;
6788         foreach my $table ($current_leader, @{$equivalents{$leader}}) {
6789             next if $table == $other;
6790             trace "setting $other to be the leader of $table, status=$status" if main::DEBUG && $to_trace;
6791
6792             my $table_addr = do { no overloading; pack 'J', $table; };
6793             $leader{$table_addr} = $other;
6794             $matches_all{$table_addr} = $matches_all;
6795             $self->_set_range_list($other->_range_list);
6796             push @{$equivalents{$other_addr}}, $table;
6797             if ($related) {
6798                 $parent{$table_addr} = $other;
6799                 push @{$children{$other_addr}}, $table;
6800                 $table->set_status($status, $status_info);
6801
6802                 # This reason currently doesn't get exposed outside; otherwise
6803                 # would have to look up the parent's reason and use it instead.
6804                 $table->set_fate($fate, "Parent's fate");
6805
6806                 $self->set_caseless_equivalent($caseless_equivalent);
6807             }
6808         }
6809
6810         # Now that we've declared these to be equivalent, any changes to one
6811         # of the tables would invalidate that equivalency.
6812         $self->lock;
6813         $other->lock;
6814         return;
6815     }
6816
6817     sub set_complement {
6818         # Set $self to be the complement of the parameter table.  $self is
6819         # locked, as what it contains should all come from the other table.
6820
6821         my $self = shift;
6822         my $other = shift;
6823
6824         my %args = @_;
6825         Carp::carp_extra_args(\%args) if main::DEBUG && %args;
6826
6827         if ($other->complement != 0) {
6828             Carp::my_carp_bug("Can't set $self to be the complement of $other, which itself is the complement of " . $other->complement);
6829             return;
6830         }
6831         my $addr = do { no overloading; pack 'J', $self; };
6832         $complement{$addr} = $other;
6833         $self->lock;
6834         return;
6835     }
6836
6837     sub add_range { # Add a range to the list for this table.
6838         my $self = shift;
6839         # Rest of parameters passed on
6840
6841         return if $self->carp_if_locked;
6842         return $self->_range_list->add_range(@_);
6843     }
6844
6845     sub header {
6846         my $self = shift;
6847         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6848
6849         # All match tables are to be used only by the Perl core.
6850         return $self->SUPER::header() . $INTERNAL_ONLY_HEADER;
6851     }
6852
6853     sub pre_body {  # Does nothing for match tables.
6854         return
6855     }
6856
6857     sub append_to_body {  # Does nothing for match tables.
6858         return
6859     }
6860
6861     sub set_fate {
6862         my $self = shift;
6863         my $fate = shift;
6864         my $reason = shift;
6865         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6866
6867         $self->SUPER::set_fate($fate, $reason);
6868
6869         # All children share this fate
6870         foreach my $child ($self->children) {
6871             $child->set_fate($fate, $reason);
6872         }
6873         return;
6874     }
6875
6876     sub write {
6877         my $self = shift;
6878         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6879
6880         return $self->SUPER::write(2); # 2 tab stops
6881     }
6882
6883     sub set_final_comment {
6884         # This creates a comment for the file that is to hold the match table
6885         # $self.  It is somewhat convoluted to make the English read nicely,
6886         # but, heh, it's just a comment.
6887         # This should be called only with the leader match table of all the
6888         # ones that share the same file.  It lists all such tables, ordered so
6889         # that related ones are together.
6890
6891         return unless $debugging_build;
6892
6893         my $leader = shift;   # Should only be called on the leader table of
6894                               # an equivalent group
6895         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6896
6897         my $addr = do { no overloading; pack 'J', $leader; };
6898
6899         if ($leader{$addr} != $leader) {
6900             Carp::my_carp_bug(<<END
6901 set_final_comment() must be called on a leader table, which $leader is not.
6902 It is equivalent to $leader{$addr}.  No comment created
6903 END
6904             );
6905             return;
6906         }
6907
6908         # Get the number of code points matched by each of the tables in this
6909         # file, and add underscores for clarity.
6910         my $count = $leader->count;
6911         my $string_count = main::clarify_number($count);
6912
6913         my $loose_count = 0;        # how many aliases loosely matched
6914         my $compound_name = "";     # ? Are any names compound?, and if so, an
6915                                     # example
6916         my $properties_with_compound_names = 0;    # count of these
6917
6918
6919         my %flags;              # The status flags used in the file
6920         my $total_entries = 0;  # number of entries written in the comment
6921         my $matches_comment = ""; # The portion of the comment about the
6922                                   # \p{}'s
6923         my @global_comments;    # List of all the tables' comments that are
6924                                 # there before this routine was called.
6925
6926         # Get list of all the parent tables that are equivalent to this one
6927         # (including itself).
6928         my @parents = grep { $parent{main::objaddr $_} == $_ }
6929                             main::uniques($leader, @{$equivalents{$addr}});
6930         my $has_unrelated = (@parents >= 2);  # boolean, ? are there unrelated
6931                                               # tables
6932
6933         for my $parent (@parents) {
6934
6935             my $property = $parent->property;
6936
6937             # Special case 'N' tables in properties with two match tables when
6938             # the other is a 'Y' one.  These are likely to be binary tables,
6939             # but not necessarily.  In either case, \P{} will match the
6940             # complement of \p{}, and so if something is a synonym of \p, the
6941             # complement of that something will be the synonym of \P.  This
6942             # would be true of any property with just two match tables, not
6943             # just those whose values are Y and N; but that would require a
6944             # little extra work, and there are none such so far in Unicode.
6945             my $perl_p = 'p';        # which is it?  \p{} or \P{}
6946             my @yes_perl_synonyms;   # list of any synonyms for the 'Y' table
6947
6948             if (scalar $property->tables == 2
6949                 && $parent == $property->table('N')
6950                 && defined (my $yes = $property->table('Y')))
6951             {
6952                 my $yes_addr = do { no overloading; pack 'J', $yes; };
6953                 @yes_perl_synonyms
6954                     = grep { $_->property == $perl }
6955                                     main::uniques($yes,
6956                                                 $parent{$yes_addr},
6957                                                 $parent{$yes_addr}->children);
6958
6959                 # But these synonyms are \P{} ,not \p{}
6960                 $perl_p = 'P';
6961             }
6962
6963             my @description;        # Will hold the table description
6964             my @note;               # Will hold the table notes.
6965             my @conflicting;        # Will hold the table conflicts.
6966
6967             # Look at the parent, any yes synonyms, and all the children
6968             my $parent_addr = do { no overloading; pack 'J', $parent; };
6969             for my $table ($parent,
6970                            @yes_perl_synonyms,
6971                            @{$children{$parent_addr}})
6972             {
6973                 my $table_addr = do { no overloading; pack 'J', $table; };
6974                 my $table_property = $table->property;
6975
6976                 # Tables are separated by a blank line to create a grouping.
6977                 $matches_comment .= "\n" if $matches_comment;
6978
6979                 # The table is named based on the property and value
6980                 # combination it is for, like script=greek.  But there may be
6981                 # a number of synonyms for each side, like 'sc' for 'script',
6982                 # and 'grek' for 'greek'.  Any combination of these is a valid
6983                 # name for this table.  In this case, there are three more,
6984                 # 'sc=grek', 'sc=greek', and 'script='grek'.  Rather than
6985                 # listing all possible combinations in the comment, we make
6986                 # sure that each synonym occurs at least once, and add
6987                 # commentary that the other combinations are possible.
6988                 # Because regular expressions don't recognize things like
6989                 # \p{jsn=}, only look at non-null right-hand-sides
6990                 my @property_aliases = $table_property->aliases;
6991                 my @table_aliases = grep { $_->name ne "" } $table->aliases;
6992
6993                 # The alias lists above are already ordered in the order we
6994                 # want to output them.  To ensure that each synonym is listed,
6995                 # we must use the max of the two numbers.  But if there are no
6996                 # legal synonyms (nothing in @table_aliases), then we don't
6997                 # list anything.
6998                 my $listed_combos = (@table_aliases)
6999                                     ?  main::max(scalar @table_aliases,
7000                                                  scalar @property_aliases)
7001                                     : 0;
7002                 trace "$listed_combos, tables=", scalar @table_aliases, "; names=", scalar @property_aliases if main::DEBUG;
7003
7004
7005                 my $property_had_compound_name = 0;
7006
7007                 for my $i (0 .. $listed_combos - 1) {
7008                     $total_entries++;
7009
7010                     # The current alias for the property is the next one on
7011                     # the list, or if beyond the end, start over.  Similarly
7012                     # for the table (\p{prop=table})
7013                     my $property_alias = $property_aliases
7014                                             [$i % @property_aliases]->name;
7015                     my $table_alias_object = $table_aliases
7016                                                         [$i % @table_aliases];
7017                     my $table_alias = $table_alias_object->name;
7018                     my $loose_match = $table_alias_object->loose_match;
7019
7020                     if ($table_alias !~ /\D/) { # Clarify large numbers.
7021                         $table_alias = main::clarify_number($table_alias)
7022                     }
7023
7024                     # Add a comment for this alias combination
7025                     my $current_match_comment;
7026                     if ($table_property == $perl) {
7027                         $current_match_comment = "\\$perl_p"
7028                                                     . "{$table_alias}";
7029                     }
7030                     else {
7031                         $current_match_comment
7032                                         = "\\p{$property_alias=$table_alias}";
7033                         $property_had_compound_name = 1;
7034                     }
7035
7036                     # Flag any abnormal status for this table.
7037                     my $flag = $property->status
7038                                 || $table->status
7039                                 || $table_alias_object->status;
7040                     $flags{$flag} = $status_past_participles{$flag} if $flag;
7041
7042                     $loose_count++;
7043
7044                     # Pretty up the comment.  Note the \b; it says don't make
7045                     # this line a continuation.
7046                     $matches_comment .= sprintf("\b%-1s%-s%s\n",
7047                                         $flag,
7048                                         " " x 7,
7049                                         $current_match_comment);
7050                 } # End of generating the entries for this table.
7051
7052                 # Save these for output after this group of related tables.
7053                 push @description, $table->description;
7054                 push @note, $table->note;
7055                 push @conflicting, $table->conflicting;
7056
7057                 # And this for output after all the tables.
7058                 push @global_comments, $table->comment;
7059
7060                 # Compute an alternate compound name using the final property
7061                 # synonym and the first table synonym with a colon instead of
7062                 # the equal sign used elsewhere.
7063                 if ($property_had_compound_name) {
7064                     $properties_with_compound_names ++;
7065                     if (! $compound_name || @property_aliases > 1) {
7066                         $compound_name = $property_aliases[-1]->name
7067                                         . ': '
7068                                         . $table_aliases[0]->name;
7069                     }
7070                 }
7071             } # End of looping through all children of this table
7072
7073             # Here have assembled in $matches_comment all the related tables
7074             # to the current parent (preceded by the same info for all the
7075             # previous parents).  Put out information that applies to all of
7076             # the current family.
7077             if (@conflicting) {
7078
7079                 # But output the conflicting information now, as it applies to
7080                 # just this table.
7081                 my $conflicting = join ", ", @conflicting;
7082                 if ($conflicting) {
7083                     $matches_comment .= <<END;
7084
7085     Note that contrary to what you might expect, the above is NOT the same as
7086 END
7087                     $matches_comment .= "any of: " if @conflicting > 1;
7088                     $matches_comment .= "$conflicting\n";
7089                 }
7090             }
7091             if (@description) {
7092                 $matches_comment .= "\n    Meaning: "
7093                                     . join('; ', @description)
7094                                     . "\n";
7095             }
7096             if (@note) {
7097                 $matches_comment .= "\n    Note: "
7098                                     . join("\n    ", @note)
7099                                     . "\n";
7100             }
7101         } # End of looping through all tables
7102
7103
7104         my $code_points;
7105         my $match;
7106         my $any_of_these;
7107         if ($count == 1) {
7108             $match = 'matches';
7109             $code_points = 'single code point';
7110         }
7111         else {
7112             $match = 'match';
7113             $code_points = "$string_count code points";
7114         }
7115
7116         my $synonyms;
7117         my $entries;
7118         if ($total_entries == 1) {
7119             $synonyms = "";
7120             $entries = 'entry';
7121             $any_of_these = 'this'
7122         }
7123         else {
7124             $synonyms = " any of the following regular expression constructs";
7125             $entries = 'entries';
7126             $any_of_these = 'any of these'
7127         }
7128
7129         my $comment = "Use Unicode::UCD::prop_invlist() to access the contents of this file.\n\n";
7130         if ($has_unrelated) {
7131             $comment .= <<END;
7132 This file is for tables that are not necessarily related:  To conserve
7133 resources, every table that matches the identical set of code points in this
7134 version of Unicode uses this file.  Each one is listed in a separate group
7135 below.  It could be that the tables will match the same set of code points in
7136 other Unicode releases, or it could be purely coincidence that they happen to
7137 be the same in Unicode $string_version, and hence may not in other versions.
7138
7139 END
7140         }
7141
7142         if (%flags) {
7143             foreach my $flag (sort keys %flags) {
7144                 $comment .= <<END;
7145 '$flag' below means that this form is $flags{$flag}.
7146 Consult $pod_file.pod
7147 END
7148             }
7149             $comment .= "\n";
7150         }
7151
7152         if ($total_entries == 0) {
7153             Carp::my_carp("No regular expression construct can match $leader, as all names for it are the null string.  Creating file anyway.");
7154             $comment .= <<END;
7155 This file returns the $code_points in Unicode Version $string_version for
7156 $leader, but it is inaccessible through Perl regular expressions, as
7157 "\\p{prop=}" is not recognized.
7158 END
7159
7160         } else {
7161             $comment .= <<END;
7162 This file returns the $code_points in Unicode Version $string_version that
7163 $match$synonyms:
7164
7165 $matches_comment
7166 $pod_file.pod should be consulted for the syntax rules for $any_of_these,
7167 including if adding or subtracting white space, underscore, and hyphen
7168 characters matters or doesn't matter, and other permissible syntactic
7169 variants.  Upper/lower case distinctions never matter.
7170 END
7171
7172         }
7173         if ($compound_name) {
7174             $comment .= <<END;
7175
7176 A colon can be substituted for the equals sign, and
7177 END
7178             if ($properties_with_compound_names > 1) {
7179                 $comment .= <<END;
7180 within each group above,
7181 END
7182             }
7183             $compound_name = sprintf("%-8s\\p{%s}", " ", $compound_name);
7184
7185             # Note the \b below, it says don't make that line a continuation.
7186             $comment .= <<END;
7187 anything to the left of the equals (or colon) can be combined with anything to
7188 the right.  Thus, for example,
7189 $compound_name
7190 \bis also valid.
7191 END
7192         }
7193
7194         # And append any comment(s) from the actual tables.  They are all
7195         # gathered here, so may not read all that well.
7196         if (@global_comments) {
7197             $comment .= "\n" . join("\n\n", @global_comments) . "\n";
7198         }
7199
7200         if ($count) {   # The format differs if no code points, and needs no
7201                         # explanation in that case
7202                 $comment.= <<END;
7203
7204 The format of the lines of this file is:
7205 END
7206             $comment.= <<END;
7207 START\\tSTOP\\twhere START is the starting code point of the range, in hex;
7208 STOP is the ending point, or if omitted, the range has just one code point.
7209 END
7210             if ($leader->output_range_counts) {
7211                 $comment .= <<END;
7212 Numbers in comments in [brackets] indicate how many code points are in the
7213 range.
7214 END
7215             }
7216         }
7217
7218         $leader->set_comment(main::join_lines($comment));
7219         return;
7220     }
7221
7222     # Accessors for the underlying list
7223     for my $sub (qw(
7224                     get_valid_code_point
7225                     get_invalid_code_point
7226                 ))
7227     {
7228         no strict "refs";
7229         *$sub = sub {
7230             use strict "refs";
7231             my $self = shift;
7232
7233             return $self->_range_list->$sub(@_);
7234         }
7235     }
7236 } # End closure for Match_Table
7237
7238 package Property;
7239
7240 # The Property class represents a Unicode property, or the $perl
7241 # pseudo-property.  It contains a map table initialized empty at construction
7242 # time, and for properties accessible through regular expressions, various
7243 # match tables, created through the add_match_table() method, and referenced
7244 # by the table('NAME') or tables() methods, the latter returning a list of all
7245 # of the match tables.  Otherwise table operations implicitly are for the map
7246 # table.
7247 #
7248 # Most of the data in the property is actually about its map table, so it
7249 # mostly just uses that table's accessors for most methods.  The two could
7250 # have been combined into one object, but for clarity because of their
7251 # differing semantics, they have been kept separate.  It could be argued that
7252 # the 'file' and 'directory' fields should be kept with the map table.
7253 #
7254 # Each property has a type.  This can be set in the constructor, or in the
7255 # set_type accessor, but mostly it is figured out by the data.  Every property
7256 # starts with unknown type, overridden by a parameter to the constructor, or
7257 # as match tables are added, or ranges added to the map table, the data is
7258 # inspected, and the type changed.  After the table is mostly or entirely
7259 # filled, compute_type() should be called to finalize they analysis.
7260 #
7261 # There are very few operations defined.  One can safely remove a range from
7262 # the map table, and property_add_or_replace_non_nulls() adds the maps from another
7263 # table to this one, replacing any in the intersection of the two.
7264
7265 sub standardize { return main::standardize($_[0]); }
7266 sub trace { return main::trace(@_) if main::DEBUG && $to_trace }
7267
7268 {   # Closure
7269
7270     # This hash will contain as keys, all the aliases of all properties, and
7271     # as values, pointers to their respective property objects.  This allows
7272     # quick look-up of a property from any of its names.
7273     my %alias_to_property_of;
7274
7275     sub dump_alias_to_property_of {
7276         # For debugging
7277
7278         print "\n", main::simple_dumper (\%alias_to_property_of), "\n";
7279         return;
7280     }
7281
7282     sub property_ref {
7283         # This is a package subroutine, not called as a method.
7284         # If the single parameter is a literal '*' it returns a list of all
7285         # defined properties.
7286         # Otherwise, the single parameter is a name, and it returns a pointer
7287         # to the corresponding property object, or undef if none.
7288         #
7289         # Properties can have several different names.  The 'standard' form of
7290         # each of them is stored in %alias_to_property_of as they are defined.
7291         # But it's possible that this subroutine will be called with some
7292         # variant, so if the initial lookup fails, it is repeated with the
7293         # standardized form of the input name.  If found, besides returning the
7294         # result, the input name is added to the list so future calls won't
7295         # have to do the conversion again.
7296
7297         my $name = shift;
7298
7299         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7300
7301         if (! defined $name) {
7302             Carp::my_carp_bug("Undefined input property.  No action taken.");
7303             return;
7304         }
7305
7306         return main::uniques(values %alias_to_property_of) if $name eq '*';
7307
7308         # Return cached result if have it.
7309         my $result = $alias_to_property_of{$name};
7310         return $result if defined $result;
7311
7312         # Convert the input to standard form.
7313         my $standard_name = standardize($name);
7314
7315         $result = $alias_to_property_of{$standard_name};
7316         return unless defined $result;        # Don't cache undefs
7317
7318         # Cache the result before returning it.
7319         $alias_to_property_of{$name} = $result;
7320         return $result;
7321     }
7322
7323
7324     main::setup_package();
7325
7326     my %map;
7327     # A pointer to the map table object for this property
7328     main::set_access('map', \%map);
7329
7330     my %full_name;
7331     # The property's full name.  This is a duplicate of the copy kept in the
7332     # map table, but is needed because stringify needs it during
7333     # construction of the map table, and then would have a chicken before egg
7334     # problem.
7335     main::set_access('full_name', \%full_name, 'r');
7336
7337     my %table_ref;
7338     # This hash will contain as keys, all the aliases of any match tables
7339     # attached to this property, and as values, the pointers to their
7340     # respective tables.  This allows quick look-up of a table from any of its
7341     # names.
7342     main::set_access('table_ref', \%table_ref);
7343
7344     my %type;
7345     # The type of the property, $ENUM, $BINARY, etc
7346     main::set_access('type', \%type, 'r');
7347
7348     my %file;
7349     # The filename where the map table will go (if actually written).
7350     # Normally defaulted, but can be overridden.
7351     main::set_access('file', \%file, 'r', 's');
7352
7353     my %directory;
7354     # The directory where the map table will go (if actually written).
7355     # Normally defaulted, but can be overridden.
7356     main::set_access('directory', \%directory, 's');
7357
7358     my %pseudo_map_type;
7359     # This is used to affect the calculation of the map types for all the
7360     # ranges in the table.  It should be set to one of the values that signify
7361     # to alter the calculation.
7362     main::set_access('pseudo_map_type', \%pseudo_map_type, 'r');
7363
7364     my %has_only_code_point_maps;
7365     # A boolean used to help in computing the type of data in the map table.
7366     main::set_access('has_only_code_point_maps', \%has_only_code_point_maps);
7367
7368     my %unique_maps;
7369     # A list of the first few distinct mappings this property has.  This is
7370     # used to disambiguate between binary and enum property types, so don't
7371     # have to keep more than three.
7372     main::set_access('unique_maps', \%unique_maps);
7373
7374     my %pre_declared_maps;
7375     # A boolean that gives whether the input data should declare all the
7376     # tables used, or not.  If the former, unknown ones raise a warning.
7377     main::set_access('pre_declared_maps',
7378                                     \%pre_declared_maps, 'r', 's');
7379
7380     sub new {
7381         # The only required parameter is the positionally first, name.  All
7382         # other parameters are key => value pairs.  See the documentation just
7383         # above for the meanings of the ones not passed directly on to the map
7384         # table constructor.
7385
7386         my $class = shift;
7387         my $name = shift || "";
7388
7389         my $self = property_ref($name);
7390         if (defined $self) {
7391             my $options_string = join ", ", @_;
7392             $options_string = ".  Ignoring options $options_string" if $options_string;
7393             Carp::my_carp("$self is already in use.  Using existing one$options_string;");
7394             return $self;
7395         }
7396
7397         my %args = @_;
7398
7399         $self = bless \do { my $anonymous_scalar }, $class;
7400         my $addr = do { no overloading; pack 'J', $self; };
7401
7402         $directory{$addr} = delete $args{'Directory'};
7403         $file{$addr} = delete $args{'File'};
7404         $full_name{$addr} = delete $args{'Full_Name'} || $name;
7405         $type{$addr} = delete $args{'Type'} || $UNKNOWN;
7406         $pseudo_map_type{$addr} = delete $args{'Map_Type'};
7407         $pre_declared_maps{$addr} = delete $args{'Pre_Declared_Maps'}
7408                                     # Starting in this release, property
7409                                     # values should be defined for all
7410                                     # properties, except those overriding this
7411                                     // $v_version ge v5.1.0;
7412
7413         # Rest of parameters passed on.
7414
7415         $has_only_code_point_maps{$addr} = 1;
7416         $table_ref{$addr} = { };
7417         $unique_maps{$addr} = { };
7418
7419         $map{$addr} = Map_Table->new($name,
7420                                     Full_Name => $full_name{$addr},
7421                                     _Alias_Hash => \%alias_to_property_of,
7422                                     _Property => $self,
7423                                     %args);
7424         return $self;
7425     }
7426
7427     # See this program's beginning comment block about overloading the copy
7428     # constructor.  Few operations are defined on properties, but a couple are
7429     # useful.  It is safe to take the inverse of a property, and to remove a
7430     # single code point from it.
7431     use overload
7432         fallback => 0,
7433         qw("") => "_operator_stringify",
7434         "." => \&main::_operator_dot,
7435         '==' => \&main::_operator_equal,
7436         '!=' => \&main::_operator_not_equal,
7437         '=' => sub { return shift },
7438         '-=' => "_minus_and_equal",
7439     ;
7440
7441     sub _operator_stringify {
7442         return "Property '" .  shift->full_name . "'";
7443     }
7444
7445     sub _minus_and_equal {
7446         # Remove a single code point from the map table of a property.
7447
7448         my $self = shift;
7449         my $other = shift;
7450         my $reversed = shift;
7451         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7452
7453         if (ref $other) {
7454             Carp::my_carp_bug("Can't cope with a "
7455                         . ref($other)
7456                         . " argument to '-='.  Subtraction ignored.");
7457             return $self;
7458         }
7459         elsif ($reversed) {   # Shouldn't happen in a -=, but just in case
7460             Carp::my_carp_bug("Can't cope with a "
7461             .  __PACKAGE__
7462             . " being the first parameter in a '-='.  Subtraction ignored.");
7463             return $self;
7464         }
7465         else {
7466             no overloading;
7467             $map{pack 'J', $self}->delete_range($other, $other);
7468         }
7469         return $self;
7470     }
7471
7472     sub add_match_table {
7473         # Add a new match table for this property, with name given by the
7474         # parameter.  It returns a pointer to the table.
7475
7476         my $self = shift;
7477         my $name = shift;
7478         my %args = @_;
7479
7480         my $addr = do { no overloading; pack 'J', $self; };
7481
7482         my $table = $table_ref{$addr}{$name};
7483         my $standard_name = main::standardize($name);
7484         if (defined $table
7485             || (defined ($table = $table_ref{$addr}{$standard_name})))
7486         {
7487             Carp::my_carp("Table '$name' in $self is already in use.  Using existing one");
7488             $table_ref{$addr}{$name} = $table;
7489             return $table;
7490         }
7491         else {
7492
7493             # See if this is a perl extension, if not passed in.
7494             my $perl_extension = delete $args{'Perl_Extension'};
7495             $perl_extension
7496                         = $self->perl_extension if ! defined $perl_extension;
7497
7498             $table = Match_Table->new(
7499                                 Name => $name,
7500                                 Perl_Extension => $perl_extension,
7501                                 _Alias_Hash => $table_ref{$addr},
7502                                 _Property => $self,
7503
7504                                 # gets property's fate and status by default
7505                                 Fate => $self->fate,
7506                                 Status => $self->status,
7507                                 _Status_Info => $self->status_info,
7508                                 %args);
7509             return unless defined $table;
7510         }
7511
7512         # Save the names for quick look up
7513         $table_ref{$addr}{$standard_name} = $table;
7514         $table_ref{$addr}{$name} = $table;
7515
7516         # Perhaps we can figure out the type of this property based on the
7517         # fact of adding this match table.  First, string properties don't
7518         # have match tables; second, a binary property can't have 3 match
7519         # tables
7520         if ($type{$addr} == $UNKNOWN) {
7521             $type{$addr} = $NON_STRING;
7522         }
7523         elsif ($type{$addr} == $STRING) {
7524             Carp::my_carp("$self Added a match table '$name' to a string property '$self'.  Changed it to a non-string property.  Bad News.");
7525             $type{$addr} = $NON_STRING;
7526         }
7527         elsif ($type{$addr} != $ENUM && $type{$addr} != $FORCED_BINARY) {
7528             if (scalar main::uniques(values %{$table_ref{$addr}}) > 2
7529                 && $type{$addr} == $BINARY)
7530             {
7531                 Carp::my_carp("$self now has more than 2 tables (with the addition of '$name'), and so is no longer binary.  Changing its type to 'enum'.  Bad News.");
7532                 $type{$addr} = $ENUM;
7533             }
7534         }
7535
7536         return $table;
7537     }
7538
7539     sub delete_match_table {
7540         # Delete the table referred to by $2 from the property $1.
7541
7542         my $self = shift;
7543         my $table_to_remove = shift;
7544         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7545
7546         my $addr = do { no overloading; pack 'J', $self; };
7547
7548         # Remove all names that refer to it.
7549         foreach my $key (keys %{$table_ref{$addr}}) {
7550             delete $table_ref{$addr}{$key}
7551                                 if $table_ref{$addr}{$key} == $table_to_remove;
7552         }
7553
7554         $table_to_remove->DESTROY;
7555         return;
7556     }
7557
7558     sub table {
7559         # Return a pointer to the match table (with name given by the
7560         # parameter) associated with this property; undef if none.
7561
7562         my $self = shift;
7563         my $name = shift;
7564         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7565
7566         my $addr = do { no overloading; pack 'J', $self; };
7567
7568         return $table_ref{$addr}{$name} if defined $table_ref{$addr}{$name};
7569
7570         # If quick look-up failed, try again using the standard form of the
7571         # input name.  If that succeeds, cache the result before returning so
7572         # won't have to standardize this input name again.
7573         my $standard_name = main::standardize($name);
7574         return unless defined $table_ref{$addr}{$standard_name};
7575
7576         $table_ref{$addr}{$name} = $table_ref{$addr}{$standard_name};
7577         return $table_ref{$addr}{$name};
7578     }
7579
7580     sub tables {
7581         # Return a list of pointers to all the match tables attached to this
7582         # property
7583
7584         no overloading;
7585         return main::uniques(values %{$table_ref{pack 'J', shift}});
7586     }
7587
7588     sub directory {
7589         # Returns the directory the map table for this property should be
7590         # output in.  If a specific directory has been specified, that has
7591         # priority;  'undef' is returned if the type isn't defined;
7592         # or $map_directory for everything else.
7593
7594         my $addr = do { no overloading; pack 'J', shift; };
7595
7596         return $directory{$addr} if defined $directory{$addr};
7597         return undef if $type{$addr} == $UNKNOWN;
7598         return $map_directory;
7599     }
7600
7601     sub swash_name {
7602         # Return the name that is used to both:
7603         #   1)  Name the file that the map table is written to.
7604         #   2)  The name of swash related stuff inside that file.
7605         # The reason for this is that the Perl core historically has used
7606         # certain names that aren't the same as the Unicode property names.
7607         # To continue using these, $file is hard-coded in this file for those,
7608         # but otherwise the standard name is used.  This is different from the
7609         # external_name, so that the rest of the files, like in lib can use
7610         # the standard name always, without regard to historical precedent.
7611
7612         my $self = shift;
7613         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7614
7615         my $addr = do { no overloading; pack 'J', $self; };
7616
7617         # Swash names are used only on regular map tables; otherwise there
7618         # should be no access to the property map table from other parts of
7619         # Perl.
7620         return if $map{$addr}->fate != $ORDINARY;
7621
7622         return $file{$addr} if defined $file{$addr};
7623         return $map{$addr}->external_name;
7624     }
7625
7626     sub to_create_match_tables {
7627         # Returns a boolean as to whether or not match tables should be
7628         # created for this property.
7629
7630         my $self = shift;
7631         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7632
7633         # The whole point of this pseudo property is match tables.
7634         return 1 if $self == $perl;
7635
7636         my $addr = do { no overloading; pack 'J', $self; };
7637
7638         # Don't generate tables of code points that match the property values
7639         # of a string property.  Such a list would most likely have many
7640         # property values, each with just one or very few code points mapping
7641         # to it.
7642         return 0 if $type{$addr} == $STRING;
7643
7644         # Don't generate anything for unimplemented properties.
7645         return 0 if grep { $self->complete_name eq $_ }
7646                                                     @unimplemented_properties;
7647         # Otherwise, do.
7648         return 1;
7649     }
7650
7651     sub property_add_or_replace_non_nulls {
7652         # This adds the mappings in the property $other to $self.  Non-null
7653         # mappings from $other override those in $self.  It essentially merges
7654         # the two properties, with the second having priority except for null
7655         # mappings.
7656
7657         my $self = shift;
7658         my $other = shift;
7659         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7660
7661         if (! $other->isa(__PACKAGE__)) {
7662             Carp::my_carp_bug("$other should be a "
7663                             . __PACKAGE__
7664                             . ".  Not a '"
7665                             . ref($other)
7666                             . "'.  Not added;");
7667             return;
7668         }
7669
7670         no overloading;
7671         return $map{pack 'J', $self}->map_add_or_replace_non_nulls($map{pack 'J', $other});
7672     }
7673
7674     sub set_proxy_for {
7675         # Certain tables are not generally written out to files, but
7676         # Unicode::UCD has the intelligence to know that the file for $self
7677         # can be used to reconstruct those tables.  This routine just changes
7678         # things so that UCD pod entries for those suppressed tables are
7679         # generated, so the fact that a proxy is used is invisible to the
7680         # user.
7681
7682         my $self = shift;
7683
7684         foreach my $property_name (@_) {
7685             my $ref = property_ref($property_name);
7686             next if $ref->to_output_map;
7687             $ref->set_fate($MAP_PROXIED);
7688         }
7689     }
7690
7691     sub set_type {
7692         # Set the type of the property.  Mostly this is figured out by the
7693         # data in the table.  But this is used to set it explicitly.  The
7694         # reason it is not a standard accessor is that when setting a binary
7695         # property, we need to make sure that all the true/false aliases are
7696         # present, as they were omitted in early Unicode releases.
7697
7698         my $self = shift;
7699         my $type = shift;
7700         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7701
7702         if ($type != $ENUM
7703             && $type != $BINARY
7704             && $type != $FORCED_BINARY
7705             && $type != $STRING)
7706         {
7707             Carp::my_carp("Unrecognized type '$type'.  Type not set");
7708             return;
7709         }
7710
7711         { no overloading; $type{pack 'J', $self} = $type; }
7712         return if $type != $BINARY && $type != $FORCED_BINARY;
7713
7714         my $yes = $self->table('Y');
7715         $yes = $self->table('Yes') if ! defined $yes;
7716         $yes = $self->add_match_table('Y', Full_Name => 'Yes')
7717                                                             if ! defined $yes;
7718
7719         # Add aliases in order wanted, duplicates will be ignored.  We use a
7720         # binary property present in all releases for its ordered lists of
7721         # true/false aliases.  Note, that could run into problems in
7722         # outputting things in that we don't distinguish between the name and
7723         # full name of these.  Hopefully, if the table was already created
7724         # before this code is executed, it was done with these set properly.
7725         my $bm = property_ref("Bidi_Mirrored");
7726         foreach my $alias ($bm->table("Y")->aliases) {
7727             $yes->add_alias($alias->name);
7728         }
7729         my $no = $self->table('N');
7730         $no = $self->table('No') if ! defined $no;
7731         $no = $self->add_match_table('N', Full_Name => 'No') if ! defined $no;
7732         foreach my $alias ($bm->table("N")->aliases) {
7733             $no->add_alias($alias->name);
7734         }
7735
7736         return;
7737     }
7738
7739     sub add_map {
7740         # Add a map to the property's map table.  This also keeps
7741         # track of the maps so that the property type can be determined from
7742         # its data.
7743
7744         my $self = shift;
7745         my $start = shift;  # First code point in range
7746         my $end = shift;    # Final code point in range
7747         my $map = shift;    # What the range maps to.
7748         # Rest of parameters passed on.
7749
7750         my $addr = do { no overloading; pack 'J', $self; };
7751
7752         # If haven't the type of the property, gather information to figure it
7753         # out.
7754         if ($type{$addr} == $UNKNOWN) {
7755
7756             # If the map contains an interior blank or dash, or most other
7757             # nonword characters, it will be a string property.  This
7758             # heuristic may actually miss some string properties.  If so, they
7759             # may need to have explicit set_types called for them.  This
7760             # happens in the Unihan properties.
7761             if ($map =~ / (?<= . ) [ -] (?= . ) /x
7762                 || $map =~ / [^\w.\/\ -]  /x)
7763             {
7764                 $self->set_type($STRING);
7765
7766                 # $unique_maps is used for disambiguating between ENUM and
7767                 # BINARY later; since we know the property is not going to be
7768                 # one of those, no point in keeping the data around
7769                 undef $unique_maps{$addr};
7770             }
7771             else {
7772
7773                 # Not necessarily a string.  The final decision has to be
7774                 # deferred until all the data are in.  We keep track of if all
7775                 # the values are code points for that eventual decision.
7776                 $has_only_code_point_maps{$addr} &=
7777                                             $map =~ / ^ $code_point_re $/x;
7778
7779                 # For the purposes of disambiguating between binary and other
7780                 # enumerations at the end, we keep track of the first three
7781                 # distinct property values.  Once we get to three, we know
7782                 # it's not going to be binary, so no need to track more.
7783                 if (scalar keys %{$unique_maps{$addr}} < 3) {
7784                     $unique_maps{$addr}{main::standardize($map)} = 1;
7785                 }
7786             }
7787         }
7788
7789         # Add the mapping by calling our map table's method
7790         return $map{$addr}->add_map($start, $end, $map, @_);
7791     }
7792
7793     sub compute_type {
7794         # Compute the type of the property: $ENUM, $STRING, or $BINARY.  This
7795         # should be called after the property is mostly filled with its maps.
7796         # We have been keeping track of what the property values have been,
7797         # and now have the necessary information to figure out the type.
7798
7799         my $self = shift;
7800         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7801
7802         my $addr = do { no overloading; pack 'J', $self; };
7803
7804         my $type = $type{$addr};
7805
7806         # If already have figured these out, no need to do so again, but we do
7807         # a double check on ENUMS to make sure that a string property hasn't
7808         # improperly been classified as an ENUM, so continue on with those.
7809         return if $type == $STRING
7810                   || $type == $BINARY
7811                   || $type == $FORCED_BINARY;
7812
7813         # If every map is to a code point, is a string property.
7814         if ($type == $UNKNOWN
7815             && ($has_only_code_point_maps{$addr}
7816                 || (defined $map{$addr}->default_map
7817                     && $map{$addr}->default_map eq "")))
7818         {
7819             $self->set_type($STRING);
7820         }
7821         else {
7822
7823             # Otherwise, it is to some sort of enumeration.  (The case where
7824             # it is a Unicode miscellaneous property, and treated like a
7825             # string in this program is handled in add_map()).  Distinguish
7826             # between binary and some other enumeration type.  Of course, if
7827             # there are more than two values, it's not binary.  But more
7828             # subtle is the test that the default mapping is defined means it
7829             # isn't binary.  This in fact may change in the future if Unicode
7830             # changes the way its data is structured.  But so far, no binary
7831             # properties ever have @missing lines for them, so the default map
7832             # isn't defined for them.  The few properties that are two-valued
7833             # and aren't considered binary have the default map defined
7834             # starting in Unicode 5.0, when the @missing lines appeared; and
7835             # this program has special code to put in a default map for them
7836             # for earlier than 5.0 releases.
7837             if ($type == $ENUM
7838                 || scalar keys %{$unique_maps{$addr}} > 2
7839                 || defined $self->default_map)
7840             {
7841                 my $tables = $self->tables;
7842                 my $count = $self->count;
7843                 if ($verbosity && $count > 500 && $tables/$count > .1) {
7844                     Carp::my_carp_bug("It appears that $self should be a \$STRING property, not an \$ENUM because it has too many match tables: $count\n");
7845                 }
7846                 $self->set_type($ENUM);
7847             }
7848             else {
7849                 $self->set_type($BINARY);
7850             }
7851         }
7852         undef $unique_maps{$addr};  # Garbage collect
7853         return;
7854     }
7855
7856     sub set_fate {
7857         my $self = shift;
7858         my $fate = shift;
7859         my $reason = shift;  # Ignored unless suppressing
7860         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7861
7862         my $addr = do { no overloading; pack 'J', $self; };
7863         if ($fate == $SUPPRESSED) {
7864             $why_suppressed{$self->complete_name} = $reason;
7865         }
7866
7867         # Each table shares the property's fate, except that MAP_PROXIED
7868         # doesn't affect match tables
7869         $map{$addr}->set_fate($fate, $reason);
7870         if ($fate != $MAP_PROXIED) {
7871             foreach my $table ($map{$addr}, $self->tables) {
7872                 $table->set_fate($fate, $reason);
7873             }
7874         }
7875         return;
7876     }
7877
7878
7879     # Most of the accessors for a property actually apply to its map table.
7880     # Setup up accessor functions for those, referring to %map
7881     for my $sub (qw(
7882                     add_alias
7883                     add_anomalous_entry
7884                     add_comment
7885                     add_conflicting
7886                     add_description
7887                     add_duplicate
7888                     add_note
7889                     aliases
7890                     comment
7891                     complete_name
7892                     containing_range
7893                     count
7894                     default_map
7895                     delete_range
7896                     description
7897                     each_range
7898                     external_name
7899                     fate
7900                     file_path
7901                     format
7902                     initialize
7903                     inverse_list
7904                     is_empty
7905                     name
7906                     note
7907                     perl_extension
7908                     property
7909                     range_count
7910                     ranges
7911                     range_size_1
7912                     reset_each_range
7913                     set_comment
7914                     set_default_map
7915                     set_file_path
7916                     set_final_comment
7917                     _set_format
7918                     set_range_size_1
7919                     set_status
7920                     set_to_output_map
7921                     short_name
7922                     status
7923                     status_info
7924                     to_output_map
7925                     type_of
7926                     value_of
7927                     write
7928                 ))
7929                     # 'property' above is for symmetry, so that one can take
7930                     # the property of a property and get itself, and so don't
7931                     # have to distinguish between properties and tables in
7932                     # calling code
7933     {
7934         no strict "refs";
7935         *$sub = sub {
7936             use strict "refs";
7937             my $self = shift;
7938             no overloading;
7939             return $map{pack 'J', $self}->$sub(@_);
7940         }
7941     }
7942
7943
7944 } # End closure
7945
7946 package main;
7947
7948 sub join_lines($) {
7949     # Returns lines of the input joined together, so that they can be folded
7950     # properly.
7951     # This causes continuation lines to be joined together into one long line
7952     # for folding.  A continuation line is any line that doesn't begin with a
7953     # space or "\b" (the latter is stripped from the output).  This is so
7954     # lines can be be in a HERE document so as to fit nicely in the terminal
7955     # width, but be joined together in one long line, and then folded with
7956     # indents, '#' prefixes, etc, properly handled.
7957     # A blank separates the joined lines except if there is a break; an extra
7958     # blank is inserted after a period ending a line.
7959
7960     # Initialize the return with the first line.
7961     my ($return, @lines) = split "\n", shift;
7962
7963     # If the first line is null, it was an empty line, add the \n back in
7964     $return = "\n" if $return eq "";
7965
7966     # Now join the remainder of the physical lines.
7967     for my $line (@lines) {
7968
7969         # An empty line means wanted a blank line, so add two \n's to get that
7970         # effect, and go to the next line.
7971         if (length $line == 0) {
7972             $return .= "\n\n";
7973             next;
7974         }
7975
7976         # Look at the last character of what we have so far.
7977         my $previous_char = substr($return, -1, 1);
7978
7979         # And at the next char to be output.
7980         my $next_char = substr($line, 0, 1);
7981
7982         if ($previous_char ne "\n") {
7983
7984             # Here didn't end wth a nl.  If the next char a blank or \b, it
7985             # means that here there is a break anyway.  So add a nl to the
7986             # output.
7987             if ($next_char eq " " || $next_char eq "\b") {
7988                 $previous_char = "\n";
7989                 $return .= $previous_char;
7990             }
7991
7992             # Add an extra space after periods.
7993             $return .= " " if $previous_char eq '.';
7994         }
7995
7996         # Here $previous_char is still the latest character to be output.  If
7997         # it isn't a nl, it means that the next line is to be a continuation
7998         # line, with a blank inserted between them.
7999         $return .= " " if $previous_char ne "\n";
8000
8001         # Get rid of any \b
8002         substr($line, 0, 1) = "" if $next_char eq "\b";
8003
8004         # And append this next line.
8005         $return .= $line;
8006     }
8007
8008     return $return;
8009 }
8010
8011 sub simple_fold($;$$$) {
8012     # Returns a string of the input (string or an array of strings) folded
8013     # into multiple-lines each of no more than $MAX_LINE_WIDTH characters plus
8014     # a \n
8015     # This is tailored for the kind of text written by this program,
8016     # especially the pod file, which can have very long names with
8017     # underscores in the middle, or words like AbcDefgHij....  We allow
8018     # breaking in the middle of such constructs if the line won't fit
8019     # otherwise.  The break in such cases will come either just after an
8020     # underscore, or just before one of the Capital letters.
8021
8022     local $to_trace = 0 if main::DEBUG;
8023
8024     my $line = shift;
8025     my $prefix = shift;     # Optional string to prepend to each output
8026                             # line
8027     $prefix = "" unless defined $prefix;
8028
8029     my $hanging_indent = shift; # Optional number of spaces to indent
8030                                 # continuation lines
8031     $hanging_indent = 0 unless $hanging_indent;
8032
8033     my $right_margin = shift;   # Optional number of spaces to narrow the
8034                                 # total width by.
8035     $right_margin = 0 unless defined $right_margin;
8036
8037     # Call carp with the 'nofold' option to avoid it from trying to call us
8038     # recursively
8039     Carp::carp_extra_args(\@_, 'nofold') if main::DEBUG && @_;
8040
8041     # The space available doesn't include what's automatically prepended
8042     # to each line, or what's reserved on the right.
8043     my $max = $MAX_LINE_WIDTH - length($prefix) - $right_margin;
8044     # XXX Instead of using the 'nofold' perhaps better to look up the stack
8045
8046     if (DEBUG && $hanging_indent >= $max) {
8047         Carp::my_carp("Too large a hanging indent ($hanging_indent); must be < $max.  Using 0", 'nofold');
8048         $hanging_indent = 0;
8049     }
8050
8051     # First, split into the current physical lines.
8052     my @line;
8053     if (ref $line) {        # Better be an array, because not bothering to
8054                             # test
8055         foreach my $line (@{$line}) {
8056             push @line, split /\n/, $line;
8057         }
8058     }
8059     else {
8060         @line = split /\n/, $line;
8061     }
8062
8063     #local $to_trace = 1 if main::DEBUG;
8064     trace "", join(" ", @line), "\n" if main::DEBUG && $to_trace;
8065
8066     # Look at each current physical line.
8067     for (my $i = 0; $i < @line; $i++) {
8068         Carp::my_carp("Tabs don't work well.", 'nofold') if $line[$i] =~ /\t/;
8069         #local $to_trace = 1 if main::DEBUG;
8070         trace "i=$i: $line[$i]\n" if main::DEBUG && $to_trace;
8071
8072         # Remove prefix, because will be added back anyway, don't want
8073         # doubled prefix
8074         $line[$i] =~ s/^$prefix//;
8075
8076         # Remove trailing space
8077         $line[$i] =~ s/\s+\Z//;
8078
8079         # If the line is too long, fold it.
8080         if (length $line[$i] > $max) {
8081             my $remainder;
8082
8083             # Here needs to fold.  Save the leading space in the line for
8084             # later.
8085             $line[$i] =~ /^ ( \s* )/x;
8086             my $leading_space = $1;
8087             trace "line length", length $line[$i], "; lead length", length($leading_space) if main::DEBUG && $to_trace;
8088
8089             # If character at final permissible position is white space,
8090             # fold there, which will delete that white space
8091             if (substr($line[$i], $max - 1, 1) =~ /\s/) {
8092                 $remainder = substr($line[$i], $max);
8093                 $line[$i] = substr($line[$i], 0, $max - 1);
8094             }
8095             else {
8096
8097                 # Otherwise fold at an acceptable break char closest to
8098                 # the max length.  Look at just the maximal initial
8099                 # segment of the line
8100                 my $segment = substr($line[$i], 0, $max - 1);
8101                 if ($segment =~
8102                     /^ ( .{$hanging_indent}   # Don't look before the
8103                                               #  indent.
8104                         \ *                   # Don't look in leading
8105                                               #  blanks past the indent
8106                             [^ ] .*           # Find the right-most
8107                         (?:                   #  acceptable break:
8108                             [ \s = ]          # space or equal
8109                             | - (?! [.0-9] )  # or non-unary minus.
8110                         )                     # $1 includes the character
8111                     )/x)
8112                 {
8113                     # Split into the initial part that fits, and remaining
8114                     # part of the input
8115                     $remainder = substr($line[$i], length $1);
8116                     $line[$i] = $1;
8117                     trace $line[$i] if DEBUG && $to_trace;
8118                     trace $remainder if DEBUG && $to_trace;
8119                 }
8120
8121                 # If didn't find a good breaking spot, see if there is a
8122                 # not-so-good breaking spot.  These are just after
8123                 # underscores or where the case changes from lower to
8124                 # upper.  Use \a as a soft hyphen, but give up
8125                 # and don't break the line if there is actually a \a
8126                 # already in the input.  We use an ascii character for the
8127                 # soft-hyphen to avoid any attempt by miniperl to try to
8128                 # access the files that this program is creating.
8129                 elsif ($segment !~ /\a/
8130                        && ($segment =~ s/_/_\a/g
8131                        || $segment =~ s/ ( [a-z] ) (?= [A-Z] )/$1\a/xg))
8132                 {
8133                     # Here were able to find at least one place to insert
8134                     # our substitute soft hyphen.  Find the right-most one
8135                     # and replace it by a real hyphen.
8136                     trace $segment if DEBUG && $to_trace;
8137                     substr($segment,
8138                             rindex($segment, "\a"),
8139                             1) = '-';
8140
8141                     # Then remove the soft hyphen substitutes.
8142                     $segment =~ s/\a//g;
8143                     trace $segment if DEBUG && $to_trace;
8144
8145                     # And split into the initial part that fits, and
8146                     # remainder of the line
8147                     my $pos = rindex($segment, '-');
8148                     $remainder = substr($line[$i], $pos);
8149                     trace $remainder if DEBUG && $to_trace;
8150                     $line[$i] = substr($segment, 0, $pos + 1);
8151                 }
8152             }
8153
8154             # Here we know if we can fold or not.  If we can, $remainder
8155             # is what remains to be processed in the next iteration.
8156             if (defined $remainder) {
8157                 trace "folded='$line[$i]'" if main::DEBUG && $to_trace;
8158
8159                 # Insert the folded remainder of the line as a new element
8160                 # of the array.  (It may still be too long, but we will
8161                 # deal with that next time through the loop.)  Omit any
8162                 # leading space in the remainder.
8163                 $remainder =~ s/^\s+//;
8164                 trace "remainder='$remainder'" if main::DEBUG && $to_trace;
8165
8166                 # But then indent by whichever is larger of:
8167                 # 1) the leading space on the input line;
8168                 # 2) the hanging indent.
8169                 # This preserves indentation in the original line.
8170                 my $lead = ($leading_space)
8171                             ? length $leading_space
8172                             : $hanging_indent;
8173                 $lead = max($lead, $hanging_indent);
8174                 splice @line, $i+1, 0, (" " x $lead) . $remainder;
8175             }
8176         }
8177
8178         # Ready to output the line. Get rid of any trailing space
8179         # And prefix by the required $prefix passed in.
8180         $line[$i] =~ s/\s+$//;
8181         $line[$i] = "$prefix$line[$i]\n";
8182     } # End of looping through all the lines.
8183
8184     return join "", @line;
8185 }
8186
8187 sub property_ref {  # Returns a reference to a property object.
8188     return Property::property_ref(@_);
8189 }
8190
8191 sub force_unlink ($) {
8192     my $filename = shift;
8193     return unless file_exists($filename);
8194     return if CORE::unlink($filename);
8195
8196     # We might need write permission
8197     chmod 0777, $filename;
8198     CORE::unlink($filename) or Carp::my_carp("Couldn't unlink $filename.  Proceeding anyway: $!");
8199     return;
8200 }
8201
8202 sub write ($$@) {
8203     # Given a filename and references to arrays of lines, write the lines of
8204     # each array to the file
8205     # Filename can be given as an arrayref of directory names
8206
8207     return Carp::carp_too_few_args(\@_, 3) if main::DEBUG && @_ < 3;
8208
8209     my $file  = shift;
8210     my $use_utf8 = shift;
8211
8212     # Get into a single string if an array, and get rid of, in Unix terms, any
8213     # leading '.'
8214     $file= File::Spec->join(@$file) if ref $file eq 'ARRAY';
8215     $file = File::Spec->canonpath($file);
8216
8217     # If has directories, make sure that they all exist
8218     (undef, my $directories, undef) = File::Spec->splitpath($file);
8219     File::Path::mkpath($directories) if $directories && ! -d $directories;
8220
8221     push @files_actually_output, $file;
8222
8223     force_unlink ($file);
8224
8225     my $OUT;
8226     if (not open $OUT, ">", $file) {
8227         Carp::my_carp("can't open $file for output.  Skipping this file: $!");
8228         return;
8229     }
8230
8231     binmode $OUT, ":utf8" if $use_utf8;
8232
8233     while (defined (my $lines_ref = shift)) {
8234         unless (@$lines_ref) {
8235             Carp::my_carp("An array of lines for writing to file '$file' is empty; writing it anyway;");
8236         }
8237
8238         print $OUT @$lines_ref or die Carp::my_carp("write to '$file' failed: $!");
8239     }
8240     close $OUT or die Carp::my_carp("close '$file' failed: $!");
8241
8242     print "$file written.\n" if $verbosity >= $VERBOSE;
8243
8244     return;
8245 }
8246
8247
8248 sub Standardize($) {
8249     # This converts the input name string into a standardized equivalent to
8250     # use internally.
8251
8252     my $name = shift;
8253     unless (defined $name) {
8254       Carp::my_carp_bug("Standardize() called with undef.  Returning undef.");
8255       return;
8256     }
8257
8258     # Remove any leading or trailing white space
8259     $name =~ s/^\s+//g;
8260     $name =~ s/\s+$//g;
8261
8262     # Convert interior white space and hyphens into underscores.
8263     $name =~ s/ (?<= .) [ -]+ (.) /_$1/xg;
8264
8265     # Capitalize the letter following an underscore, and convert a sequence of
8266     # multiple underscores to a single one
8267     $name =~ s/ (?<= .) _+ (.) /_\u$1/xg;
8268
8269     # And capitalize the first letter, but not for the special cjk ones.
8270     $name = ucfirst($name) unless $name =~ /^k[A-Z]/;
8271     return $name;
8272 }
8273
8274 sub standardize ($) {
8275     # Returns a lower-cased standardized name, without underscores.  This form
8276     # is chosen so that it can distinguish between any real versus superficial
8277     # Unicode name differences.  It relies on the fact that Unicode doesn't
8278     # have interior underscores, white space, nor dashes in any
8279     # stricter-matched name.  It should not be used on Unicode code point
8280     # names (the Name property), as they mostly, but not always follow these
8281     # rules.
8282
8283     my $name = Standardize(shift);
8284     return if !defined $name;
8285
8286     $name =~ s/ (?<= .) _ (?= . ) //xg;
8287     return lc $name;
8288 }
8289
8290 sub utf8_heavy_name ($$) {
8291     # Returns the name that utf8_heavy.pl will use to find a table.  XXX
8292     # perhaps this function should be placed somewhere, like Heavy.pl so that
8293     # utf8_heavy can use it directly without duplicating code that can get
8294     # out-of sync.
8295
8296     my $table = shift;
8297     my $alias = shift;
8298     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8299
8300     my $property = $table->property;
8301     $property = ($property == $perl)
8302                 ? ""                # 'perl' is never explicitly stated
8303                 : standardize($property->name) . '=';
8304     if ($alias->loose_match) {
8305         return $property . standardize($alias->name);
8306     }
8307     else {
8308         return lc ($property . $alias->name);
8309     }
8310
8311     return;
8312 }
8313
8314 {   # Closure
8315
8316     my $indent_increment = " " x (($debugging_build) ? 2 : 0);
8317     my %already_output;
8318
8319     $main::simple_dumper_nesting = 0;
8320
8321     sub simple_dumper {
8322         # Like Simple Data::Dumper. Good enough for our needs. We can't use
8323         # the real thing as we have to run under miniperl.
8324
8325         # It is designed so that on input it is at the beginning of a line,
8326         # and the final thing output in any call is a trailing ",\n".
8327
8328         my $item = shift;
8329         my $indent = shift;
8330         $indent = "" if ! $debugging_build || ! defined $indent;
8331
8332         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8333
8334         # nesting level is localized, so that as the call stack pops, it goes
8335         # back to the prior value.
8336         local $main::simple_dumper_nesting = $main::simple_dumper_nesting;
8337         undef %already_output if $main::simple_dumper_nesting == 0;
8338         $main::simple_dumper_nesting++;
8339         #print STDERR __LINE__, ": $main::simple_dumper_nesting: $indent$item\n";
8340
8341         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8342
8343         # Determine the indent for recursive calls.
8344         my $next_indent = $indent . $indent_increment;
8345
8346         my $output;
8347         if (! ref $item) {
8348
8349             # Dump of scalar: just output it in quotes if not a number.  To do
8350             # so we must escape certain characters, and therefore need to
8351             # operate on a copy to avoid changing the original
8352             my $copy = $item;
8353             $copy = $UNDEF unless defined $copy;
8354
8355             # Quote non-integers (integers also have optional leading '-')
8356             if ($copy eq "" || $copy !~ /^ -? \d+ $/x) {
8357
8358                 # Escape apostrophe and backslash
8359                 $copy =~ s/ ( ['\\] ) /\\$1/xg;
8360                 $copy = "'$copy'";
8361             }
8362             $output = "$indent$copy,\n";
8363         }
8364         else {
8365
8366             # Keep track of cycles in the input, and refuse to infinitely loop
8367             my $addr = do { no overloading; pack 'J', $item; };
8368             if (defined $already_output{$addr}) {
8369                 return "${indent}ALREADY OUTPUT: $item\n";
8370             }
8371             $already_output{$addr} = $item;
8372
8373             if (ref $item eq 'ARRAY') {
8374                 my $using_brackets;
8375                 $output = $indent;
8376                 if ($main::simple_dumper_nesting > 1) {
8377                     $output .= '[';
8378                     $using_brackets = 1;
8379                 }
8380                 else {
8381                     $using_brackets = 0;
8382                 }
8383
8384                 # If the array is empty, put the closing bracket on the same
8385                 # line.  Otherwise, recursively add each array element
8386                 if (@$item == 0) {
8387                     $output .= " ";
8388                 }
8389                 else {
8390                     $output .= "\n";
8391                     for (my $i = 0; $i < @$item; $i++) {
8392
8393                         # Indent array elements one level
8394                         $output .= &simple_dumper($item->[$i], $next_indent);
8395                         next if ! $debugging_build;
8396                         $output =~ s/\n$//;      # Remove any trailing nl so
8397                         $output .= " # [$i]\n";  # as to add a comment giving
8398                                                  # the array index
8399                     }
8400                     $output .= $indent;     # Indent closing ']' to orig level
8401                 }
8402                 $output .= ']' if $using_brackets;
8403                 $output .= ",\n";
8404             }
8405             elsif (ref $item eq 'HASH') {
8406                 my $is_first_line;
8407                 my $using_braces;
8408                 my $body_indent;
8409
8410                 # No surrounding braces at top level
8411                 $output .= $indent;
8412                 if ($main::simple_dumper_nesting > 1) {
8413                     $output .= "{\n";
8414                     $is_first_line = 0;
8415                     $body_indent = $next_indent;
8416                     $next_indent .= $indent_increment;
8417                     $using_braces = 1;
8418                 }
8419                 else {
8420                     $is_first_line = 1;
8421                     $body_indent = $indent;
8422                     $using_braces = 0;
8423                 }
8424
8425                 # Output hashes sorted alphabetically instead of apparently
8426                 # random.  Use caseless alphabetic sort
8427                 foreach my $key (sort { lc $a cmp lc $b } keys %$item)
8428                 {
8429                     if ($is_first_line) {
8430                         $is_first_line = 0;
8431                     }
8432                     else {
8433                         $output .= "$body_indent";
8434                     }
8435
8436                     # The key must be a scalar, but this recursive call quotes
8437                     # it
8438                     $output .= &simple_dumper($key);
8439
8440                     # And change the trailing comma and nl to the hash fat
8441                     # comma for clarity, and so the value can be on the same
8442                     # line
8443                     $output =~ s/,\n$/ => /;
8444
8445                     # Recursively call to get the value's dump.
8446                     my $next = &simple_dumper($item->{$key}, $next_indent);
8447
8448                     # If the value is all on one line, remove its indent, so
8449                     # will follow the => immediately.  If it takes more than
8450                     # one line, start it on a new line.
8451                     if ($next !~ /\n.*\n/) {
8452                         $next =~ s/^ *//;
8453                     }
8454                     else {
8455                         $output .= "\n";
8456                     }
8457                     $output .= $next;
8458                 }
8459
8460                 $output .= "$indent},\n" if $using_braces;
8461             }
8462             elsif (ref $item eq 'CODE' || ref $item eq 'GLOB') {
8463                 $output = $indent . ref($item) . "\n";
8464                 # XXX see if blessed
8465             }
8466             elsif ($item->can('dump')) {
8467
8468                 # By convention in this program, objects furnish a 'dump'
8469                 # method.  Since not doing any output at this level, just pass
8470                 # on the input indent
8471                 $output = $item->dump($indent);
8472             }
8473             else {
8474                 Carp::my_carp("Can't cope with dumping a " . ref($item) . ".  Skipping.");
8475             }
8476         }
8477         return $output;
8478     }
8479 }
8480
8481 sub dump_inside_out {
8482     # Dump inside-out hashes in an object's state by converting them to a
8483     # regular hash and then calling simple_dumper on that.
8484
8485     my $object = shift;
8486     my $fields_ref = shift;
8487     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8488
8489     my $addr = do { no overloading; pack 'J', $object; };
8490
8491     my %hash;
8492     foreach my $key (keys %$fields_ref) {
8493         $hash{$key} = $fields_ref->{$key}{$addr};
8494     }
8495
8496     return simple_dumper(\%hash, @_);
8497 }
8498
8499 sub _operator_dot {
8500     # Overloaded '.' method that is common to all packages.  It uses the
8501     # package's stringify method.
8502
8503     my $self = shift;
8504     my $other = shift;
8505     my $reversed = shift;
8506     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8507
8508     $other = "" unless defined $other;
8509
8510     foreach my $which (\$self, \$other) {
8511         next unless ref $$which;
8512         if ($$which->can('_operator_stringify')) {
8513             $$which = $$which->_operator_stringify;
8514         }
8515         else {
8516             my $ref = ref $$which;
8517             my $addr = do { no overloading; pack 'J', $$which; };
8518             $$which = "$ref ($addr)";
8519         }
8520     }
8521     return ($reversed)
8522             ? "$other$self"
8523             : "$self$other";
8524 }
8525
8526 sub _operator_equal {
8527     # Generic overloaded '==' routine.  To be equal, they must be the exact
8528     # same object
8529
8530     my $self = shift;
8531     my $other = shift;
8532
8533     return 0 unless defined $other;
8534     return 0 unless ref $other;
8535     no overloading;
8536     return $self == $other;
8537 }
8538
8539 sub _operator_not_equal {
8540     my $self = shift;
8541     my $other = shift;
8542
8543     return ! _operator_equal($self, $other);
8544 }
8545
8546 sub process_PropertyAliases($) {
8547     # This reads in the PropertyAliases.txt file, which contains almost all
8548     # the character properties in Unicode and their equivalent aliases:
8549     # scf       ; Simple_Case_Folding         ; sfc
8550     #
8551     # Field 0 is the preferred short name for the property.
8552     # Field 1 is the full name.
8553     # Any succeeding ones are other accepted names.
8554
8555     my $file= shift;
8556     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8557
8558     # This whole file was non-existent in early releases, so use our own
8559     # internal one.
8560     $file->insert_lines(get_old_property_aliases())
8561                                                 if ! -e 'PropertyAliases.txt';
8562
8563     # Add any cjk properties that may have been defined.
8564     $file->insert_lines(@cjk_properties);
8565
8566     while ($file->next_line) {
8567
8568         my @data = split /\s*;\s*/;
8569
8570         my $full = $data[1];
8571
8572         my $this = Property->new($data[0], Full_Name => $full);
8573
8574         # Start looking for more aliases after these two.
8575         for my $i (2 .. @data - 1) {
8576             $this->add_alias($data[$i]);
8577         }
8578
8579     }
8580     return;
8581 }
8582
8583 sub finish_property_setup {
8584     # Finishes setting up after PropertyAliases.
8585
8586     my $file = shift;
8587     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8588
8589     # This entry was missing from this file in earlier Unicode versions
8590     if (-e 'Jamo.txt') {
8591         my $jsn = property_ref('JSN');
8592         if (! defined $jsn) {
8593             $jsn = Property->new('JSN', Full_Name => 'Jamo_Short_Name');
8594         }
8595     }
8596
8597     # This entry is still missing as of 6.0, perhaps because no short name for
8598     # it.
8599     if (-e 'NameAliases.txt') {
8600         my $aliases = property_ref('Name_Alias');
8601         if (! defined $aliases) {
8602             $aliases = Property->new('Name_Alias');
8603         }
8604     }
8605
8606     # These are used so much, that we set globals for them.
8607     $gc = property_ref('General_Category');
8608     $block = property_ref('Block');
8609     $script = property_ref('Script');
8610
8611     # Perl adds this alias.
8612     $gc->add_alias('Category');
8613
8614     # For backwards compatibility, these property files have particular names.
8615     property_ref('Uppercase_Mapping')->set_file('Upper'); # This is what
8616                                                           # utf8.c calls it
8617     property_ref('Lowercase_Mapping')->set_file('Lower');
8618     property_ref('Titlecase_Mapping')->set_file('Title');
8619
8620     my $fold = property_ref('Case_Folding');
8621     $fold->set_file('Fold') if defined $fold;
8622
8623     # Unicode::Normalize expects this file with this name and directory.
8624     my $ccc = property_ref('Canonical_Combining_Class');
8625     if (defined $ccc) {
8626         $ccc->set_file('CombiningClass');
8627         $ccc->set_directory(File::Spec->curdir());
8628     }
8629
8630     # utf8.c has a different meaning for non range-size-1 for map properties
8631     # that this program doesn't currently handle; and even if it were changed
8632     # to do so, some other code may be using them expecting range size 1.
8633     foreach my $property (qw {
8634                                 Case_Folding
8635                                 Lowercase_Mapping
8636                                 Titlecase_Mapping
8637                                 Uppercase_Mapping
8638                             })
8639     {
8640         property_ref($property)->set_range_size_1(1);
8641     }
8642
8643     # These two properties aren't actually used in the core, but unfortunately
8644     # the names just above that are in the core interfere with these, so
8645     # choose different names.  These aren't a problem unless the map tables
8646     # for these files get written out.
8647     my $lowercase = property_ref('Lowercase');
8648     $lowercase->set_file('IsLower') if defined $lowercase;
8649     my $uppercase = property_ref('Uppercase');
8650     $uppercase->set_file('IsUpper') if defined $uppercase;
8651
8652     # Set up the hard-coded default mappings, but only on properties defined
8653     # for this release
8654     foreach my $property (keys %default_mapping) {
8655         my $property_object = property_ref($property);
8656         next if ! defined $property_object;
8657         my $default_map = $default_mapping{$property};
8658         $property_object->set_default_map($default_map);
8659
8660         # A map of <code point> implies the property is string.
8661         if ($property_object->type == $UNKNOWN
8662             && $default_map eq $CODE_POINT)
8663         {
8664             $property_object->set_type($STRING);
8665         }
8666     }
8667
8668     # The following use the Multi_Default class to create objects for
8669     # defaults.
8670
8671     # Bidi class has a complicated default, but the derived file takes care of
8672     # the complications, leaving just 'L'.
8673     if (file_exists("${EXTRACTED}DBidiClass.txt")) {
8674         property_ref('Bidi_Class')->set_default_map('L');
8675     }
8676     else {
8677         my $default;
8678
8679         # The derived file was introduced in 3.1.1.  The values below are
8680         # taken from table 3-8, TUS 3.0
8681         my $default_R =
8682             'my $default = Range_List->new;
8683              $default->add_range(0x0590, 0x05FF);
8684              $default->add_range(0xFB1D, 0xFB4F);'
8685         ;
8686
8687         # The defaults apply only to unassigned characters
8688         $default_R .= '$gc->table("Unassigned") & $default;';
8689
8690         if ($v_version lt v3.0.0) {
8691             $default = Multi_Default->new(R => $default_R, 'L');
8692         }
8693         else {
8694
8695             # AL apparently not introduced until 3.0:  TUS 2.x references are
8696             # not on-line to check it out
8697             my $default_AL =
8698                 'my $default = Range_List->new;
8699                  $default->add_range(0x0600, 0x07BF);
8700                  $default->add_range(0xFB50, 0xFDFF);
8701                  $default->add_range(0xFE70, 0xFEFF);'
8702             ;
8703
8704             # Non-character code points introduced in this release; aren't AL
8705             if ($v_version ge 3.1.0) {
8706                 $default_AL .= '$default->delete_range(0xFDD0, 0xFDEF);';
8707             }
8708             $default_AL .= '$gc->table("Unassigned") & $default';
8709             $default = Multi_Default->new(AL => $default_AL,
8710                                           R => $default_R,
8711                                           'L');
8712         }
8713         property_ref('Bidi_Class')->set_default_map($default);
8714     }
8715
8716     # Joining type has a complicated default, but the derived file takes care
8717     # of the complications, leaving just 'U' (or Non_Joining), except the file
8718     # is bad in 3.1.0
8719     if (file_exists("${EXTRACTED}DJoinType.txt") || -e 'ArabicShaping.txt') {
8720         if (file_exists("${EXTRACTED}DJoinType.txt") && $v_version ne 3.1.0) {
8721             property_ref('Joining_Type')->set_default_map('Non_Joining');
8722         }
8723         else {
8724
8725             # Otherwise, there are not one, but two possibilities for the
8726             # missing defaults: T and U.
8727             # The missing defaults that evaluate to T are given by:
8728             # T = Mn + Cf - ZWNJ - ZWJ
8729             # where Mn and Cf are the general category values. In other words,
8730             # any non-spacing mark or any format control character, except
8731             # U+200C ZERO WIDTH NON-JOINER (joining type U) and U+200D ZERO
8732             # WIDTH JOINER (joining type C).
8733             my $default = Multi_Default->new(
8734                'T' => '$gc->table("Mn") + $gc->table("Cf") - 0x200C - 0x200D',
8735                'Non_Joining');
8736             property_ref('Joining_Type')->set_default_map($default);
8737         }
8738     }
8739
8740     # Line break has a complicated default in early releases. It is 'Unknown'
8741     # for non-assigned code points; 'AL' for assigned.
8742     if (file_exists("${EXTRACTED}DLineBreak.txt") || -e 'LineBreak.txt') {
8743         my $lb = property_ref('Line_Break');
8744         if ($v_version gt 3.2.0) {
8745             $lb->set_default_map('Unknown');
8746         }
8747         else {
8748             my $default = Multi_Default->new( 'Unknown' => '$gc->table("Cn")',
8749                                               'AL');
8750             $lb->set_default_map($default);
8751         }
8752
8753         # If has the URS property, make sure that the standard aliases are in
8754         # it, since not in the input tables in some versions.
8755         my $urs = property_ref('Unicode_Radical_Stroke');
8756         if (defined $urs) {
8757             $urs->add_alias('cjkRSUnicode');
8758             $urs->add_alias('kRSUnicode');
8759         }
8760     }
8761     return;
8762 }
8763
8764 sub get_old_property_aliases() {
8765     # Returns what would be in PropertyAliases.txt if it existed in very old
8766     # versions of Unicode.  It was derived from the one in 3.2, and pared
8767     # down based on the data that was actually in the older releases.
8768     # An attempt was made to use the existence of files to mean inclusion or
8769     # not of various aliases, but if this was not sufficient, using version
8770     # numbers was resorted to.
8771
8772     my @return;
8773
8774     # These are to be used in all versions (though some are constructed by
8775     # this program if missing)
8776     push @return, split /\n/, <<'END';
8777 bc        ; Bidi_Class
8778 Bidi_M    ; Bidi_Mirrored
8779 cf        ; Case_Folding
8780 ccc       ; Canonical_Combining_Class
8781 dm        ; Decomposition_Mapping
8782 dt        ; Decomposition_Type
8783 gc        ; General_Category
8784 isc       ; ISO_Comment
8785 lc        ; Lowercase_Mapping
8786 na        ; Name
8787 na1       ; Unicode_1_Name
8788 nt        ; Numeric_Type
8789 nv        ; Numeric_Value
8790 sfc       ; Simple_Case_Folding
8791 slc       ; Simple_Lowercase_Mapping
8792 stc       ; Simple_Titlecase_Mapping
8793 suc       ; Simple_Uppercase_Mapping
8794 tc        ; Titlecase_Mapping
8795 uc        ; Uppercase_Mapping
8796 END
8797
8798     if (-e 'Blocks.txt') {
8799         push @return, "blk       ; Block\n";
8800     }
8801     if (-e 'ArabicShaping.txt') {
8802         push @return, split /\n/, <<'END';
8803 jg        ; Joining_Group
8804 jt        ; Joining_Type
8805 END
8806     }
8807     if (-e 'PropList.txt') {
8808
8809         # This first set is in the original old-style proplist.
8810         push @return, split /\n/, <<'END';
8811 Alpha     ; Alphabetic
8812 Bidi_C    ; Bidi_Control
8813 Dash      ; Dash
8814 Dia       ; Diacritic
8815 Ext       ; Extender
8816 Hex       ; Hex_Digit
8817 Hyphen    ; Hyphen
8818 IDC       ; ID_Continue
8819 Ideo      ; Ideographic
8820 Join_C    ; Join_Control
8821 Math      ; Math
8822 QMark     ; Quotation_Mark
8823 Term      ; Terminal_Punctuation
8824 WSpace    ; White_Space
8825 END
8826         # The next sets were added later
8827         if ($v_version ge v3.0.0) {
8828             push @return, split /\n/, <<'END';
8829 Upper     ; Uppercase
8830 Lower     ; Lowercase
8831 END
8832         }
8833         if ($v_version ge v3.0.1) {
8834             push @return, split /\n/, <<'END';
8835 NChar     ; Noncharacter_Code_Point
8836 END
8837         }
8838         # The next sets were added in the new-style
8839         if ($v_version ge v3.1.0) {
8840             push @return, split /\n/, <<'END';
8841 OAlpha    ; Other_Alphabetic
8842 OLower    ; Other_Lowercase
8843 OMath     ; Other_Math
8844 OUpper    ; Other_Uppercase
8845 END
8846         }
8847         if ($v_version ge v3.1.1) {
8848             push @return, "AHex      ; ASCII_Hex_Digit\n";
8849         }
8850     }
8851     if (-e 'EastAsianWidth.txt') {
8852         push @return, "ea        ; East_Asian_Width\n";
8853     }
8854     if (-e 'CompositionExclusions.txt') {
8855         push @return, "CE        ; Composition_Exclusion\n";
8856     }
8857     if (-e 'LineBreak.txt') {
8858         push @return, "lb        ; Line_Break\n";
8859     }
8860     if (-e 'BidiMirroring.txt') {
8861         push @return, "bmg       ; Bidi_Mirroring_Glyph\n";
8862     }
8863     if (-e 'Scripts.txt') {
8864         push @return, "sc        ; Script\n";
8865     }
8866     if (-e 'DNormalizationProps.txt') {
8867         push @return, split /\n/, <<'END';
8868 Comp_Ex   ; Full_Composition_Exclusion
8869 FC_NFKC   ; FC_NFKC_Closure
8870 NFC_QC    ; NFC_Quick_Check
8871 NFD_QC    ; NFD_Quick_Check
8872 NFKC_QC   ; NFKC_Quick_Check
8873 NFKD_QC   ; NFKD_Quick_Check
8874 XO_NFC    ; Expands_On_NFC
8875 XO_NFD    ; Expands_On_NFD
8876 XO_NFKC   ; Expands_On_NFKC
8877 XO_NFKD   ; Expands_On_NFKD
8878 END
8879     }
8880     if (-e 'DCoreProperties.txt') {
8881         push @return, split /\n/, <<'END';
8882 IDS       ; ID_Start
8883 XIDC      ; XID_Continue
8884 XIDS      ; XID_Start
8885 END
8886         # These can also appear in some versions of PropList.txt
8887         push @return, "Lower     ; Lowercase\n"
8888                                     unless grep { $_ =~ /^Lower\b/} @return;
8889         push @return, "Upper     ; Uppercase\n"
8890                                     unless grep { $_ =~ /^Upper\b/} @return;
8891     }
8892
8893     # This flag requires the DAge.txt file to be copied into the directory.
8894     if (DEBUG && $compare_versions) {
8895         push @return, 'age       ; Age';
8896     }
8897
8898     return @return;
8899 }
8900
8901 sub process_PropValueAliases {
8902     # This file contains values that properties look like:
8903     # bc ; AL        ; Arabic_Letter
8904     # blk; n/a       ; Greek_And_Coptic                 ; Greek
8905     #
8906     # Field 0 is the property.
8907     # Field 1 is the short name of a property value or 'n/a' if no
8908     #                short name exists;
8909     # Field 2 is the full property value name;
8910     # Any other fields are more synonyms for the property value.
8911     # Purely numeric property values are omitted from the file; as are some
8912     # others, fewer and fewer in later releases
8913
8914     # Entries for the ccc property have an extra field before the
8915     # abbreviation:
8916     # ccc;   0; NR   ; Not_Reordered
8917     # It is the numeric value that the names are synonyms for.
8918
8919     # There are comment entries for values missing from this file:
8920     # # @missing: 0000..10FFFF; ISO_Comment; <none>
8921     # # @missing: 0000..10FFFF; Lowercase_Mapping; <code point>
8922
8923     my $file= shift;
8924     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8925
8926     # This whole file was non-existent in early releases, so use our own
8927     # internal one if necessary.
8928     if (! -e 'PropValueAliases.txt') {
8929         $file->insert_lines(get_old_property_value_aliases());
8930     }
8931
8932     # Add any explicit cjk values
8933     $file->insert_lines(@cjk_property_values);
8934
8935     # This line is used only for testing the code that checks for name
8936     # conflicts.  There is a script Inherited, and when this line is executed
8937     # it causes there to be a name conflict with the 'Inherited' that this
8938     # program generates for this block property value
8939     #$file->insert_lines('blk; n/a; Herited');
8940
8941
8942     # Process each line of the file ...
8943     while ($file->next_line) {
8944
8945         my ($property, @data) = split /\s*;\s*/;
8946
8947         # The ccc property has an extra field at the beginning, which is the
8948         # numeric value.  Move it to be after the other two, mnemonic, fields,
8949         # so that those will be used as the property value's names, and the
8950         # number will be an extra alias.  (Rightmost splice removes field 1-2,
8951         # returning them in a slice; left splice inserts that before anything,
8952         # thus shifting the former field 0 to after them.)
8953         splice (@data, 0, 0, splice(@data, 1, 2)) if $property eq 'ccc';
8954
8955         # Field 0 is a short name unless "n/a"; field 1 is the full name.  If
8956         # there is no short name, use the full one in element 1
8957         if ($data[0] eq "n/a") {
8958             $data[0] = $data[1];
8959         }
8960         elsif ($data[0] ne $data[1]
8961                && standardize($data[0]) eq standardize($data[1])
8962                && $data[1] !~ /[[:upper:]]/)
8963         {
8964             # Also, there is a bug in the file in which "n/a" is omitted, and
8965             # the two fields are identical except for case, and the full name
8966             # is all lower case.  Copy the "short" name unto the full one to
8967             # give it some upper case.
8968
8969             $data[1] = $data[0];
8970         }
8971
8972         # Earlier releases had the pseudo property 'qc' that should expand to
8973         # the ones that replace it below.
8974         if ($property eq 'qc') {
8975             if (lc $data[0] eq 'y') {
8976                 $file->insert_lines('NFC_QC; Y      ; Yes',
8977                                     'NFD_QC; Y      ; Yes',
8978                                     'NFKC_QC; Y     ; Yes',
8979                                     'NFKD_QC; Y     ; Yes',
8980                                     );
8981             }
8982             elsif (lc $data[0] eq 'n') {
8983                 $file->insert_lines('NFC_QC; N      ; No',
8984                                     'NFD_QC; N      ; No',
8985                                     'NFKC_QC; N     ; No',
8986                                     'NFKD_QC; N     ; No',
8987                                     );
8988             }
8989             elsif (lc $data[0] eq 'm') {
8990                 $file->insert_lines('NFC_QC; M      ; Maybe',
8991                                     'NFKC_QC; M     ; Maybe',
8992                                     );
8993             }
8994             else {
8995                 $file->carp_bad_line("qc followed by unexpected '$data[0]");
8996             }
8997             next;
8998         }
8999
9000         # The first field is the short name, 2nd is the full one.
9001         my $property_object = property_ref($property);
9002         my $table = $property_object->add_match_table($data[0],
9003                                                 Full_Name => $data[1]);
9004
9005         # Start looking for more aliases after these two.
9006         for my $i (2 .. @data - 1) {
9007             $table->add_alias($data[$i]);
9008         }
9009     } # End of looping through the file
9010
9011     # As noted in the comments early in the program, it generates tables for
9012     # the default values for all releases, even those for which the concept
9013     # didn't exist at the time.  Here we add those if missing.
9014     my $age = property_ref('age');
9015     if (defined $age && ! defined $age->table('Unassigned')) {
9016         $age->add_match_table('Unassigned');
9017     }
9018     $block->add_match_table('No_Block') if -e 'Blocks.txt'
9019                                     && ! defined $block->table('No_Block');
9020
9021
9022     # Now set the default mappings of the properties from the file.  This is
9023     # done after the loop because a number of properties have only @missings
9024     # entries in the file, and may not show up until the end.
9025     my @defaults = $file->get_missings;
9026     foreach my $default_ref (@defaults) {
9027         my $default = $default_ref->[0];
9028         my $property = property_ref($default_ref->[1]);
9029         $property->set_default_map($default);
9030     }
9031     return;
9032 }
9033
9034 sub get_old_property_value_aliases () {
9035     # Returns what would be in PropValueAliases.txt if it existed in very old
9036     # versions of Unicode.  It was derived from the one in 3.2, and pared
9037     # down.  An attempt was made to use the existence of files to mean
9038     # inclusion or not of various aliases, but if this was not sufficient,
9039     # using version numbers was resorted to.
9040
9041     my @return = split /\n/, <<'END';
9042 bc ; AN        ; Arabic_Number
9043 bc ; B         ; Paragraph_Separator
9044 bc ; CS        ; Common_Separator
9045 bc ; EN        ; European_Number
9046 bc ; ES        ; European_Separator
9047 bc ; ET        ; European_Terminator
9048 bc ; L         ; Left_To_Right
9049 bc ; ON        ; Other_Neutral
9050 bc ; R         ; Right_To_Left
9051 bc ; WS        ; White_Space
9052
9053 # The standard combining classes are very much different in v1, so only use
9054 # ones that look right (not checked thoroughly)
9055 ccc;   0; NR   ; Not_Reordered
9056 ccc;   1; OV   ; Overlay
9057 ccc;   7; NK   ; Nukta
9058 ccc;   8; KV   ; Kana_Voicing
9059 ccc;   9; VR   ; Virama
9060 ccc; 202; ATBL ; Attached_Below_Left
9061 ccc; 216; ATAR ; Attached_Above_Right
9062 ccc; 218; BL   ; Below_Left
9063 ccc; 220; B    ; Below
9064 ccc; 222; BR   ; Below_Right
9065 ccc; 224; L    ; Left
9066 ccc; 228; AL   ; Above_Left
9067 ccc; 230; A    ; Above
9068 ccc; 232; AR   ; Above_Right
9069 ccc; 234; DA   ; Double_Above
9070
9071 dt ; can       ; canonical
9072 dt ; enc       ; circle
9073 dt ; fin       ; final
9074 dt ; font      ; font
9075 dt ; fra       ; fraction
9076 dt ; init      ; initial
9077 dt ; iso       ; isolated
9078 dt ; med       ; medial
9079 dt ; n/a       ; none
9080 dt ; nb        ; noBreak
9081 dt ; sqr       ; square
9082 dt ; sub       ; sub
9083 dt ; sup       ; super
9084
9085 gc ; C         ; Other                            # Cc | Cf | Cn | Co | Cs
9086 gc ; Cc        ; Control
9087 gc ; Cn        ; Unassigned
9088 gc ; Co        ; Private_Use
9089 gc ; L         ; Letter                           # Ll | Lm | Lo | Lt | Lu
9090 gc ; LC        ; Cased_Letter                     # Ll | Lt | Lu
9091 gc ; Ll        ; Lowercase_Letter
9092 gc ; Lm        ; Modifier_Letter
9093 gc ; Lo        ; Other_Letter
9094 gc ; Lu        ; Uppercase_Letter
9095 gc ; M         ; Mark                             # Mc | Me | Mn
9096 gc ; Mc        ; Spacing_Mark
9097 gc ; Mn        ; Nonspacing_Mark
9098 gc ; N         ; Number                           # Nd | Nl | No
9099 gc ; Nd        ; Decimal_Number
9100 gc ; No        ; Other_Number
9101 gc ; P         ; Punctuation                      # Pc | Pd | Pe | Pf | Pi | Po | Ps
9102 gc ; Pd        ; Dash_Punctuation
9103 gc ; Pe        ; Close_Punctuation
9104 gc ; Po        ; Other_Punctuation
9105 gc ; Ps        ; Open_Punctuation
9106 gc ; S         ; Symbol                           # Sc | Sk | Sm | So
9107 gc ; Sc        ; Currency_Symbol
9108 gc ; Sm        ; Math_Symbol
9109 gc ; So        ; Other_Symbol
9110 gc ; Z         ; Separator                        # Zl | Zp | Zs
9111 gc ; Zl        ; Line_Separator
9112 gc ; Zp        ; Paragraph_Separator
9113 gc ; Zs        ; Space_Separator
9114
9115 nt ; de        ; Decimal
9116 nt ; di        ; Digit
9117 nt ; n/a       ; None
9118 nt ; nu        ; Numeric
9119 END
9120
9121     if (-e 'ArabicShaping.txt') {
9122         push @return, split /\n/, <<'END';
9123 jg ; n/a       ; AIN
9124 jg ; n/a       ; ALEF
9125 jg ; n/a       ; DAL
9126 jg ; n/a       ; GAF
9127 jg ; n/a       ; LAM
9128 jg ; n/a       ; MEEM
9129 jg ; n/a       ; NO_JOINING_GROUP
9130 jg ; n/a       ; NOON
9131 jg ; n/a       ; QAF
9132 jg ; n/a       ; SAD
9133 jg ; n/a       ; SEEN
9134 jg ; n/a       ; TAH
9135 jg ; n/a       ; WAW
9136
9137 jt ; C         ; Join_Causing
9138 jt ; D         ; Dual_Joining
9139 jt ; L         ; Left_Joining
9140 jt ; R         ; Right_Joining
9141 jt ; U         ; Non_Joining
9142 jt ; T         ; Transparent
9143 END
9144         if ($v_version ge v3.0.0) {
9145             push @return, split /\n/, <<'END';
9146 jg ; n/a       ; ALAPH
9147 jg ; n/a       ; BEH
9148 jg ; n/a       ; BETH
9149 jg ; n/a       ; DALATH_RISH
9150 jg ; n/a       ; E
9151 jg ; n/a       ; FEH
9152 jg ; n/a       ; FINAL_SEMKATH
9153 jg ; n/a       ; GAMAL
9154 jg ; n/a       ; HAH
9155 jg ; n/a       ; HAMZA_ON_HEH_GOAL
9156 jg ; n/a       ; HE
9157 jg ; n/a       ; HEH
9158 jg ; n/a       ; HEH_GOAL
9159 jg ; n/a       ; HETH
9160 jg ; n/a       ; KAF
9161 jg ; n/a       ; KAPH
9162 jg ; n/a       ; KNOTTED_HEH
9163 jg ; n/a       ; LAMADH
9164 jg ; n/a       ; MIM
9165 jg ; n/a       ; NUN
9166 jg ; n/a       ; PE
9167 jg ; n/a       ; QAPH
9168 jg ; n/a       ; REH
9169 jg ; n/a       ; REVERSED_PE
9170 jg ; n/a       ; SADHE
9171 jg ; n/a       ; SEMKATH
9172 jg ; n/a       ; SHIN
9173 jg ; n/a       ; SWASH_KAF
9174 jg ; n/a       ; TAW
9175 jg ; n/a       ; TEH_MARBUTA
9176 jg ; n/a       ; TETH
9177 jg ; n/a       ; YEH
9178 jg ; n/a       ; YEH_BARREE
9179 jg ; n/a       ; YEH_WITH_TAIL
9180 jg ; n/a       ; YUDH
9181 jg ; n/a       ; YUDH_HE
9182 jg ; n/a       ; ZAIN
9183 END
9184         }
9185     }
9186
9187
9188     if (-e 'EastAsianWidth.txt') {
9189         push @return, split /\n/, <<'END';
9190 ea ; A         ; Ambiguous
9191 ea ; F         ; Fullwidth
9192 ea ; H         ; Halfwidth
9193 ea ; N         ; Neutral
9194 ea ; Na        ; Narrow
9195 ea ; W         ; Wide
9196 END
9197     }
9198
9199     if (-e 'LineBreak.txt') {
9200         push @return, split /\n/, <<'END';
9201 lb ; AI        ; Ambiguous
9202 lb ; AL        ; Alphabetic
9203 lb ; B2        ; Break_Both
9204 lb ; BA        ; Break_After
9205 lb ; BB        ; Break_Before
9206 lb ; BK        ; Mandatory_Break
9207 lb ; CB        ; Contingent_Break
9208 lb ; CL        ; Close_Punctuation
9209 lb ; CM        ; Combining_Mark
9210 lb ; CR        ; Carriage_Return
9211 lb ; EX        ; Exclamation
9212 lb ; GL        ; Glue
9213 lb ; HY        ; Hyphen
9214 lb ; ID        ; Ideographic
9215 lb ; IN        ; Inseperable
9216 lb ; IS        ; Infix_Numeric
9217 lb ; LF        ; Line_Feed
9218 lb ; NS        ; Nonstarter
9219 lb ; NU        ; Numeric
9220 lb ; OP        ; Open_Punctuation
9221 lb ; PO        ; Postfix_Numeric
9222 lb ; PR        ; Prefix_Numeric
9223 lb ; QU        ; Quotation
9224 lb ; SA        ; Complex_Context
9225 lb ; SG        ; Surrogate
9226 lb ; SP        ; Space
9227 lb ; SY        ; Break_Symbols
9228 lb ; XX        ; Unknown
9229 lb ; ZW        ; ZWSpace
9230 END
9231     }
9232
9233     if (-e 'DNormalizationProps.txt') {
9234         push @return, split /\n/, <<'END';
9235 qc ; M         ; Maybe
9236 qc ; N         ; No
9237 qc ; Y         ; Yes
9238 END
9239     }
9240
9241     if (-e 'Scripts.txt') {
9242         push @return, split /\n/, <<'END';
9243 sc ; Arab      ; Arabic
9244 sc ; Armn      ; Armenian
9245 sc ; Beng      ; Bengali
9246 sc ; Bopo      ; Bopomofo
9247 sc ; Cans      ; Canadian_Aboriginal
9248 sc ; Cher      ; Cherokee
9249 sc ; Cyrl      ; Cyrillic
9250 sc ; Deva      ; Devanagari
9251 sc ; Dsrt      ; Deseret
9252 sc ; Ethi      ; Ethiopic
9253 sc ; Geor      ; Georgian
9254 sc ; Goth      ; Gothic
9255 sc ; Grek      ; Greek
9256 sc ; Gujr      ; Gujarati
9257 sc ; Guru      ; Gurmukhi
9258 sc ; Hang      ; Hangul
9259 sc ; Hani      ; Han
9260 sc ; Hebr      ; Hebrew
9261 sc ; Hira      ; Hiragana
9262 sc ; Ital      ; Old_Italic
9263 sc ; Kana      ; Katakana
9264 sc ; Khmr      ; Khmer
9265 sc ; Knda      ; Kannada
9266 sc ; Laoo      ; Lao
9267 sc ; Latn      ; Latin
9268 sc ; Mlym      ; Malayalam
9269 sc ; Mong      ; Mongolian
9270 sc ; Mymr      ; Myanmar
9271 sc ; Ogam      ; Ogham
9272 sc ; Orya      ; Oriya
9273 sc ; Qaai      ; Inherited
9274 sc ; Runr      ; Runic
9275 sc ; Sinh      ; Sinhala
9276 sc ; Syrc      ; Syriac
9277 sc ; Taml      ; Tamil
9278 sc ; Telu      ; Telugu
9279 sc ; Thaa      ; Thaana
9280 sc ; Thai      ; Thai
9281 sc ; Tibt      ; Tibetan
9282 sc ; Yiii      ; Yi
9283 sc ; Zyyy      ; Common
9284 END
9285     }
9286
9287     if ($v_version ge v2.0.0) {
9288         push @return, split /\n/, <<'END';
9289 dt ; com       ; compat
9290 dt ; nar       ; narrow
9291 dt ; sml       ; small
9292 dt ; vert      ; vertical
9293 dt ; wide      ; wide
9294
9295 gc ; Cf        ; Format
9296 gc ; Cs        ; Surrogate
9297 gc ; Lt        ; Titlecase_Letter
9298 gc ; Me        ; Enclosing_Mark
9299 gc ; Nl        ; Letter_Number
9300 gc ; Pc        ; Connector_Punctuation
9301 gc ; Sk        ; Modifier_Symbol
9302 END
9303     }
9304     if ($v_version ge v2.1.2) {
9305         push @return, "bc ; S         ; Segment_Separator\n";
9306     }
9307     if ($v_version ge v2.1.5) {
9308         push @return, split /\n/, <<'END';
9309 gc ; Pf        ; Final_Punctuation
9310 gc ; Pi        ; Initial_Punctuation
9311 END
9312     }
9313     if ($v_version ge v2.1.8) {
9314         push @return, "ccc; 240; IS   ; Iota_Subscript\n";
9315     }
9316
9317     if ($v_version ge v3.0.0) {
9318         push @return, split /\n/, <<'END';
9319 bc ; AL        ; Arabic_Letter
9320 bc ; BN        ; Boundary_Neutral
9321 bc ; LRE       ; Left_To_Right_Embedding
9322 bc ; LRO       ; Left_To_Right_Override
9323 bc ; NSM       ; Nonspacing_Mark
9324 bc ; PDF       ; Pop_Directional_Format
9325 bc ; RLE       ; Right_To_Left_Embedding
9326 bc ; RLO       ; Right_To_Left_Override
9327
9328 ccc; 233; DB   ; Double_Below
9329 END
9330     }
9331
9332     if ($v_version ge v3.1.0) {
9333         push @return, "ccc; 226; R    ; Right\n";
9334     }
9335
9336     return @return;
9337 }
9338
9339 sub output_perl_charnames_line ($$) {
9340
9341     # Output the entries in Perl_charnames specially, using 5 digits instead
9342     # of four.  This makes the entries a constant length, and simplifies
9343     # charnames.pm which this table is for.  Unicode can have 6 digit
9344     # ordinals, but they are all private use or noncharacters which do not
9345     # have names, so won't be in this table.
9346
9347     return sprintf "%05X\t%s\n", $_[0], $_[1];
9348 }
9349
9350 { # Closure
9351     # This is used to store the range list of all the code points usable when
9352     # the little used $compare_versions feature is enabled.
9353     my $compare_versions_range_list;
9354
9355     # These are constants to the $property_info hash in this subroutine, to
9356     # avoid using a quoted-string which might have a typo.
9357     my $TYPE  = 'type';
9358     my $DEFAULT_MAP = 'default_map';
9359     my $DEFAULT_TABLE = 'default_table';
9360     my $PSEUDO_MAP_TYPE = 'pseudo_map_type';
9361     my $MISSINGS = 'missings';
9362
9363     sub process_generic_property_file {
9364         # This processes a file containing property mappings and puts them
9365         # into internal map tables.  It should be used to handle any property
9366         # files that have mappings from a code point or range thereof to
9367         # something else.  This means almost all the UCD .txt files.
9368         # each_line_handlers() should be set to adjust the lines of these
9369         # files, if necessary, to what this routine understands:
9370         #
9371         # 0374          ; NFD_QC; N
9372         # 003C..003E    ; Math
9373         #
9374         # the fields are: "codepoint-range ; property; map"
9375         #
9376         # meaning the codepoints in the range all have the value 'map' under
9377         # 'property'.
9378         # Beginning and trailing white space in each field are not significant.
9379         # Note there is not a trailing semi-colon in the above.  A trailing
9380         # semi-colon means the map is a null-string.  An omitted map, as
9381         # opposed to a null-string, is assumed to be 'Y', based on Unicode
9382         # table syntax.  (This could have been hidden from this routine by
9383         # doing it in the $file object, but that would require parsing of the
9384         # line there, so would have to parse it twice, or change the interface
9385         # to pass this an array.  So not done.)
9386         #
9387         # The map field may begin with a sequence of commands that apply to
9388         # this range.  Each such command begins and ends with $CMD_DELIM.
9389         # These are used to indicate, for example, that the mapping for a
9390         # range has a non-default type.
9391         #
9392         # This loops through the file, calling it's next_line() method, and
9393         # then taking the map and adding it to the property's table.
9394         # Complications arise because any number of properties can be in the
9395         # file, in any order, interspersed in any way.  The first time a
9396         # property is seen, it gets information about that property and
9397         # caches it for quick retrieval later.  It also normalizes the maps
9398         # so that only one of many synonyms is stored.  The Unicode input
9399         # files do use some multiple synonyms.
9400
9401         my $file = shift;
9402         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9403
9404         my %property_info;               # To keep track of what properties
9405                                          # have already had entries in the
9406                                          # current file, and info about each,
9407                                          # so don't have to recompute.
9408         my $property_name;               # property currently being worked on
9409         my $property_type;               # and its type
9410         my $previous_property_name = ""; # name from last time through loop
9411         my $property_object;             # pointer to the current property's
9412                                          # object
9413         my $property_addr;               # the address of that object
9414         my $default_map;                 # the string that code points missing
9415                                          # from the file map to
9416         my $default_table;               # For non-string properties, a
9417                                          # reference to the match table that
9418                                          # will contain the list of code
9419                                          # points that map to $default_map.
9420
9421         # Get the next real non-comment line
9422         LINE:
9423         while ($file->next_line) {
9424
9425             # Default replacement type; means that if parts of the range have
9426             # already been stored in our tables, the new map overrides them if
9427             # they differ more than cosmetically
9428             my $replace = $IF_NOT_EQUIVALENT;
9429             my $map_type;            # Default type for the map of this range
9430
9431             #local $to_trace = 1 if main::DEBUG;
9432             trace $_ if main::DEBUG && $to_trace;
9433
9434             # Split the line into components
9435             my ($range, $property_name, $map, @remainder)
9436                 = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
9437
9438             # If more or less on the line than we are expecting, warn and skip
9439             # the line
9440             if (@remainder) {
9441                 $file->carp_bad_line('Extra fields');
9442                 next LINE;
9443             }
9444             elsif ( ! defined $property_name) {
9445                 $file->carp_bad_line('Missing property');
9446                 next LINE;
9447             }
9448
9449             # Examine the range.
9450             if ($range !~ /^ ($code_point_re) (?:\.\. ($code_point_re) )? $/x)
9451             {
9452                 $file->carp_bad_line("Range '$range' not of the form 'CP1' or 'CP1..CP2' (where CP1,2 are code points in hex)");
9453                 next LINE;
9454             }
9455             my $low = hex $1;
9456             my $high = (defined $2) ? hex $2 : $low;
9457
9458             # For the very specialized case of comparing two Unicode
9459             # versions...
9460             if (DEBUG && $compare_versions) {
9461                 if ($property_name eq 'Age') {
9462
9463                     # Only allow code points at least as old as the version
9464                     # specified.
9465                     my $age = pack "C*", split(/\./, $map);        # v string
9466                     next LINE if $age gt $compare_versions;
9467                 }
9468                 else {
9469
9470                     # Again, we throw out code points younger than those of
9471                     # the specified version.  By now, the Age property is
9472                     # populated.  We use the intersection of each input range
9473                     # with this property to find what code points in it are
9474                     # valid.   To do the intersection, we have to convert the
9475                     # Age property map to a Range_list.  We only have to do
9476                     # this once.
9477                     if (! defined $compare_versions_range_list) {
9478                         my $age = property_ref('Age');
9479                         if (! -e 'DAge.txt') {
9480                             croak "Need to have 'DAge.txt' file to do version comparison";
9481                         }
9482                         elsif ($age->count == 0) {
9483                             croak "The 'Age' table is empty, but its file exists";
9484                         }
9485                         $compare_versions_range_list
9486                                         = Range_List->new(Initialize => $age);
9487                     }
9488
9489                     # An undefined map is always 'Y'
9490                     $map = 'Y' if ! defined $map;
9491
9492                     # Calculate the intersection of the input range with the
9493                     # code points that are known in the specified version
9494                     my @ranges = ($compare_versions_range_list
9495                                   & Range->new($low, $high))->ranges;
9496
9497                     # If the intersection is empty, throw away this range
9498                     next LINE unless @ranges;
9499
9500                     # Only examine the first range this time through the loop.
9501                     my $this_range = shift @ranges;
9502
9503                     # Put any remaining ranges in the queue to be processed
9504                     # later.  Note that there is unnecessary work here, as we
9505                     # will do the intersection again for each of these ranges
9506                     # during some future iteration of the LINE loop, but this
9507                     # code is not used in production.  The later intersections
9508                     # are guaranteed to not splinter, so this will not become
9509                     # an infinite loop.
9510                     my $line = join ';', $property_name, $map;
9511                     foreach my $range (@ranges) {
9512                         $file->insert_adjusted_lines(sprintf("%04X..%04X; %s",
9513                                                             $range->start,
9514                                                             $range->end,
9515                                                             $line));
9516                     }
9517
9518                     # And process the first range, like any other.
9519                     $low = $this_range->start;
9520                     $high = $this_range->end;
9521                 }
9522             } # End of $compare_versions
9523
9524             # If changing to a new property, get the things constant per
9525             # property
9526             if ($previous_property_name ne $property_name) {
9527
9528                 $property_object = property_ref($property_name);
9529                 if (! defined $property_object) {
9530                     $file->carp_bad_line("Unexpected property '$property_name'.  Skipped");
9531                     next LINE;
9532                 }
9533                 { no overloading; $property_addr = pack 'J', $property_object; }
9534
9535                 # Defer changing names until have a line that is acceptable
9536                 # (the 'next' statement above means is unacceptable)
9537                 $previous_property_name = $property_name;
9538
9539                 # If not the first time for this property, retrieve info about
9540                 # it from the cache
9541                 if (defined ($property_info{$property_addr}{$TYPE})) {
9542                     $property_type = $property_info{$property_addr}{$TYPE};
9543                     $default_map = $property_info{$property_addr}{$DEFAULT_MAP};
9544                     $map_type
9545                         = $property_info{$property_addr}{$PSEUDO_MAP_TYPE};
9546                     $default_table
9547                             = $property_info{$property_addr}{$DEFAULT_TABLE};
9548                 }
9549                 else {
9550
9551                     # Here, is the first time for this property.  Set up the
9552                     # cache.
9553                     $property_type = $property_info{$property_addr}{$TYPE}
9554                                    = $property_object->type;
9555                     $map_type
9556                         = $property_info{$property_addr}{$PSEUDO_MAP_TYPE}
9557                         = $property_object->pseudo_map_type;
9558
9559                     # The Unicode files are set up so that if the map is not
9560                     # defined, it is a binary property
9561                     if (! defined $map && $property_type != $BINARY) {
9562                         if ($property_type != $UNKNOWN
9563                             && $property_type != $NON_STRING)
9564                         {
9565                             $file->carp_bad_line("No mapping defined on a non-binary property.  Using 'Y' for the map");
9566                         }
9567                         else {
9568                             $property_object->set_type($BINARY);
9569                             $property_type
9570                                 = $property_info{$property_addr}{$TYPE}
9571                                 = $BINARY;
9572                         }
9573                     }
9574
9575                     # Get any @missings default for this property.  This
9576                     # should precede the first entry for the property in the
9577                     # input file, and is located in a comment that has been
9578                     # stored by the Input_file class until we access it here.
9579                     # It's possible that there is more than one such line
9580                     # waiting for us; collect them all, and parse
9581                     my @missings_list = $file->get_missings
9582                                             if $file->has_missings_defaults;
9583                     foreach my $default_ref (@missings_list) {
9584                         my $default = $default_ref->[0];
9585                         my $addr = do { no overloading; pack 'J', property_ref($default_ref->[1]); };
9586
9587                         # For string properties, the default is just what the
9588                         # file says, but non-string properties should already
9589                         # have set up a table for the default property value;
9590                         # use the table for these, so can resolve synonyms
9591                         # later to a single standard one.
9592                         if ($property_type == $STRING
9593                             || $property_type == $UNKNOWN)
9594                         {
9595                             $property_info{$addr}{$MISSINGS} = $default;
9596                         }
9597                         else {
9598                             $property_info{$addr}{$MISSINGS}
9599                                         = $property_object->table($default);
9600                         }
9601                     }
9602
9603                     # Finished storing all the @missings defaults in the input
9604                     # file so far.  Get the one for the current property.
9605                     my $missings = $property_info{$property_addr}{$MISSINGS};
9606
9607                     # But we likely have separately stored what the default
9608                     # should be.  (This is to accommodate versions of the
9609                     # standard where the @missings lines are absent or
9610                     # incomplete.)  Hopefully the two will match.  But check
9611                     # it out.
9612                     $default_map = $property_object->default_map;
9613
9614                     # If the map is a ref, it means that the default won't be
9615                     # processed until later, so undef it, so next few lines
9616                     # will redefine it to something that nothing will match
9617                     undef $default_map if ref $default_map;
9618
9619                     # Create a $default_map if don't have one; maybe a dummy
9620                     # that won't match anything.
9621                     if (! defined $default_map) {
9622
9623                         # Use any @missings line in the file.
9624                         if (defined $missings) {
9625                             if (ref $missings) {
9626                                 $default_map = $missings->full_name;
9627                                 $default_table = $missings;
9628                             }
9629                             else {
9630                                 $default_map = $missings;
9631                             }
9632
9633                             # And store it with the property for outside use.
9634                             $property_object->set_default_map($default_map);
9635                         }
9636                         else {
9637
9638                             # Neither an @missings nor a default map.  Create
9639                             # a dummy one, so won't have to test definedness
9640                             # in the main loop.
9641                             $default_map = '_Perl This will never be in a file
9642                                             from Unicode';
9643                         }
9644                     }
9645
9646                     # Here, we have $default_map defined, possibly in terms of
9647                     # $missings, but maybe not, and possibly is a dummy one.
9648                     if (defined $missings) {
9649
9650                         # Make sure there is no conflict between the two.
9651                         # $missings has priority.
9652                         if (ref $missings) {
9653                             $default_table
9654                                         = $property_object->table($default_map);
9655                             if (! defined $default_table
9656                                 || $default_table != $missings)
9657                             {
9658                                 if (! defined $default_table) {
9659                                     $default_table = $UNDEF;
9660                                 }
9661                                 $file->carp_bad_line(<<END
9662 The \@missings line for $property_name in $file says that missings default to
9663 $missings, but we expect it to be $default_table.  $missings used.
9664 END
9665                                 );
9666                                 $default_table = $missings;
9667                                 $default_map = $missings->full_name;
9668                             }
9669                             $property_info{$property_addr}{$DEFAULT_TABLE}
9670                                                         = $default_table;
9671                         }
9672                         elsif ($default_map ne $missings) {
9673                             $file->carp_bad_line(<<END
9674 The \@missings line for $property_name in $file says that missings default to
9675 $missings, but we expect it to be $default_map.  $missings used.
9676 END
9677                             );
9678                             $default_map = $missings;
9679                         }
9680                     }
9681
9682                     $property_info{$property_addr}{$DEFAULT_MAP}
9683                                                     = $default_map;
9684
9685                     # If haven't done so already, find the table corresponding
9686                     # to this map for non-string properties.
9687                     if (! defined $default_table
9688                         && $property_type != $STRING
9689                         && $property_type != $UNKNOWN)
9690                     {
9691                         $default_table = $property_info{$property_addr}
9692                                                         {$DEFAULT_TABLE}
9693                                     = $property_object->table($default_map);
9694                     }
9695                 } # End of is first time for this property
9696             } # End of switching properties.
9697
9698             # Ready to process the line.
9699             # The Unicode files are set up so that if the map is not defined,
9700             # it is a binary property with value 'Y'
9701             if (! defined $map) {
9702                 $map = 'Y';
9703             }
9704             else {
9705
9706                 # If the map begins with a special command to us (enclosed in
9707                 # delimiters), extract the command(s).
9708                 while ($map =~ s/ ^ $CMD_DELIM (.*?) $CMD_DELIM //x) {
9709                     my $command = $1;
9710                     if ($command =~  / ^ $REPLACE_CMD= (.*) /x) {
9711                         $replace = $1;
9712                     }
9713                     elsif ($command =~  / ^ $MAP_TYPE_CMD= (.*) /x) {
9714                         $map_type = $1;
9715                     }
9716                     else {
9717                         $file->carp_bad_line("Unknown command line: '$1'");
9718                         next LINE;
9719                     }
9720                 }
9721             }
9722
9723             if ($default_map eq $CODE_POINT && $map =~ / ^ $code_point_re $/x)
9724             {
9725
9726                 # Here, we have a map to a particular code point, and the
9727                 # default map is to a code point itself.  If the range
9728                 # includes the particular code point, change that portion of
9729                 # the range to the default.  This makes sure that in the final
9730                 # table only the non-defaults are listed.
9731                 my $decimal_map = hex $map;
9732                 if ($low <= $decimal_map && $decimal_map <= $high) {
9733
9734                     # If the range includes stuff before or after the map
9735                     # we're changing, split it and process the split-off parts
9736                     # later.
9737                     if ($low < $decimal_map) {
9738                         $file->insert_adjusted_lines(
9739                                             sprintf("%04X..%04X; %s; %s",
9740                                                     $low,
9741                                                     $decimal_map - 1,
9742                                                     $property_name,
9743                                                     $map));
9744                     }
9745                     if ($high > $decimal_map) {
9746                         $file->insert_adjusted_lines(
9747                                             sprintf("%04X..%04X; %s; %s",
9748                                                     $decimal_map + 1,
9749                                                     $high,
9750                                                     $property_name,
9751                                                     $map));
9752                     }
9753                     $low = $high = $decimal_map;
9754                     $map = $CODE_POINT;
9755                 }
9756             }
9757
9758             # If we can tell that this is a synonym for the default map, use
9759             # the default one instead.
9760             if ($property_type != $STRING
9761                 && $property_type != $UNKNOWN)
9762             {
9763                 my $table = $property_object->table($map);
9764                 if (defined $table && $table == $default_table) {
9765                     $map = $default_map;
9766                 }
9767             }
9768
9769             # And figure out the map type if not known.
9770             if (! defined $map_type || $map_type == $COMPUTE_NO_MULTI_CP) {
9771                 if ($map eq "") {   # Nulls are always $NULL map type
9772                     $map_type = $NULL;
9773                 } # Otherwise, non-strings, and those that don't allow
9774                   # $MULTI_CP, and those that aren't multiple code points are
9775                   # 0
9776                 elsif
9777                    (($property_type != $STRING && $property_type != $UNKNOWN)
9778                    || (defined $map_type && $map_type == $COMPUTE_NO_MULTI_CP)
9779                    || $map !~ /^ $code_point_re ( \  $code_point_re )+ $ /x)
9780                 {
9781                     $map_type = 0;
9782                 }
9783                 else {
9784                     $map_type = $MULTI_CP;
9785                 }
9786             }
9787
9788             $property_object->add_map($low, $high,
9789                                         $map,
9790                                         Type => $map_type,
9791                                         Replace => $replace);
9792         } # End of loop through file's lines
9793
9794         return;
9795     }
9796 }
9797
9798 { # Closure for UnicodeData.txt handling
9799
9800     # This file was the first one in the UCD; its design leads to some
9801     # awkwardness in processing.  Here is a sample line:
9802     # 0041;LATIN CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0061;
9803     # The fields in order are:
9804     my $i = 0;            # The code point is in field 0, and is shifted off.
9805     my $CHARNAME = $i++;  # character name (e.g. "LATIN CAPITAL LETTER A")
9806     my $CATEGORY = $i++;  # category (e.g. "Lu")
9807     my $CCC = $i++;       # Canonical combining class (e.g. "230")
9808     my $BIDI = $i++;      # directional class (e.g. "L")
9809     my $PERL_DECOMPOSITION = $i++;  # decomposition mapping
9810     my $PERL_DECIMAL_DIGIT = $i++;   # decimal digit value
9811     my $NUMERIC_TYPE_OTHER_DIGIT = $i++; # digit value, like a superscript
9812                                          # Dual-use in this program; see below
9813     my $NUMERIC = $i++;   # numeric value
9814     my $MIRRORED = $i++;  # ? mirrored
9815     my $UNICODE_1_NAME = $i++; # name in Unicode 1.0
9816     my $COMMENT = $i++;   # iso comment
9817     my $UPPER = $i++;     # simple uppercase mapping
9818     my $LOWER = $i++;     # simple lowercase mapping
9819     my $TITLE = $i++;     # simple titlecase mapping
9820     my $input_field_count = $i;
9821
9822     # This routine in addition outputs these extra fields:
9823     my $DECOMP_TYPE = $i++; # Decomposition type
9824
9825     # These fields are modifications of ones above, and are usually
9826     # suppressed; they must come last, as for speed, the loop upper bound is
9827     # normally set to ignore them
9828     my $NAME = $i++;        # This is the strict name field, not the one that
9829                             # charnames uses.
9830     my $DECOMP_MAP = $i++;  # Strict decomposition mapping; not the one used
9831                             # by Unicode::Normalize
9832     my $last_field = $i - 1;
9833
9834     # All these are read into an array for each line, with the indices defined
9835     # above.  The empty fields in the example line above indicate that the
9836     # value is defaulted.  The handler called for each line of the input
9837     # changes these to their defaults.
9838
9839     # Here are the official names of the properties, in a parallel array:
9840     my @field_names;
9841     $field_names[$BIDI] = 'Bidi_Class';
9842     $field_names[$CATEGORY] = 'General_Category';
9843     $field_names[$CCC] = 'Canonical_Combining_Class';
9844     $field_names[$CHARNAME] = 'Perl_Charnames';
9845     $field_names[$COMMENT] = 'ISO_Comment';
9846     $field_names[$DECOMP_MAP] = 'Decomposition_Mapping';
9847     $field_names[$DECOMP_TYPE] = 'Decomposition_Type';
9848     $field_names[$LOWER] = 'Lowercase_Mapping';
9849     $field_names[$MIRRORED] = 'Bidi_Mirrored';
9850     $field_names[$NAME] = 'Name';
9851     $field_names[$NUMERIC] = 'Numeric_Value';
9852     $field_names[$NUMERIC_TYPE_OTHER_DIGIT] = 'Numeric_Type';
9853     $field_names[$PERL_DECIMAL_DIGIT] = 'Perl_Decimal_Digit';
9854     $field_names[$PERL_DECOMPOSITION] = 'Perl_Decomposition_Mapping';
9855     $field_names[$TITLE] = 'Titlecase_Mapping';
9856     $field_names[$UNICODE_1_NAME] = 'Unicode_1_Name';
9857     $field_names[$UPPER] = 'Uppercase_Mapping';
9858
9859     # Some of these need a little more explanation:
9860     # The $PERL_DECIMAL_DIGIT field does not lead to an official Unicode
9861     #   property, but is used in calculating the Numeric_Type.  Perl however,
9862     #   creates a file from this field, so a Perl property is created from it.
9863     # Similarly, the Other_Digit field is used only for calculating the
9864     #   Numeric_Type, and so it can be safely re-used as the place to store
9865     #   the value for Numeric_Type; hence it is referred to as
9866     #   $NUMERIC_TYPE_OTHER_DIGIT.
9867     # The input field named $PERL_DECOMPOSITION is a combination of both the
9868     #   decomposition mapping and its type.  Perl creates a file containing
9869     #   exactly this field, so it is used for that.  The two properties are
9870     #   separated into two extra output fields, $DECOMP_MAP and $DECOMP_TYPE.
9871     #   $DECOMP_MAP is usually suppressed (unless the lists are changed to
9872     #   output it), as Perl doesn't use it directly.
9873     # The input field named here $CHARNAME is used to construct the
9874     #   Perl_Charnames property, which is a combination of the Name property
9875     #   (which the input field contains), and the Unicode_1_Name property, and
9876     #   others from other files.  Since, the strict Name property is not used
9877     #   by Perl, this field is used for the table that Perl does use.  The
9878     #   strict Name property table is usually suppressed (unless the lists are
9879     #   changed to output it), so it is accumulated in a separate field,
9880     #   $NAME, which to save time is discarded unless the table is actually to
9881     #   be output
9882
9883     # This file is processed like most in this program.  Control is passed to
9884     # process_generic_property_file() which calls filter_UnicodeData_line()
9885     # for each input line.  This filter converts the input into line(s) that
9886     # process_generic_property_file() understands.  There is also a setup
9887     # routine called before any of the file is processed, and a handler for
9888     # EOF processing, all in this closure.
9889
9890     # A huge speed-up occurred at the cost of some added complexity when these
9891     # routines were altered to buffer the outputs into ranges.  Almost all the
9892     # lines of the input file apply to just one code point, and for most
9893     # properties, the map for the next code point up is the same as the
9894     # current one.  So instead of creating a line for each property for each
9895     # input line, filter_UnicodeData_line() remembers what the previous map
9896     # of a property was, and doesn't generate a line to pass on until it has
9897     # to, as when the map changes; and that passed-on line encompasses the
9898     # whole contiguous range of code points that have the same map for that
9899     # property.  This means a slight amount of extra setup, and having to
9900     # flush these buffers on EOF, testing if the maps have changed, plus
9901     # remembering state information in the closure.  But it means a lot less
9902     # real time in not having to change the data base for each property on
9903     # each line.
9904
9905     # Another complication is that there are already a few ranges designated
9906     # in the input.  There are two lines for each, with the same maps except
9907     # the code point and name on each line.  This was actually the hardest
9908     # thing to design around.  The code points in those ranges may actually
9909     # have real maps not given by these two lines.  These maps will either
9910     # be algorithmically determinable, or be in the extracted files furnished
9911     # with the UCD.  In the event of conflicts between these extracted files,
9912     # and this one, Unicode says that this one prevails.  But it shouldn't
9913     # prevail for conflicts that occur in these ranges.  The data from the
9914     # extracted files prevails in those cases.  So, this program is structured
9915     # so that those files are processed first, storing maps.  Then the other
9916     # files are processed, generally overwriting what the extracted files
9917     # stored.  But just the range lines in this input file are processed
9918     # without overwriting.  This is accomplished by adding a special string to
9919     # the lines output to tell process_generic_property_file() to turn off the
9920     # overwriting for just this one line.
9921     # A similar mechanism is used to tell it that the map is of a non-default
9922     # type.
9923
9924     sub setup_UnicodeData { # Called before any lines of the input are read
9925         my $file = shift;
9926         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9927
9928         # Create a new property specially located that is a combination of the
9929         # various Name properties: Name, Unicode_1_Name, Named Sequences, and
9930         # Name_Alias properties.  (The final duplicates elements of the
9931         # first.)  A comment for it will later be constructed based on the
9932         # actual properties present and used
9933         $perl_charname = Property->new('Perl_Charnames',
9934                        Default_Map => "",
9935                        Directory => File::Spec->curdir(),
9936                        File => 'Name',
9937                        Fate => $INTERNAL_ONLY,
9938                        Perl_Extension => 1,
9939                        Range_Size_1 => \&output_perl_charnames_line,
9940                        Type => $STRING,
9941                        );
9942         $perl_charname->set_proxy_for('Name', 'Name_Alias');
9943
9944         my $Perl_decomp = Property->new('Perl_Decomposition_Mapping',
9945                                         Directory => File::Spec->curdir(),
9946                                         File => 'Decomposition',
9947                                         Format => $DECOMP_STRING_FORMAT,
9948                                         Fate => $INTERNAL_ONLY,
9949                                         Perl_Extension => 1,
9950                                         Default_Map => $CODE_POINT,
9951
9952                                         # normalize.pm can't cope with these
9953                                         Output_Range_Counts => 0,
9954
9955                                         # This is a specially formatted table
9956                                         # explicitly for normalize.pm, which
9957                                         # is expecting a particular format,
9958                                         # which means that mappings containing
9959                                         # multiple code points are in the main
9960                                         # body of the table
9961                                         Map_Type => $COMPUTE_NO_MULTI_CP,
9962                                         Type => $STRING,
9963                                         );
9964         $Perl_decomp->set_proxy_for('Decomposition_Mapping', 'Decomposition_Type');
9965         $Perl_decomp->add_comment(join_lines(<<END
9966 This mapping is a combination of the Unicode 'Decomposition_Type' and
9967 'Decomposition_Mapping' properties, formatted for use by normalize.pm.  It is
9968 identical to the official Unicode 'Decomposition_Mapping' property except for
9969 two things:
9970  1) It omits the algorithmically determinable Hangul syllable decompositions,
9971 which normalize.pm handles algorithmically.
9972  2) It contains the decomposition type as well.  Non-canonical decompositions
9973 begin with a word in angle brackets, like <super>, which denotes the
9974 compatible decomposition type.  If the map does not begin with the <angle
9975 brackets>, the decomposition is canonical.
9976 END
9977         ));
9978
9979         my $Decimal_Digit = Property->new("Perl_Decimal_Digit",
9980                                         Default_Map => "",
9981                                         Perl_Extension => 1,
9982                                         File => 'Digit',    # Trad. location
9983                                         Directory => $map_directory,
9984                                         Type => $STRING,
9985                                         Range_Size_1 => 1,
9986                                         );
9987         $Decimal_Digit->add_comment(join_lines(<<END
9988 This file gives the mapping of all code points which represent a single
9989 decimal digit [0-9] to their respective digits.  For example, the code point
9990 U+0031 (an ASCII '1') is mapped to a numeric 1.  These code points are those
9991 that have Numeric_Type=Decimal; not special things, like subscripts nor Roman
9992 numerals.
9993 END
9994         ));
9995
9996         # These properties are not used for generating anything else, and are
9997         # usually not output.  By making them last in the list, we can just
9998         # change the high end of the loop downwards to avoid the work of
9999         # generating a table(s) that is/are just going to get thrown away.
10000         if (! property_ref('Decomposition_Mapping')->to_output_map
10001             && ! property_ref('Name')->to_output_map)
10002         {
10003             $last_field = min($NAME, $DECOMP_MAP) - 1;
10004         } elsif (property_ref('Decomposition_Mapping')->to_output_map) {
10005             $last_field = $DECOMP_MAP;
10006         } elsif (property_ref('Name')->to_output_map) {
10007             $last_field = $NAME;
10008         }
10009         return;
10010     }
10011
10012     my $first_time = 1;                 # ? Is this the first line of the file
10013     my $in_range = 0;                   # ? Are we in one of the file's ranges
10014     my $previous_cp;                    # hex code point of previous line
10015     my $decimal_previous_cp = -1;       # And its decimal equivalent
10016     my @start;                          # For each field, the current starting
10017                                         # code point in hex for the range
10018                                         # being accumulated.
10019     my @fields;                         # The input fields;
10020     my @previous_fields;                # And those from the previous call
10021
10022     sub filter_UnicodeData_line {
10023         # Handle a single input line from UnicodeData.txt; see comments above
10024         # Conceptually this takes a single line from the file containing N
10025         # properties, and converts it into N lines with one property per line,
10026         # which is what the final handler expects.  But there are
10027         # complications due to the quirkiness of the input file, and to save
10028         # time, it accumulates ranges where the property values don't change
10029         # and only emits lines when necessary.  This is about an order of
10030         # magnitude fewer lines emitted.
10031
10032         my $file = shift;
10033         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10034
10035         # $_ contains the input line.
10036         # -1 in split means retain trailing null fields
10037         (my $cp, @fields) = split /\s*;\s*/, $_, -1;
10038
10039         #local $to_trace = 1 if main::DEBUG;
10040         trace $cp, @fields , $input_field_count if main::DEBUG && $to_trace;
10041         if (@fields > $input_field_count) {
10042             $file->carp_bad_line('Extra fields');
10043             $_ = "";
10044             return;
10045         }
10046
10047         my $decimal_cp = hex $cp;
10048
10049         # We have to output all the buffered ranges when the next code point
10050         # is not exactly one after the previous one, which means there is a
10051         # gap in the ranges.
10052         my $force_output = ($decimal_cp != $decimal_previous_cp + 1);
10053
10054         # The decomposition mapping field requires special handling.  It looks
10055         # like either:
10056         #
10057         # <compat> 0032 0020
10058         # 0041 0300
10059         #
10060         # The decomposition type is enclosed in <brackets>; if missing, it
10061         # means the type is canonical.  There are two decomposition mapping
10062         # tables: the one for use by Perl's normalize.pm has a special format
10063         # which is this field intact; the other, for general use is of
10064         # standard format.  In either case we have to find the decomposition
10065         # type.  Empty fields have None as their type, and map to the code
10066         # point itself
10067         if ($fields[$PERL_DECOMPOSITION] eq "") {
10068             $fields[$DECOMP_TYPE] = 'None';
10069             $fields[$DECOMP_MAP] = $fields[$PERL_DECOMPOSITION] = $CODE_POINT;
10070         }
10071         else {
10072             ($fields[$DECOMP_TYPE], my $map) = $fields[$PERL_DECOMPOSITION]
10073                                             =~ / < ( .+? ) > \s* ( .+ ) /x;
10074             if (! defined $fields[$DECOMP_TYPE]) {
10075                 $fields[$DECOMP_TYPE] = 'Canonical';
10076                 $fields[$DECOMP_MAP] = $fields[$PERL_DECOMPOSITION];
10077             }
10078             else {
10079                 $fields[$DECOMP_MAP] = $map;
10080             }
10081         }
10082
10083         # The 3 numeric fields also require special handling.  The 2 digit
10084         # fields must be either empty or match the number field.  This means
10085         # that if it is empty, they must be as well, and the numeric type is
10086         # None, and the numeric value is 'Nan'.
10087         # The decimal digit field must be empty or match the other digit
10088         # field.  If the decimal digit field is non-empty, the code point is
10089         # a decimal digit, and the other two fields will have the same value.
10090         # If it is empty, but the other digit field is non-empty, the code
10091         # point is an 'other digit', and the number field will have the same
10092         # value as the other digit field.  If the other digit field is empty,
10093         # but the number field is non-empty, the code point is a generic
10094         # numeric type.
10095         if ($fields[$NUMERIC] eq "") {
10096             if ($fields[$PERL_DECIMAL_DIGIT] ne ""
10097                 || $fields[$NUMERIC_TYPE_OTHER_DIGIT] ne ""
10098             ) {
10099                 $file->carp_bad_line("Numeric values inconsistent.  Trying to process anyway");
10100             }
10101             $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'None';
10102             $fields[$NUMERIC] = 'NaN';
10103         }
10104         else {
10105             $file->carp_bad_line("'$fields[$NUMERIC]' should be a whole or rational number.  Processing as if it were") if $fields[$NUMERIC] !~ qr{ ^ -? \d+ ( / \d+ )? $ }x;
10106             if ($fields[$PERL_DECIMAL_DIGIT] ne "") {
10107                 $file->carp_bad_line("$fields[$PERL_DECIMAL_DIGIT] should equal $fields[$NUMERIC].  Processing anyway") if $fields[$PERL_DECIMAL_DIGIT] != $fields[$NUMERIC];
10108                 $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'Decimal';
10109             }
10110             elsif ($fields[$NUMERIC_TYPE_OTHER_DIGIT] ne "") {
10111                 $file->carp_bad_line("$fields[$NUMERIC_TYPE_OTHER_DIGIT] should equal $fields[$NUMERIC].  Processing anyway") if $fields[$NUMERIC_TYPE_OTHER_DIGIT] != $fields[$NUMERIC];
10112                 $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'Digit';
10113             }
10114             else {
10115                 $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'Numeric';
10116
10117                 # Rationals require extra effort.
10118                 register_fraction($fields[$NUMERIC])
10119                                                 if $fields[$NUMERIC] =~ qr{/};
10120             }
10121         }
10122
10123         # For the properties that have empty fields in the file, and which
10124         # mean something different from empty, change them to that default.
10125         # Certain fields just haven't been empty so far in any Unicode
10126         # version, so don't look at those, namely $MIRRORED, $BIDI, $CCC,
10127         # $CATEGORY.  This leaves just the two fields, and so we hard-code in
10128         # the defaults; which are very unlikely to ever change.
10129         $fields[$UPPER] = $CODE_POINT if $fields[$UPPER] eq "";
10130         $fields[$LOWER] = $CODE_POINT if $fields[$LOWER] eq "";
10131
10132         # UAX44 says that if title is empty, it is the same as whatever upper
10133         # is,
10134         $fields[$TITLE] = $fields[$UPPER] if $fields[$TITLE] eq "";
10135
10136         # There are a few pairs of lines like:
10137         #   AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
10138         #   D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
10139         # that define ranges.  These should be processed after the fields are
10140         # adjusted above, as they may override some of them; but mostly what
10141         # is left is to possibly adjust the $CHARNAME field.  The names of all the
10142         # paired lines start with a '<', but this is also true of '<control>,
10143         # which isn't one of these special ones.
10144         if ($fields[$CHARNAME] eq '<control>') {
10145
10146             # Some code points in this file have the pseudo-name
10147             # '<control>', but the official name for such ones is the null
10148             # string.  For charnames.pm, we use the Unicode version 1 name
10149             $fields[$NAME] = "";
10150             $fields[$CHARNAME] = $fields[$UNICODE_1_NAME];
10151
10152             # We had better not be in between range lines.
10153             if ($in_range) {
10154                 $file->carp_bad_line("Expecting a closing range line, not a $fields[$CHARNAME]'.  Trying anyway");
10155                 $in_range = 0;
10156             }
10157         }
10158         elsif (substr($fields[$CHARNAME], 0, 1) ne '<') {
10159
10160             # Here is a non-range line.  We had better not be in between range
10161             # lines.
10162             if ($in_range) {
10163                 $file->carp_bad_line("Expecting a closing range line, not a $fields[$CHARNAME]'.  Trying anyway");
10164                 $in_range = 0;
10165             }
10166             if ($fields[$CHARNAME] =~ s/- $cp $//x) {
10167
10168                 # These are code points whose names end in their code points,
10169                 # which means the names are algorithmically derivable from the
10170                 # code points.  To shorten the output Name file, the algorithm
10171                 # for deriving these is placed in the file instead of each
10172                 # code point, so they have map type $CP_IN_NAME
10173                 $fields[$CHARNAME] = $CMD_DELIM
10174                                  . $MAP_TYPE_CMD
10175                                  . '='
10176                                  . $CP_IN_NAME
10177                                  . $CMD_DELIM
10178                                  . $fields[$CHARNAME];
10179             }
10180             $fields[$NAME] = $fields[$CHARNAME];
10181         }
10182         elsif ($fields[$CHARNAME] =~ /^<(.+), First>$/) {
10183             $fields[$CHARNAME] = $fields[$NAME] = $1;
10184
10185             # Here we are at the beginning of a range pair.
10186             if ($in_range) {
10187                 $file->carp_bad_line("Expecting a closing range line, not a beginning one, $fields[$CHARNAME]'.  Trying anyway");
10188             }
10189             $in_range = 1;
10190
10191             # Because the properties in the range do not overwrite any already
10192             # in the db, we must flush the buffers of what's already there, so
10193             # they get handled in the normal scheme.
10194             $force_output = 1;
10195
10196         }
10197         elsif ($fields[$CHARNAME] !~ s/^<(.+), Last>$/$1/) {
10198             $file->carp_bad_line("Unexpected name starting with '<' $fields[$CHARNAME].  Ignoring this line.");
10199             $_ = "";
10200             return;
10201         }
10202         else { # Here, we are at the last line of a range pair.
10203
10204             if (! $in_range) {
10205                 $file->carp_bad_line("Unexpected end of range $fields[$CHARNAME] when not in one.  Ignoring this line.");
10206                 $_ = "";
10207                 return;
10208             }
10209             $in_range = 0;
10210
10211             $fields[$NAME] = $fields[$CHARNAME];
10212
10213             # Check that the input is valid: that the closing of the range is
10214             # the same as the beginning.
10215             foreach my $i (0 .. $last_field) {
10216                 next if $fields[$i] eq $previous_fields[$i];
10217                 $file->carp_bad_line("Expecting '$fields[$i]' to be the same as '$previous_fields[$i]'.  Bad News.  Trying anyway");
10218             }
10219
10220             # The processing differs depending on the type of range,
10221             # determined by its $CHARNAME
10222             if ($fields[$CHARNAME] =~ /^Hangul Syllable/) {
10223
10224                 # Check that the data looks right.
10225                 if ($decimal_previous_cp != $SBase) {
10226                     $file->carp_bad_line("Unexpected Hangul syllable start = $previous_cp.  Bad News.  Results will be wrong");
10227                 }
10228                 if ($decimal_cp != $SBase + $SCount - 1) {
10229                     $file->carp_bad_line("Unexpected Hangul syllable end = $cp.  Bad News.  Results will be wrong");
10230                 }
10231
10232                 # The Hangul syllable range has a somewhat complicated name
10233                 # generation algorithm.  Each code point in it has a canonical
10234                 # decomposition also computable by an algorithm.  The
10235                 # perl decomposition map table built from these is used only
10236                 # by normalize.pm, which has the algorithm built in it, so the
10237                 # decomposition maps are not needed, and are large, so are
10238                 # omitted from it.  If the full decomposition map table is to
10239                 # be output, the decompositions are generated for it, in the
10240                 # EOF handling code for this input file.
10241
10242                 $previous_fields[$DECOMP_TYPE] = 'Canonical';
10243
10244                 # This range is stored in our internal structure with its
10245                 # own map type, different from all others.
10246                 $previous_fields[$CHARNAME] = $previous_fields[$NAME]
10247                                         = $CMD_DELIM
10248                                           . $MAP_TYPE_CMD
10249                                           . '='
10250                                           . $HANGUL_SYLLABLE
10251                                           . $CMD_DELIM
10252                                           . $fields[$CHARNAME];
10253             }
10254             elsif ($fields[$CHARNAME] =~ /^CJK/) {
10255
10256                 # The name for these contains the code point itself, and all
10257                 # are defined to have the same base name, regardless of what
10258                 # is in the file.  They are stored in our internal structure
10259                 # with a map type of $CP_IN_NAME
10260                 $previous_fields[$CHARNAME] = $previous_fields[$NAME]
10261                                         = $CMD_DELIM
10262                                            . $MAP_TYPE_CMD
10263                                            . '='
10264                                            . $CP_IN_NAME
10265                                            . $CMD_DELIM
10266                                            . 'CJK UNIFIED IDEOGRAPH';
10267
10268             }
10269             elsif ($fields[$CATEGORY] eq 'Co'
10270                      || $fields[$CATEGORY] eq 'Cs')
10271             {
10272                 # The names of all the code points in these ranges are set to
10273                 # null, as there are no names for the private use and
10274                 # surrogate code points.
10275
10276                 $previous_fields[$CHARNAME] = $previous_fields[$NAME] = "";
10277             }
10278             else {
10279                 $file->carp_bad_line("Unexpected code point range $fields[$CHARNAME] because category is $fields[$CATEGORY].  Attempting to process it.");
10280             }
10281
10282             # The first line of the range caused everything else to be output,
10283             # and then its values were stored as the beginning values for the
10284             # next set of ranges, which this one ends.  Now, for each value,
10285             # add a command to tell the handler that these values should not
10286             # replace any existing ones in our database.
10287             foreach my $i (0 .. $last_field) {
10288                 $previous_fields[$i] = $CMD_DELIM
10289                                         . $REPLACE_CMD
10290                                         . '='
10291                                         . $NO
10292                                         . $CMD_DELIM
10293                                         . $previous_fields[$i];
10294             }
10295
10296             # And change things so it looks like the entire range has been
10297             # gone through with this being the final part of it.  Adding the
10298             # command above to each field will cause this range to be flushed
10299             # during the next iteration, as it guaranteed that the stored
10300             # field won't match whatever value the next one has.
10301             $previous_cp = $cp;
10302             $decimal_previous_cp = $decimal_cp;
10303
10304             # We are now set up for the next iteration; so skip the remaining
10305             # code in this subroutine that does the same thing, but doesn't
10306             # know about these ranges.
10307             $_ = "";
10308
10309             return;
10310         }
10311
10312         # On the very first line, we fake it so the code below thinks there is
10313         # nothing to output, and initialize so that when it does get output it
10314         # uses the first line's values for the lowest part of the range.
10315         # (One could avoid this by using peek(), but then one would need to
10316         # know the adjustments done above and do the same ones in the setup
10317         # routine; not worth it)
10318         if ($first_time) {
10319             $first_time = 0;
10320             @previous_fields = @fields;
10321             @start = ($cp) x scalar @fields;
10322             $decimal_previous_cp = $decimal_cp - 1;
10323         }
10324
10325         # For each field, output the stored up ranges that this code point
10326         # doesn't fit in.  Earlier we figured out if all ranges should be
10327         # terminated because of changing the replace or map type styles, or if
10328         # there is a gap between this new code point and the previous one, and
10329         # that is stored in $force_output.  But even if those aren't true, we
10330         # need to output the range if this new code point's value for the
10331         # given property doesn't match the stored range's.
10332         #local $to_trace = 1 if main::DEBUG;
10333         foreach my $i (0 .. $last_field) {
10334             my $field = $fields[$i];
10335             if ($force_output || $field ne $previous_fields[$i]) {
10336
10337                 # Flush the buffer of stored values.
10338                 $file->insert_adjusted_lines("$start[$i]..$previous_cp; $field_names[$i]; $previous_fields[$i]");
10339
10340                 # Start a new range with this code point and its value
10341                 $start[$i] = $cp;
10342                 $previous_fields[$i] = $field;
10343             }
10344         }
10345
10346         # Set the values for the next time.
10347         $previous_cp = $cp;
10348         $decimal_previous_cp = $decimal_cp;
10349
10350         # The input line has generated whatever adjusted lines are needed, and
10351         # should not be looked at further.
10352         $_ = "";
10353         return;
10354     }
10355
10356     sub EOF_UnicodeData {
10357         # Called upon EOF to flush the buffers, and create the Hangul
10358         # decomposition mappings if needed.
10359
10360         my $file = shift;
10361         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10362
10363         # Flush the buffers.
10364         foreach my $i (1 .. $last_field) {
10365             $file->insert_adjusted_lines("$start[$i]..$previous_cp; $field_names[$i]; $previous_fields[$i]");
10366         }
10367
10368         if (-e 'Jamo.txt') {
10369
10370             # The algorithm is published by Unicode, based on values in
10371             # Jamo.txt, (which should have been processed before this
10372             # subroutine), and the results left in %Jamo
10373             unless (%Jamo) {
10374                 Carp::my_carp_bug("Jamo.txt should be processed before Unicode.txt.  Hangul syllables not generated.");
10375                 return;
10376             }
10377
10378             # If the full decomposition map table is being output, insert
10379             # into it the Hangul syllable mappings.  This is to avoid having
10380             # to publish a subroutine in it to compute them.  (which would
10381             # essentially be this code.)  This uses the algorithm published by
10382             # Unicode.
10383             if (property_ref('Decomposition_Mapping')->to_output_map) {
10384                 for (my $S = $SBase; $S < $SBase + $SCount; $S++) {
10385                     use integer;
10386                     my $SIndex = $S - $SBase;
10387                     my $L = $LBase + $SIndex / $NCount;
10388                     my $V = $VBase + ($SIndex % $NCount) / $TCount;
10389                     my $T = $TBase + $SIndex % $TCount;
10390
10391                     trace "L=$L, V=$V, T=$T" if main::DEBUG && $to_trace;
10392                     my $decomposition = sprintf("%04X %04X", $L, $V);
10393                     $decomposition .= sprintf(" %04X", $T) if $T != $TBase;
10394                     $file->insert_adjusted_lines(
10395                                 sprintf("%04X; Decomposition_Mapping; %s",
10396                                         $S,
10397                                         $decomposition));
10398                 }
10399             }
10400         }
10401
10402         return;
10403     }
10404
10405     sub filter_v1_ucd {
10406         # Fix UCD lines in version 1.  This is probably overkill, but this
10407         # fixes some glaring errors in Version 1 UnicodeData.txt.  That file:
10408         # 1)    had many Hangul (U+3400 - U+4DFF) code points that were later
10409         #       removed.  This program retains them
10410         # 2)    didn't include ranges, which it should have, and which are now
10411         #       added in @corrected_lines below.  It was hand populated by
10412         #       taking the data from Version 2, verified by analyzing
10413         #       DAge.txt.
10414         # 3)    There is a syntax error in the entry for U+09F8 which could
10415         #       cause problems for utf8_heavy, and so is changed.  It's
10416         #       numeric value was simply a minus sign, without any number.
10417         #       (Eventually Unicode changed the code point to non-numeric.)
10418         # 4)    The decomposition types often don't match later versions
10419         #       exactly, and the whole syntax of that field is different; so
10420         #       the syntax is changed as well as the types to their later
10421         #       terminology.  Otherwise normalize.pm would be very unhappy
10422         # 5)    Many ccc classes are different.  These are left intact.
10423         # 6)    U+FF10 - U+FF19 are missing their numeric values in all three
10424         #       fields.  These are unchanged because it doesn't really cause
10425         #       problems for Perl.
10426         # 7)    A number of code points, such as controls, don't have their
10427         #       Unicode Version 1 Names in this file.  These are unchanged.
10428
10429         my @corrected_lines = split /\n/, <<'END';
10430 4E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;;
10431 9FA5;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;;
10432 E000;<Private Use, First>;Co;0;L;;;;;N;;;;;
10433 F8FF;<Private Use, Last>;Co;0;L;;;;;N;;;;;
10434 F900;<CJK Compatibility Ideograph, First>;Lo;0;L;;;;;N;;;;;
10435 FA2D;<CJK Compatibility Ideograph, Last>;Lo;0;L;;;;;N;;;;;
10436 END
10437
10438         my $file = shift;
10439         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10440
10441         #local $to_trace = 1 if main::DEBUG;
10442         trace $_ if main::DEBUG && $to_trace;
10443
10444         # -1 => retain trailing null fields
10445         my ($code_point, @fields) = split /\s*;\s*/, $_, -1;
10446
10447         # At the first place that is wrong in the input, insert all the
10448         # corrections, replacing the wrong line.
10449         if ($code_point eq '4E00') {
10450             my @copy = @corrected_lines;
10451             $_ = shift @copy;
10452             ($code_point, @fields) = split /\s*;\s*/, $_, -1;
10453
10454             $file->insert_lines(@copy);
10455         }
10456
10457
10458         if ($fields[$NUMERIC] eq '-') {
10459             $fields[$NUMERIC] = '-1';  # This is what 2.0 made it.
10460         }
10461
10462         if  ($fields[$PERL_DECOMPOSITION] ne "") {
10463
10464             # Several entries have this change to superscript 2 or 3 in the
10465             # middle.  Convert these to the modern version, which is to use
10466             # the actual U+00B2 and U+00B3 (the superscript forms) instead.
10467             # So 'HHHH HHHH <+sup> 0033 <-sup> HHHH' becomes
10468             # 'HHHH HHHH 00B3 HHHH'.
10469             # It turns out that all of these that don't have another
10470             # decomposition defined at the beginning of the line have the
10471             # <square> decomposition in later releases.
10472             if ($code_point ne '00B2' && $code_point ne '00B3') {
10473                 if  ($fields[$PERL_DECOMPOSITION]
10474                                     =~ s/<\+sup> 003([23]) <-sup>/00B$1/)
10475                 {
10476                     if (substr($fields[$PERL_DECOMPOSITION], 0, 1) ne '<') {
10477                         $fields[$PERL_DECOMPOSITION] = '<square> '
10478                         . $fields[$PERL_DECOMPOSITION];
10479                     }
10480                 }
10481             }
10482
10483             # If is like '<+circled> 0052 <-circled>', convert to
10484             # '<circled> 0052'
10485             $fields[$PERL_DECOMPOSITION] =~
10486                             s/ < \+ ( .*? ) > \s* (.*?) \s* <-\1> /<$1> $2/x;
10487
10488             # Convert '<join> HHHH HHHH <join>' to '<medial> HHHH HHHH', etc.
10489             $fields[$PERL_DECOMPOSITION] =~
10490                             s/ <join> \s* (.*?) \s* <no-join> /<final> $1/x
10491             or $fields[$PERL_DECOMPOSITION] =~
10492                             s/ <join> \s* (.*?) \s* <join> /<medial> $1/x
10493             or $fields[$PERL_DECOMPOSITION] =~
10494                             s/ <no-join> \s* (.*?) \s* <join> /<initial> $1/x
10495             or $fields[$PERL_DECOMPOSITION] =~
10496                         s/ <no-join> \s* (.*?) \s* <no-join> /<isolated> $1/x;
10497
10498             # Convert '<break> HHHH HHHH <break>' to '<break> HHHH', etc.
10499             $fields[$PERL_DECOMPOSITION] =~
10500                     s/ <(break|no-break)> \s* (.*?) \s* <\1> /<$1> $2/x;
10501
10502             # Change names to modern form.
10503             $fields[$PERL_DECOMPOSITION] =~ s/<font variant>/<font>/g;
10504             $fields[$PERL_DECOMPOSITION] =~ s/<no-break>/<noBreak>/g;
10505             $fields[$PERL_DECOMPOSITION] =~ s/<circled>/<circle>/g;
10506             $fields[$PERL_DECOMPOSITION] =~ s/<break>/<fraction>/g;
10507
10508             # One entry has weird braces
10509             $fields[$PERL_DECOMPOSITION] =~ s/[{}]//g;
10510         }
10511
10512         $_ = join ';', $code_point, @fields;
10513         trace $_ if main::DEBUG && $to_trace;
10514         return;
10515     }
10516
10517     sub filter_v2_1_5_ucd {
10518         # A dozen entries in this 2.1.5 file had the mirrored and numeric
10519         # columns swapped;  These all had mirrored be 'N'.  So if the numeric
10520         # column appears to be N, swap it back.
10521
10522         my ($code_point, @fields) = split /\s*;\s*/, $_, -1;
10523         if ($fields[$NUMERIC] eq 'N') {
10524             $fields[$NUMERIC] = $fields[$MIRRORED];
10525             $fields[$MIRRORED] = 'N';
10526             $_ = join ';', $code_point, @fields;
10527         }
10528         return;
10529     }
10530
10531     sub filter_v6_ucd {
10532
10533         # Unicode 6.0 co-opted the name BELL for U+1F514, but we haven't
10534         # accepted that yet to allow for some deprecation cycles.
10535
10536         return if $_ !~ /^(?:0007|1F514|070F);/;
10537
10538         my ($code_point, @fields) = split /\s*;\s*/, $_, -1;
10539         if ($code_point eq '0007') {
10540             $fields[$CHARNAME] = "";
10541         }
10542         elsif ($code_point eq '070F') { # Unicode Corrigendum #8; see
10543                             # http://www.unicode.org/versions/corrigendum8.html
10544             $fields[$BIDI] = "AL";
10545         }
10546         elsif ($^V lt v5.17.0) { # For 5.18 will convert to use Unicode's name
10547             $fields[$CHARNAME] = "";
10548         }
10549
10550         $_ = join ';', $code_point, @fields;
10551
10552         return;
10553     }
10554 } # End closure for UnicodeData
10555
10556 sub process_GCB_test {
10557
10558     my $file = shift;
10559     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10560
10561     while ($file->next_line) {
10562         push @backslash_X_tests, $_;
10563     }
10564
10565     return;
10566 }
10567
10568 sub process_NamedSequences {
10569     # NamedSequences.txt entries are just added to an array.  Because these
10570     # don't look like the other tables, they have their own handler.
10571     # An example:
10572     # LATIN CAPITAL LETTER A WITH MACRON AND GRAVE;0100 0300
10573     #
10574     # This just adds the sequence to an array for later handling
10575
10576     my $file = shift;
10577     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10578
10579     while ($file->next_line) {
10580         my ($name, $sequence, @remainder) = split /\s*;\s*/, $_, -1;
10581         if (@remainder) {
10582             $file->carp_bad_line(
10583                 "Doesn't look like 'KHMER VOWEL SIGN OM;17BB 17C6'");
10584             next;
10585         }
10586
10587         # Note single \t in keeping with special output format of
10588         # Perl_charnames.  But it turns out that the code points don't have to
10589         # be 5 digits long, like the rest, based on the internal workings of
10590         # charnames.pm.  This could be easily changed for consistency.
10591         push @named_sequences, "$sequence\t$name";
10592     }
10593     return;
10594 }
10595
10596 { # Closure
10597
10598     my $first_range;
10599
10600     sub  filter_early_ea_lb {
10601         # Fixes early EastAsianWidth.txt and LineBreak.txt files.  These had a
10602         # third field be the name of the code point, which can be ignored in
10603         # most cases.  But it can be meaningful if it marks a range:
10604         # 33FE;W;IDEOGRAPHIC TELEGRAPH SYMBOL FOR DAY THIRTY-ONE
10605         # 3400;W;<CJK Ideograph Extension A, First>
10606         #
10607         # We need to see the First in the example above to know it's a range.
10608         # They did not use the later range syntaxes.  This routine changes it
10609         # to use the modern syntax.
10610         # $1 is the Input_file object.
10611
10612         my @fields = split /\s*;\s*/;
10613         if ($fields[2] =~ /^<.*, First>/) {
10614             $first_range = $fields[0];
10615             $_ = "";
10616         }
10617         elsif ($fields[2] =~ /^<.*, Last>/) {
10618             $_ = $_ = "$first_range..$fields[0]; $fields[1]";
10619         }
10620         else {
10621             undef $first_range;
10622             $_ = "$fields[0]; $fields[1]";
10623         }
10624
10625         return;
10626     }
10627 }
10628
10629 sub filter_old_style_arabic_shaping {
10630     # Early versions used a different term for the later one.
10631
10632     my @fields = split /\s*;\s*/;
10633     $fields[3] =~ s/<no shaping>/No_Joining_Group/;
10634     $fields[3] =~ s/\s+/_/g;                # Change spaces to underscores
10635     $_ = join ';', @fields;
10636     return;
10637 }
10638
10639 sub filter_arabic_shaping_line {
10640     # ArabicShaping.txt has entries that look like:
10641     # 062A; TEH; D; BEH
10642     # The field containing 'TEH' is not used.  The next field is Joining_Type
10643     # and the last is Joining_Group
10644     # This generates two lines to pass on, one for each property on the input
10645     # line.
10646
10647     my $file = shift;
10648     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10649
10650     my @fields = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
10651
10652     if (@fields > 4) {
10653         $file->carp_bad_line('Extra fields');
10654         $_ = "";
10655         return;
10656     }
10657
10658     $file->insert_adjusted_lines("$fields[0]; Joining_Group; $fields[3]");
10659     $_ = "$fields[0]; Joining_Type; $fields[2]";
10660
10661     return;
10662 }
10663
10664 { # Closure
10665     my $lc; # Table for lowercase mapping
10666     my $tc;
10667     my $uc;
10668
10669     sub setup_special_casing {
10670         # SpecialCasing.txt contains the non-simple case change mappings.  The
10671         # simple ones are in UnicodeData.txt, which should already have been
10672         # read in to the full property data structures, so as to initialize
10673         # these with the simple ones.  Then the SpecialCasing.txt entries
10674         # overwrite the ones which have different full mappings.
10675
10676         # This routine sees if the simple mappings are to be output, and if
10677         # so, copies what has already been put into the full mapping tables,
10678         # while they still contain only the simple mappings.
10679
10680         # The reason it is done this way is that the simple mappings are
10681         # probably not going to be output, so it saves work to initialize the
10682         # full tables with the simple mappings, and then overwrite those
10683         # relatively few entries in them that have different full mappings,
10684         # and thus skip the simple mapping tables altogether.
10685
10686         # New tables with just the simple mappings that are overridden by the
10687         # full ones are constructed.  These are for Unicode::UCD, which
10688         # requires the simple mappings.  The Case_Folding table is a combined
10689         # table of both the simple and full mappings, with the full ones being
10690         # in the hash, and the simple ones, even those overridden by the hash,
10691         # being in the base table.  That same mechanism could have been
10692         # employed here, except that the docs have said that the generated
10693         # files are usuable directly by programs, so we dare not change the
10694         # format in any way.
10695
10696         my $file= shift;
10697         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10698
10699         $lc = property_ref('lc');
10700         $tc = property_ref('tc');
10701         $uc = property_ref('uc');
10702
10703         # For each of the case change mappings...
10704         foreach my $case_table ($lc, $tc, $uc) {
10705             my $case = $case_table->name;
10706             my $full = property_ref($case);
10707             unless (defined $full && ! $full->is_empty) {
10708                 Carp::my_carp_bug("Need to process UnicodeData before SpecialCasing.  Only special casing will be generated.");
10709             }
10710
10711             # The simple version's name in each mapping merely has an 's' in
10712             # front of the full one's
10713             my $simple_name = 's' . $case;
10714             my $simple = property_ref($simple_name);
10715             $simple->initialize($full) if $simple->to_output_map();
10716
10717             my $simple_only = Property->new("_s$case",
10718                     Type => $STRING,
10719                     Default_Map => $CODE_POINT,
10720                     Perl_Extension => 1,
10721                     Fate => $INTERNAL_ONLY,
10722                     Description => "This contains the simple mappings for $case for just the code points that have different full mappings");
10723             $simple_only->set_to_output_map($INTERNAL_MAP);
10724             $simple_only->add_comment(join_lines( <<END
10725 This file is for UCD.pm so that it can construct simple mappings that would
10726 otherwise be lost because they are overridden by full mappings.
10727 END
10728             ));
10729
10730             unless ($simple->to_output_map()) {
10731                 $simple_only->set_proxy_for($simple_name);
10732             }
10733         }
10734
10735         return;
10736     }
10737
10738     sub filter_special_casing_line {
10739         # Change the format of $_ from SpecialCasing.txt into something that
10740         # the generic handler understands.  Each input line contains three
10741         # case mappings.  This will generate three lines to pass to the
10742         # generic handler for each of those.
10743
10744         # The input syntax (after stripping comments and trailing white space
10745         # is like one of the following (with the final two being entries that
10746         # we ignore):
10747         # 00DF; 00DF; 0053 0073; 0053 0053; # LATIN SMALL LETTER SHARP S
10748         # 03A3; 03C2; 03A3; 03A3; Final_Sigma;
10749         # 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
10750         # Note the trailing semi-colon, unlike many of the input files.  That
10751         # means that there will be an extra null field generated by the split
10752
10753         my $file = shift;
10754         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10755
10756         my @fields = split /\s*;\s*/, $_, -1; # -1 => retain trailing null
10757                                               # fields
10758
10759         # field #4 is when this mapping is conditional.  If any of these get
10760         # implemented, it would be by hard-coding in the casing functions in
10761         # the Perl core, not through tables.  But if there is a new condition
10762         # we don't know about, output a warning.  We know about all the
10763         # conditions through 6.0
10764         if ($fields[4] ne "") {
10765             my @conditions = split ' ', $fields[4];
10766             if ($conditions[0] ne 'tr'  # We know that these languages have
10767                                         # conditions, and some are multiple
10768                 && $conditions[0] ne 'az'
10769                 && $conditions[0] ne 'lt'
10770
10771                 # And, we know about a single condition Final_Sigma, but
10772                 # nothing else.
10773                 && ($v_version gt v5.2.0
10774                     && (@conditions > 1 || $conditions[0] ne 'Final_Sigma')))
10775             {
10776                 $file->carp_bad_line("Unknown condition '$fields[4]'.  You should inspect it and either add code to handle it, or add to list of those that are to ignore");
10777             }
10778             elsif ($conditions[0] ne 'Final_Sigma') {
10779
10780                     # Don't print out a message for Final_Sigma, because we
10781                     # have hard-coded handling for it.  (But the standard
10782                     # could change what the rule should be, but it wouldn't
10783                     # show up here anyway.
10784
10785                     print "# SKIPPING Special Casing: $_\n"
10786                                                     if $verbosity >= $VERBOSE;
10787             }
10788             $_ = "";
10789             return;
10790         }
10791         elsif (@fields > 6 || (@fields == 6 && $fields[5] ne "" )) {
10792             $file->carp_bad_line('Extra fields');
10793             $_ = "";
10794             return;
10795         }
10796
10797         $_ = "$fields[0]; lc; $fields[1]";
10798         $file->insert_adjusted_lines("$fields[0]; tc; $fields[2]");
10799         $file->insert_adjusted_lines("$fields[0]; uc; $fields[3]");
10800
10801         # Copy any simple case change to the special tables constructed if
10802         # being overridden by a multi-character case change.
10803         if ($fields[1] ne $fields[0]
10804             && (my $value = $lc->value_of(hex $fields[0])) ne $CODE_POINT)
10805         {
10806             $file->insert_adjusted_lines("$fields[0]; _slc; $value");
10807         }
10808         if ($fields[2] ne $fields[0]
10809             && (my $value = $tc->value_of(hex $fields[0])) ne $CODE_POINT)
10810         {
10811             $file->insert_adjusted_lines("$fields[0]; _stc; $value");
10812         }
10813         if ($fields[3] ne $fields[0]
10814             && (my $value = $uc->value_of(hex $fields[0])) ne $CODE_POINT)
10815         {
10816             $file->insert_adjusted_lines("$fields[0]; _suc; $value");
10817         }
10818
10819         return;
10820     }
10821 }
10822
10823 sub filter_old_style_case_folding {
10824     # This transforms $_ containing the case folding style of 3.0.1, to 3.1
10825     # and later style.  Different letters were used in the earlier.
10826
10827     my $file = shift;
10828     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10829
10830     my @fields = split /\s*;\s*/;
10831     if ($fields[0] =~ /^ 013 [01] $/x) { # The two turkish fields
10832         $fields[1] = 'I';
10833     }
10834     elsif ($fields[1] eq 'L') {
10835         $fields[1] = 'C';             # L => C always
10836     }
10837     elsif ($fields[1] eq 'E') {
10838         if ($fields[2] =~ / /) {      # E => C if one code point; F otherwise
10839             $fields[1] = 'F'
10840         }
10841         else {
10842             $fields[1] = 'C'
10843         }
10844     }
10845     else {
10846         $file->carp_bad_line("Expecting L or E in second field");
10847         $_ = "";
10848         return;
10849     }
10850     $_ = join("; ", @fields) . ';';
10851     return;
10852 }
10853
10854 { # Closure for case folding
10855
10856     # Create the map for simple only if are going to output it, for otherwise
10857     # it takes no part in anything we do.
10858     my $to_output_simple;
10859
10860     sub setup_case_folding($) {
10861         # Read in the case foldings in CaseFolding.txt.  This handles both
10862         # simple and full case folding.
10863
10864         $to_output_simple
10865                         = property_ref('Simple_Case_Folding')->to_output_map;
10866
10867         if (! $to_output_simple) {
10868             property_ref('Case_Folding')->set_proxy_for('Simple_Case_Folding');
10869         }
10870
10871         # If we ever wanted to show that these tables were combined, a new
10872         # property method could be created, like set_combined_props()
10873         property_ref('Case_Folding')->add_comment(join_lines( <<END
10874 This file includes both the simple and full case folding maps.  The simple
10875 ones are in the main body of the table below, and the full ones adding to or
10876 overriding them are in the hash.
10877 END
10878         ));
10879         return;
10880     }
10881
10882     sub filter_case_folding_line {
10883         # Called for each line in CaseFolding.txt
10884         # Input lines look like:
10885         # 0041; C; 0061; # LATIN CAPITAL LETTER A
10886         # 00DF; F; 0073 0073; # LATIN SMALL LETTER SHARP S
10887         # 1E9E; S; 00DF; # LATIN CAPITAL LETTER SHARP S
10888         #
10889         # 'C' means that folding is the same for both simple and full
10890         # 'F' that it is only for full folding
10891         # 'S' that it is only for simple folding
10892         # 'T' is locale-dependent, and ignored
10893         # 'I' is a type of 'F' used in some early releases.
10894         # Note the trailing semi-colon, unlike many of the input files.  That
10895         # means that there will be an extra null field generated by the split
10896         # below, which we ignore and hence is not an error.
10897
10898         my $file = shift;
10899         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10900
10901         my ($range, $type, $map, @remainder) = split /\s*;\s*/, $_, -1;
10902         if (@remainder > 1 || (@remainder == 1 && $remainder[0] ne "" )) {
10903             $file->carp_bad_line('Extra fields');
10904             $_ = "";
10905             return;
10906         }
10907
10908         if ($type eq 'T') {   # Skip Turkic case folding, is locale dependent
10909             $_ = "";
10910             return;
10911         }
10912
10913         # C: complete, F: full, or I: dotted uppercase I -> dotless lowercase
10914         # I are all full foldings; S is single-char.  For S, there is always
10915         # an F entry, so we must allow multiple values for the same code
10916         # point.  Fortunately this table doesn't need further manipulation
10917         # which would preclude using multiple-values.  The S is now included
10918         # so that _swash_inversion_hash() is able to construct closures
10919         # without having to worry about F mappings.
10920         if ($type eq 'C' || $type eq 'F' || $type eq 'I' || $type eq 'S') {
10921             $_ = "$range; Case_Folding; "
10922                  . "$CMD_DELIM$REPLACE_CMD=$MULTIPLE_BEFORE$CMD_DELIM$map";
10923         }
10924         else {
10925             $_ = "";
10926             $file->carp_bad_line('Expecting C F I S or T in second field');
10927         }
10928
10929         # C and S are simple foldings, but simple case folding is not needed
10930         # unless we explicitly want its map table output.
10931         if ($to_output_simple && $type eq 'C' || $type eq 'S') {
10932             $file->insert_adjusted_lines("$range; Simple_Case_Folding; $map");
10933         }
10934
10935         return;
10936     }
10937
10938 } # End case fold closure
10939
10940 sub filter_jamo_line {
10941     # Filter Jamo.txt lines.  This routine mainly is used to populate hashes
10942     # from this file that is used in generating the Name property for Jamo
10943     # code points.  But, it also is used to convert early versions' syntax
10944     # into the modern form.  Here are two examples:
10945     # 1100; G   # HANGUL CHOSEONG KIYEOK            # Modern syntax
10946     # U+1100; G; HANGUL CHOSEONG KIYEOK             # 2.0 syntax
10947     #
10948     # The input is $_, the output is $_ filtered.
10949
10950     my @fields = split /\s*;\s*/, $_, -1;  # -1 => retain trailing null fields
10951
10952     # Let the caller handle unexpected input.  In earlier versions, there was
10953     # a third field which is supposed to be a comment, but did not have a '#'
10954     # before it.
10955     return if @fields > (($v_version gt v3.0.0) ? 2 : 3);
10956
10957     $fields[0] =~ s/^U\+//;     # Also, early versions had this extraneous
10958                                 # beginning.
10959
10960     # Some 2.1 versions had this wrong.  Causes havoc with the algorithm.
10961     $fields[1] = 'R' if $fields[0] eq '1105';
10962
10963     # Add to structure so can generate Names from it.
10964     my $cp = hex $fields[0];
10965     my $short_name = $fields[1];
10966     $Jamo{$cp} = $short_name;
10967     if ($cp <= $LBase + $LCount) {
10968         $Jamo_L{$short_name} = $cp - $LBase;
10969     }
10970     elsif ($cp <= $VBase + $VCount) {
10971         $Jamo_V{$short_name} = $cp - $VBase;
10972     }
10973     elsif ($cp <= $TBase + $TCount) {
10974         $Jamo_T{$short_name} = $cp - $TBase;
10975     }
10976     else {
10977         Carp::my_carp_bug("Unexpected Jamo code point in $_");
10978     }
10979
10980
10981     # Reassemble using just the first two fields to look like a typical
10982     # property file line
10983     $_ = "$fields[0]; $fields[1]";
10984
10985     return;
10986 }
10987
10988 sub register_fraction($) {
10989     # This registers the input rational number so that it can be passed on to
10990     # utf8_heavy.pl, both in rational and floating forms.
10991
10992     my $rational = shift;
10993
10994     my $float = eval $rational;
10995     $nv_floating_to_rational{$float} = $rational;
10996     return;
10997 }
10998
10999 sub filter_numeric_value_line {
11000     # DNumValues contains lines of a different syntax than the typical
11001     # property file:
11002     # 0F33          ; -0.5 ; ; -1/2 # No       TIBETAN DIGIT HALF ZERO
11003     #
11004     # This routine transforms $_ containing the anomalous syntax to the
11005     # typical, by filtering out the extra columns, and convert early version
11006     # decimal numbers to strings that look like rational numbers.
11007
11008     my $file = shift;
11009     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
11010
11011     # Starting in 5.1, there is a rational field.  Just use that, omitting the
11012     # extra columns.  Otherwise convert the decimal number in the second field
11013     # to a rational, and omit extraneous columns.
11014     my @fields = split /\s*;\s*/, $_, -1;
11015     my $rational;
11016
11017     if ($v_version ge v5.1.0) {
11018         if (@fields != 4) {
11019             $file->carp_bad_line('Not 4 semi-colon separated fields');
11020             $_ = "";
11021             return;
11022         }
11023         $rational = $fields[3];
11024         $_ = join '; ', @fields[ 0, 3 ];
11025     }
11026     else {
11027
11028         # Here, is an older Unicode file, which has decimal numbers instead of
11029         # rationals in it.  Use the fraction to calculate the denominator and
11030         # convert to rational.
11031
11032         if (@fields != 2 && @fields != 3) {
11033             $file->carp_bad_line('Not 2 or 3 semi-colon separated fields');
11034             $_ = "";
11035             return;
11036         }
11037
11038         my $codepoints = $fields[0];
11039         my $decimal = $fields[1];
11040         if ($decimal =~ s/\.0+$//) {
11041
11042             # Anything ending with a decimal followed by nothing but 0's is an
11043             # integer
11044             $_ = "$codepoints; $decimal";
11045             $rational = $decimal;
11046         }
11047         else {
11048
11049             my $denominator;
11050             if ($decimal =~ /\.50*$/) {
11051                 $denominator = 2;
11052             }
11053
11054             # Here have the hardcoded repeating decimals in the fraction, and
11055             # the denominator they imply.  There were only a few denominators
11056             # in the older Unicode versions of this file which this code
11057             # handles, so it is easy to convert them.
11058
11059             # The 4 is because of a round-off error in the Unicode 3.2 files
11060             elsif ($decimal =~ /\.33*[34]$/ || $decimal =~ /\.6+7$/) {
11061                 $denominator = 3;
11062             }
11063             elsif ($decimal =~ /\.[27]50*$/) {
11064                 $denominator = 4;
11065             }
11066             elsif ($decimal =~ /\.[2468]0*$/) {
11067                 $denominator = 5;
11068             }
11069             elsif ($decimal =~ /\.16+7$/ || $decimal =~ /\.83+$/) {
11070                 $denominator = 6;
11071             }
11072             elsif ($decimal =~ /\.(12|37|62|87)50*$/) {
11073                 $denominator = 8;
11074             }
11075             if ($denominator) {
11076                 my $sign = ($decimal < 0) ? "-" : "";
11077                 my $numerator = int((abs($decimal) * $denominator) + .5);
11078                 $rational = "$sign$numerator/$denominator";
11079                 $_ = "$codepoints; $rational";
11080             }
11081             else {
11082                 $file->carp_bad_line("Can't cope with number '$decimal'.");
11083                 $_ = "";
11084                 return;
11085             }
11086         }
11087     }
11088
11089     register_fraction($rational) if $rational =~ qr{/};
11090     return;
11091 }
11092
11093 { # Closure
11094     my %unihan_properties;
11095
11096     sub setup_unihan {
11097         # Do any special setup for Unihan properties.
11098
11099         # This property gives the wrong computed type, so override.
11100         my $usource = property_ref('kIRG_USource');
11101         $usource->set_type($STRING) if defined $usource;
11102
11103         # This property is to be considered binary (it says so in
11104         # http://www.unicode.org/reports/tr38/)
11105         my $iicore = property_ref('kIICore');
11106         if (defined $iicore) {
11107             $iicore->set_type($FORCED_BINARY);
11108             $iicore->table("Y")->add_note("Forced to a binary property as per unicode.org UAX #38.");
11109
11110             # Unicode doesn't include the maps for this property, so don't
11111             # warn that they are missing.
11112             $iicore->set_pre_declared_maps(0);
11113             $iicore->add_comment(join_lines( <<END
11114 This property contains enum values, but Unicode UAX #38 says it should be
11115 interpreted as binary, so Perl creates tables for both 1) its enum values,
11116 plus 2) true/false tables in which it is considered true for all code points
11117 that have a non-null value
11118 END
11119             ));
11120         }
11121
11122         return;
11123     }
11124
11125     sub filter_unihan_line {
11126         # Change unihan db lines to look like the others in the db.  Here is
11127         # an input sample:
11128         #   U+341C        kCangjie        IEKN
11129
11130         # Tabs are used instead of semi-colons to separate fields; therefore
11131         # they may have semi-colons embedded in them.  Change these to periods
11132         # so won't screw up the rest of the code.
11133         s/;/./g;
11134
11135         # Remove lines that don't look like ones we accept.
11136         if ($_ !~ /^ [^\t]* \t ( [^\t]* ) /x) {
11137             $_ = "";
11138             return;
11139         }
11140
11141         # Extract the property, and save a reference to its object.
11142         my $property = $1;
11143         if (! exists $unihan_properties{$property}) {
11144             $unihan_properties{$property} = property_ref($property);
11145         }
11146
11147         # Don't do anything unless the property is one we're handling, which
11148         # we determine by seeing if there is an object defined for it or not
11149         if (! defined $unihan_properties{$property}) {
11150             $_ = "";
11151             return;
11152         }
11153
11154         # Convert the tab separators to our standard semi-colons, and convert
11155         # the U+HHHH notation to the rest of the standard's HHHH
11156         s/\t/;/g;
11157         s/\b U \+ (?= $code_point_re )//xg;
11158
11159         #local $to_trace = 1 if main::DEBUG;
11160         trace $_ if main::DEBUG && $to_trace;
11161
11162         return;
11163     }
11164 }
11165
11166 sub filter_blocks_lines {
11167     # In the Blocks.txt file, the names of the blocks don't quite match the
11168     # names given in PropertyValueAliases.txt, so this changes them so they
11169     # do match:  Blanks and hyphens are changed into underscores.  Also makes
11170     # early release versions look like later ones
11171     #
11172     # $_ is transformed to the correct value.
11173
11174     my $file = shift;
11175         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
11176
11177     if ($v_version lt v3.2.0) {
11178         if (/FEFF.*Specials/) { # Bug in old versions: line wrongly inserted
11179             $_ = "";
11180             return;
11181         }
11182
11183         # Old versions used a different syntax to mark the range.
11184         $_ =~ s/;\s+/../ if $v_version lt v3.1.0;
11185     }
11186
11187     my @fields = split /\s*;\s*/, $_, -1;
11188     if (@fields != 2) {
11189         $file->carp_bad_line("Expecting exactly two fields");
11190         $_ = "";
11191         return;
11192     }
11193
11194     # Change hyphens and blanks in the block name field only
11195     $fields[1] =~ s/[ -]/_/g;
11196     $fields[1] =~ s/_ ( [a-z] ) /_\u$1/g;   # Capitalize first letter of word
11197
11198     $_ = join("; ", @fields);
11199     return;
11200 }
11201
11202 { # Closure
11203     my $current_property;
11204
11205     sub filter_old_style_proplist {
11206         # PropList.txt has been in Unicode since version 2.0.  Until 3.1, it
11207         # was in a completely different syntax.  Ken Whistler of Unicode says
11208         # that it was something he used as an aid for his own purposes, but
11209         # was never an official part of the standard.  However, comments in
11210         # DAge.txt indicate that non-character code points were available in
11211         # the UCD as of 3.1.  It is unclear to me (khw) how they could be
11212         # there except through this file (but on the other hand, they first
11213         # appeared there in 3.0.1), so maybe it was part of the UCD, and maybe
11214         # not.  But the claim is that it was published as an aid to others who
11215         # might want some more information than was given in the official UCD
11216         # of the time.  Many of the properties in it were incorporated into
11217         # the later PropList.txt, but some were not.  This program uses this
11218         # early file to generate property tables that are otherwise not
11219         # accessible in the early UCD's, and most were probably not really
11220         # official at that time, so one could argue that it should be ignored,
11221         # and you can easily modify things to skip this.  And there are bugs
11222         # in this file in various versions.  (For example, the 2.1.9 version
11223         # removes from Alphabetic the CJK range starting at 4E00, and they
11224         # weren't added back in until 3.1.0.)  Many of this file's properties
11225         # were later sanctioned, so this code generates tables for those
11226         # properties that aren't otherwise in the UCD of the time but
11227         # eventually did become official, and throws away the rest.  Here is a
11228         # list of all the ones that are thrown away:
11229         #   Bidi=*                       duplicates UnicodeData.txt
11230         #   Combining                    never made into official property;
11231         #                                is \P{ccc=0}
11232         #   Composite                    never made into official property.
11233         #   Currency Symbol              duplicates UnicodeData.txt: gc=sc
11234         #   Decimal Digit                duplicates UnicodeData.txt: gc=nd
11235         #   Delimiter                    never made into official property;
11236         #                                removed in 3.0.1
11237         #   Format Control               never made into official property;
11238         #                                similar to gc=cf
11239         #   High Surrogate               duplicates Blocks.txt
11240         #   Ignorable Control            never made into official property;
11241         #                                similar to di=y
11242         #   ISO Control                  duplicates UnicodeData.txt: gc=cc
11243         #   Left of Pair                 never made into official property;
11244         #   Line Separator               duplicates UnicodeData.txt: gc=zl
11245         #   Low Surrogate                duplicates Blocks.txt
11246         #   Non-break                    was actually listed as a property
11247         #                                in 3.2, but without any code
11248         #                                points.  Unicode denies that this
11249         #                                was ever an official property
11250         #   Non-spacing                  duplicate UnicodeData.txt: gc=mn
11251         #   Numeric                      duplicates UnicodeData.txt: gc=cc
11252         #   Paired Punctuation           never made into official property;
11253         #                                appears to be gc=ps + gc=pe
11254         #   Paragraph Separator          duplicates UnicodeData.txt: gc=cc
11255         #   Private Use                  duplicates UnicodeData.txt: gc=co
11256         #   Private Use High Surrogate   duplicates Blocks.txt
11257         #   Punctuation                  duplicates UnicodeData.txt: gc=p
11258         #   Space                        different definition than eventual
11259         #                                one.
11260         #   Titlecase                    duplicates UnicodeData.txt: gc=lt
11261         #   Unassigned Code Value        duplicates UnicodeData.txt: gc=cc
11262         #   Zero-width                   never made into official property;
11263         #                                subset of gc=cf
11264         # Most of the properties have the same names in this file as in later
11265         # versions, but a couple do not.
11266         #
11267         # This subroutine filters $_, converting it from the old style into
11268         # the new style.  Here's a sample of the old-style
11269         #
11270         #   *******************************************
11271         #
11272         #   Property dump for: 0x100000A0 (Join Control)
11273         #
11274         #   200C..200D  (2 chars)
11275         #
11276         # In the example, the property is "Join Control".  It is kept in this
11277         # closure between calls to the subroutine.  The numbers beginning with
11278         # 0x were internal to Ken's program that generated this file.
11279
11280         # If this line contains the property name, extract it.
11281         if (/^Property dump for: [^(]*\((.*)\)/) {
11282             $_ = $1;
11283
11284             # Convert white space to underscores.
11285             s/ /_/g;
11286
11287             # Convert the few properties that don't have the same name as
11288             # their modern counterparts
11289             s/Identifier_Part/ID_Continue/
11290             or s/Not_a_Character/NChar/;
11291
11292             # If the name matches an existing property, use it.
11293             if (defined property_ref($_)) {
11294                 trace "new property=", $_ if main::DEBUG && $to_trace;
11295                 $current_property = $_;
11296             }
11297             else {        # Otherwise discard it
11298                 trace "rejected property=", $_ if main::DEBUG && $to_trace;
11299                 undef $current_property;
11300             }
11301             $_ = "";    # The property is saved for the next lines of the
11302                         # file, but this defining line is of no further use,
11303                         # so clear it so that the caller won't process it
11304                         # further.
11305         }
11306         elsif (! defined $current_property || $_ !~ /^$code_point_re/) {
11307
11308             # Here, the input line isn't a header defining a property for the
11309             # following section, and either we aren't in such a section, or
11310             # the line doesn't look like one that defines the code points in
11311             # such a section.  Ignore this line.
11312             $_ = "";
11313         }
11314         else {
11315
11316             # Here, we have a line defining the code points for the current
11317             # stashed property.  Anything starting with the first blank is
11318             # extraneous.  Otherwise, it should look like a normal range to
11319             # the caller.  Append the property name so that it looks just like
11320             # a modern PropList entry.
11321
11322             $_ =~ s/\s.*//;
11323             $_ .= "; $current_property";
11324         }
11325         trace $_ if main::DEBUG && $to_trace;
11326         return;
11327     }
11328 } # End closure for old style proplist
11329
11330 sub filter_old_style_normalization_lines {
11331     # For early releases of Unicode, the lines were like:
11332     #        74..2A76    ; NFKD_NO
11333     # For later releases this became:
11334     #        74..2A76    ; NFKD_QC; N
11335     # Filter $_ to look like those in later releases.
11336     # Similarly for MAYBEs
11337
11338     s/ _NO \b /_QC; N/x || s/ _MAYBE \b /_QC; M/x;
11339
11340     # Also, the property FC_NFKC was abbreviated to FNC
11341     s/FNC/FC_NFKC/;
11342     return;
11343 }
11344
11345 sub setup_script_extensions {
11346     # The Script_Extensions property starts out with a clone of the Script
11347     # property.
11348
11349     my $scx = property_ref("Script_Extensions");
11350     $scx = Property->new("scx", Full_Name => "Script_Extensions")
11351             if ! defined $scx;
11352     $scx->_set_format($STRING_WHITE_SPACE_LIST);
11353     $scx->initialize($script);
11354     $scx->set_default_map($script->default_map);
11355     $scx->set_pre_declared_maps(0);     # PropValueAliases doesn't list these
11356     $scx->add_comment(join_lines( <<END
11357 The values for code points that appear in one script are just the same as for
11358 the 'Script' property.  Likewise the values for those that appear in many
11359 scripts are either 'Common' or 'Inherited', same as with 'Script'.  But the
11360 values of code points that appear in a few scripts are a space separated list
11361 of those scripts.
11362 END
11363     ));
11364
11365     # Initialize scx's tables and the aliases for them to be the same as sc's
11366     foreach my $table ($script->tables) {
11367         my $scx_table = $scx->add_match_table($table->name,
11368                                 Full_Name => $table->full_name);
11369         foreach my $alias ($table->aliases) {
11370             $scx_table->add_alias($alias->name);
11371         }
11372     }
11373 }
11374
11375 sub  filter_script_extensions_line {
11376     # The Scripts file comes with the full name for the scripts; the
11377     # ScriptExtensions, with the short name.  The final mapping file is a
11378     # combination of these, and without adjustment, would have inconsistent
11379     # entries.  This filters the latter file to convert to full names.
11380     # Entries look like this:
11381     # 064B..0655    ; Arab Syrc # Mn  [11] ARABIC FATHATAN..ARABIC HAMZA BELOW
11382
11383     my @fields = split /\s*;\s*/;
11384     my @full_names;
11385     foreach my $short_name (split " ", $fields[1]) {
11386         push @full_names, $script->table($short_name)->full_name;
11387     }
11388     $fields[1] = join " ", @full_names;
11389     $_ = join "; ", @fields;
11390
11391     return;
11392 }
11393
11394 sub setup_v6_name_alias {
11395         property_ref('Name_Alias')->add_map(7, 7, "ALERT");
11396 }
11397
11398 sub finish_Unicode() {
11399     # This routine should be called after all the Unicode files have been read
11400     # in.  It:
11401     # 1) Adds the mappings for code points missing from the files which have
11402     #    defaults specified for them.
11403     # 2) At this this point all mappings are known, so it computes the type of
11404     #    each property whose type hasn't been determined yet.
11405     # 3) Calculates all the regular expression match tables based on the
11406     #    mappings.
11407     # 3) Calculates and adds the tables which are defined by Unicode, but
11408     #    which aren't derived by them
11409
11410     # For each property, fill in any missing mappings, and calculate the re
11411     # match tables.  If a property has more than one missing mapping, the
11412     # default is a reference to a data structure, and requires data from other
11413     # properties to resolve.  The sort is used to cause these to be processed
11414     # last, after all the other properties have been calculated.
11415     # (Fortunately, the missing properties so far don't depend on each other.)
11416     foreach my $property
11417         (sort { (defined $a->default_map && ref $a->default_map) ? 1 : -1 }
11418         property_ref('*'))
11419     {
11420         # $perl has been defined, but isn't one of the Unicode properties that
11421         # need to be finished up.
11422         next if $property == $perl;
11423
11424         # Nor do we need to do anything with properties that aren't going to
11425         # be output.
11426         next if $property->fate == $SUPPRESSED;
11427
11428         # Handle the properties that have more than one possible default
11429         if (ref $property->default_map) {
11430             my $default_map = $property->default_map;
11431
11432             # These properties have stored in the default_map:
11433             # One or more of:
11434             #   1)  A default map which applies to all code points in a
11435             #       certain class
11436             #   2)  an expression which will evaluate to the list of code
11437             #       points in that class
11438             # And
11439             #   3) the default map which applies to every other missing code
11440             #      point.
11441             #
11442             # Go through each list.
11443             while (my ($default, $eval) = $default_map->get_next_defaults) {
11444
11445                 # Get the class list, and intersect it with all the so-far
11446                 # unspecified code points yielding all the code points
11447                 # in the class that haven't been specified.
11448                 my $list = eval $eval;
11449                 if ($@) {
11450                     Carp::my_carp("Can't set some defaults for missing code points for $property because eval '$eval' failed with '$@'");
11451                     last;
11452                 }
11453
11454                 # Narrow down the list to just those code points we don't have
11455                 # maps for yet.
11456                 $list = $list & $property->inverse_list;
11457
11458                 # Add mappings to the property for each code point in the list
11459                 foreach my $range ($list->ranges) {
11460                     $property->add_map($range->start, $range->end, $default,
11461                     Replace => $CROAK);
11462                 }
11463             }
11464
11465             # All remaining code points have the other mapping.  Set that up
11466             # so the normal single-default mapping code will work on them
11467             $property->set_default_map($default_map->other_default);
11468
11469             # And fall through to do that
11470         }
11471
11472         # We should have enough data now to compute the type of the property.
11473         $property->compute_type;
11474         my $property_type = $property->type;
11475
11476         next if ! $property->to_create_match_tables;
11477
11478         # Here want to create match tables for this property
11479
11480         # The Unicode db always (so far, and they claim into the future) have
11481         # the default for missing entries in binary properties be 'N' (unless
11482         # there is a '@missing' line that specifies otherwise)
11483         if ($property_type == $BINARY && ! defined $property->default_map) {
11484             $property->set_default_map('N');
11485         }
11486
11487         # Add any remaining code points to the mapping, using the default for
11488         # missing code points.
11489         my $default_table;
11490         if (defined (my $default_map = $property->default_map)) {
11491
11492             # Make sure there is a match table for the default
11493             if (! defined ($default_table = $property->table($default_map))) {
11494                 $default_table = $property->add_match_table($default_map);
11495             }
11496
11497             # And, if the property is binary, the default table will just
11498             # be the complement of the other table.
11499             if ($property_type == $BINARY) {
11500                 my $non_default_table;
11501
11502                 # Find the non-default table.
11503                 for my $table ($property->tables) {
11504                     next if $table == $default_table;
11505                     $non_default_table = $table;
11506                 }
11507                 $default_table->set_complement($non_default_table);
11508             }
11509             else {
11510
11511                 # This fills in any missing values with the default.  It's not
11512                 # necessary to do this with binary properties, as the default
11513                 # is defined completely in terms of the Y table.
11514                 $property->add_map(0, $MAX_UNICODE_CODEPOINT,
11515                                    $default_map, Replace => $NO);
11516             }
11517         }
11518
11519         # Have all we need to populate the match tables.
11520         my $property_name = $property->name;
11521         my $maps_should_be_defined = $property->pre_declared_maps;
11522         foreach my $range ($property->ranges) {
11523             my $map = $range->value;
11524             my $table = $property->table($map);
11525             if (! defined $table) {
11526
11527                 # Integral and rational property values are not necessarily
11528                 # defined in PropValueAliases, but whether all the other ones
11529                 # should be depends on the property.
11530                 if ($maps_should_be_defined
11531                     && $map !~ /^ -? \d+ ( \/ \d+ )? $/x)
11532                 {
11533                     Carp::my_carp("Table '$property_name=$map' should have been defined.  Defining it now.")
11534                 }
11535                 $table = $property->add_match_table($map);
11536             }
11537
11538             next if $table->complement != 0;    # Don't need to populate these
11539             $table->add_range($range->start, $range->end);
11540         }
11541
11542         # A forced binary property has additional true/false tables which
11543         # should have been set up when it was forced into binary.  The false
11544         # table matches exactly the same set as the property's default table.
11545         # The true table matches the complement of that.  The false table is
11546         # not the same as an additional set of aliases on top of the default
11547         # table, so use 'set_equivalent_to'.  If it were implemented as
11548         # additional aliases, various things would have to be adjusted, but
11549         # especially, if the user wants to get a list of names for the table
11550         # using Unicode::UCD::prop_value_aliases(), s/he should get a
11551         # different set depending on whether they want the default table or
11552         # the false table.
11553         if ($property_type == $FORCED_BINARY) {
11554             $property->table('N')->set_equivalent_to($default_table,
11555                                                      Related => 1);
11556             $property->table('Y')->set_complement($default_table);
11557         }
11558
11559         # For Perl 5.6 compatibility, all properties matchable in regexes can
11560         # have an optional 'Is_' prefix.  This is now done in utf8_heavy.pl.
11561         # But warn if this creates a conflict with a (new) Unicode property
11562         # name, although it appears that Unicode has made a decision never to
11563         # begin a property name with 'Is_', so this shouldn't happen.
11564         foreach my $alias ($property->aliases) {
11565             my $Is_name = 'Is_' . $alias->name;
11566             if (defined (my $pre_existing = property_ref($Is_name))) {
11567                 Carp::my_carp(<<END
11568 There is already an alias named $Is_name (from " . $pre_existing . "), so
11569 creating one for $property won't work.  This is bad news.  If it is not too
11570 late, get Unicode to back off.  Otherwise go back to the old scheme (findable
11571 from the git blame log for this area of the code that suppressed individual
11572 aliases that conflict with the new Unicode names.  Proceeding anyway.
11573 END
11574                 );
11575             }
11576         } # End of loop through aliases for this property
11577     } # End of loop through all Unicode properties.
11578
11579     # Fill in the mappings that Unicode doesn't completely furnish.  First the
11580     # single letter major general categories.  If Unicode were to start
11581     # delivering the values, this would be redundant, but better that than to
11582     # try to figure out if should skip and not get it right.  Ths could happen
11583     # if a new major category were to be introduced, and the hard-coded test
11584     # wouldn't know about it.
11585     # This routine depends on the standard names for the general categories
11586     # being what it thinks they are, like 'Cn'.  The major categories are the
11587     # union of all the general category tables which have the same first
11588     # letters. eg. L = Lu + Lt + Ll + Lo + Lm
11589     foreach my $minor_table ($gc->tables) {
11590         my $minor_name = $minor_table->name;
11591         next if length $minor_name == 1;
11592         if (length $minor_name != 2) {
11593             Carp::my_carp_bug("Unexpected general category '$minor_name'.  Skipped.");
11594             next;
11595         }
11596
11597         my $major_name = uc(substr($minor_name, 0, 1));
11598         my $major_table = $gc->table($major_name);
11599         $major_table += $minor_table;
11600     }
11601
11602     # LC is Ll, Lu, and Lt.  (used to be L& or L_, but PropValueAliases.txt
11603     # defines it as LC)
11604     my $LC = $gc->table('LC');
11605     $LC->add_alias('L_', Status => $DISCOURAGED);   # For backwards...
11606     $LC->add_alias('L&', Status => $DISCOURAGED);   # compatibility.
11607
11608
11609     if ($LC->is_empty) { # Assume if not empty that Unicode has started to
11610                          # deliver the correct values in it
11611         $LC->initialize($gc->table('Ll') + $gc->table('Lu'));
11612
11613         # Lt not in release 1.
11614         if (defined $gc->table('Lt')) {
11615             $LC += $gc->table('Lt');
11616             $gc->table('Lt')->set_caseless_equivalent($LC);
11617         }
11618     }
11619     $LC->add_description('[\p{Ll}\p{Lu}\p{Lt}]');
11620
11621     $gc->table('Ll')->set_caseless_equivalent($LC);
11622     $gc->table('Lu')->set_caseless_equivalent($LC);
11623
11624     my $Cs = $gc->table('Cs');
11625
11626
11627     # Folding information was introduced later into Unicode data.  To get
11628     # Perl's case ignore (/i) to work at all in releases that don't have
11629     # folding, use the best available alternative, which is lower casing.
11630     my $fold = property_ref('Simple_Case_Folding');
11631     if ($fold->is_empty) {
11632         $fold->initialize(property_ref('Simple_Lowercase_Mapping'));
11633         $fold->add_note(join_lines(<<END
11634 WARNING: This table uses lower case as a substitute for missing fold
11635 information
11636 END
11637         ));
11638     }
11639
11640     # Multiple-character mapping was introduced later into Unicode data.  If
11641     # missing, use the single-characters maps as best available alternative
11642     foreach my $map (qw {   Uppercase_Mapping
11643                             Lowercase_Mapping
11644                             Titlecase_Mapping
11645                             Case_Folding
11646                         } ) {
11647         my $full = property_ref($map);
11648         if ($full->is_empty) {
11649             my $simple = property_ref('Simple_' . $map);
11650             $full->initialize($simple);
11651             $full->add_comment($simple->comment) if ($simple->comment);
11652             $full->add_note(join_lines(<<END
11653 WARNING: This table uses simple mapping (single-character only) as a
11654 substitute for missing multiple-character information
11655 END
11656             ));
11657         }
11658     }
11659
11660     # The Script_Extensions property started out as a clone of the Script
11661     # property.  But processing its data file caused some elements to be
11662     # replaced with different data.  (These elements were for the Common and
11663     # Inherited properties.)  This data is a qw() list of all the scripts that
11664     # the code points in the given range are in.  An example line is:
11665     # 060C          ; Arab Syrc Thaa # Po       ARABIC COMMA
11666     #
11667     # The code above has created a new match table named "Arab Syrc Thaa"
11668     # which contains 060C.  (The cloned table started out with this code point
11669     # mapping to "Common".)  Now we add 060C to each of the Arab, Syrc, and
11670     # Thaa match tables.  Then we delete the now spurious "Arab Syrc Thaa"
11671     # match table.  This is repeated for all these tables and ranges.  The map
11672     # data is retained in the map table for reference, but the spurious match
11673     # tables are deleted.
11674
11675     my $scx = property_ref("Script_Extensions");
11676     if (defined $scx) {
11677         foreach my $table ($scx->tables) {
11678             next unless $table->name =~ /\s/;   # All the new and only the new
11679                                                 # tables have a space in their
11680                                                 # names
11681             my @scripts = split /\s+/, $table->name;
11682             foreach my $script (@scripts) {
11683                 my $script_table = $scx->table($script);
11684                 $script_table += $table;
11685             }
11686             $scx->delete_match_table($table);
11687         }
11688     }
11689
11690     return;
11691 }
11692
11693 sub compile_perl() {
11694     # Create perl-defined tables.  Almost all are part of the pseudo-property
11695     # named 'perl' internally to this program.  Many of these are recommended
11696     # in UTS#18 "Unicode Regular Expressions", and their derivations are based
11697     # on those found there.
11698     # Almost all of these are equivalent to some Unicode property.
11699     # A number of these properties have equivalents restricted to the ASCII
11700     # range, with their names prefaced by 'Posix', to signify that these match
11701     # what the Posix standard says they should match.  A couple are
11702     # effectively this, but the name doesn't have 'Posix' in it because there
11703     # just isn't any Posix equivalent.  'XPosix' are the Posix tables extended
11704     # to the full Unicode range, by our guesses as to what is appropriate.
11705
11706     # 'Any' is all code points.  As an error check, instead of just setting it
11707     # to be that, construct it to be the union of all the major categories
11708     $Any = $perl->add_match_table('Any',
11709             Description  => "[\\x{0000}-\\x{$MAX_UNICODE_CODEPOINT_STRING}]",
11710             Matches_All => 1);
11711
11712     foreach my $major_table ($gc->tables) {
11713
11714         # Major categories are the ones with single letter names.
11715         next if length($major_table->name) != 1;
11716
11717         $Any += $major_table;
11718     }
11719
11720     if ($Any->max != $MAX_UNICODE_CODEPOINT) {
11721         Carp::my_carp_bug("Generated highest code point ("
11722            . sprintf("%X", $Any->max)
11723            . ") doesn't match expected value $MAX_UNICODE_CODEPOINT_STRING.")
11724     }
11725     if ($Any->range_count != 1 || $Any->min != 0) {
11726      Carp::my_carp_bug("Generated table 'Any' doesn't match all code points.")
11727     }
11728
11729     $Any->add_alias('All');
11730
11731     # Assigned is the opposite of gc=unassigned
11732     my $Assigned = $perl->add_match_table('Assigned',
11733                                 Description  => "All assigned code points",
11734                                 Initialize => ~ $gc->table('Unassigned'),
11735                                 );
11736
11737     # Our internal-only property should be treated as more than just a
11738     # synonym; grandfather it in to the pod.
11739     $perl->add_match_table('_CombAbove', Re_Pod_Entry => 1,
11740                             Fate => $INTERNAL_ONLY, Status => $DISCOURAGED)
11741             ->set_equivalent_to(property_ref('ccc')->table('Above'),
11742                                                                 Related => 1);
11743
11744     my $ASCII = $perl->add_match_table('ASCII', Description => '[[:ASCII:]]');
11745     if (defined $block) {   # This is equivalent to the block if have it.
11746         my $Unicode_ASCII = $block->table('Basic_Latin');
11747         if (defined $Unicode_ASCII && ! $Unicode_ASCII->is_empty) {
11748             $ASCII->set_equivalent_to($Unicode_ASCII, Related => 1);
11749         }
11750     }
11751
11752     # Very early releases didn't have blocks, so initialize ASCII ourselves if
11753     # necessary
11754     if ($ASCII->is_empty) {
11755         $ASCII->initialize([ 0..127 ]);
11756     }
11757
11758     # Get the best available case definitions.  Early Unicode versions didn't
11759     # have Uppercase and Lowercase defined, so use the general category
11760     # instead for them.
11761     my $Lower = $perl->add_match_table('Lower');
11762     my $Unicode_Lower = property_ref('Lowercase');
11763     if (defined $Unicode_Lower && ! $Unicode_Lower->is_empty) {
11764         $Lower->set_equivalent_to($Unicode_Lower->table('Y'), Related => 1);
11765         $Unicode_Lower->table('Y')->set_caseless_equivalent(property_ref('Cased')->table('Y'));
11766         $Unicode_Lower->table('N')->set_caseless_equivalent(property_ref('Cased')->table('N'));
11767         $Lower->set_caseless_equivalent(property_ref('Cased')->table('Y'));
11768
11769     }
11770     else {
11771         $Lower->set_equivalent_to($gc->table('Lowercase_Letter'),
11772                                                                 Related => 1);
11773     }
11774     $Lower->add_alias('XPosixLower');
11775     my $Posix_Lower = $perl->add_match_table("PosixLower",
11776                             Description => "[a-z]",
11777                             Initialize => $Lower & $ASCII,
11778                             );
11779
11780     my $Upper = $perl->add_match_table('Upper');
11781     my $Unicode_Upper = property_ref('Uppercase');
11782     if (defined $Unicode_Upper && ! $Unicode_Upper->is_empty) {
11783         $Upper->set_equivalent_to($Unicode_Upper->table('Y'), Related => 1);
11784         $Unicode_Upper->table('Y')->set_caseless_equivalent(property_ref('Cased')->table('Y'));
11785         $Unicode_Upper->table('N')->set_caseless_equivalent(property_ref('Cased')->table('N'));
11786         $Upper->set_caseless_equivalent(property_ref('Cased')->table('Y'));
11787     }
11788     else {
11789         $Upper->set_equivalent_to($gc->table('Uppercase_Letter'),
11790                                                                 Related => 1);
11791     }
11792     $Upper->add_alias('XPosixUpper');
11793     my $Posix_Upper = $perl->add_match_table("PosixUpper",
11794                             Description => "[A-Z]",
11795                             Initialize => $Upper & $ASCII,
11796                             );
11797
11798     # Earliest releases didn't have title case.  Initialize it to empty if not
11799     # otherwise present
11800     my $Title = $perl->add_match_table('Title', Full_Name => 'Titlecase',
11801                                        Description => '(= \p{Gc=Lt})');
11802     my $lt = $gc->table('Lt');
11803
11804     # Earlier versions of mktables had this related to $lt since they have
11805     # identical code points, but their caseless equivalents are not the same,
11806     # one being 'Cased' and the other being 'LC', and so now must be kept as
11807     # separate entities.
11808     $Title += $lt if defined $lt;
11809
11810     # If this Unicode version doesn't have Cased, set up our own.  From
11811     # Unicode 5.1: Definition D120: A character C is defined to be cased if
11812     # and only if C has the Lowercase or Uppercase property or has a
11813     # General_Category value of Titlecase_Letter.
11814     my $Unicode_Cased = property_ref('Cased');
11815     unless (defined $Unicode_Cased) {
11816         my $cased = $perl->add_match_table('Cased',
11817                         Initialize => $Lower + $Upper + $Title,
11818                         Description => 'Uppercase or Lowercase or Titlecase',
11819                         );
11820         $Unicode_Cased = $cased;
11821     }
11822     $Title->set_caseless_equivalent($Unicode_Cased->table('Y'));
11823
11824     # Similarly, set up our own Case_Ignorable property if this Unicode
11825     # version doesn't have it.  From Unicode 5.1: Definition D121: A character
11826     # C is defined to be case-ignorable if C has the value MidLetter or the
11827     # value MidNumLet for the Word_Break property or its General_Category is
11828     # one of Nonspacing_Mark (Mn), Enclosing_Mark (Me), Format (Cf),
11829     # Modifier_Letter (Lm), or Modifier_Symbol (Sk).
11830
11831     # Perl has long had an internal-only alias for this property; grandfather
11832     # it in to the pod, but discourage its use.
11833     my $perl_case_ignorable = $perl->add_match_table('_Case_Ignorable',
11834                                                      Re_Pod_Entry => 1,
11835                                                      Fate => $INTERNAL_ONLY,
11836                                                      Status => $DISCOURAGED);
11837     my $case_ignorable = property_ref('Case_Ignorable');
11838     if (defined $case_ignorable && ! $case_ignorable->is_empty) {
11839         $perl_case_ignorable->set_equivalent_to($case_ignorable->table('Y'),
11840                                                                 Related => 1);
11841     }
11842     else {
11843
11844         $perl_case_ignorable->initialize($gc->table('Mn') + $gc->table('Lm'));
11845
11846         # The following three properties are not in early releases
11847         $perl_case_ignorable += $gc->table('Me') if defined $gc->table('Me');
11848         $perl_case_ignorable += $gc->table('Cf') if defined $gc->table('Cf');
11849         $perl_case_ignorable += $gc->table('Sk') if defined $gc->table('Sk');
11850
11851         # For versions 4.1 - 5.0, there is no MidNumLet property, and
11852         # correspondingly the case-ignorable definition lacks that one.  For
11853         # 4.0, it appears that it was meant to be the same definition, but was
11854         # inadvertently omitted from the standard's text, so add it if the
11855         # property actually is there
11856         my $wb = property_ref('Word_Break');
11857         if (defined $wb) {
11858             my $midlet = $wb->table('MidLetter');
11859             $perl_case_ignorable += $midlet if defined $midlet;
11860             my $midnumlet = $wb->table('MidNumLet');
11861             $perl_case_ignorable += $midnumlet if defined $midnumlet;
11862         }
11863         else {
11864
11865             # In earlier versions of the standard, instead of the above two
11866             # properties , just the following characters were used:
11867             $perl_case_ignorable +=  0x0027  # APOSTROPHE
11868                                 +   0x00AD  # SOFT HYPHEN (SHY)
11869                                 +   0x2019; # RIGHT SINGLE QUOTATION MARK
11870         }
11871     }
11872
11873     # The remaining perl defined tables are mostly based on Unicode TR 18,
11874     # "Annex C: Compatibility Properties".  All of these have two versions,
11875     # one whose name generally begins with Posix that is posix-compliant, and
11876     # one that matches Unicode characters beyond the Posix, ASCII range
11877
11878     my $Alpha = $perl->add_match_table('Alpha');
11879
11880     # Alphabetic was not present in early releases
11881     my $Alphabetic = property_ref('Alphabetic');
11882     if (defined $Alphabetic && ! $Alphabetic->is_empty) {
11883         $Alpha->set_equivalent_to($Alphabetic->table('Y'), Related => 1);
11884     }
11885     else {
11886
11887         # For early releases, we don't get it exactly right.  The below
11888         # includes more than it should, which in 5.2 terms is: L + Nl +
11889         # Other_Alphabetic.  Other_Alphabetic contains many characters from
11890         # Mn and Mc.  It's better to match more than we should, than less than
11891         # we should.
11892         $Alpha->initialize($gc->table('Letter')
11893                             + $gc->table('Mn')
11894                             + $gc->table('Mc'));
11895         $Alpha += $gc->table('Nl') if defined $gc->table('Nl');
11896         $Alpha->add_description('Alphabetic');
11897     }
11898     $Alpha->add_alias('XPosixAlpha');
11899     my $Posix_Alpha = $perl->add_match_table("PosixAlpha",
11900                             Description => "[A-Za-z]",
11901                             Initialize => $Alpha & $ASCII,
11902                             );
11903     $Posix_Upper->set_caseless_equivalent($Posix_Alpha);
11904     $Posix_Lower->set_caseless_equivalent($Posix_Alpha);
11905
11906     my $Alnum = $perl->add_match_table('Alnum',
11907                         Description => 'Alphabetic and (decimal) Numeric',
11908                         Initialize => $Alpha + $gc->table('Decimal_Number'),
11909                         );
11910     $Alnum->add_alias('XPosixAlnum');
11911     $perl->add_match_table("PosixAlnum",
11912                             Description => "[A-Za-z0-9]",
11913                             Initialize => $Alnum & $ASCII,
11914                             );
11915
11916     my $Word = $perl->add_match_table('Word',
11917                                 Description => '\w, including beyond ASCII;'
11918                                             . ' = \p{Alnum} + \pM + \p{Pc}',
11919                                 Initialize => $Alnum + $gc->table('Mark'),
11920                                 );
11921     $Word->add_alias('XPosixWord');
11922     my $Pc = $gc->table('Connector_Punctuation'); # 'Pc' Not in release 1
11923     $Word += $Pc if defined $Pc;
11924
11925     # This is a Perl extension, so the name doesn't begin with Posix.
11926     my $PerlWord = $perl->add_match_table('PerlWord',
11927                     Description => '\w, restricted to ASCII = [A-Za-z0-9_]',
11928                     Initialize => $Word & $ASCII,
11929                     );
11930     $PerlWord->add_alias('PosixWord');
11931
11932     my $Blank = $perl->add_match_table('Blank',
11933                                 Description => '\h, Horizontal white space',
11934
11935                                 # 200B is Zero Width Space which is for line
11936                                 # break control, and was listed as
11937                                 # Space_Separator in early releases
11938                                 Initialize => $gc->table('Space_Separator')
11939                                             +   0x0009  # TAB
11940                                             -   0x200B, # ZWSP
11941                                 );
11942     $Blank->add_alias('HorizSpace');        # Another name for it.
11943     $Blank->add_alias('XPosixBlank');
11944     $perl->add_match_table("PosixBlank",
11945                             Description => "\\t and ' '",
11946                             Initialize => $Blank & $ASCII,
11947                             );
11948
11949     my $VertSpace = $perl->add_match_table('VertSpace',
11950                             Description => '\v',
11951                             Initialize => $gc->table('Line_Separator')
11952                                         + $gc->table('Paragraph_Separator')
11953                                         + 0x000A  # LINE FEED
11954                                         + 0x000B  # VERTICAL TAB
11955                                         + 0x000C  # FORM FEED
11956                                         + 0x000D  # CARRIAGE RETURN
11957                                         + 0x0085, # NEL
11958                             );
11959     # No Posix equivalent for vertical space
11960
11961     my $Space = $perl->add_match_table('Space',
11962                 Description => '\s including beyond ASCII plus vertical tab',
11963                 Initialize => $Blank + $VertSpace,
11964     );
11965     $Space->add_alias('XPosixSpace');
11966     $perl->add_match_table("PosixSpace",
11967                             Description => "\\t, \\n, \\cK, \\f, \\r, and ' '.  (\\cK is vertical tab)",
11968                             Initialize => $Space & $ASCII,
11969                             );
11970
11971     # Perl's traditional space doesn't include Vertical Tab
11972     my $XPerlSpace = $perl->add_match_table('XPerlSpace',
11973                                   Description => '\s, including beyond ASCII',
11974                                   Initialize => $Space - 0x000B,
11975                                 );
11976     $XPerlSpace->add_alias('SpacePerl');    # A pre-existing synonym
11977     my $PerlSpace = $perl->add_match_table('PerlSpace',
11978                         Description => '\s, restricted to ASCII = [ \f\n\r\t]',
11979                         Initialize => $XPerlSpace & $ASCII,
11980                             );
11981
11982
11983     my $Cntrl = $perl->add_match_table('Cntrl',
11984                                         Description => 'Control characters');
11985     $Cntrl->set_equivalent_to($gc->table('Cc'), Related => 1);
11986     $Cntrl->add_alias('XPosixCntrl');
11987     $perl->add_match_table("PosixCntrl",
11988                             Description => "ASCII control characters: NUL, SOH, STX, ETX, EOT, ENQ, ACK, BEL, BS, HT, LF, VT, FF, CR, SO, SI, DLE, DC1, DC2, DC3, DC4, NAK, SYN, ETB, CAN, EOM, SUB, ESC, FS, GS, RS, US, and DEL",
11989                             Initialize => $Cntrl & $ASCII,
11990                             );
11991
11992     # $controls is a temporary used to construct Graph.
11993     my $controls = Range_List->new(Initialize => $gc->table('Unassigned')
11994                                                 + $gc->table('Control'));
11995     # Cs not in release 1
11996     $controls += $gc->table('Surrogate') if defined $gc->table('Surrogate');
11997
11998     # Graph is  ~space &  ~(Cc|Cs|Cn) = ~(space + $controls)
11999     my $Graph = $perl->add_match_table('Graph',
12000                         Description => 'Characters that are graphical',
12001                         Initialize => ~ ($Space + $controls),
12002                         );
12003     $Graph->add_alias('XPosixGraph');
12004     $perl->add_match_table("PosixGraph",
12005                             Description =>
12006                                 '[-!"#$%&\'()*+,./:;<>?@[\\\]^_`{|}~0-9A-Za-z]',
12007                             Initialize => $Graph & $ASCII,
12008                             );
12009
12010     $print = $perl->add_match_table('Print',
12011                         Description => 'Characters that are graphical plus space characters (but no controls)',
12012                         Initialize => $Blank + $Graph - $gc->table('Control'),
12013                         );
12014     $print->add_alias('XPosixPrint');
12015     $perl->add_match_table("PosixPrint",
12016                             Description =>
12017                               '[- 0-9A-Za-z!"#$%&\'()*+,./:;<>?@[\\\]^_`{|}~]',
12018                             Initialize => $print & $ASCII,
12019                             );
12020
12021     my $Punct = $perl->add_match_table('Punct');
12022     $Punct->set_equivalent_to($gc->table('Punctuation'), Related => 1);
12023
12024     # \p{punct} doesn't include the symbols, which posix does
12025     my $XPosixPunct = $perl->add_match_table('XPosixPunct',
12026                     Description => '\p{Punct} + ASCII-range \p{Symbol}',
12027                     Initialize => $gc->table('Punctuation')
12028                                 + ($ASCII & $gc->table('Symbol')),
12029                                 Perl_Extension => 1
12030         );
12031     $perl->add_match_table('PosixPunct', Perl_Extension => 1,
12032         Description => '[-!"#$%&\'()*+,./:;<>?@[\\\]^_`{|}~]',
12033         Initialize => $ASCII & $XPosixPunct,
12034         );
12035
12036     my $Digit = $perl->add_match_table('Digit',
12037                             Description => '[0-9] + all other decimal digits');
12038     $Digit->set_equivalent_to($gc->table('Decimal_Number'), Related => 1);
12039     $Digit->add_alias('XPosixDigit');
12040     my $PosixDigit = $perl->add_match_table("PosixDigit",
12041                                             Description => '[0-9]',
12042                                             Initialize => $Digit & $ASCII,
12043                                             );
12044
12045     # Hex_Digit was not present in first release
12046     my $Xdigit = $perl->add_match_table('XDigit');
12047     $Xdigit->add_alias('XPosixXDigit');
12048     my $Hex = property_ref('Hex_Digit');
12049     if (defined $Hex && ! $Hex->is_empty) {
12050         $Xdigit->set_equivalent_to($Hex->table('Y'), Related => 1);
12051     }
12052     else {
12053         # (Have to use hex instead of e.g. '0', because could be running on an
12054         # non-ASCII machine, and we want the Unicode (ASCII) values)
12055         $Xdigit->initialize([ 0x30..0x39, 0x41..0x46, 0x61..0x66,
12056                               0xFF10..0xFF19, 0xFF21..0xFF26, 0xFF41..0xFF46]);
12057         $Xdigit->add_description('[0-9A-Fa-f] and corresponding fullwidth versions, like U+FF10: FULLWIDTH DIGIT ZERO');
12058     }
12059
12060     # AHex was not present in early releases
12061     my $PosixXDigit = $perl->add_match_table('PosixXDigit');
12062     my $AHex = property_ref('ASCII_Hex_Digit');
12063     if (defined $AHex && ! $AHex->is_empty) {
12064         $PosixXDigit->set_equivalent_to($AHex->table('Y'), Related => 1);
12065     }
12066     else {
12067         $PosixXDigit->initialize($Xdigit & $ASCII);
12068     }
12069     $PosixXDigit->add_description('[0-9A-Fa-f]');
12070
12071     my $dt = property_ref('Decomposition_Type');
12072     $dt->add_match_table('Non_Canon', Full_Name => 'Non_Canonical',
12073         Initialize => ~ ($dt->table('None') + $dt->table('Canonical')),
12074         Perl_Extension => 1,
12075         Note => 'Union of all non-canonical decompositions',
12076         );
12077
12078     # _CanonDCIJ is equivalent to Soft_Dotted, but if on a release earlier
12079     # than SD appeared, construct it ourselves, based on the first release SD
12080     # was in.  A pod entry is grandfathered in for it
12081     my $CanonDCIJ = $perl->add_match_table('_CanonDCIJ', Re_Pod_Entry => 1,
12082                                            Perl_Extension => 1,
12083                                            Fate => $INTERNAL_ONLY,
12084                                            Status => $DISCOURAGED);
12085     my $soft_dotted = property_ref('Soft_Dotted');
12086     if (defined $soft_dotted && ! $soft_dotted->is_empty) {
12087         $CanonDCIJ->set_equivalent_to($soft_dotted->table('Y'), Related => 1);
12088     }
12089     else {
12090
12091         # This list came from 3.2 Soft_Dotted.
12092         $CanonDCIJ->initialize([ 0x0069,
12093                                  0x006A,
12094                                  0x012F,
12095                                  0x0268,
12096                                  0x0456,
12097                                  0x0458,
12098                                  0x1E2D,
12099                                  0x1ECB,
12100                                ]);
12101         $CanonDCIJ = $CanonDCIJ & $Assigned;
12102     }
12103
12104     # These are used in Unicode's definition of \X
12105     my $begin = $perl->add_match_table('_X_Begin', Perl_Extension => 1,
12106                                        Fate => $INTERNAL_ONLY);
12107     my $extend = $perl->add_match_table('_X_Extend', Perl_Extension => 1,
12108                                         Fate => $INTERNAL_ONLY);
12109
12110     # For backward compatibility, Perl has its own definition for IDStart
12111     # First, we include the underscore, and then the regular XID_Start also
12112     # have to be Words
12113     $perl->add_match_table('_Perl_IDStart',
12114                            Perl_Extension => 1,
12115                            Fate => $INTERNAL_ONLY,
12116                            Initialize =>
12117                              ord('_')
12118                              + (property_ref('XID_Start')->table('Y') & $Word)
12119                            );
12120
12121     my $gcb = property_ref('Grapheme_Cluster_Break');
12122
12123     # The 'extended' grapheme cluster came in 5.1.  The non-extended
12124     # definition differs too much from the traditional Perl one to use.
12125     if (defined $gcb && defined $gcb->table('SpacingMark')) {
12126
12127         # Note that assumes HST is defined; it came in an earlier release than
12128         # GCB.  In the line below, two negatives means: yes hangul
12129         $begin += ~ property_ref('Hangul_Syllable_Type')
12130                                                     ->table('Not_Applicable')
12131                + ~ ($gcb->table('Control')
12132                     + $gcb->table('CR')
12133                     + $gcb->table('LF'));
12134         $begin->add_comment('For use in \X; matches: Hangul_Syllable | ! Control');
12135
12136         $extend += $gcb->table('Extend') + $gcb->table('SpacingMark');
12137         $extend->add_comment('For use in \X; matches: Extend | SpacingMark');
12138     }
12139     else {    # Old definition, used on early releases.
12140         $extend += $gc->table('Mark')
12141                 + 0x200C    # ZWNJ
12142                 + 0x200D;   # ZWJ
12143         $begin += ~ $extend;
12144
12145         # Here we may have a release that has the regular grapheme cluster
12146         # defined, or a release that doesn't have anything defined.
12147         # We set things up so the Perl core degrades gracefully, possibly with
12148         # placeholders that match nothing.
12149
12150         if (! defined $gcb) {
12151             $gcb = Property->new('GCB', Status => $PLACEHOLDER);
12152         }
12153         my $hst = property_ref('HST');
12154         if (!defined $hst) {
12155             $hst = Property->new('HST', Status => $PLACEHOLDER);
12156             $hst->add_match_table('Not_Applicable',
12157                                 Initialize => $Any,
12158                                 Matches_All => 1);
12159         }
12160
12161         # On some releases, here we may not have the needed tables for the
12162         # perl core, in some releases we may.
12163         foreach my $name (qw{ L LV LVT T V prepend }) {
12164             my $table = $gcb->table($name);
12165             if (! defined $table) {
12166                 $table = $gcb->add_match_table($name);
12167                 push @tables_that_may_be_empty, $table->complete_name;
12168             }
12169
12170             # The HST property predates the GCB one, and has identical tables
12171             # for some of them, so use it if we can.
12172             if ($table->is_empty
12173                 && defined $hst
12174                 && defined $hst->table($name))
12175             {
12176                 $table += $hst->table($name);
12177             }
12178         }
12179     }
12180
12181     # More GCB.  If we found some hangul syllables, populate a combined
12182     # table.
12183     my $lv_lvt_v = $perl->add_match_table('_X_LV_LVT_V',
12184                                           Perl_Extension => 1,
12185                                           Fate => $INTERNAL_ONLY);
12186     my $LV = $gcb->table('LV');
12187     if ($LV->is_empty) {
12188         push @tables_that_may_be_empty, $lv_lvt_v->complete_name;
12189     } else {
12190         $lv_lvt_v += $LV + $gcb->table('LVT') + $gcb->table('V');
12191         $lv_lvt_v->add_comment('For use in \X; matches: HST=LV | HST=LVT | HST=V');
12192     }
12193
12194     # Was previously constructed to contain both Name and Unicode_1_Name
12195     my @composition = ('Name', 'Unicode_1_Name');
12196
12197     if (@named_sequences) {
12198         push @composition, 'Named_Sequence';
12199         foreach my $sequence (@named_sequences) {
12200             $perl_charname->add_anomalous_entry($sequence);
12201         }
12202     }
12203
12204     my $alias_sentence = "";
12205     my $alias = property_ref('Name_Alias');
12206     if (defined $alias) {
12207         push @composition, 'Name_Alias';
12208         $alias->reset_each_range;
12209         while (my ($range) = $alias->each_range) {
12210             next if $range->value eq "";
12211             if ($range->start != $range->end) {
12212                 Carp::my_carp("Expecting only one code point in the range $range.  Just to keep going, using just the first code point;");
12213             }
12214             $perl_charname->add_duplicate($range->start, $range->value);
12215         }
12216         $alias_sentence = <<END;
12217 The Name_Alias property adds duplicate code point entries with a corrected
12218 name.  The original (less correct, but still valid) name will be physically
12219 last.
12220 END
12221     }
12222     my $comment;
12223     if (@composition <= 2) { # Always at least 2
12224         $comment = join " and ", @composition;
12225     }
12226     else {
12227         $comment = join ", ", @composition[0 .. scalar @composition - 2];
12228         $comment .= ", and $composition[-1]";
12229     }
12230
12231     $perl_charname->add_comment(join_lines( <<END
12232 This file is for charnames.pm.  It is the union of the $comment properties.
12233 Unicode_1_Name entries are used only for otherwise nameless code
12234 points.
12235 $alias_sentence
12236 This file doesn't include the algorithmically determinable names.  For those,
12237 use 'unicore/Name.pm'
12238 END
12239     ));
12240     property_ref('Name')->add_comment(join_lines( <<END
12241 This file doesn't include the algorithmically determinable names.  For those,
12242 use 'unicore/Name.pm'
12243 END
12244     ));
12245
12246     # Construct the Present_In property from the Age property.
12247     if (-e 'DAge.txt' && defined (my $age = property_ref('Age'))) {
12248         my $default_map = $age->default_map;
12249         my $in = Property->new('In',
12250                                 Default_Map => $default_map,
12251                                 Full_Name => "Present_In",
12252                                 Perl_Extension => 1,
12253                                 Type => $ENUM,
12254                                 Initialize => $age,
12255                                 );
12256         $in->add_comment(join_lines(<<END
12257 THIS FILE SHOULD NOT BE USED FOR ANY PURPOSE.  The values in this file are the
12258 same as for $age, and not for what $in really means.  This is because anything
12259 defined in a given release should have multiple values: that release and all
12260 higher ones.  But only one value per code point can be represented in a table
12261 like this.
12262 END
12263         ));
12264
12265         # The Age tables are named like 1.5, 2.0, 2.1, ....  Sort so that the
12266         # lowest numbered (earliest) come first, with the non-numeric one
12267         # last.
12268         my ($first_age, @rest_ages) = sort { ($a->name !~ /^[\d.]*$/)
12269                                             ? 1
12270                                             : ($b->name !~ /^[\d.]*$/)
12271                                                 ? -1
12272                                                 : $a->name <=> $b->name
12273                                             } $age->tables;
12274
12275         # The Present_In property is the cumulative age properties.  The first
12276         # one hence is identical to the first age one.
12277         my $previous_in = $in->add_match_table($first_age->name);
12278         $previous_in->set_equivalent_to($first_age, Related => 1);
12279
12280         my $description_start = "Code point's usage introduced in version ";
12281         $first_age->add_description($description_start . $first_age->name);
12282
12283         # To construct the accumulated values, for each of the age tables
12284         # starting with the 2nd earliest, merge the earliest with it, to get
12285         # all those code points existing in the 2nd earliest.  Repeat merging
12286         # the new 2nd earliest with the 3rd earliest to get all those existing
12287         # in the 3rd earliest, and so on.
12288         foreach my $current_age (@rest_ages) {
12289             next if $current_age->name !~ /^[\d.]*$/;   # Skip the non-numeric
12290
12291             my $current_in = $in->add_match_table(
12292                                     $current_age->name,
12293                                     Initialize => $current_age + $previous_in,
12294                                     Description => $description_start
12295                                                     . $current_age->name
12296                                                     . ' or earlier',
12297                                     );
12298             $previous_in = $current_in;
12299
12300             # Add clarifying material for the corresponding age file.  This is
12301             # in part because of the confusing and contradictory information
12302             # given in the Standard's documentation itself, as of 5.2.
12303             $current_age->add_description(
12304                             "Code point's usage was introduced in version "
12305                             . $current_age->name);
12306             $current_age->add_note("See also $in");
12307
12308         }
12309
12310         # And finally the code points whose usages have yet to be decided are
12311         # the same in both properties.  Note that permanently unassigned code
12312         # points actually have their usage assigned (as being permanently
12313         # unassigned), so that these tables are not the same as gc=cn.
12314         my $unassigned = $in->add_match_table($default_map);
12315         my $age_default = $age->table($default_map);
12316         $age_default->add_description(<<END
12317 Code point's usage has not been assigned in any Unicode release thus far.
12318 END
12319         );
12320         $unassigned->set_equivalent_to($age_default, Related => 1);
12321     }
12322
12323
12324     # Finished creating all the perl properties.  All non-internal non-string
12325     # ones have a synonym of 'Is_' prefixed.  (Internal properties begin with
12326     # an underscore.)  These do not get a separate entry in the pod file
12327     foreach my $table ($perl->tables) {
12328         foreach my $alias ($table->aliases) {
12329             next if $alias->name =~ /^_/;
12330             $table->add_alias('Is_' . $alias->name,
12331                                Re_Pod_Entry => 0,
12332                                UCD => 0,
12333                                Status => $alias->status,
12334                                OK_as_Filename => 0);
12335         }
12336     }
12337
12338     # Here done with all the basic stuff.  Ready to populate the information
12339     # about each character if annotating them.
12340     if ($annotate) {
12341
12342         # See comments at its declaration
12343         $annotate_ranges = Range_Map->new;
12344
12345         # This separates out the non-characters from the other unassigneds, so
12346         # can give different annotations for each.
12347         $unassigned_sans_noncharacters = Range_List->new(
12348          Initialize => $gc->table('Unassigned')
12349                        & property_ref('Noncharacter_Code_Point')->table('N'));
12350
12351         for (my $i = 0; $i <= $MAX_UNICODE_CODEPOINT; $i++ ) {
12352             $i = populate_char_info($i);    # Note sets $i so may cause skips
12353         }
12354     }
12355
12356     return;
12357 }
12358
12359 sub add_perl_synonyms() {
12360     # A number of Unicode tables have Perl synonyms that are expressed in
12361     # the single-form, \p{name}.  These are:
12362     #   All the binary property Y tables, so that \p{Name=Y} gets \p{Name} and
12363     #       \p{Is_Name} as synonyms
12364     #   \p{Script=Value} gets \p{Value}, \p{Is_Value} as synonyms
12365     #   \p{General_Category=Value} gets \p{Value}, \p{Is_Value} as synonyms
12366     #   \p{Block=Value} gets \p{In_Value} as a synonym, and, if there is no
12367     #       conflict, \p{Value} and \p{Is_Value} as well
12368     #
12369     # This routine generates these synonyms, warning of any unexpected
12370     # conflicts.
12371
12372     # Construct the list of tables to get synonyms for.  Start with all the
12373     # binary and the General_Category ones.
12374     my @tables = grep { $_->type == $BINARY || $_->type == $FORCED_BINARY }
12375                                                             property_ref('*');
12376     push @tables, $gc->tables;
12377
12378     # If the version of Unicode includes the Script property, add its tables
12379     push @tables, $script->tables if defined $script;
12380
12381     # The Block tables are kept separate because they are treated differently.
12382     # And the earliest versions of Unicode didn't include them, so add only if
12383     # there are some.
12384     my @blocks;
12385     push @blocks, $block->tables if defined $block;
12386
12387     # Here, have the lists of tables constructed.  Process blocks last so that
12388     # if there are name collisions with them, blocks have lowest priority.
12389     # Should there ever be other collisions, manual intervention would be
12390     # required.  See the comments at the beginning of the program for a
12391     # possible way to handle those semi-automatically.
12392     foreach my $table (@tables,  @blocks) {
12393
12394         # For non-binary properties, the synonym is just the name of the
12395         # table, like Greek, but for binary properties the synonym is the name
12396         # of the property, and means the code points in its 'Y' table.
12397         my $nominal = $table;
12398         my $nominal_property = $nominal->property;
12399         my $actual;
12400         if (! $nominal->isa('Property')) {
12401             $actual = $table;
12402         }
12403         else {
12404
12405             # Here is a binary property.  Use the 'Y' table.  Verify that is
12406             # there
12407             my $yes = $nominal->table('Y');
12408             unless (defined $yes) {  # Must be defined, but is permissible to
12409                                      # be empty.
12410                 Carp::my_carp_bug("Undefined $nominal, 'Y'.  Skipping.");
12411                 next;
12412             }
12413             $actual = $yes;
12414         }
12415
12416         foreach my $alias ($nominal->aliases) {
12417
12418             # Attempt to create a table in the perl directory for the
12419             # candidate table, using whatever aliases in it that don't
12420             # conflict.  Also add non-conflicting aliases for all these
12421             # prefixed by 'Is_' (and/or 'In_' for Block property tables)
12422             PREFIX:
12423             foreach my $prefix ("", 'Is_', 'In_') {
12424
12425                 # Only Block properties can have added 'In_' aliases.
12426                 next if $prefix eq 'In_' and $nominal_property != $block;
12427
12428                 my $proposed_name = $prefix . $alias->name;
12429
12430                 # No Is_Is, In_In, nor combinations thereof
12431                 trace "$proposed_name is a no-no" if main::DEBUG && $to_trace && $proposed_name =~ /^ I [ns] _I [ns] _/x;
12432                 next if $proposed_name =~ /^ I [ns] _I [ns] _/x;
12433
12434                 trace "Seeing if can add alias or table: 'perl=$proposed_name' based on $nominal" if main::DEBUG && $to_trace;
12435
12436                 # Get a reference to any existing table in the perl
12437                 # directory with the desired name.
12438                 my $pre_existing = $perl->table($proposed_name);
12439
12440                 if (! defined $pre_existing) {
12441
12442                     # No name collision, so ok to add the perl synonym.
12443
12444                     my $make_re_pod_entry;
12445                     my $ok_as_filename;
12446                     my $status = $alias->status;
12447                     if ($nominal_property == $block) {
12448
12449                         # For block properties, the 'In' form is preferred for
12450                         # external use; the pod file contains wild cards for
12451                         # this and the 'Is' form so no entries for those; and
12452                         # we don't want people using the name without the
12453                         # 'In', so discourage that.
12454                         if ($prefix eq "") {
12455                             $make_re_pod_entry = 1;
12456                             $status = $status || $DISCOURAGED;
12457                             $ok_as_filename = 0;
12458                         }
12459                         elsif ($prefix eq 'In_') {
12460                             $make_re_pod_entry = 0;
12461                             $status = $status || $NORMAL;
12462                             $ok_as_filename = 1;
12463                         }
12464                         else {
12465                             $make_re_pod_entry = 0;
12466                             $status = $status || $DISCOURAGED;
12467                             $ok_as_filename = 0;
12468                         }
12469                     }
12470                     elsif ($prefix ne "") {
12471
12472                         # The 'Is' prefix is handled in the pod by a wild
12473                         # card, and we won't use it for an external name
12474                         $make_re_pod_entry = 0;
12475                         $status = $status || $NORMAL;
12476                         $ok_as_filename = 0;
12477                     }
12478                     else {
12479
12480                         # Here, is an empty prefix, non block.  This gets its
12481                         # own pod entry and can be used for an external name.
12482                         $make_re_pod_entry = 1;
12483                         $status = $status || $NORMAL;
12484                         $ok_as_filename = 1;
12485                     }
12486
12487                     # Here, there isn't a perl pre-existing table with the
12488                     # name.  Look through the list of equivalents of this
12489                     # table to see if one is a perl table.
12490                     foreach my $equivalent ($actual->leader->equivalents) {
12491                         next if $equivalent->property != $perl;
12492
12493                         # Here, have found a table for $perl.  Add this alias
12494                         # to it, and are done with this prefix.
12495                         $equivalent->add_alias($proposed_name,
12496                                         Re_Pod_Entry => $make_re_pod_entry,
12497
12498                                         # Currently don't output these in the
12499                                         # ucd pod, as are strongly discouraged
12500                                         # from being used
12501                                         UCD => 0,
12502
12503                                         Status => $status,
12504                                         OK_as_Filename => $ok_as_filename);
12505                         trace "adding alias perl=$proposed_name to $equivalent" if main::DEBUG && $to_trace;
12506                         next PREFIX;
12507                     }
12508
12509                     # Here, $perl doesn't already have a table that is a
12510                     # synonym for this property, add one.
12511                     my $added_table = $perl->add_match_table($proposed_name,
12512                                             Re_Pod_Entry => $make_re_pod_entry,
12513
12514                                             # See UCD comment just above
12515                                             UCD => 0,
12516
12517                                             Status => $status,
12518                                             OK_as_Filename => $ok_as_filename);
12519                     # And it will be related to the actual table, since it is
12520                     # based on it.
12521                     $added_table->set_equivalent_to($actual, Related => 1);
12522                     trace "added ", $perl->table($proposed_name) if main::DEBUG && $to_trace;
12523                     next;
12524                 } # End of no pre-existing.
12525
12526                 # Here, there is a pre-existing table that has the proposed
12527                 # name.  We could be in trouble, but not if this is just a
12528                 # synonym for another table that we have already made a child
12529                 # of the pre-existing one.
12530                 if ($pre_existing->is_set_equivalent_to($actual)) {
12531                     trace "$pre_existing is already equivalent to $actual; adding alias perl=$proposed_name to it" if main::DEBUG && $to_trace;
12532                     $pre_existing->add_alias($proposed_name);
12533                     next;
12534                 }
12535
12536                 # Here, there is a name collision, but it still could be ok if
12537                 # the tables match the identical set of code points, in which
12538                 # case, we can combine the names.  Compare each table's code
12539                 # point list to see if they are identical.
12540                 trace "Potential name conflict with $pre_existing having ", $pre_existing->count, " code points" if main::DEBUG && $to_trace;
12541                 if ($pre_existing->matches_identically_to($actual)) {
12542
12543                     # Here, they do match identically.  Not a real conflict.
12544                     # Make the perl version a child of the Unicode one, except
12545                     # in the non-obvious case of where the perl name is
12546                     # already a synonym of another Unicode property.  (This is
12547                     # excluded by the test for it being its own parent.)  The
12548                     # reason for this exclusion is that then the two Unicode
12549                     # properties become related; and we don't really know if
12550                     # they are or not.  We generate documentation based on
12551                     # relatedness, and this would be misleading.  Code
12552                     # later executed in the process will cause the tables to
12553                     # be represented by a single file anyway, without making
12554                     # it look in the pod like they are necessarily related.
12555                     if ($pre_existing->parent == $pre_existing
12556                         && ($pre_existing->property == $perl
12557                             || $actual->property == $perl))
12558                     {
12559                         trace "Setting $pre_existing equivalent to $actual since one is \$perl, and match identical sets" if main::DEBUG && $to_trace;
12560                         $pre_existing->set_equivalent_to($actual, Related => 1);
12561                     }
12562                     elsif (main::DEBUG && $to_trace) {
12563                         trace "$pre_existing is equivalent to $actual since match identical sets, but not setting them equivalent, to preserve the separateness of the perl aliases";
12564                         trace $pre_existing->parent;
12565                     }
12566                     next PREFIX;
12567                 }
12568
12569                 # Here they didn't match identically, there is a real conflict
12570                 # between our new name and a pre-existing property.
12571                 $actual->add_conflicting($proposed_name, 'p', $pre_existing);
12572                 $pre_existing->add_conflicting($nominal->full_name,
12573                                                'p',
12574                                                $actual);
12575
12576                 # Don't output a warning for aliases for the block
12577                 # properties (unless they start with 'In_') as it is
12578                 # expected that there will be conflicts and the block
12579                 # form loses.
12580                 if ($verbosity >= $NORMAL_VERBOSITY
12581                     && ($actual->property != $block || $prefix eq 'In_'))
12582                 {
12583                     print simple_fold(join_lines(<<END
12584 There is already an alias named $proposed_name (from " . $pre_existing . "),
12585 so not creating this alias for " . $actual
12586 END
12587                     ), "", 4);
12588                 }
12589
12590                 # Keep track for documentation purposes.
12591                 $has_In_conflicts++ if $prefix eq 'In_';
12592                 $has_Is_conflicts++ if $prefix eq 'Is_';
12593             }
12594         }
12595     }
12596
12597     # There are some properties which have No and Yes (and N and Y) as
12598     # property values, but aren't binary, and could possibly be confused with
12599     # binary ones.  So create caveats for them.  There are tables that are
12600     # named 'No', and tables that are named 'N', but confusion is not likely
12601     # unless they are the same table.  For example, N meaning Number or
12602     # Neutral is not likely to cause confusion, so don't add caveats to things
12603     # like them.
12604     foreach my $property (grep { $_->type != $BINARY
12605                                  && $_->type != $FORCED_BINARY }
12606                                                             property_ref('*'))
12607     {
12608         my $yes = $property->table('Yes');
12609         if (defined $yes) {
12610             my $y = $property->table('Y');
12611             if (defined $y && $yes == $y) {
12612                 foreach my $alias ($property->aliases) {
12613                     $yes->add_conflicting($alias->name);
12614                 }
12615             }
12616         }
12617         my $no = $property->table('No');
12618         if (defined $no) {
12619             my $n = $property->table('N');
12620             if (defined $n && $no == $n) {
12621                 foreach my $alias ($property->aliases) {
12622                     $no->add_conflicting($alias->name, 'P');
12623                 }
12624             }
12625         }
12626     }
12627
12628     return;
12629 }
12630
12631 sub register_file_for_name($$$) {
12632     # Given info about a table and a datafile that it should be associated
12633     # with, register that association
12634
12635     my $table = shift;
12636     my $directory_ref = shift;   # Array of the directory path for the file
12637     my $file = shift;            # The file name in the final directory.
12638     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
12639
12640     trace "table=$table, file=$file, directory=@$directory_ref" if main::DEBUG && $to_trace;
12641
12642     if ($table->isa('Property')) {
12643         $table->set_file_path(@$directory_ref, $file);
12644         push @map_properties, $table;
12645
12646         # No swash means don't do the rest of this.
12647         return if $table->fate != $ORDINARY;
12648
12649         # Get the path to the file
12650         my @path = $table->file_path;
12651
12652         # Use just the file name if no subdirectory.
12653         shift @path if $path[0] eq File::Spec->curdir();
12654
12655         my $file = join '/', @path;
12656
12657         # Create a hash entry for utf8_heavy to get the file that stores this
12658         # property's map table
12659         foreach my $alias ($table->aliases) {
12660             my $name = $alias->name;
12661             $loose_property_to_file_of{standardize($name)} = $file;
12662         }
12663
12664         # And a way for utf8_heavy to find the proper key in the SwashInfo
12665         # hash for this property.
12666         $file_to_swash_name{$file} = "To" . $table->swash_name;
12667         return;
12668     }
12669
12670     # Do all of the work for all equivalent tables when called with the leader
12671     # table, so skip if isn't the leader.
12672     return if $table->leader != $table;
12673
12674     # If this is a complement of another file, use that other file instead,
12675     # with a ! prepended to it.
12676     my $complement;
12677     if (($complement = $table->complement) != 0) {
12678         my @directories = $complement->file_path;
12679
12680         # This assumes that the 0th element is something like 'lib',
12681         # the 1th element the property name (in its own directory), like
12682         # 'AHex', and the 2th element the file like 'Y' which will have a .pl
12683         # appended to it later.
12684         $directories[1] =~ s/^/!/;
12685         $file = pop @directories;
12686         $directory_ref =\@directories;
12687     }
12688
12689     # Join all the file path components together, using slashes.
12690     my $full_filename = join('/', @$directory_ref, $file);
12691
12692     # All go in the same subdirectory of unicore
12693     if ($directory_ref->[0] ne $matches_directory) {
12694         Carp::my_carp("Unexpected directory in "
12695                 .  join('/', @{$directory_ref}, $file));
12696     }
12697
12698     # For this table and all its equivalents ...
12699     foreach my $table ($table, $table->equivalents) {
12700
12701         # Associate it with its file internally.  Don't include the
12702         # $matches_directory first component
12703         $table->set_file_path(@$directory_ref, $file);
12704
12705         # No swash means don't do the rest of this.
12706         next if $table->isa('Map_Table') && $table->fate != $ORDINARY;
12707
12708         my $sub_filename = join('/', $directory_ref->[1, -1], $file);
12709
12710         my $property = $table->property;
12711         my $property_name = ($property == $perl)
12712                              ? ""  # 'perl' is never explicitly stated
12713                              : standardize($property->name) . '=';
12714
12715         my $is_default = 0; # Is this table the default one for the property?
12716
12717         # To calculate $is_default, we find if this table is the same as the
12718         # default one for the property.  But this is complicated by the
12719         # possibility that there is a master table for this one, and the
12720         # information is stored there instead of here.
12721         my $parent = $table->parent;
12722         my $leader_prop = $parent->property;
12723         my $default_map = $leader_prop->default_map;
12724         if (defined $default_map) {
12725             my $default_table = $leader_prop->table($default_map);
12726             $is_default = 1 if defined $default_table && $parent == $default_table;
12727         }
12728
12729         # Calculate the loose name for this table.  Mostly it's just its name,
12730         # standardized.  But in the case of Perl tables that are single-form
12731         # equivalents to Unicode properties, it is the latter's name.
12732         my $loose_table_name =
12733                         ($property != $perl || $leader_prop == $perl)
12734                         ? standardize($table->name)
12735                         : standardize($parent->name);
12736
12737         my $deprecated = ($table->status eq $DEPRECATED)
12738                          ? $table->status_info
12739                          : "";
12740         my $caseless_equivalent = $table->caseless_equivalent;
12741
12742         # And for each of the table's aliases...  This inner loop eventually
12743         # goes through all aliases in the UCD that we generate regex match
12744         # files for
12745         foreach my $alias ($table->aliases) {
12746             my $standard = utf8_heavy_name($table, $alias);
12747
12748             # Generate an entry in either the loose or strict hashes, which
12749             # will translate the property and alias names combination into the
12750             # file where the table for them is stored.
12751             if ($alias->loose_match) {
12752                 if (exists $loose_to_file_of{$standard}) {
12753                     Carp::my_carp("Can't change file registered to $loose_to_file_of{$standard} to '$sub_filename'.");
12754                 }
12755                 else {
12756                     $loose_to_file_of{$standard} = $sub_filename;
12757                 }
12758             }
12759             else {
12760                 if (exists $stricter_to_file_of{$standard}) {
12761                     Carp::my_carp("Can't change file registered to $stricter_to_file_of{$standard} to '$sub_filename'.");
12762                 }
12763                 else {
12764                     $stricter_to_file_of{$standard} = $sub_filename;
12765
12766                     # Tightly coupled with how utf8_heavy.pl works, for a
12767                     # floating point number that is a whole number, get rid of
12768                     # the trailing decimal point and 0's, so that utf8_heavy
12769                     # will work.  Also note that this assumes that such a
12770                     # number is matched strictly; so if that were to change,
12771                     # this would be wrong.
12772                     if ((my $integer_name = $alias->name)
12773                             =~ s/^ ( -? \d+ ) \.0+ $ /$1/x)
12774                     {
12775                         $stricter_to_file_of{$property_name . $integer_name}
12776                                                             = $sub_filename;
12777                     }
12778                 }
12779             }
12780
12781             # For Unicode::UCD, create a mapping of the prop=value to the
12782             # canonical =value for that property.
12783             if ($standard =~ /=/) {
12784
12785                 # This could happen if a strict name mapped into an existing
12786                 # loose name.  In that event, the strict names would have to
12787                 # be moved to a new hash.
12788                 if (exists($loose_to_standard_value{$standard})) {
12789                     Carp::my_carp_bug("'$standard' conflicts with a pre-existing use.  Bad News.  Continuing anyway");
12790                 }
12791                 $loose_to_standard_value{$standard} = $loose_table_name;
12792             }
12793
12794             # Keep a list of the deprecated properties and their filenames
12795             if ($deprecated && $complement == 0) {
12796                 $utf8::why_deprecated{$sub_filename} = $deprecated;
12797             }
12798
12799             # And a substitute table, if any, for case-insensitive matching
12800             if ($caseless_equivalent != 0) {
12801                 $caseless_equivalent_to{$standard} = $caseless_equivalent;
12802             }
12803
12804             # Add to defaults list if the table this alias belongs to is the
12805             # default one
12806             $loose_defaults{$standard} = 1 if $is_default;
12807         }
12808     }
12809
12810     return;
12811 }
12812
12813 {   # Closure
12814     my %base_names;  # Names already used for avoiding DOS 8.3 filesystem
12815                      # conflicts
12816     my %full_dir_name_of;   # Full length names of directories used.
12817
12818     sub construct_filename($$$) {
12819         # Return a file name for a table, based on the table name, but perhaps
12820         # changed to get rid of non-portable characters in it, and to make
12821         # sure that it is unique on a file system that allows the names before
12822         # any period to be at most 8 characters (DOS).  While we're at it
12823         # check and complain if there are any directory conflicts.
12824
12825         my $name = shift;       # The name to start with
12826         my $mutable = shift;    # Boolean: can it be changed?  If no, but
12827                                 # yet it must be to work properly, a warning
12828                                 # is given
12829         my $directories_ref = shift;  # A reference to an array containing the
12830                                 # path to the file, with each element one path
12831                                 # component.  This is used because the same
12832                                 # name can be used in different directories.
12833         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
12834
12835         my $warn = ! defined wantarray;  # If true, then if the name is
12836                                 # changed, a warning is issued as well.
12837
12838         if (! defined $name) {
12839             Carp::my_carp("Undefined name in directory "
12840                           . File::Spec->join(@$directories_ref)
12841                           . ". '_' used");
12842             return '_';
12843         }
12844
12845         # Make sure that no directory names conflict with each other.  Look at
12846         # each directory in the input file's path.  If it is already in use,
12847         # assume it is correct, and is merely being re-used, but if we
12848         # truncate it to 8 characters, and find that there are two directories
12849         # that are the same for the first 8 characters, but differ after that,
12850         # then that is a problem.
12851         foreach my $directory (@$directories_ref) {
12852             my $short_dir = substr($directory, 0, 8);
12853             if (defined $full_dir_name_of{$short_dir}) {
12854                 next if $full_dir_name_of{$short_dir} eq $directory;
12855                 Carp::my_carp("$directory conflicts with $full_dir_name_of{$short_dir}.  Bad News.  Continuing anyway");
12856             }
12857             else {
12858                 $full_dir_name_of{$short_dir} = $directory;
12859             }
12860         }
12861
12862         my $path = join '/', @$directories_ref;
12863         $path .= '/' if $path;
12864
12865         # Remove interior underscores.
12866         (my $filename = $name) =~ s/ (?<=.) _ (?=.) //xg;
12867
12868         # Change any non-word character into an underscore, and truncate to 8.
12869         $filename =~ s/\W+/_/g;   # eg., "L&" -> "L_"
12870         substr($filename, 8) = "" if length($filename) > 8;
12871
12872         # Make sure the basename doesn't conflict with something we
12873         # might have already written. If we have, say,
12874         #     InGreekExtended1
12875         #     InGreekExtended2
12876         # they become
12877         #     InGreekE
12878         #     InGreek2
12879         my $warned = 0;
12880         while (my $num = $base_names{$path}{lc $filename}++) {
12881             $num++; # so basenames with numbers start with '2', which
12882                     # just looks more natural.
12883
12884             # Want to append $num, but if it'll make the basename longer
12885             # than 8 characters, pre-truncate $filename so that the result
12886             # is acceptable.
12887             my $delta = length($filename) + length($num) - 8;
12888             if ($delta > 0) {
12889                 substr($filename, -$delta) = $num;
12890             }
12891             else {
12892                 $filename .= $num;
12893             }
12894             if ($warn && ! $warned) {
12895                 $warned = 1;
12896                 Carp::my_carp("'$path$name' conflicts with another name on a filesystem with 8 significant characters (like DOS).  Proceeding anyway.");
12897             }
12898         }
12899
12900         return $filename if $mutable;
12901
12902         # If not changeable, must return the input name, but warn if needed to
12903         # change it beyond shortening it.
12904         if ($name ne $filename
12905             && substr($name, 0, length($filename)) ne $filename) {
12906             Carp::my_carp("'$path$name' had to be changed into '$filename'.  Bad News.  Proceeding anyway.");
12907         }
12908         return $name;
12909     }
12910 }
12911
12912 # The pod file contains a very large table.  Many of the lines in that table
12913 # would exceed a typical output window's size, and so need to be wrapped with
12914 # a hanging indent to make them look good.  The pod language is really
12915 # insufficient here.  There is no general construct to do that in pod, so it
12916 # is done here by beginning each such line with a space to cause the result to
12917 # be output without formatting, and doing all the formatting here.  This leads
12918 # to the result that if the eventual display window is too narrow it won't
12919 # look good, and if the window is too wide, no advantage is taken of that
12920 # extra width.  A further complication is that the output may be indented by
12921 # the formatter so that there is less space than expected.  What I (khw) have
12922 # done is to assume that that indent is a particular number of spaces based on
12923 # what it is in my Linux system;  people can always resize their windows if
12924 # necessary, but this is obviously less than desirable, but the best that can
12925 # be expected.
12926 my $automatic_pod_indent = 8;
12927
12928 # Try to format so that uses fewest lines, but few long left column entries
12929 # slide into the right column.  An experiment on 5.1 data yielded the
12930 # following percentages that didn't cut into the other side along with the
12931 # associated first-column widths
12932 # 69% = 24
12933 # 80% not too bad except for a few blocks
12934 # 90% = 33; # , cuts 353/3053 lines from 37 = 12%
12935 # 95% = 37;
12936 my $indent_info_column = 27;    # 75% of lines didn't have overlap
12937
12938 my $FILLER = 3;     # Length of initial boiler-plate columns in a pod line
12939                     # The 3 is because of:
12940                     #   1   for the leading space to tell the pod formatter to
12941                     #       output as-is
12942                     #   1   for the flag
12943                     #   1   for the space between the flag and the main data
12944
12945 sub format_pod_line ($$$;$$) {
12946     # Take a pod line and return it, formatted properly
12947
12948     my $first_column_width = shift;
12949     my $entry = shift;  # Contents of left column
12950     my $info = shift;   # Contents of right column
12951
12952     my $status = shift || "";   # Any flag
12953
12954     my $loose_match = shift;    # Boolean.
12955     $loose_match = 1 unless defined $loose_match;
12956
12957     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
12958
12959     my $flags = "";
12960     $flags .= $STRICTER if ! $loose_match;
12961
12962     $flags .= $status if $status;
12963
12964     # There is a blank in the left column to cause the pod formatter to
12965     # output the line as-is.
12966     return sprintf " %-*s%-*s %s\n",
12967                     # The first * in the format is replaced by this, the -1 is
12968                     # to account for the leading blank.  There isn't a
12969                     # hard-coded blank after this to separate the flags from
12970                     # the rest of the line, so that in the unlikely event that
12971                     # multiple flags are shown on the same line, they both
12972                     # will get displayed at the expense of that separation,
12973                     # but since they are left justified, a blank will be
12974                     # inserted in the normal case.
12975                     $FILLER - 1,
12976                     $flags,
12977
12978                     # The other * in the format is replaced by this number to
12979                     # cause the first main column to right fill with blanks.
12980                     # The -1 is for the guaranteed blank following it.
12981                     $first_column_width - $FILLER - 1,
12982                     $entry,
12983                     $info;
12984 }
12985
12986 my @zero_match_tables;  # List of tables that have no matches in this release
12987
12988 sub make_re_pod_entries($) {
12989     # This generates the entries for the pod file for a given table.
12990     # Also done at this time are any children tables.  The output looks like:
12991     # \p{Common}              \p{Script=Common} (Short: \p{Zyyy}) (5178)
12992
12993     my $input_table = shift;        # Table the entry is for
12994     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
12995
12996     # Generate parent and all its children at the same time.
12997     return if $input_table->parent != $input_table;
12998
12999     my $property = $input_table->property;
13000     my $type = $property->type;
13001     my $full_name = $property->full_name;
13002
13003     my $count = $input_table->count;
13004     my $string_count = clarify_number($count);
13005     my $status = $input_table->status;
13006     my $status_info = $input_table->status_info;
13007     my $caseless_equivalent = $input_table->caseless_equivalent;
13008
13009     my $entry_for_first_table; # The entry for the first table output.
13010                            # Almost certainly, it is the parent.
13011
13012     # For each related table (including itself), we will generate a pod entry
13013     # for each name each table goes by
13014     foreach my $table ($input_table, $input_table->children) {
13015
13016         # utf8_heavy.pl cannot deal with null string property values, so skip
13017         # any tables that have no non-null names.
13018         next if ! grep { $_->name ne "" } $table->aliases;
13019
13020         # First, gather all the info that applies to this table as a whole.
13021
13022         push @zero_match_tables, $table if $count == 0;
13023
13024         my $table_property = $table->property;
13025
13026         # The short name has all the underscores removed, while the full name
13027         # retains them.  Later, we decide whether to output a short synonym
13028         # for the full one, we need to compare apples to apples, so we use the
13029         # short name's length including underscores.
13030         my $table_property_short_name_length;
13031         my $table_property_short_name
13032             = $table_property->short_name(\$table_property_short_name_length);
13033         my $table_property_full_name = $table_property->full_name;
13034
13035         # Get how much savings there is in the short name over the full one
13036         # (delta will always be <= 0)
13037         my $table_property_short_delta = $table_property_short_name_length
13038                                          - length($table_property_full_name);
13039         my @table_description = $table->description;
13040         my @table_note = $table->note;
13041
13042         # Generate an entry for each alias in this table.
13043         my $entry_for_first_alias;  # saves the first one encountered.
13044         foreach my $alias ($table->aliases) {
13045
13046             # Skip if not to go in pod.
13047             next unless $alias->make_re_pod_entry;
13048
13049             # Start gathering all the components for the entry
13050             my $name = $alias->name;
13051
13052             # Skip if name is empty, as can't be accessed by regexes.
13053             next if $name eq "";
13054
13055             my $entry;      # Holds the left column, may include extras
13056             my $entry_ref;  # To refer to the left column's contents from
13057                             # another entry; has no extras
13058
13059             # First the left column of the pod entry.  Tables for the $perl
13060             # property always use the single form.
13061             if ($table_property == $perl) {
13062                 $entry = "\\p{$name}";
13063                 $entry_ref = "\\p{$name}";
13064             }
13065             else {    # Compound form.
13066
13067                 # Only generate one entry for all the aliases that mean true
13068                 # or false in binary properties.  Append a '*' to indicate
13069                 # some are missing.  (The heading comment notes this.)
13070                 my $rhs;
13071                 if ($type == $BINARY) {
13072                     next if $name ne 'N' && $name ne 'Y';
13073                     $rhs = "$name*";
13074                 }
13075                 elsif ($type != $FORCED_BINARY) {
13076                     $rhs = $name;
13077                 }
13078                 else {
13079
13080                     # Forced binary properties require special handling.  It
13081                     # has two sets of tables, one set is true/false; and the
13082                     # other set is everything else.  Entries are generated for
13083                     # each set.  Use the Bidi_Mirrored property (which appears
13084                     # in all Unicode versions) to get a list of the aliases
13085                     # for the true/false tables.  Of these, only output the N
13086                     # and Y ones, the same as, a regular binary property.  And
13087                     # output all the rest, same as a non-binary property.
13088                     my $bm = property_ref("Bidi_Mirrored");
13089                     if ($name eq 'N' || $name eq 'Y') {
13090                         $rhs = "$name*";
13091                     } elsif (grep { $name eq $_->name } $bm->table("Y")->aliases,
13092                                                         $bm->table("N")->aliases)
13093                     {
13094                         next;
13095                     }
13096                     else {
13097                         $rhs = $name;
13098                     }
13099                 }
13100
13101                 # Colon-space is used to give a little more space to be easier
13102                 # to read;
13103                 $entry = "\\p{"
13104                         . $table_property_full_name
13105                         . ": $rhs}";
13106
13107                 # But for the reference to this entry, which will go in the
13108                 # right column, where space is at a premium, use equals
13109                 # without a space
13110                 $entry_ref = "\\p{" . $table_property_full_name . "=$name}";
13111             }
13112
13113             # Then the right (info) column.  This is stored as components of
13114             # an array for the moment, then joined into a string later.  For
13115             # non-internal only properties, begin the info with the entry for
13116             # the first table we encountered (if any), as things are ordered
13117             # so that that one is the most descriptive.  This leads to the
13118             # info column of an entry being a more descriptive version of the
13119             # name column
13120             my @info;
13121             if ($name =~ /^_/) {
13122                 push @info,
13123                         '(For internal use by Perl, not necessarily stable)';
13124             }
13125             elsif ($entry_for_first_alias) {
13126                 push @info, $entry_for_first_alias;
13127             }
13128
13129             # If this entry is equivalent to another, add that to the info,
13130             # using the first such table we encountered
13131             if ($entry_for_first_table) {
13132                 if (@info) {
13133                     push @info, "(= $entry_for_first_table)";
13134                 }
13135                 else {
13136                     push @info, $entry_for_first_table;
13137                 }
13138             }
13139
13140             # If the name is a large integer, add an equivalent with an
13141             # exponent for better readability
13142             if ($name =~ /^[+-]?[\d]+$/ && $name >= 10_000) {
13143                 push @info, sprintf "(= %.1e)", $name
13144             }
13145
13146             my $parenthesized = "";
13147             if (! $entry_for_first_alias) {
13148
13149                 # This is the first alias for the current table.  The alias
13150                 # array is ordered so that this is the fullest, most
13151                 # descriptive alias, so it gets the fullest info.  The other
13152                 # aliases are mostly merely pointers to this one, using the
13153                 # information already added above.
13154
13155                 # Display any status message, but only on the parent table
13156                 if ($status && ! $entry_for_first_table) {
13157                     push @info, $status_info;
13158                 }
13159
13160                 # Put out any descriptive info
13161                 if (@table_description || @table_note) {
13162                     push @info, join "; ", @table_description, @table_note;
13163                 }
13164
13165                 # Look to see if there is a shorter name we can point people
13166                 # at
13167                 my $standard_name = standardize($name);
13168                 my $short_name;
13169                 my $proposed_short = $table->short_name;
13170                 if (defined $proposed_short) {
13171                     my $standard_short = standardize($proposed_short);
13172
13173                     # If the short name is shorter than the standard one, or
13174                     # even it it's not, but the combination of it and its
13175                     # short property name (as in \p{prop=short} ($perl doesn't
13176                     # have this form)) saves at least two characters, then,
13177                     # cause it to be listed as a shorter synonym.
13178                     if (length $standard_short < length $standard_name
13179                         || ($table_property != $perl
13180                             && (length($standard_short)
13181                                 - length($standard_name)
13182                                 + $table_property_short_delta)  # (<= 0)
13183                                 < -2))
13184                     {
13185                         $short_name = $proposed_short;
13186                         if ($table_property != $perl) {
13187                             $short_name = $table_property_short_name
13188                                           . "=$short_name";
13189                         }
13190                         $short_name = "\\p{$short_name}";
13191                     }
13192                 }
13193
13194                 # And if this is a compound form name, see if there is a
13195                 # single form equivalent
13196                 my $single_form;
13197                 if ($table_property != $perl) {
13198
13199                     # Special case the binary N tables, so that will print
13200                     # \P{single}, but use the Y table values to populate
13201                     # 'single', as we haven't likewise populated the N table.
13202                     # For forced binary tables, we can't just look at the N
13203                     # table, but must see if this table is equivalent to the N
13204                     # one, as there are two equivalent beasts in these
13205                     # properties.
13206                     my $test_table;
13207                     my $p;
13208                     if (   ($type == $BINARY
13209                             && $input_table == $property->table('No'))
13210                         || ($type == $FORCED_BINARY
13211                             && $property->table('No')->
13212                                         is_set_equivalent_to($input_table)))
13213                     {
13214                         $test_table = $property->table('Yes');
13215                         $p = 'P';
13216                     }
13217                     else {
13218                         $test_table = $input_table;
13219                         $p = 'p';
13220                     }
13221
13222                     # Look for a single form amongst all the children.
13223                     foreach my $table ($test_table->children) {
13224                         next if $table->property != $perl;
13225                         my $proposed_name = $table->short_name;
13226                         next if ! defined $proposed_name;
13227
13228                         # Don't mention internal-only properties as a possible
13229                         # single form synonym
13230                         next if substr($proposed_name, 0, 1) eq '_';
13231
13232                         $proposed_name = "\\$p\{$proposed_name}";
13233                         if (! defined $single_form
13234                             || length($proposed_name) < length $single_form)
13235                         {
13236                             $single_form = $proposed_name;
13237
13238                             # The goal here is to find a single form; not the
13239                             # shortest possible one.  We've already found a
13240                             # short name.  So, stop at the first single form
13241                             # found, which is likely to be closer to the
13242                             # original.
13243                             last;
13244                         }
13245                     }
13246                 }
13247
13248                 # Ouput both short and single in the same parenthesized
13249                 # expression, but with only one of 'Single', 'Short' if there
13250                 # are both items.
13251                 if ($short_name || $single_form || $table->conflicting) {
13252                     $parenthesized .= "Short: $short_name" if $short_name;
13253                     if ($short_name && $single_form) {
13254                         $parenthesized .= ', ';
13255                     }
13256                     elsif ($single_form) {
13257                         $parenthesized .= 'Single: ';
13258                     }
13259                     $parenthesized .= $single_form if $single_form;
13260                 }
13261             }
13262
13263             if ($caseless_equivalent != 0) {
13264                 $parenthesized .=  '; ' if $parenthesized ne "";
13265                 $parenthesized .= "/i= " . $caseless_equivalent->complete_name;
13266             }
13267
13268
13269             # Warn if this property isn't the same as one that a
13270             # semi-casual user might expect.  The other components of this
13271             # parenthesized structure are calculated only for the first entry
13272             # for this table, but the conflicting is deemed important enough
13273             # to go on every entry.
13274             my $conflicting = join " NOR ", $table->conflicting;
13275             if ($conflicting) {
13276                 $parenthesized .=  '; ' if $parenthesized ne "";
13277                 $parenthesized .= "NOT $conflicting";
13278             }
13279
13280             push @info, "($parenthesized)" if $parenthesized;
13281
13282             if ($name =~ /_$/ && $alias->loose_match) {
13283                 push @info, "Note the trailing '_' matters in spite of loose matching rules.";
13284             }
13285
13286             if ($table_property != $perl && $table->perl_extension) {
13287                 push @info, '(Perl extension)';
13288             }
13289             push @info, "($string_count)";
13290
13291             # Now, we have both the entry and info so add them to the
13292             # list of all the properties.
13293             push @match_properties,
13294                 format_pod_line($indent_info_column,
13295                                 $entry,
13296                                 join( " ", @info),
13297                                 $alias->status,
13298                                 $alias->loose_match);
13299
13300             $entry_for_first_alias = $entry_ref unless $entry_for_first_alias;
13301         } # End of looping through the aliases for this table.
13302
13303         if (! $entry_for_first_table) {
13304             $entry_for_first_table = $entry_for_first_alias;
13305         }
13306     } # End of looping through all the related tables
13307     return;
13308 }
13309
13310 sub make_ucd_table_pod_entries {
13311     my $table = shift;
13312
13313     # Generate the entries for the UCD section of the pod for $table.  This
13314     # also calculates if names are ambiguous, so has to be called even if the
13315     # pod is not being output
13316
13317     my $short_name = $table->name;
13318     my $standard_short_name = standardize($short_name);
13319     my $full_name = $table->full_name;
13320     my $standard_full_name = standardize($full_name);
13321
13322     my $full_info = "";     # Text of info column for full-name entries
13323     my $other_info = "";    # Text of info column for short-name entries
13324     my $short_info = "";    # Text of info column for other entries
13325     my $meaning = "";       # Synonym of this table
13326
13327     my $property = ($table->isa('Property'))
13328                    ? $table
13329                    : $table->parent->property;
13330
13331     my $perl_extension = $table->perl_extension;
13332
13333     # Get the more official name for for perl extensions that aren't
13334     # stand-alone properties
13335     if ($perl_extension && $property != $table) {
13336         if ($property == $perl ||$property->type == $BINARY) {
13337             $meaning = $table->complete_name;
13338         }
13339         else {
13340             $meaning = $property->full_name . "=$full_name";
13341         }
13342     }
13343
13344     # There are three types of info column.  One for the short name, one for
13345     # the full name, and one for everything else.  They mostly are the same,
13346     # so initialize in the same loop.
13347     foreach my $info_ref (\$full_info, \$short_info, \$other_info) {
13348         if ($perl_extension && $property != $table) {
13349
13350             # Add the synonymous name for the non-full name entries; and to
13351             # the full-name entry if it adds extra information
13352             if ($info_ref == \$other_info
13353                 || ($info_ref == \$short_info
13354                     && $standard_short_name ne $standard_full_name)
13355                 || standardize($meaning) ne $standard_full_name
13356             ) {
13357                 $$info_ref .= "$meaning.";
13358             }
13359         }
13360         elsif ($info_ref != \$full_info) {
13361
13362             # Otherwise, the non-full name columns include the full name
13363             $$info_ref .= $full_name;
13364         }
13365
13366         # And the full-name entry includes the short name, if different
13367         if ($info_ref == \$full_info
13368             && $standard_short_name ne $standard_full_name)
13369         {
13370             $full_info =~ s/\.\Z//;
13371             $full_info .= "  " if $full_info;
13372             $full_info .= "(Short: $short_name)";
13373         }
13374
13375         if ($table->perl_extension) {
13376             $$info_ref =~ s/\.\Z//;
13377             $$info_ref .= ".  " if $$info_ref;
13378             $$info_ref .= "(Perl extension)";
13379         }
13380     }
13381
13382     # Add any extra annotations to the full name entry
13383     foreach my $more_info ($table->description,
13384                             $table->note,
13385                             $table->status_info)
13386     {
13387         next unless $more_info;
13388         $full_info =~ s/\.\Z//;
13389         $full_info .= ".  " if $full_info;
13390         $full_info .= $more_info;
13391     }
13392
13393     # These keep track if have created full and short name pod entries for the
13394     # property
13395     my $done_full = 0;
13396     my $done_short = 0;
13397
13398     # Every possible name is kept track of, even those that aren't going to be
13399     # output.  This way we can be sure to find the ambiguities.
13400     foreach my $alias ($table->aliases) {
13401         my $name = $alias->name;
13402         my $standard = standardize($name);
13403         my $info;
13404         my $output_this = $alias->ucd;
13405
13406         # If the full and short names are the same, we want to output the full
13407         # one's entry, so it has priority.
13408         if ($standard eq $standard_full_name) {
13409             next if $done_full;
13410             $done_full = 1;
13411             $info = $full_info;
13412         }
13413         elsif ($standard eq $standard_short_name) {
13414             next if $done_short;
13415             $done_short = 1;
13416             next if $standard_short_name eq $standard_full_name;
13417             $info = $short_info;
13418         }
13419         else {
13420             $info = $other_info;
13421         }
13422
13423         # Here, we have set up the two columns for this entry.  But if an
13424         # entry already exists for this name, we have to decide which one
13425         # we're going to later output.
13426         if (exists $ucd_pod{$standard}) {
13427
13428             # If the two entries refer to the same property, it's not going to
13429             # be ambiguous.  (Likely it's because the names when standardized
13430             # are the same.)  But that means if they are different properties,
13431             # there is ambiguity.
13432             if ($ucd_pod{$standard}->{'property'} != $property) {
13433
13434                 # Here, we have an ambiguity.  This code assumes that one is
13435                 # scheduled to be output and one not and that one is a perl
13436                 # extension (which is not to be output) and the other isn't.
13437                 # If those assumptions are wrong, things have to be rethought.
13438                 if ($ucd_pod{$standard}{'output_this'} == $output_this
13439                     || $ucd_pod{$standard}{'perl_extension'} == $perl_extension
13440                     || $output_this == $perl_extension)
13441                 {
13442                     Carp::my_carp("Bad news.  $property and $ucd_pod{$standard}->{'property'} have unexpected output statuss and perl-extension combinations.  Proceeding anyway.");
13443                 }
13444
13445                 # We modifiy the info column of the one being output to
13446                 # indicate the ambiguity.  Set $which to point to that one's
13447                 # info.
13448                 my $which;
13449                 if ($ucd_pod{$standard}{'output_this'}) {
13450                     $which = \$ucd_pod{$standard}->{'info'};
13451                 }
13452                 else {
13453                     $which = \$info;
13454                     $meaning = $ucd_pod{$standard}{'meaning'};
13455                 }
13456
13457                 chomp $$which;
13458                 $$which =~ s/\.\Z//;
13459                 $$which .= "; NOT '$standard' meaning '$meaning'";
13460
13461                 $ambiguous_names{$standard} = 1;
13462             }
13463
13464             # Use the non-perl-extension variant
13465             next unless $ucd_pod{$standard}{'perl_extension'};
13466         }
13467
13468         # Store enough information about this entry that we can later look for
13469         # ambiguities, and output it properly.
13470         $ucd_pod{$standard} = { 'name' => $name,
13471                                 'info' => $info,
13472                                 'meaning' => $meaning,
13473                                 'output_this' => $output_this,
13474                                 'perl_extension' => $perl_extension,
13475                                 'property' => $property,
13476                                 'status' => $alias->status,
13477         };
13478     } # End of looping through all this table's aliases
13479
13480     return;
13481 }
13482
13483 sub pod_alphanumeric_sort {
13484     # Sort pod entries alphanumerically.
13485
13486     # The first few character columns are filler, plus the '\p{'; and get rid
13487     # of all the trailing stuff, starting with the trailing '}', so as to sort
13488     # on just 'Name=Value'
13489     (my $a = lc $a) =~ s/^ .*? { //x;
13490     $a =~ s/}.*//;
13491     (my $b = lc $b) =~ s/^ .*? { //x;
13492     $b =~ s/}.*//;
13493
13494     # Determine if the two operands are both internal only or both not.
13495     # Character 0 should be a '\'; 1 should be a p; 2 should be '{', so 3
13496     # should be the underscore that begins internal only
13497     my $a_is_internal = (substr($a, 0, 1) eq '_');
13498     my $b_is_internal = (substr($b, 0, 1) eq '_');
13499
13500     # Sort so the internals come last in the table instead of first (which the
13501     # leading underscore would otherwise indicate).
13502     if ($a_is_internal != $b_is_internal) {
13503         return 1 if $a_is_internal;
13504         return -1
13505     }
13506
13507     # Determine if the two operands are numeric property values or not.
13508     # A numeric property will look like xyz: 3.  But the number
13509     # can begin with an optional minus sign, and may have a
13510     # fraction or rational component, like xyz: 3/2.  If either
13511     # isn't numeric, use alphabetic sort.
13512     my ($a_initial, $a_number) =
13513         ($a =~ /^ ( [^:=]+ [:=] \s* ) (-? \d+ (?: [.\/] \d+)? )/ix);
13514     return $a cmp $b unless defined $a_number;
13515     my ($b_initial, $b_number) =
13516         ($b =~ /^ ( [^:=]+ [:=] \s* ) (-? \d+ (?: [.\/] \d+)? )/ix);
13517     return $a cmp $b unless defined $b_number;
13518
13519     # Here they are both numeric, but use alphabetic sort if the
13520     # initial parts don't match
13521     return $a cmp $b if $a_initial ne $b_initial;
13522
13523     # Convert rationals to floating for the comparison.
13524     $a_number = eval $a_number if $a_number =~ qr{/};
13525     $b_number = eval $b_number if $b_number =~ qr{/};
13526
13527     return $a_number <=> $b_number;
13528 }
13529
13530 sub make_pod () {
13531     # Create the .pod file.  This generates the various subsections and then
13532     # combines them in one big HERE document.
13533
13534     my $Is_flags_text = "If an entry has flag(s) at its beginning, like \"$DEPRECATED\", the \"Is_\" form has the same flag(s)";
13535
13536     return unless defined $pod_directory;
13537     print "Making pod file\n" if $verbosity >= $PROGRESS;
13538
13539     my $exception_message =
13540     '(Any exceptions are individually noted beginning with the word NOT.)';
13541     my @block_warning;
13542     if (-e 'Blocks.txt') {
13543
13544         # Add the line: '\p{In_*}    \p{Block: *}', with the warning message
13545         # if the global $has_In_conflicts indicates we have them.
13546         push @match_properties, format_pod_line($indent_info_column,
13547                                                 '\p{In_*}',
13548                                                 '\p{Block: *}'
13549                                                     . (($has_In_conflicts)
13550                                                       ? " $exception_message"
13551                                                       : ""));
13552         @block_warning = << "END";
13553
13554 Matches in the Block property have shortcuts that begin with "In_".  For
13555 example, C<\\p{Block=Latin1}> can be written as C<\\p{In_Latin1}>.  For
13556 backward compatibility, if there is no conflict with another shortcut, these
13557 may also be written as C<\\p{Latin1}> or C<\\p{Is_Latin1}>.  But, N.B., there
13558 are numerous such conflicting shortcuts.  Use of these forms for Block is
13559 discouraged, and are flagged as such, not only because of the potential
13560 confusion as to what is meant, but also because a later release of Unicode may
13561 preempt the shortcut, and your program would no longer be correct.  Use the
13562 "In_" form instead to avoid this, or even more clearly, use the compound form,
13563 e.g., C<\\p{blk:latin1}>.  See L<perlunicode/"Blocks"> for more information
13564 about this.
13565 END
13566     }
13567     my $text = $Is_flags_text;
13568     $text = "$exception_message $text" if $has_Is_conflicts;
13569
13570     # And the 'Is_ line';
13571     push @match_properties, format_pod_line($indent_info_column,
13572                                             '\p{Is_*}',
13573                                             "\\p{*} $text");
13574
13575     # Sort the properties array for output.  It is sorted alphabetically
13576     # except numerically for numeric properties, and only output unique lines.
13577     @match_properties = sort pod_alphanumeric_sort uniques @match_properties;
13578
13579     my $formatted_properties = simple_fold(\@match_properties,
13580                                         "",
13581                                         # indent succeeding lines by two extra
13582                                         # which looks better
13583                                         $indent_info_column + 2,
13584
13585                                         # shorten the line length by how much
13586                                         # the formatter indents, so the folded
13587                                         # line will fit in the space
13588                                         # presumably available
13589                                         $automatic_pod_indent);
13590     # Add column headings, indented to be a little more centered, but not
13591     # exactly
13592     $formatted_properties =  format_pod_line($indent_info_column,
13593                                                     '    NAME',
13594                                                     '           INFO')
13595                                     . "\n"
13596                                     . $formatted_properties;
13597
13598     # Generate pod documentation lines for the tables that match nothing
13599     my $zero_matches = "";
13600     if (@zero_match_tables) {
13601         @zero_match_tables = uniques(@zero_match_tables);
13602         $zero_matches = join "\n\n",
13603                         map { $_ = '=item \p{' . $_->complete_name . "}" }
13604                             sort { $a->complete_name cmp $b->complete_name }
13605                             @zero_match_tables;
13606
13607         $zero_matches = <<END;
13608
13609 =head2 Legal C<\\p{}> and C<\\P{}> constructs that match no characters
13610
13611 Unicode has some property-value pairs that currently don't match anything.
13612 This happens generally either because they are obsolete, or they exist for
13613 symmetry with other forms, but no language has yet been encoded that uses
13614 them.  In this version of Unicode, the following match zero code points:
13615
13616 =over 4
13617
13618 $zero_matches
13619
13620 =back
13621
13622 END
13623     }
13624
13625     # Generate list of properties that we don't accept, grouped by the reasons
13626     # why.  This is so only put out the 'why' once, and then list all the
13627     # properties that have that reason under it.
13628
13629     my %why_list;   # The keys are the reasons; the values are lists of
13630                     # properties that have the key as their reason
13631
13632     # For each property, add it to the list that are suppressed for its reason
13633     # The sort will cause the alphabetically first properties to be added to
13634     # each list first, so each list will be sorted.
13635     foreach my $property (sort keys %why_suppressed) {
13636         push @{$why_list{$why_suppressed{$property}}}, $property;
13637     }
13638
13639     # For each reason (sorted by the first property that has that reason)...
13640     my @bad_re_properties;
13641     foreach my $why (sort { $why_list{$a}->[0] cmp $why_list{$b}->[0] }
13642                      keys %why_list)
13643     {
13644         # Add to the output, all the properties that have that reason.
13645         my $has_item = 0;   # Flag if actually output anything.
13646         foreach my $name (@{$why_list{$why}}) {
13647
13648             # Split compound names into $property and $table components
13649             my $property = $name;
13650             my $table;
13651             if ($property =~ / (.*) = (.*) /x) {
13652                 $property = $1;
13653                 $table = $2;
13654             }
13655
13656             # This release of Unicode may not have a property that is
13657             # suppressed, so don't reference a non-existent one.
13658             $property = property_ref($property);
13659             next if ! defined $property;
13660
13661             # And since this list is only for match tables, don't list the
13662             # ones that don't have match tables.
13663             next if ! $property->to_create_match_tables;
13664
13665             # Find any abbreviation, and turn it into a compound name if this
13666             # is a property=value pair.
13667             my $short_name = $property->name;
13668             $short_name .= '=' . $property->table($table)->name if $table;
13669
13670             # Start with an empty line.
13671             push @bad_re_properties, "\n\n" unless $has_item;
13672
13673             # And add the property as an item for the reason.
13674             push @bad_re_properties, "\n=item I<$name> ($short_name)\n";
13675             $has_item = 1;
13676         }
13677
13678         # And add the reason under the list of properties, if such a list
13679         # actually got generated.  Note that the header got added
13680         # unconditionally before.  But pod ignores extra blank lines, so no
13681         # harm.
13682         push @bad_re_properties, "\n$why\n" if $has_item;
13683
13684     } # End of looping through each reason.
13685
13686     if (! @bad_re_properties) {
13687         push @bad_re_properties,
13688                 "*** This installation accepts ALL non-Unihan properties ***";
13689     }
13690     else {
13691         # Add =over only if non-empty to avoid an empty =over/=back section,
13692         # which is considered bad form.
13693         unshift @bad_re_properties, "\n=over 4\n";
13694         push @bad_re_properties, "\n=back\n";
13695     }
13696
13697     # Similiarly, generate a list of files that we don't use, grouped by the
13698     # reasons why.  First, create a hash whose keys are the reasons, and whose
13699     # values are anonymous arrays of all the files that share that reason.
13700     my %grouped_by_reason;
13701     foreach my $file (keys %ignored_files) {
13702         push @{$grouped_by_reason{$ignored_files{$file}}}, $file;
13703     }
13704     foreach my $file (keys %skipped_files) {
13705         push @{$grouped_by_reason{$skipped_files{$file}}}, $file;
13706     }
13707
13708     # Then, sort each group.
13709     foreach my $group (keys %grouped_by_reason) {
13710         @{$grouped_by_reason{$group}} = sort { lc $a cmp lc $b }
13711                                         @{$grouped_by_reason{$group}} ;
13712     }
13713
13714     # Finally, create the output text.  For each reason (sorted by the
13715     # alphabetically first file that has that reason)...
13716     my @unused_files;
13717     foreach my $reason (sort { lc $grouped_by_reason{$a}->[0]
13718                                cmp lc $grouped_by_reason{$b}->[0]
13719                               }
13720                          keys %grouped_by_reason)
13721     {
13722         # Add all the files that have that reason to the output.  Start
13723         # with an empty line.
13724         push @unused_files, "\n\n";
13725         push @unused_files, map { "\n=item F<$_> \n" }
13726                             @{$grouped_by_reason{$reason}};
13727         # And add the reason under the list of files
13728         push @unused_files, "\n$reason\n";
13729     }
13730
13731     # Similarly, create the output text for the UCD section of the pod
13732     my @ucd_pod;
13733     foreach my $key (keys %ucd_pod) {
13734         next unless $ucd_pod{$key}->{'output_this'};
13735         push @ucd_pod, format_pod_line($indent_info_column,
13736                                        $ucd_pod{$key}->{'name'},
13737                                        $ucd_pod{$key}->{'info'},
13738                                        $ucd_pod{$key}->{'status'},
13739                                       );
13740     }
13741
13742     # Sort alphabetically, and fold for output
13743     @ucd_pod = sort { lc substr($a, 2) cmp lc substr($b, 2) } @ucd_pod;
13744     my $ucd_pod = simple_fold(\@ucd_pod,
13745                            ' ',
13746                            $indent_info_column,
13747                            $automatic_pod_indent);
13748     $ucd_pod =  format_pod_line($indent_info_column, 'NAME', '  INFO')
13749                 . "\n"
13750                 . $ucd_pod;
13751     local $" = "";
13752
13753     # Everything is ready to assemble.
13754     my @OUT = << "END";
13755 =begin comment
13756
13757 $HEADER
13758
13759 To change this file, edit $0 instead.
13760
13761 =end comment
13762
13763 =head1 NAME
13764
13765 $pod_file - Index of Unicode Version $string_version character properties in Perl
13766
13767 =head1 DESCRIPTION
13768
13769 This document provides information about the portion of the Unicode database
13770 that deals with character properties, that is the portion that is defined on
13771 single code points.  (L</Other information in the Unicode data base>
13772 below briefly mentions other data that Unicode provides.)
13773
13774 Perl can provide access to all non-provisional Unicode character properties,
13775 though not all are enabled by default.  The omitted ones are the Unihan
13776 properties (accessible via the CPAN module L<Unicode::Unihan>) and certain
13777 deprecated or Unicode-internal properties.  (An installation may choose to
13778 recompile Perl's tables to change this.  See L<Unicode character
13779 properties that are NOT accepted by Perl>.)
13780
13781 For most purposes, access to Unicode properties from the Perl core is through
13782 regular expression matches, as described in the next section.
13783 For some special purposes, and to access the properties that are not suitable
13784 for regular expression matching, all the Unicode character properties that
13785 Perl handles are accessible via the standard L<Unicode::UCD> module, as
13786 described in the section L</Properties accessible through Unicode::UCD>.
13787
13788 Perl also provides some additional extensions and short-cut synonyms
13789 for Unicode properties.
13790
13791 This document merely lists all available properties and does not attempt to
13792 explain what each property really means.  There is a brief description of each
13793 Perl extension; see L<perlunicode/Other Properties> for more information on
13794 these.  There is some detail about Blocks, Scripts, General_Category,
13795 and Bidi_Class in L<perlunicode>, but to find out about the intricacies of the
13796 official Unicode properties, refer to the Unicode standard.  A good starting
13797 place is L<$unicode_reference_url>.
13798
13799 Note that you can define your own properties; see
13800 L<perlunicode/"User-Defined Character Properties">.
13801
13802 =head1 Properties accessible through C<\\p{}> and C<\\P{}>
13803
13804 The Perl regular expression C<\\p{}> and C<\\P{}> constructs give access to
13805 most of the Unicode character properties.  The table below shows all these
13806 constructs, both single and compound forms.
13807
13808 B<Compound forms> consist of two components, separated by an equals sign or a
13809 colon.  The first component is the property name, and the second component is
13810 the particular value of the property to match against, for example,
13811 C<\\p{Script: Greek}> and C<\\p{Script=Greek}> both mean to match characters
13812 whose Script property is Greek.
13813
13814 B<Single forms>, like C<\\p{Greek}>, are mostly Perl-defined shortcuts for
13815 their equivalent compound forms.  The table shows these equivalences.  (In our
13816 example, C<\\p{Greek}> is a just a shortcut for C<\\p{Script=Greek}>.)
13817 There are also a few Perl-defined single forms that are not shortcuts for a
13818 compound form.  One such is C<\\p{Word}>.  These are also listed in the table.
13819
13820 In parsing these constructs, Perl always ignores Upper/lower case differences
13821 everywhere within the {braces}.  Thus C<\\p{Greek}> means the same thing as
13822 C<\\p{greek}>.  But note that changing the case of the C<"p"> or C<"P"> before
13823 the left brace completely changes the meaning of the construct, from "match"
13824 (for C<\\p{}>) to "doesn't match" (for C<\\P{}>).  Casing in this document is
13825 for improved legibility.
13826
13827 Also, white space, hyphens, and underscores are also normally ignored
13828 everywhere between the {braces}, and hence can be freely added or removed
13829 even if the C</x> modifier hasn't been specified on the regular expression.
13830 But $a_bold_stricter at the beginning of an entry in the table below
13831 means that tighter (stricter) rules are used for that entry:
13832
13833 =over 4
13834
13835 =item Single form (C<\\p{name}>) tighter rules:
13836
13837 White space, hyphens, and underscores ARE significant
13838 except for:
13839
13840 =over 4
13841
13842 =item * white space adjacent to a non-word character
13843
13844 =item * underscores separating digits in numbers
13845
13846 =back
13847
13848 That means, for example, that you can freely add or remove white space
13849 adjacent to (but within) the braces without affecting the meaning.
13850
13851 =item Compound form (C<\\p{name=value}> or C<\\p{name:value}>) tighter rules:
13852
13853 The tighter rules given above for the single form apply to everything to the
13854 right of the colon or equals; the looser rules still apply to everything to
13855 the left.
13856
13857 That means, for example, that you can freely add or remove white space
13858 adjacent to (but within) the braces and the colon or equal sign.
13859
13860 =back
13861
13862 Some properties are considered obsolete by Unicode, but still available.
13863 There are several varieties of obsolescence:
13864
13865 =over 4
13866
13867 =item Stabilized
13868
13869 A property may be stabilized.  Such a determination does not indicate
13870 that the property should or should not be used; instead it is a declaration
13871 that the property will not be maintained nor extended for newly encoded
13872 characters.  Such properties are marked with $a_bold_stabilized in the
13873 table.
13874
13875 =item Deprecated
13876
13877 A property may be deprecated, perhaps because its original intent
13878 has been replaced by another property, or because its specification was
13879 somehow defective.  This means that its use is strongly
13880 discouraged, so much so that a warning will be issued if used, unless the
13881 regular expression is in the scope of a C<S<no warnings 'deprecated'>>
13882 statement.  $A_bold_deprecated flags each such entry in the table, and
13883 the entry there for the longest, most descriptive version of the property will
13884 give the reason it is deprecated, and perhaps advice.  Perl may issue such a
13885 warning, even for properties that aren't officially deprecated by Unicode,
13886 when there used to be characters or code points that were matched by them, but
13887 no longer.  This is to warn you that your program may not work like it did on
13888 earlier Unicode releases.
13889
13890 A deprecated property may be made unavailable in a future Perl version, so it
13891 is best to move away from them.
13892
13893 A deprecated property may also be stabilized, but this fact is not shown.
13894
13895 =item Obsolete
13896
13897 Properties marked with $a_bold_obsolete in the table are considered (plain)
13898 obsolete.  Generally this designation is given to properties that Unicode once
13899 used for internal purposes (but not any longer).
13900
13901 =back
13902
13903 Some Perl extensions are present for backwards compatibility and are
13904 discouraged from being used, but are not obsolete.  $A_bold_discouraged
13905 flags each such entry in the table.  Future Unicode versions may force
13906 some of these extensions to be removed without warning, replaced by another
13907 property with the same name that means something different.  Use the
13908 equivalent shown instead.
13909
13910 @block_warning
13911
13912 The table below has two columns.  The left column contains the C<\\p{}>
13913 constructs to look up, possibly preceded by the flags mentioned above; and
13914 the right column contains information about them, like a description, or
13915 synonyms.  It shows both the single and compound forms for each property that
13916 has them.  If the left column is a short name for a property, the right column
13917 will give its longer, more descriptive name; and if the left column is the
13918 longest name, the right column will show any equivalent shortest name, in both
13919 single and compound forms if applicable.
13920
13921 The right column will also caution you if a property means something different
13922 than what might normally be expected.
13923
13924 All single forms are Perl extensions; a few compound forms are as well, and
13925 are noted as such.
13926
13927 Numbers in (parentheses) indicate the total number of code points matched by
13928 the property.  For emphasis, those properties that match no code points at all
13929 are listed as well in a separate section following the table.
13930
13931 Most properties match the same code points regardless of whether C<"/i">
13932 case-insensitive matching is specified or not.  But a few properties are
13933 affected.  These are shown with the notation
13934
13935  (/i= other_property)
13936
13937 in the second column.  Under case-insensitive matching they match the
13938 same code pode points as the property "other_property".
13939
13940 There is no description given for most non-Perl defined properties (See
13941 L<$unicode_reference_url> for that).
13942
13943 For compactness, 'B<*>' is used as a wildcard instead of showing all possible
13944 combinations.  For example, entries like:
13945
13946  \\p{Gc: *}                                  \\p{General_Category: *}
13947
13948 mean that 'Gc' is a synonym for 'General_Category', and anything that is valid
13949 for the latter is also valid for the former.  Similarly,
13950
13951  \\p{Is_*}                                   \\p{*}
13952
13953 means that if and only if, for example, C<\\p{Foo}> exists, then
13954 C<\\p{Is_Foo}> and C<\\p{IsFoo}> are also valid and all mean the same thing.
13955 And similarly, C<\\p{Foo=Bar}> means the same as C<\\p{Is_Foo=Bar}> and
13956 C<\\p{IsFoo=Bar}>.  "*" here is restricted to something not beginning with an
13957 underscore.
13958
13959 Also, in binary properties, 'Yes', 'T', and 'True' are all synonyms for 'Y'.
13960 And 'No', 'F', and 'False' are all synonyms for 'N'.  The table shows 'Y*' and
13961 'N*' to indicate this, and doesn't have separate entries for the other
13962 possibilities.  Note that not all properties which have values 'Yes' and 'No'
13963 are binary, and they have all their values spelled out without using this wild
13964 card, and a C<NOT> clause in their description that highlights their not being
13965 binary.  These also require the compound form to match them, whereas true
13966 binary properties have both single and compound forms available.
13967
13968 Note that all non-essential underscores are removed in the display of the
13969 short names below.
13970
13971 B<Legend summary:>
13972
13973 =over 4
13974
13975 =item Z<>B<*> is a wild-card
13976
13977 =item B<(\\d+)> in the info column gives the number of code points matched by
13978 this property.
13979
13980 =item B<$DEPRECATED> means this is deprecated.
13981
13982 =item B<$OBSOLETE> means this is obsolete.
13983
13984 =item B<$STABILIZED> means this is stabilized.
13985
13986 =item B<$STRICTER> means tighter (stricter) name matching applies.
13987
13988 =item B<$DISCOURAGED> means use of this form is discouraged, and may not be
13989 stable.
13990
13991 =back
13992
13993 $formatted_properties
13994
13995 $zero_matches
13996
13997 =head1 Properties accessible through Unicode::UCD
13998
13999 All the Unicode character properties mentioned above (except for those marked
14000 as for internal use by Perl) are also accessible by
14001 L<Unicode::UCD/prop_invlist()>.
14002
14003 Due to their nature, not all Unicode character properties are suitable for
14004 regular expression matches, nor C<prop_invlist()>.  The remaining
14005 non-provisional, non-internal ones are accessible via
14006 L<Unicode::UCD/prop_invmap()> (except for those that this Perl installation
14007 hasn't included; see L<below for which those are|/Unicode character properties
14008 that are NOT accepted by Perl>).
14009
14010 For compatibility with other parts of Perl, all the single forms given in the
14011 table in the L<section above|/Properties accessible through \\p{} and \\P{}>
14012 are recognized.  BUT, there are some ambiguities between some Perl extensions
14013 and the Unicode properties, all of which are silently resolved in favor of the
14014 official Unicode property.  To avoid surprises, you should only use
14015 C<prop_invmap()> for forms listed in the table below, which omits the
14016 non-recommended ones.  The affected forms are the Perl single form equivalents
14017 of Unicode properties, such as C<\\p{sc}> being a single-form equivalent of
14018 C<\\p{gc=sc}>, which is treated by C<prop_invmap()> as the C<Script> property,
14019 whose short name is C<sc>.  The table indicates the current ambiguities in the
14020 INFO column, beginning with the word C<"NOT">.
14021
14022 The standard Unicode properties listed below are documented in
14023 L<$unicode_reference_url>; Perl_Decimal_Digit is documented in
14024 L<Unicode::UCD/prop_invmap()>.  The other Perl extensions are in
14025 L<perlunicode/Other Properties>;
14026
14027 The first column in the table is a name for the property; the second column is
14028 an alternative name, if any, plus possibly some annotations.  The alternative
14029 name is the property's full name, unless that would simply repeat the first
14030 column, in which case the second column indicates the property's short name
14031 (if different).  The annotations are given only in the entry for the full
14032 name.  If a property is obsolete, etc, the entry will be flagged with the same
14033 characters used in the table in the L<section above|/Properties accessible
14034 through \\p{} and \\P{}>, like B<$DEPRECATED> or B<$STABILIZED>.
14035
14036 $ucd_pod
14037
14038 =head1 Properties accessible through other means
14039
14040 Certain properties are accessible also via core function calls.  These are:
14041
14042  Lowercase_Mapping          lc() and lcfirst()
14043  Titlecase_Mapping          ucfirst()
14044  Uppercase_Mapping          uc()
14045
14046 Also, Case_Folding is accessible through the C</i> modifier in regular
14047 expressions.
14048
14049 And, the Name and Name_Aliases properties are accessible through the C<\\N{}>
14050 interpolation in double-quoted strings and regular expressions; and functions
14051 C<charnames::viacode()>, C<charnames::vianame()>, and
14052 C<charnames::string_vianame()> (which require a C<use charnames ();> to be
14053 specified.
14054
14055 Finally, most properties related to decomposition are accessible via
14056 L<Unicode::Normalize>.
14057
14058 =head1 Unicode character properties that are NOT accepted by Perl
14059
14060 Perl will generate an error for a few character properties in Unicode when
14061 used in a regular expression.  The non-Unihan ones are listed below, with the
14062 reasons they are not accepted, perhaps with work-arounds.  The short names for
14063 the properties are listed enclosed in (parentheses).
14064 As described after the list, an installation can change the defaults and choose
14065 to accept any of these.  The list is machine generated based on the
14066 choices made for the installation that generated this document.
14067
14068 @bad_re_properties
14069
14070 An installation can choose to allow any of these to be matched by downloading
14071 the Unicode database from L<http://www.unicode.org/Public/> to
14072 C<\$Config{privlib}>/F<unicore/> in the Perl source tree, changing the
14073 controlling lists contained in the program
14074 C<\$Config{privlib}>/F<unicore/mktables> and then re-compiling and installing.
14075 (C<\%Config> is available from the Config module).
14076
14077 =head1 Other information in the Unicode data base
14078
14079 The Unicode data base is delivered in two different formats.  The XML version
14080 is valid for more modern Unicode releases.  The other version is a collection
14081 of files.  The two are intended to give equivalent information.  Perl uses the
14082 older form; this allows you to recompile Perl to use early Unicode releases.
14083
14084 The only non-character property that Perl currently supports is Named
14085 Sequences, in which a sequence of code points
14086 is given a name and generally treated as a single entity.  (Perl supports
14087 these via the C<\\N{...}> double-quotish construct,
14088 L<charnames/charnames::string_vianame(name)>, and L<Unicode::UCD/namedseq()>.
14089
14090 Below is a list of the files in the Unicode data base that Perl doesn't
14091 currently use, along with very brief descriptions of their purposes.
14092 Some of the names of the files have been shortened from those that Unicode
14093 uses, in order to allow them to be distinguishable from similarly named files
14094 on file systems for which only the first 8 characters of a name are
14095 significant.
14096
14097 =over 4
14098
14099 @unused_files
14100
14101 =back
14102
14103 =head1 SEE ALSO
14104
14105 L<$unicode_reference_url>
14106
14107 L<perlrecharclass>
14108
14109 L<perlunicode>
14110
14111 END
14112
14113     # And write it.  The 0 means no utf8.
14114     main::write([ $pod_directory, "$pod_file.pod" ], 0, \@OUT);
14115     return;
14116 }
14117
14118 sub make_Heavy () {
14119     # Create and write Heavy.pl, which passes info about the tables to
14120     # utf8_heavy.pl
14121
14122     # Stringify structures for output
14123     my $loose_property_name_of
14124                            = simple_dumper(\%loose_property_name_of, ' ' x 4);
14125     chomp $loose_property_name_of;
14126
14127     my $stricter_to_file_of = simple_dumper(\%stricter_to_file_of, ' ' x 4);
14128     chomp $stricter_to_file_of;
14129
14130     my $loose_to_file_of = simple_dumper(\%loose_to_file_of, ' ' x 4);
14131     chomp $loose_to_file_of;
14132
14133     my $nv_floating_to_rational
14134                            = simple_dumper(\%nv_floating_to_rational, ' ' x 4);
14135     chomp $nv_floating_to_rational;
14136
14137     my $why_deprecated = simple_dumper(\%utf8::why_deprecated, ' ' x 4);
14138     chomp $why_deprecated;
14139
14140     # We set the key to the file when we associated files with tables, but we
14141     # couldn't do the same for the value then, as we might not have the file
14142     # for the alternate table figured out at that time.
14143     foreach my $cased (keys %caseless_equivalent_to) {
14144         my @path = $caseless_equivalent_to{$cased}->file_path;
14145         my $path = join '/', @path[1, -1];
14146         $caseless_equivalent_to{$cased} = $path;
14147     }
14148     my $caseless_equivalent_to
14149                            = simple_dumper(\%caseless_equivalent_to, ' ' x 4);
14150     chomp $caseless_equivalent_to;
14151
14152     my $loose_property_to_file_of
14153                         = simple_dumper(\%loose_property_to_file_of, ' ' x 4);
14154     chomp $loose_property_to_file_of;
14155
14156     my $file_to_swash_name = simple_dumper(\%file_to_swash_name, ' ' x 4);
14157     chomp $file_to_swash_name;
14158
14159     my @heavy = <<END;
14160 $HEADER
14161 $INTERNAL_ONLY_HEADER
14162
14163 # This file is for the use of utf8_heavy.pl and Unicode::UCD
14164
14165 # Maps Unicode (not Perl single-form extensions) property names in loose
14166 # standard form to their corresponding standard names
14167 \%utf8::loose_property_name_of = (
14168 $loose_property_name_of
14169 );
14170
14171 # Maps property, table to file for those using stricter matching
14172 \%utf8::stricter_to_file_of = (
14173 $stricter_to_file_of
14174 );
14175
14176 # Maps property, table to file for those using loose matching
14177 \%utf8::loose_to_file_of = (
14178 $loose_to_file_of
14179 );
14180
14181 # Maps floating point to fractional form
14182 \%utf8::nv_floating_to_rational = (
14183 $nv_floating_to_rational
14184 );
14185
14186 # If a floating point number doesn't have enough digits in it to get this
14187 # close to a fraction, it isn't considered to be that fraction even if all the
14188 # digits it does have match.
14189 \$utf8::max_floating_slop = $MAX_FLOATING_SLOP;
14190
14191 # Deprecated tables to generate a warning for.  The key is the file containing
14192 # the table, so as to avoid duplication, as many property names can map to the
14193 # file, but we only need one entry for all of them.
14194 \%utf8::why_deprecated = (
14195 $why_deprecated
14196 );
14197
14198 # A few properties have different behavior under /i matching.  This maps
14199 # those to substitute files to use under /i.
14200 \%utf8::caseless_equivalent = (
14201 $caseless_equivalent_to
14202 );
14203
14204 # Property names to mapping files
14205 \%utf8::loose_property_to_file_of = (
14206 $loose_property_to_file_of
14207 );
14208
14209 # Files to the swash names within them.
14210 \%utf8::file_to_swash_name = (
14211 $file_to_swash_name
14212 );
14213
14214 1;
14215 END
14216
14217     main::write("Heavy.pl", 0, \@heavy);  # The 0 means no utf8.
14218     return;
14219 }
14220
14221 sub make_Name_pm () {
14222     # Create and write Name.pm, which contains subroutines and data to use in
14223     # conjunction with Name.pl
14224
14225     # Maybe there's nothing to do.
14226     return unless $has_hangul_syllables || @code_points_ending_in_code_point;
14227
14228     my @name = <<END;
14229 $HEADER
14230 $INTERNAL_ONLY_HEADER
14231 END
14232
14233     # Convert these structures to output format.
14234     my $code_points_ending_in_code_point =
14235         main::simple_dumper(\@code_points_ending_in_code_point,
14236                             ' ' x 8);
14237     my $names = main::simple_dumper(\%names_ending_in_code_point,
14238                                     ' ' x 8);
14239     my $loose_names = main::simple_dumper(\%loose_names_ending_in_code_point,
14240                                     ' ' x 8);
14241
14242     # Do the same with the Hangul names,
14243     my $jamo;
14244     my $jamo_l;
14245     my $jamo_v;
14246     my $jamo_t;
14247     my $jamo_re;
14248     if ($has_hangul_syllables) {
14249
14250         # Construct a regular expression of all the possible
14251         # combinations of the Hangul syllables.
14252         my @L_re;   # Leading consonants
14253         for my $i ($LBase .. $LBase + $LCount - 1) {
14254             push @L_re, $Jamo{$i}
14255         }
14256         my @V_re;   # Middle vowels
14257         for my $i ($VBase .. $VBase + $VCount - 1) {
14258             push @V_re, $Jamo{$i}
14259         }
14260         my @T_re;   # Trailing consonants
14261         for my $i ($TBase + 1 .. $TBase + $TCount - 1) {
14262             push @T_re, $Jamo{$i}
14263         }
14264
14265         # The whole re is made up of the L V T combination.
14266         $jamo_re = '('
14267                     . join ('|', sort @L_re)
14268                     . ')('
14269                     . join ('|', sort @V_re)
14270                     . ')('
14271                     . join ('|', sort @T_re)
14272                     . ')?';
14273
14274         # These hashes needed by the algorithm were generated
14275         # during reading of the Jamo.txt file
14276         $jamo = main::simple_dumper(\%Jamo, ' ' x 8);
14277         $jamo_l = main::simple_dumper(\%Jamo_L, ' ' x 8);
14278         $jamo_v = main::simple_dumper(\%Jamo_V, ' ' x 8);
14279         $jamo_t = main::simple_dumper(\%Jamo_T, ' ' x 8);
14280     }
14281
14282     push @name, <<END;
14283
14284 package charnames;
14285
14286 # This module contains machine-generated tables and code for the
14287 # algorithmically-determinable Unicode character names.  The following
14288 # routines can be used to translate between name and code point and vice versa
14289
14290 { # Closure
14291
14292     # Matches legal code point.  4-6 hex numbers, If there are 6, the first
14293     # two must be 10; if there are 5, the first must not be a 0.  Written this
14294     # way to decrease backtracking.  The first regex allows the code point to
14295     # be at the end of a word, but to work properly, the word shouldn't end
14296     # with a valid hex character.  The second one won't match a code point at
14297     # the end of a word, and doesn't have the run-on issue
14298     my \$run_on_code_point_re = qr/$run_on_code_point_re/;
14299     my \$code_point_re = qr/$code_point_re/;
14300
14301     # In the following hash, the keys are the bases of names which includes
14302     # the code point in the name, like CJK UNIFIED IDEOGRAPH-4E01.  The values
14303     # of each key is another hash which is used to get the low and high ends
14304     # for each range of code points that apply to the name.
14305     my %names_ending_in_code_point = (
14306 $names
14307     );
14308
14309     # The following hash is a copy of the previous one, except is for loose
14310     # matching, so each name has blanks and dashes squeezed out
14311     my %loose_names_ending_in_code_point = (
14312 $loose_names
14313     );
14314
14315     # And the following array gives the inverse mapping from code points to
14316     # names.  Lowest code points are first
14317     my \@code_points_ending_in_code_point = (
14318 $code_points_ending_in_code_point
14319     );
14320 END
14321     # Earlier releases didn't have Jamos.  No sense outputting
14322     # them unless will be used.
14323     if ($has_hangul_syllables) {
14324         push @name, <<END;
14325
14326     # Convert from code point to Jamo short name for use in composing Hangul
14327     # syllable names
14328     my %Jamo = (
14329 $jamo
14330     );
14331
14332     # Leading consonant (can be null)
14333     my %Jamo_L = (
14334 $jamo_l
14335     );
14336
14337     # Vowel
14338     my %Jamo_V = (
14339 $jamo_v
14340     );
14341
14342     # Optional trailing consonant
14343     my %Jamo_T = (
14344 $jamo_t
14345     );
14346
14347     # Computed re that splits up a Hangul name into LVT or LV syllables
14348     my \$syllable_re = qr/$jamo_re/;
14349
14350     my \$HANGUL_SYLLABLE = "HANGUL SYLLABLE ";
14351     my \$loose_HANGUL_SYLLABLE = "HANGULSYLLABLE";
14352
14353     # These constants names and values were taken from the Unicode standard,
14354     # version 5.1, section 3.12.  They are used in conjunction with Hangul
14355     # syllables
14356     my \$SBase = $SBase_string;
14357     my \$LBase = $LBase_string;
14358     my \$VBase = $VBase_string;
14359     my \$TBase = $TBase_string;
14360     my \$SCount = $SCount;
14361     my \$LCount = $LCount;
14362     my \$VCount = $VCount;
14363     my \$TCount = $TCount;
14364     my \$NCount = \$VCount * \$TCount;
14365 END
14366     } # End of has Jamos
14367
14368     push @name, << 'END';
14369
14370     sub name_to_code_point_special {
14371         my ($name, $loose) = @_;
14372
14373         # Returns undef if not one of the specially handled names; otherwise
14374         # returns the code point equivalent to the input name
14375         # $loose is non-zero if to use loose matching, 'name' in that case
14376         # must be input as upper case with all blanks and dashes squeezed out.
14377 END
14378     if ($has_hangul_syllables) {
14379         push @name, << 'END';
14380
14381         if ((! $loose && $name =~ s/$HANGUL_SYLLABLE//)
14382             || ($loose && $name =~ s/$loose_HANGUL_SYLLABLE//))
14383         {
14384             return if $name !~ qr/^$syllable_re$/;
14385             my $L = $Jamo_L{$1};
14386             my $V = $Jamo_V{$2};
14387             my $T = (defined $3) ? $Jamo_T{$3} : 0;
14388             return ($L * $VCount + $V) * $TCount + $T + $SBase;
14389         }
14390 END
14391     }
14392     push @name, << 'END';
14393
14394         # Name must end in 'code_point' for this to handle.
14395         return if (($loose && $name !~ /^ (.*?) ($run_on_code_point_re) $/x)
14396                    || (! $loose && $name !~ /^ (.*) ($code_point_re) $/x));
14397
14398         my $base = $1;
14399         my $code_point = CORE::hex $2;
14400         my $names_ref;
14401
14402         if ($loose) {
14403             $names_ref = \%loose_names_ending_in_code_point;
14404         }
14405         else {
14406             return if $base !~ s/-$//;
14407             $names_ref = \%names_ending_in_code_point;
14408         }
14409
14410         # Name must be one of the ones which has the code point in it.
14411         return if ! $names_ref->{$base};
14412
14413         # Look through the list of ranges that apply to this name to see if
14414         # the code point is in one of them.
14415         for (my $i = 0; $i < scalar @{$names_ref->{$base}{'low'}}; $i++) {
14416             return if $names_ref->{$base}{'low'}->[$i] > $code_point;
14417             next if $names_ref->{$base}{'high'}->[$i] < $code_point;
14418
14419             # Here, the code point is in the range.
14420             return $code_point;
14421         }
14422
14423         # Here, looked like the name had a code point number in it, but
14424         # did not match one of the valid ones.
14425         return;
14426     }
14427
14428     sub code_point_to_name_special {
14429         my $code_point = shift;
14430
14431         # Returns the name of a code point if algorithmically determinable;
14432         # undef if not
14433 END
14434     if ($has_hangul_syllables) {
14435         push @name, << 'END';
14436
14437         # If in the Hangul range, calculate the name based on Unicode's
14438         # algorithm
14439         if ($code_point >= $SBase && $code_point <= $SBase + $SCount -1) {
14440             use integer;
14441             my $SIndex = $code_point - $SBase;
14442             my $L = $LBase + $SIndex / $NCount;
14443             my $V = $VBase + ($SIndex % $NCount) / $TCount;
14444             my $T = $TBase + $SIndex % $TCount;
14445             $name = "$HANGUL_SYLLABLE$Jamo{$L}$Jamo{$V}";
14446             $name .= $Jamo{$T} if $T != $TBase;
14447             return $name;
14448         }
14449 END
14450     }
14451     push @name, << 'END';
14452
14453         # Look through list of these code points for one in range.
14454         foreach my $hash (@code_points_ending_in_code_point) {
14455             return if $code_point < $hash->{'low'};
14456             if ($code_point <= $hash->{'high'}) {
14457                 return sprintf("%s-%04X", $hash->{'name'}, $code_point);
14458             }
14459         }
14460         return;            # None found
14461     }
14462 } # End closure
14463
14464 1;
14465 END
14466
14467     main::write("Name.pm", 0, \@name);  # The 0 means no utf8.
14468     return;
14469 }
14470
14471 sub make_UCD () {
14472     # Create and write UCD.pl, which passes info about the tables to
14473     # Unicode::UCD
14474
14475     # Create a mapping from each alias of Perl single-form extensions to all
14476     # its equivalent aliases, for quick look-up.
14477     my %perlprop_to_aliases;
14478     foreach my $table ($perl->tables) {
14479
14480         # First create the list of the aliases of each extension
14481         my @aliases_list;    # List of legal aliases for this extension
14482
14483         my $table_name = $table->name;
14484         my $standard_table_name = standardize($table_name);
14485         my $table_full_name = $table->full_name;
14486         my $standard_table_full_name = standardize($table_full_name);
14487
14488         # Make sure that the list has both the short and full names
14489         push @aliases_list, $table_name, $table_full_name;
14490
14491         my $found_ucd = 0;  # ? Did we actually get an alias that should be
14492                             # output for this table
14493
14494         # Go through all the aliases (including the two just added), and add
14495         # any new unique ones to the list
14496         foreach my $alias ($table->aliases) {
14497
14498             # Skip non-legal names
14499             next unless $alias->ok_as_filename;
14500             next unless $alias->ucd;
14501
14502             $found_ucd = 1;     # have at least one legal name
14503
14504             my $name = $alias->name;
14505             my $standard = standardize($name);
14506
14507             # Don't repeat a name that is equivalent to one already on the
14508             # list
14509             next if $standard eq $standard_table_name;
14510             next if $standard eq $standard_table_full_name;
14511
14512             push @aliases_list, $name;
14513         }
14514
14515         # If there were no legal names, don't output anything.
14516         next unless $found_ucd;
14517
14518         # To conserve memory in the program reading these in, omit full names
14519         # that are identical to the short name, when those are the only two
14520         # aliases for the property.
14521         if (@aliases_list == 2 && $aliases_list[0] eq $aliases_list[1]) {
14522             pop @aliases_list;
14523         }
14524
14525         # Here, @aliases_list is the list of all the aliases that this
14526         # extension legally has.  Now can create a map to it from each legal
14527         # standardized alias
14528         foreach my $alias ($table->aliases) {
14529             next unless $alias->ucd;
14530             next unless $alias->ok_as_filename;
14531             push @{$perlprop_to_aliases{standardize($alias->name)}},
14532                  @aliases_list;
14533         }
14534     }
14535
14536     # Make a list of all combinations of properties/values that are suppressed.
14537     my @suppressed;
14538     foreach my $property_name (keys %why_suppressed) {
14539
14540         # Just the value
14541         my $value_name = $1 if $property_name =~ s/ = ( .* ) //x;
14542
14543         # The hash may contain properties not in this release of Unicode
14544         next unless defined (my $property = property_ref($property_name));
14545
14546         # Find all combinations
14547         foreach my $prop_alias ($property->aliases) {
14548             my $prop_alias_name = standardize($prop_alias->name);
14549
14550             # If no =value, there's just one combination possibe for this
14551             if (! $value_name) {
14552
14553                 # The property may be suppressed, but there may be a proxy for
14554                 # it, so it shouldn't be listed as suppressed
14555                 next if $prop_alias->ucd;
14556                 push @suppressed, $prop_alias_name;
14557             }
14558             else {  # Otherwise
14559                 foreach my $value_alias ($property->table($value_name)->aliases)
14560                 {
14561                     next if $value_alias->ucd;
14562
14563                     push @suppressed, "$prop_alias_name="
14564                                       .  standardize($value_alias->name);
14565                 }
14566             }
14567         }
14568     }
14569
14570     # Convert the structure below (designed for Name.pm) to a form that UCD
14571     # wants, so it doesn't have to modify it at all; i.e. so that it includes
14572     # an element for the Hangul syllables in the appropriate place, and
14573     # otherwise changes the name to include the "-<code point>" suffix.
14574     my @algorithm_names;
14575     my $done_hangul = 0;
14576
14577     # Copy it linearly.
14578     for my $i (0 .. @code_points_ending_in_code_point - 1) {
14579
14580         # Insert the hanguls in the correct place.
14581         if (! $done_hangul
14582             && $code_points_ending_in_code_point[$i]->{'low'} > $SBase)
14583         {
14584             $done_hangul = 1;
14585             push @algorithm_names, { low => $SBase,
14586                                      high => $SBase + $SCount - 1,
14587                                      name => '<hangul syllable>',
14588                                     };
14589         }
14590
14591         # Copy the current entry, modified.
14592         push @algorithm_names, {
14593             low => $code_points_ending_in_code_point[$i]->{'low'},
14594             high => $code_points_ending_in_code_point[$i]->{'high'},
14595             name =>
14596                "$code_points_ending_in_code_point[$i]->{'name'}-<code point>",
14597         };
14598     }
14599
14600     # Serialize these structures for output.
14601     my $loose_to_standard_value
14602                           = simple_dumper(\%loose_to_standard_value, ' ' x 4);
14603     chomp $loose_to_standard_value;
14604
14605     my $string_property_loose_to_name
14606                     = simple_dumper(\%string_property_loose_to_name, ' ' x 4);
14607     chomp $string_property_loose_to_name;
14608
14609     my $perlprop_to_aliases = simple_dumper(\%perlprop_to_aliases, ' ' x 4);
14610     chomp $perlprop_to_aliases;
14611
14612     my $prop_aliases = simple_dumper(\%prop_aliases, ' ' x 4);
14613     chomp $prop_aliases;
14614
14615     my $prop_value_aliases = simple_dumper(\%prop_value_aliases, ' ' x 4);
14616     chomp $prop_value_aliases;
14617
14618     my $suppressed = (@suppressed) ? simple_dumper(\@suppressed, ' ' x 4) : "";
14619     chomp $suppressed;
14620
14621     my $algorithm_names = simple_dumper(\@algorithm_names, ' ' x 4);
14622     chomp $algorithm_names;
14623
14624     my $ambiguous_names = simple_dumper(\%ambiguous_names, ' ' x 4);
14625     chomp $ambiguous_names;
14626
14627     my $loose_defaults = simple_dumper(\%loose_defaults, ' ' x 4);
14628     chomp $loose_defaults;
14629
14630     my @ucd = <<END;
14631 $HEADER
14632 $INTERNAL_ONLY_HEADER
14633
14634 # This file is for the use of Unicode::UCD
14635
14636 # Highest legal Unicode code point
14637 \$Unicode::UCD::MAX_UNICODE_CODEPOINT = 0x$MAX_UNICODE_CODEPOINT_STRING;
14638
14639 # Hangul syllables
14640 \$Unicode::UCD::HANGUL_BEGIN = $SBase_string;
14641 \$Unicode::UCD::HANGUL_COUNT = $SCount;
14642
14643 # Keys are all the possible "prop=value" combinations, in loose form; values
14644 # are the standard loose name for the 'value' part of the key
14645 \%Unicode::UCD::loose_to_standard_value = (
14646 $loose_to_standard_value
14647 );
14648
14649 # String property loose names to standard loose name
14650 \%Unicode::UCD::string_property_loose_to_name = (
14651 $string_property_loose_to_name
14652 );
14653
14654 # Keys are Perl extensions in loose form; values are each one's list of
14655 # aliases
14656 \%Unicode::UCD::loose_perlprop_to_name = (
14657 $perlprop_to_aliases
14658 );
14659
14660 # Keys are standard property name; values are each one's aliases
14661 \%Unicode::UCD::prop_aliases = (
14662 $prop_aliases
14663 );
14664
14665 # Keys of top level are standard property name; values are keys to another
14666 # hash,  Each one is one of the property's values, in standard form.  The
14667 # values are that prop-val's aliases.  If only one specified, the short and
14668 # long alias are identical.
14669 \%Unicode::UCD::prop_value_aliases = (
14670 $prop_value_aliases
14671 );
14672
14673 # Ordered (by code point ordinal) list of the ranges of code points whose
14674 # names are algorithmically determined.  Each range entry is an anonymous hash
14675 # of the start and end points and a template for the names within it.
14676 \@Unicode::UCD::algorithmic_named_code_points = (
14677 $algorithm_names
14678 );
14679
14680 # The properties that as-is have two meanings, and which must be disambiguated
14681 \%Unicode::UCD::ambiguous_names = (
14682 $ambiguous_names
14683 );
14684
14685 # Keys are the prop-val combinations which are the default values for the
14686 # given property, expressed in standard loose form
14687 \%Unicode::UCD::loose_defaults = (
14688 $loose_defaults
14689 );
14690
14691 # All combinations of names that are suppressed.
14692 # This is actually for UCD.t, so it knows which properties shouldn't have
14693 # entries.  If it got any bigger, would probably want to put it in its own
14694 # file to use memory only when it was needed, in testing.
14695 \@Unicode::UCD::suppressed_properties = (
14696 $suppressed
14697 );
14698
14699 1;
14700 END
14701
14702     main::write("UCD.pl", 0, \@ucd);  # The 0 means no utf8.
14703     return;
14704 }
14705
14706 sub write_all_tables() {
14707     # Write out all the tables generated by this program to files, as well as
14708     # the supporting data structures, pod file, and .t file.
14709
14710     my @writables;              # List of tables that actually get written
14711     my %match_tables_to_write;  # Used to collapse identical match tables
14712                                 # into one file.  Each key is a hash function
14713                                 # result to partition tables into buckets.
14714                                 # Each value is an array of the tables that
14715                                 # fit in the bucket.
14716
14717     # For each property ...
14718     # (sort so that if there is an immutable file name, it has precedence, so
14719     # some other property can't come in and take over its file name.  If b's
14720     # file name is defined, will return 1, meaning to take it first; don't
14721     # care if both defined, as they had better be different anyway.  And the
14722     # property named 'Perl' needs to be first (it doesn't have any immutable
14723     # file name) because empty properties are defined in terms of it's table
14724     # named 'Any'.)
14725     PROPERTY:
14726     foreach my $property (sort { return -1 if $a == $perl;
14727                                  return 1 if $b == $perl;
14728                                  return defined $b->file
14729                                 } property_ref('*'))
14730     {
14731         my $type = $property->type;
14732
14733         # And for each table for that property, starting with the mapping
14734         # table for it ...
14735         TABLE:
14736         foreach my $table($property,
14737
14738                         # and all the match tables for it (if any), sorted so
14739                         # the ones with the shortest associated file name come
14740                         # first.  The length sorting prevents problems of a
14741                         # longer file taking a name that might have to be used
14742                         # by a shorter one.  The alphabetic sorting prevents
14743                         # differences between releases
14744                         sort {  my $ext_a = $a->external_name;
14745                                 return 1 if ! defined $ext_a;
14746                                 my $ext_b = $b->external_name;
14747                                 return -1 if ! defined $ext_b;
14748
14749                                 # But return the non-complement table before
14750                                 # the complement one, as the latter is defined
14751                                 # in terms of the former, and needs to have
14752                                 # the information for the former available.
14753                                 return 1 if $a->complement != 0;
14754                                 return -1 if $b->complement != 0;
14755
14756                                 # Similarly, return a subservient table after
14757                                 # a leader
14758                                 return 1 if $a->leader != $a;
14759                                 return -1 if $b->leader != $b;
14760
14761                                 my $cmp = length $ext_a <=> length $ext_b;
14762
14763                                 # Return result if lengths not equal
14764                                 return $cmp if $cmp;
14765
14766                                 # Alphabetic if lengths equal
14767                                 return $ext_a cmp $ext_b
14768                         } $property->tables
14769                     )
14770         {
14771
14772             # Here we have a table associated with a property.  It could be
14773             # the map table (done first for each property), or one of the
14774             # other tables.  Determine which type.
14775             my $is_property = $table->isa('Property');
14776
14777             my $name = $table->name;
14778             my $complete_name = $table->complete_name;
14779
14780             # See if should suppress the table if is empty, but warn if it
14781             # contains something.
14782             my $suppress_if_empty_warn_if_not
14783                     = $why_suppress_if_empty_warn_if_not{$complete_name} || 0;
14784
14785             # Calculate if this table should have any code points associated
14786             # with it or not.
14787             my $expected_empty =
14788
14789                 # $perl should be empty, as well as properties that we just
14790                 # don't do anything with
14791                 ($is_property
14792                     && ($table == $perl
14793                         || grep { $complete_name eq $_ }
14794                                                     @unimplemented_properties
14795                     )
14796                 )
14797
14798                 # Match tables in properties we skipped populating should be
14799                 # empty
14800                 || (! $is_property && ! $property->to_create_match_tables)
14801
14802                 # Tables and properties that are expected to have no code
14803                 # points should be empty
14804                 || $suppress_if_empty_warn_if_not
14805             ;
14806
14807             # Set a boolean if this table is the complement of an empty binary
14808             # table
14809             my $is_complement_of_empty_binary =
14810                 $type == $BINARY &&
14811                 (($table == $property->table('Y')
14812                     && $property->table('N')->is_empty)
14813                 || ($table == $property->table('N')
14814                     && $property->table('Y')->is_empty));
14815
14816             if ($table->is_empty) {
14817
14818                 if ($suppress_if_empty_warn_if_not) {
14819                     $table->set_fate($SUPPRESSED,
14820                                      $suppress_if_empty_warn_if_not);
14821                 }
14822
14823                 # Suppress (by skipping them) expected empty tables.
14824                 next TABLE if $expected_empty;
14825
14826                 # And setup to later output a warning for those that aren't
14827                 # known to be allowed to be empty.  Don't do the warning if
14828                 # this table is a child of another one to avoid duplicating
14829                 # the warning that should come from the parent one.
14830                 if (($table == $property || $table->parent == $table)
14831                     && $table->fate != $SUPPRESSED
14832                     && $table->fate != $MAP_PROXIED
14833                     && ! grep { $complete_name =~ /^$_$/ }
14834                                                     @tables_that_may_be_empty)
14835                 {
14836                     push @unhandled_properties, "$table";
14837                 }
14838
14839                 # An empty table is just the complement of everything.
14840                 $table->set_complement($Any) if $table != $property;
14841             }
14842             elsif ($expected_empty) {
14843                 my $because = "";
14844                 if ($suppress_if_empty_warn_if_not) {
14845                     $because = " because $suppress_if_empty_warn_if_not";
14846                 }
14847
14848                 Carp::my_carp("Not expecting property $table$because.  Generating file for it anyway.");
14849             }
14850
14851             # Some tables should match everything
14852             my $expected_full =
14853                 ($table->fate == $SUPPRESSED)
14854                 ? 0
14855                 : ($is_property)
14856                   ? # All these types of map tables will be full because
14857                     # they will have been populated with defaults
14858                     ($type == $ENUM || $type == $FORCED_BINARY)
14859
14860                   : # A match table should match everything if its method
14861                     # shows it should
14862                     ($table->matches_all
14863
14864                     # The complement of an empty binary table will match
14865                     # everything
14866                     || $is_complement_of_empty_binary
14867                     )
14868             ;
14869
14870             my $count = $table->count;
14871             if ($expected_full) {
14872                 if ($count != $MAX_UNICODE_CODEPOINTS) {
14873                     Carp::my_carp("$table matches only "
14874                     . clarify_number($count)
14875                     . " Unicode code points but should match "
14876                     . clarify_number($MAX_UNICODE_CODEPOINTS)
14877                     . " (off by "
14878                     .  clarify_number(abs($MAX_UNICODE_CODEPOINTS - $count))
14879                     . ").  Proceeding anyway.");
14880                 }
14881
14882                 # Here is expected to be full.  If it is because it is the
14883                 # complement of an (empty) binary table that is to be
14884                 # suppressed, then suppress this one as well.
14885                 if ($is_complement_of_empty_binary) {
14886                     my $opposing_name = ($name eq 'Y') ? 'N' : 'Y';
14887                     my $opposing = $property->table($opposing_name);
14888                     my $opposing_status = $opposing->status;
14889                     if ($opposing_status) {
14890                         $table->set_status($opposing_status,
14891                                            $opposing->status_info);
14892                     }
14893                 }
14894             }
14895             elsif ($count == $MAX_UNICODE_CODEPOINTS) {
14896                 if ($table == $property || $table->leader == $table) {
14897                     Carp::my_carp("$table unexpectedly matches all Unicode code points.  Proceeding anyway.");
14898                 }
14899             }
14900
14901             if ($table->fate == $SUPPRESSED) {
14902                 if (! $is_property) {
14903                     my @children = $table->children;
14904                     foreach my $child (@children) {
14905                         if ($child->fate != $SUPPRESSED) {
14906                             Carp::my_carp_bug("'$table' is suppressed and has a child '$child' which isn't");
14907                         }
14908                     }
14909                 }
14910                 next TABLE;
14911
14912             }
14913
14914             if (! $is_property) {
14915
14916                 make_ucd_table_pod_entries($table) if $table->property == $perl;
14917
14918                 # Several things need to be done just once for each related
14919                 # group of match tables.  Do them on the parent.
14920                 if ($table->parent == $table) {
14921
14922                     # Add an entry in the pod file for the table; it also does
14923                     # the children.
14924                     make_re_pod_entries($table) if defined $pod_directory;
14925
14926                     # See if the the table matches identical code points with
14927                     # something that has already been output.  In that case,
14928                     # no need to have two files with the same code points in
14929                     # them.  We use the table's hash() method to store these
14930                     # in buckets, so that it is quite likely that if two
14931                     # tables are in the same bucket they will be identical, so
14932                     # don't have to compare tables frequently.  The tables
14933                     # have to have the same status to share a file, so add
14934                     # this to the bucket hash.  (The reason for this latter is
14935                     # that Heavy.pl associates a status with a file.)
14936                     # We don't check tables that are inverses of others, as it
14937                     # would lead to some coding complications, and checking
14938                     # all the regular ones should find everything.
14939                     if ($table->complement == 0) {
14940                         my $hash = $table->hash . ';' . $table->status;
14941
14942                         # Look at each table that is in the same bucket as
14943                         # this one would be.
14944                         foreach my $comparison
14945                                             (@{$match_tables_to_write{$hash}})
14946                         {
14947                             if ($table->matches_identically_to($comparison)) {
14948                                 $table->set_equivalent_to($comparison,
14949                                                                 Related => 0);
14950                                 next TABLE;
14951                             }
14952                         }
14953
14954                         # Here, not equivalent, add this table to the bucket.
14955                         push @{$match_tables_to_write{$hash}}, $table;
14956                     }
14957                 }
14958             }
14959             else {
14960
14961                 # Here is the property itself.
14962                 # Don't write out or make references to the $perl property
14963                 next if $table == $perl;
14964
14965                 make_ucd_table_pod_entries($table);
14966
14967                 # There is a mapping stored of the various synonyms to the
14968                 # standardized name of the property for utf8_heavy.pl.
14969                 # Also, the pod file contains entries of the form:
14970                 # \p{alias: *}         \p{full: *}
14971                 # rather than show every possible combination of things.
14972
14973                 my @property_aliases = $property->aliases;
14974
14975                 my $full_property_name = $property->full_name;
14976                 my $property_name = $property->name;
14977                 my $standard_property_name = standardize($property_name);
14978                 my $standard_property_full_name
14979                                         = standardize($full_property_name);
14980
14981                 # We also create for Unicode::UCD a list of aliases for
14982                 # the property.  The list starts with the property name;
14983                 # then its full name.
14984                 my @property_list;
14985                 my @standard_list;
14986                 if ( $property->fate <= $MAP_PROXIED) {
14987                     @property_list = ($property_name, $full_property_name);
14988                     @standard_list = ($standard_property_name,
14989                                         $standard_property_full_name);
14990                 }
14991
14992                 # For each synonym ...
14993                 for my $i (0 .. @property_aliases - 1)  {
14994                     my $alias = $property_aliases[$i];
14995                     my $alias_name = $alias->name;
14996                     my $alias_standard = standardize($alias_name);
14997
14998
14999                     # Add other aliases to the list of property aliases
15000                     if ($property->fate <= $MAP_PROXIED
15001                         && ! grep { $alias_standard eq $_ } @standard_list)
15002                     {
15003                         push @property_list, $alias_name;
15004                         push @standard_list, $alias_standard;
15005                     }
15006
15007                     # For utf8_heavy, set the mapping of the alias to the
15008                     # property
15009                     if ($type == $STRING) {
15010                         if ($property->fate <= $MAP_PROXIED) {
15011                             $string_property_loose_to_name{$alias_standard}
15012                                             = $standard_property_name;
15013                         }
15014                     }
15015                     else {
15016                         if (exists ($loose_property_name_of{$alias_standard}))
15017                         {
15018                             Carp::my_carp("There already is a property with the same standard name as $alias_name: $loose_property_name_of{$alias_standard}.  Old name is retained");
15019                         }
15020                         else {
15021                             $loose_property_name_of{$alias_standard}
15022                                                 = $standard_property_name;
15023                         }
15024
15025                         # Now for the re pod entry for this alias.  Skip if not
15026                         # outputting a pod; skip the first one, which is the
15027                         # full name so won't have an entry like: '\p{full: *}
15028                         # \p{full: *}', and skip if don't want an entry for
15029                         # this one.
15030                         next if $i == 0
15031                                 || ! defined $pod_directory
15032                                 || ! $alias->make_re_pod_entry;
15033
15034                         my $rhs = "\\p{$full_property_name: *}";
15035                         if ($property != $perl && $table->perl_extension) {
15036                             $rhs .= ' (Perl extension)';
15037                         }
15038                         push @match_properties,
15039                             format_pod_line($indent_info_column,
15040                                         '\p{' . $alias->name . ': *}',
15041                                         $rhs,
15042                                         $alias->status);
15043                     }
15044                 }
15045
15046                 # The list of all possible names is attached to each alias, so
15047                 # lookup is easy
15048                 if (@property_list) {
15049                     push @{$prop_aliases{$standard_list[0]}}, @property_list;
15050                 }
15051
15052                 if ($property->fate <= $MAP_PROXIED) {
15053
15054                     # Similarly, we create for Unicode::UCD a list of
15055                     # property-value aliases.
15056
15057                     my $property_full_name = $property->full_name;
15058
15059                     # Look at each table in the property...
15060                     foreach my $table ($property->tables) {
15061                         my @values_list;
15062                         my $table_full_name = $table->full_name;
15063                         my $standard_table_full_name
15064                                               = standardize($table_full_name);
15065                         my $table_name = $table->name;
15066                         my $standard_table_name = standardize($table_name);
15067
15068                         # The list starts with the table name and its full
15069                         # name.
15070                         push @values_list, $table_name, $table_full_name;
15071
15072                         # We add to the table each unique alias that isn't
15073                         # discouraged from use.
15074                         foreach my $alias ($table->aliases) {
15075                             next if $alias->status
15076                                  && $alias->status eq $DISCOURAGED;
15077                             my $name = $alias->name;
15078                             my $standard = standardize($name);
15079                             next if $standard eq $standard_table_name;
15080                             next if $standard eq $standard_table_full_name;
15081                             push @values_list, $name;
15082                         }
15083
15084                         # Here @values_list is a list of all the aliases for
15085                         # the table.  That is, all the property-values given
15086                         # by this table.  By agreement with Unicode::UCD,
15087                         # if the name and full name are identical, and there
15088                         # are no other names, drop the duplcate entry to save
15089                         # memory.
15090                         if (@values_list == 2
15091                             && $values_list[0] eq $values_list[1])
15092                         {
15093                             pop @values_list
15094                         }
15095
15096                         # To save memory, unlike the similar list for property
15097                         # aliases above, only the standard forms hve the list.
15098                         # This forces an extra step of converting from input
15099                         # name to standard name, but the savings are
15100                         # considerable.  (There is only marginal savings if we
15101                         # did this with the property aliases.)
15102                         push @{$prop_value_aliases{$standard_property_name}{$standard_table_name}}, @values_list;
15103                     }
15104                 }
15105
15106                 # Don't write out a mapping file if not desired.
15107                 next if ! $property->to_output_map;
15108             }
15109
15110             # Here, we know we want to write out the table, but don't do it
15111             # yet because there may be other tables that come along and will
15112             # want to share the file, and the file's comments will change to
15113             # mention them.  So save for later.
15114             push @writables, $table;
15115
15116         } # End of looping through the property and all its tables.
15117     } # End of looping through all properties.
15118
15119     # Now have all the tables that will have files written for them.  Do it.
15120     foreach my $table (@writables) {
15121         my @directory;
15122         my $filename;
15123         my $property = $table->property;
15124         my $is_property = ($table == $property);
15125         if (! $is_property) {
15126
15127             # Match tables for the property go in lib/$subdirectory, which is
15128             # the property's name.  Don't use the standard file name for this,
15129             # as may get an unfamiliar alias
15130             @directory = ($matches_directory, $property->external_name);
15131         }
15132         else {
15133
15134             @directory = $table->directory;
15135             $filename = $table->file;
15136         }
15137
15138         # Use specified filename if available, or default to property's
15139         # shortest name.  We need an 8.3 safe filename (which means "an 8
15140         # safe" filename, since after the dot is only 'pl', which is < 3)
15141         # The 2nd parameter is if the filename shouldn't be changed, and
15142         # it shouldn't iff there is a hard-coded name for this table.
15143         $filename = construct_filename(
15144                                 $filename || $table->external_name,
15145                                 ! $filename,    # mutable if no filename
15146                                 \@directory);
15147
15148         register_file_for_name($table, \@directory, $filename);
15149
15150         # Only need to write one file when shared by more than one
15151         # property
15152         next if ! $is_property
15153                 && ($table->leader != $table || $table->complement != 0);
15154
15155         # Construct a nice comment to add to the file
15156         $table->set_final_comment;
15157
15158         $table->write;
15159     }
15160
15161
15162     # Write out the pod file
15163     make_pod;
15164
15165     # And Heavy.pl, Name.pm, UCD.pl
15166     make_Heavy;
15167     make_Name_pm;
15168     make_UCD;
15169
15170     make_property_test_script() if $make_test_script;
15171     return;
15172 }
15173
15174 my @white_space_separators = ( # This used only for making the test script.
15175                             "",
15176                             ' ',
15177                             "\t",
15178                             '   '
15179                         );
15180
15181 sub generate_separator($) {
15182     # This used only for making the test script.  It generates the colon or
15183     # equal separator between the property and property value, with random
15184     # white space surrounding the separator
15185
15186     my $lhs = shift;
15187
15188     return "" if $lhs eq "";  # No separator if there's only one (the r) side
15189
15190     # Choose space before and after randomly
15191     my $spaces_before =$white_space_separators[rand(@white_space_separators)];
15192     my $spaces_after = $white_space_separators[rand(@white_space_separators)];
15193
15194     # And return the whole complex, half the time using a colon, half the
15195     # equals
15196     return $spaces_before
15197             . (rand() < 0.5) ? '=' : ':'
15198             . $spaces_after;
15199 }
15200
15201 sub generate_tests($$$$$) {
15202     # This used only for making the test script.  It generates test cases that
15203     # are expected to compile successfully in perl.  Note that the lhs and
15204     # rhs are assumed to already be as randomized as the caller wants.
15205
15206     my $lhs = shift;           # The property: what's to the left of the colon
15207                                #  or equals separator
15208     my $rhs = shift;           # The property value; what's to the right
15209     my $valid_code = shift;    # A code point that's known to be in the
15210                                # table given by lhs=rhs; undef if table is
15211                                # empty
15212     my $invalid_code = shift;  # A code point known to not be in the table;
15213                                # undef if the table is all code points
15214     my $warning = shift;
15215
15216     # Get the colon or equal
15217     my $separator = generate_separator($lhs);
15218
15219     # The whole 'property=value'
15220     my $name = "$lhs$separator$rhs";
15221
15222     my @output;
15223     # Create a complete set of tests, with complements.
15224     if (defined $valid_code) {
15225         push @output, <<"EOC"
15226 Expect(1, $valid_code, '\\p{$name}', $warning);
15227 Expect(0, $valid_code, '\\p{^$name}', $warning);
15228 Expect(0, $valid_code, '\\P{$name}', $warning);
15229 Expect(1, $valid_code, '\\P{^$name}', $warning);
15230 EOC
15231     }
15232     if (defined $invalid_code) {
15233         push @output, <<"EOC"
15234 Expect(0, $invalid_code, '\\p{$name}', $warning);
15235 Expect(1, $invalid_code, '\\p{^$name}', $warning);
15236 Expect(1, $invalid_code, '\\P{$name}', $warning);
15237 Expect(0, $invalid_code, '\\P{^$name}', $warning);
15238 EOC
15239     }
15240     return @output;
15241 }
15242
15243 sub generate_error($$$) {
15244     # This used only for making the test script.  It generates test cases that
15245     # are expected to not only not match, but to be syntax or similar errors
15246
15247     my $lhs = shift;                # The property: what's to the left of the
15248                                     # colon or equals separator
15249     my $rhs = shift;                # The property value; what's to the right
15250     my $already_in_error = shift;   # Boolean; if true it's known that the
15251                                 # unmodified lhs and rhs will cause an error.
15252                                 # This routine should not force another one
15253     # Get the colon or equal
15254     my $separator = generate_separator($lhs);
15255
15256     # Since this is an error only, don't bother to randomly decide whether to
15257     # put the error on the left or right side; and assume that the rhs is
15258     # loosely matched, again for convenience rather than rigor.
15259     $rhs = randomize_loose_name($rhs, 'ERROR') unless $already_in_error;
15260
15261     my $property = $lhs . $separator . $rhs;
15262
15263     return <<"EOC";
15264 Error('\\p{$property}');
15265 Error('\\P{$property}');
15266 EOC
15267 }
15268
15269 # These are used only for making the test script
15270 # XXX Maybe should also have a bad strict seps, which includes underscore.
15271
15272 my @good_loose_seps = (
15273             " ",
15274             "-",
15275             "\t",
15276             "",
15277             "_",
15278            );
15279 my @bad_loose_seps = (
15280            "/a/",
15281            ':=',
15282           );
15283
15284 sub randomize_stricter_name {
15285     # This used only for making the test script.  Take the input name and
15286     # return a randomized, but valid version of it under the stricter matching
15287     # rules.
15288
15289     my $name = shift;
15290     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
15291
15292     # If the name looks like a number (integer, floating, or rational), do
15293     # some extra work
15294     if ($name =~ qr{ ^ ( -? ) (\d+ ( ( [./] ) \d+ )? ) $ }x) {
15295         my $sign = $1;
15296         my $number = $2;
15297         my $separator = $3;
15298
15299         # If there isn't a sign, part of the time add a plus
15300         # Note: Not testing having any denominator having a minus sign
15301         if (! $sign) {
15302             $sign = '+' if rand() <= .3;
15303         }
15304
15305         # And add 0 or more leading zeros.
15306         $name = $sign . ('0' x int rand(10)) . $number;
15307
15308         if (defined $separator) {
15309             my $extra_zeros = '0' x int rand(10);
15310
15311             if ($separator eq '.') {
15312
15313                 # Similarly, add 0 or more trailing zeros after a decimal
15314                 # point
15315                 $name .= $extra_zeros;
15316             }
15317             else {
15318
15319                 # Or, leading zeros before the denominator
15320                 $name =~ s,/,/$extra_zeros,;
15321             }
15322         }
15323     }
15324
15325     # For legibility of the test, only change the case of whole sections at a
15326     # time.  To do this, first split into sections.  The split returns the
15327     # delimiters
15328     my @sections;
15329     for my $section (split / ( [ - + \s _ . ]+ ) /x, $name) {
15330         trace $section if main::DEBUG && $to_trace;
15331
15332         if (length $section > 1 && $section !~ /\D/) {
15333
15334             # If the section is a sequence of digits, about half the time
15335             # randomly add underscores between some of them.
15336             if (rand() > .5) {
15337
15338                 # Figure out how many underscores to add.  max is 1 less than
15339                 # the number of digits.  (But add 1 at the end to make sure
15340                 # result isn't 0, and compensate earlier by subtracting 2
15341                 # instead of 1)
15342                 my $num_underscores = int rand(length($section) - 2) + 1;
15343
15344                 # And add them evenly throughout, for convenience, not rigor
15345                 use integer;
15346                 my $spacing = (length($section) - 1)/ $num_underscores;
15347                 my $temp = $section;
15348                 $section = "";
15349                 for my $i (1 .. $num_underscores) {
15350                     $section .= substr($temp, 0, $spacing, "") . '_';
15351                 }
15352                 $section .= $temp;
15353             }
15354             push @sections, $section;
15355         }
15356         else {
15357
15358             # Here not a sequence of digits.  Change the case of the section
15359             # randomly
15360             my $switch = int rand(4);
15361             if ($switch == 0) {
15362                 push @sections, uc $section;
15363             }
15364             elsif ($switch == 1) {
15365                 push @sections, lc $section;
15366             }
15367             elsif ($switch == 2) {
15368                 push @sections, ucfirst $section;
15369             }
15370             else {
15371                 push @sections, $section;
15372             }
15373         }
15374     }
15375     trace "returning", join "", @sections if main::DEBUG && $to_trace;
15376     return join "", @sections;
15377 }
15378
15379 sub randomize_loose_name($;$) {
15380     # This used only for making the test script
15381
15382     my $name = shift;
15383     my $want_error = shift;  # if true, make an error
15384     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
15385
15386     $name = randomize_stricter_name($name);
15387
15388     my @parts;
15389     push @parts, $good_loose_seps[rand(@good_loose_seps)];
15390
15391     # Preserve trailing ones for the sake of not stripping the underscore from
15392     # 'L_'
15393     for my $part (split /[-\s_]+ (?= . )/, $name) {
15394         if (@parts) {
15395             if ($want_error and rand() < 0.3) {
15396                 push @parts, $bad_loose_seps[rand(@bad_loose_seps)];
15397                 $want_error = 0;
15398             }
15399             else {
15400                 push @parts, $good_loose_seps[rand(@good_loose_seps)];
15401             }
15402         }
15403         push @parts, $part;
15404     }
15405     my $new = join("", @parts);
15406     trace "$name => $new" if main::DEBUG && $to_trace;
15407
15408     if ($want_error) {
15409         if (rand() >= 0.5) {
15410             $new .= $bad_loose_seps[rand(@bad_loose_seps)];
15411         }
15412         else {
15413             $new = $bad_loose_seps[rand(@bad_loose_seps)] . $new;
15414         }
15415     }
15416     return $new;
15417 }
15418
15419 # Used to make sure don't generate duplicate test cases.
15420 my %test_generated;
15421
15422 sub make_property_test_script() {
15423     # This used only for making the test script
15424     # this written directly -- it's huge.
15425
15426     print "Making test script\n" if $verbosity >= $PROGRESS;
15427
15428     # This uses randomness to test different possibilities without testing all
15429     # possibilities.  To ensure repeatability, set the seed to 0.  But if
15430     # tests are added, it will perturb all later ones in the .t file
15431     srand 0;
15432
15433     $t_path = 'TestProp.pl' unless defined $t_path; # the traditional name
15434
15435     # Keep going down an order of magnitude
15436     # until find that adding this quantity to
15437     # 1 remains 1; but put an upper limit on
15438     # this so in case this algorithm doesn't
15439     # work properly on some platform, that we
15440     # won't loop forever.
15441     my $digits = 0;
15442     my $min_floating_slop = 1;
15443     while (1+ $min_floating_slop != 1
15444             && $digits++ < 50)
15445     {
15446         my $next = $min_floating_slop / 10;
15447         last if $next == 0; # If underflows,
15448                             # use previous one
15449         $min_floating_slop = $next;
15450     }
15451
15452     # It doesn't matter whether the elements of this array contain single lines
15453     # or multiple lines. main::write doesn't count the lines.
15454     my @output;
15455
15456     foreach my $property (property_ref('*')) {
15457         foreach my $table ($property->tables) {
15458
15459             # Find code points that match, and don't match this table.
15460             my $valid = $table->get_valid_code_point;
15461             my $invalid = $table->get_invalid_code_point;
15462             my $warning = ($table->status eq $DEPRECATED)
15463                             ? "'deprecated'"
15464                             : '""';
15465
15466             # Test each possible combination of the property's aliases with
15467             # the table's.  If this gets to be too many, could do what is done
15468             # in the set_final_comment() for Tables
15469             my @table_aliases = $table->aliases;
15470             my @property_aliases = $table->property->aliases;
15471
15472             # Every property can be optionally be prefixed by 'Is_', so test
15473             # that those work, by creating such a new alias for each
15474             # pre-existing one.
15475             push @property_aliases, map { Alias->new("Is_" . $_->name,
15476                                                     $_->loose_match,
15477                                                     $_->make_re_pod_entry,
15478                                                     $_->ok_as_filename,
15479                                                     $_->status,
15480                                                     $_->ucd,
15481                                                     )
15482                                          } @property_aliases;
15483             my $max = max(scalar @table_aliases, scalar @property_aliases);
15484             for my $j (0 .. $max - 1) {
15485
15486                 # The current alias for property is the next one on the list,
15487                 # or if beyond the end, start over.  Similarly for table
15488                 my $property_name
15489                             = $property_aliases[$j % @property_aliases]->name;
15490
15491                 $property_name = "" if $table->property == $perl;
15492                 my $table_alias = $table_aliases[$j % @table_aliases];
15493                 my $table_name = $table_alias->name;
15494                 my $loose_match = $table_alias->loose_match;
15495
15496                 # If the table doesn't have a file, any test for it is
15497                 # already guaranteed to be in error
15498                 my $already_error = ! $table->file_path;
15499
15500                 # Generate error cases for this alias.
15501                 push @output, generate_error($property_name,
15502                                              $table_name,
15503                                              $already_error);
15504
15505                 # If the table is guaranteed to always generate an error,
15506                 # quit now without generating success cases.
15507                 next if $already_error;
15508
15509                 # Now for the success cases.
15510                 my $random;
15511                 if ($loose_match) {
15512
15513                     # For loose matching, create an extra test case for the
15514                     # standard name.
15515                     my $standard = standardize($table_name);
15516
15517                     # $test_name should be a unique combination for each test
15518                     # case; used just to avoid duplicate tests
15519                     my $test_name = "$property_name=$standard";
15520
15521                     # Don't output duplicate test cases.
15522                     if (! exists $test_generated{$test_name}) {
15523                         $test_generated{$test_name} = 1;
15524                         push @output, generate_tests($property_name,
15525                                                      $standard,
15526                                                      $valid,
15527                                                      $invalid,
15528                                                      $warning,
15529                                                  );
15530                     }
15531                     $random = randomize_loose_name($table_name)
15532                 }
15533                 else { # Stricter match
15534                     $random = randomize_stricter_name($table_name);
15535                 }
15536
15537                 # Now for the main test case for this alias.
15538                 my $test_name = "$property_name=$random";
15539                 if (! exists $test_generated{$test_name}) {
15540                     $test_generated{$test_name} = 1;
15541                     push @output, generate_tests($property_name,
15542                                                  $random,
15543                                                  $valid,
15544                                                  $invalid,
15545                                                  $warning,
15546                                              );
15547
15548                     # If the name is a rational number, add tests for the
15549                     # floating point equivalent.
15550                     if ($table_name =~ qr{/}) {
15551
15552                         # Calculate the float, and find just the fraction.
15553                         my $float = eval $table_name;
15554                         my ($whole, $fraction)
15555                                             = $float =~ / (.*) \. (.*) /x;
15556
15557                         # Starting with one digit after the decimal point,
15558                         # create a test for each possible precision (number of
15559                         # digits past the decimal point) until well beyond the
15560                         # native number found on this machine.  (If we started
15561                         # with 0 digits, it would be an integer, which could
15562                         # well match an unrelated table)
15563                         PLACE:
15564                         for my $i (1 .. $min_floating_slop + 3) {
15565                             my $table_name = sprintf("%.*f", $i, $float);
15566                             if ($i < $MIN_FRACTION_LENGTH) {
15567
15568                                 # If the test case has fewer digits than the
15569                                 # minimum acceptable precision, it shouldn't
15570                                 # succeed, so we expect an error for it.
15571                                 # E.g., 2/3 = .7 at one decimal point, and we
15572                                 # shouldn't say it matches .7.  We should make
15573                                 # it be .667 at least before agreeing that the
15574                                 # intent was to match 2/3.  But at the
15575                                 # less-than- acceptable level of precision, it
15576                                 # might actually match an unrelated number.
15577                                 # So don't generate a test case if this
15578                                 # conflating is possible.  In our example, we
15579                                 # don't want 2/3 matching 7/10, if there is
15580                                 # a 7/10 code point.
15581                                 for my $existing
15582                                         (keys %nv_floating_to_rational)
15583                                 {
15584                                     next PLACE
15585                                         if abs($table_name - $existing)
15586                                                 < $MAX_FLOATING_SLOP;
15587                                 }
15588                                 push @output, generate_error($property_name,
15589                                                              $table_name,
15590                                                              1   # 1 => already an error
15591                                               );
15592                             }
15593                             else {
15594
15595                                 # Here the number of digits exceeds the
15596                                 # minimum we think is needed.  So generate a
15597                                 # success test case for it.
15598                                 push @output, generate_tests($property_name,
15599                                                              $table_name,
15600                                                              $valid,
15601                                                              $invalid,
15602                                                              $warning,
15603                                              );
15604                             }
15605                         }
15606                     }
15607                 }
15608             }
15609         }
15610     }
15611
15612     &write($t_path,
15613            0,           # Not utf8;
15614            [<DATA>,
15615             @output,
15616             (map {"Test_X('$_');\n"} @backslash_X_tests),
15617             "Finished();\n"]);
15618     return;
15619 }
15620
15621 # This is a list of the input files and how to handle them.  The files are
15622 # processed in their order in this list.  Some reordering is possible if
15623 # desired, but the v0 files should be first, and the extracted before the
15624 # others except DAge.txt (as data in an extracted file can be over-ridden by
15625 # the non-extracted.  Some other files depend on data derived from an earlier
15626 # file, like UnicodeData requires data from Jamo, and the case changing and
15627 # folding requires data from Unicode.  Mostly, it safest to order by first
15628 # version releases in (except the Jamo).  DAge.txt is read before the
15629 # extracted ones because of the rarely used feature $compare_versions.  In the
15630 # unlikely event that there were ever an extracted file that contained the Age
15631 # property information, it would have to go in front of DAge.
15632 #
15633 # The version strings allow the program to know whether to expect a file or
15634 # not, but if a file exists in the directory, it will be processed, even if it
15635 # is in a version earlier than expected, so you can copy files from a later
15636 # release into an earlier release's directory.
15637 my @input_file_objects = (
15638     Input_file->new('PropertyAliases.txt', v0,
15639                     Handler => \&process_PropertyAliases,
15640                     ),
15641     Input_file->new(undef, v0,  # No file associated with this
15642                     Progress_Message => 'Finishing property setup',
15643                     Handler => \&finish_property_setup,
15644                     ),
15645     Input_file->new('PropValueAliases.txt', v0,
15646                      Handler => \&process_PropValueAliases,
15647                      Has_Missings_Defaults => $NOT_IGNORED,
15648                      ),
15649     Input_file->new('DAge.txt', v3.2.0,
15650                     Has_Missings_Defaults => $NOT_IGNORED,
15651                     Property => 'Age'
15652                     ),
15653     Input_file->new("${EXTRACTED}DGeneralCategory.txt", v3.1.0,
15654                     Property => 'General_Category',
15655                     ),
15656     Input_file->new("${EXTRACTED}DCombiningClass.txt", v3.1.0,
15657                     Property => 'Canonical_Combining_Class',
15658                     Has_Missings_Defaults => $NOT_IGNORED,
15659                     ),
15660     Input_file->new("${EXTRACTED}DNumType.txt", v3.1.0,
15661                     Property => 'Numeric_Type',
15662                     Has_Missings_Defaults => $NOT_IGNORED,
15663                     ),
15664     Input_file->new("${EXTRACTED}DEastAsianWidth.txt", v3.1.0,
15665                     Property => 'East_Asian_Width',
15666                     Has_Missings_Defaults => $NOT_IGNORED,
15667                     ),
15668     Input_file->new("${EXTRACTED}DLineBreak.txt", v3.1.0,
15669                     Property => 'Line_Break',
15670                     Has_Missings_Defaults => $NOT_IGNORED,
15671                     ),
15672     Input_file->new("${EXTRACTED}DBidiClass.txt", v3.1.1,
15673                     Property => 'Bidi_Class',
15674                     Has_Missings_Defaults => $NOT_IGNORED,
15675                     ),
15676     Input_file->new("${EXTRACTED}DDecompositionType.txt", v3.1.0,
15677                     Property => 'Decomposition_Type',
15678                     Has_Missings_Defaults => $NOT_IGNORED,
15679                     ),
15680     Input_file->new("${EXTRACTED}DBinaryProperties.txt", v3.1.0),
15681     Input_file->new("${EXTRACTED}DNumValues.txt", v3.1.0,
15682                     Property => 'Numeric_Value',
15683                     Each_Line_Handler => \&filter_numeric_value_line,
15684                     Has_Missings_Defaults => $NOT_IGNORED,
15685                     ),
15686     Input_file->new("${EXTRACTED}DJoinGroup.txt", v3.1.0,
15687                     Property => 'Joining_Group',
15688                     Has_Missings_Defaults => $NOT_IGNORED,
15689                     ),
15690
15691     Input_file->new("${EXTRACTED}DJoinType.txt", v3.1.0,
15692                     Property => 'Joining_Type',
15693                     Has_Missings_Defaults => $NOT_IGNORED,
15694                     ),
15695     Input_file->new('Jamo.txt', v2.0.0,
15696                     Property => 'Jamo_Short_Name',
15697                     Each_Line_Handler => \&filter_jamo_line,
15698                     ),
15699     Input_file->new('UnicodeData.txt', v1.1.5,
15700                     Pre_Handler => \&setup_UnicodeData,
15701
15702                     # We clean up this file for some early versions.
15703                     Each_Line_Handler => [ (($v_version lt v2.0.0 )
15704                                             ? \&filter_v1_ucd
15705                                             : ($v_version eq v2.1.5)
15706                                                 ? \&filter_v2_1_5_ucd
15707
15708                                                 # And for 5.14 Perls with 6.0,
15709                                                 # have to also make changes
15710                                                 : ($v_version ge v6.0.0)
15711                                                     ? \&filter_v6_ucd
15712                                                     : undef),
15713
15714                                             # And the main filter
15715                                             \&filter_UnicodeData_line,
15716                                          ],
15717                     EOF_Handler => \&EOF_UnicodeData,
15718                     ),
15719     Input_file->new('ArabicShaping.txt', v2.0.0,
15720                     Each_Line_Handler =>
15721                         [ ($v_version lt 4.1.0)
15722                                     ? \&filter_old_style_arabic_shaping
15723                                     : undef,
15724                         \&filter_arabic_shaping_line,
15725                         ],
15726                     Has_Missings_Defaults => $NOT_IGNORED,
15727                     ),
15728     Input_file->new('Blocks.txt', v2.0.0,
15729                     Property => 'Block',
15730                     Has_Missings_Defaults => $NOT_IGNORED,
15731                     Each_Line_Handler => \&filter_blocks_lines
15732                     ),
15733     Input_file->new('PropList.txt', v2.0.0,
15734                     Each_Line_Handler => (($v_version lt v3.1.0)
15735                                             ? \&filter_old_style_proplist
15736                                             : undef),
15737                     ),
15738     Input_file->new('Unihan.txt', v2.0.0,
15739                     Pre_Handler => \&setup_unihan,
15740                     Optional => 1,
15741                     Each_Line_Handler => \&filter_unihan_line,
15742                         ),
15743     Input_file->new('SpecialCasing.txt', v2.1.8,
15744                     Each_Line_Handler => \&filter_special_casing_line,
15745                     Pre_Handler => \&setup_special_casing,
15746                     Has_Missings_Defaults => $IGNORED,
15747                     ),
15748     Input_file->new(
15749                     'LineBreak.txt', v3.0.0,
15750                     Has_Missings_Defaults => $NOT_IGNORED,
15751                     Property => 'Line_Break',
15752                     # Early versions had problematic syntax
15753                     Each_Line_Handler => (($v_version lt v3.1.0)
15754                                         ? \&filter_early_ea_lb
15755                                         : undef),
15756                     ),
15757     Input_file->new('EastAsianWidth.txt', v3.0.0,
15758                     Property => 'East_Asian_Width',
15759                     Has_Missings_Defaults => $NOT_IGNORED,
15760                     # Early versions had problematic syntax
15761                     Each_Line_Handler => (($v_version lt v3.1.0)
15762                                         ? \&filter_early_ea_lb
15763                                         : undef),
15764                     ),
15765     Input_file->new('CompositionExclusions.txt', v3.0.0,
15766                     Property => 'Composition_Exclusion',
15767                     ),
15768     Input_file->new('BidiMirroring.txt', v3.0.1,
15769                     Property => 'Bidi_Mirroring_Glyph',
15770                     ),
15771     Input_file->new("NormalizationTest.txt", v3.0.1,
15772                     Skip => 'Validation Tests',
15773                     ),
15774     Input_file->new('CaseFolding.txt', v3.0.1,
15775                     Pre_Handler => \&setup_case_folding,
15776                     Each_Line_Handler =>
15777                         [ ($v_version lt v3.1.0)
15778                                  ? \&filter_old_style_case_folding
15779                                  : undef,
15780                            \&filter_case_folding_line
15781                         ],
15782                     Has_Missings_Defaults => $IGNORED,
15783                     ),
15784     Input_file->new('DCoreProperties.txt', v3.1.0,
15785                     # 5.2 changed this file
15786                     Has_Missings_Defaults => (($v_version ge v5.2.0)
15787                                             ? $NOT_IGNORED
15788                                             : $NO_DEFAULTS),
15789                     ),
15790     Input_file->new('Scripts.txt', v3.1.0,
15791                     Property => 'Script',
15792                     Has_Missings_Defaults => $NOT_IGNORED,
15793                     ),
15794     Input_file->new('DNormalizationProps.txt', v3.1.0,
15795                     Has_Missings_Defaults => $NOT_IGNORED,
15796                     Each_Line_Handler => (($v_version lt v4.0.1)
15797                                       ? \&filter_old_style_normalization_lines
15798                                       : undef),
15799                     ),
15800     Input_file->new('HangulSyllableType.txt', v4.0.0,
15801                     Has_Missings_Defaults => $NOT_IGNORED,
15802                     Property => 'Hangul_Syllable_Type'),
15803     Input_file->new("$AUXILIARY/WordBreakProperty.txt", v4.1.0,
15804                     Property => 'Word_Break',
15805                     Has_Missings_Defaults => $NOT_IGNORED,
15806                     ),
15807     Input_file->new("$AUXILIARY/GraphemeBreakProperty.txt", v4.1.0,
15808                     Property => 'Grapheme_Cluster_Break',
15809                     Has_Missings_Defaults => $NOT_IGNORED,
15810                     ),
15811     Input_file->new("$AUXILIARY/GCBTest.txt", v4.1.0,
15812                     Handler => \&process_GCB_test,
15813                     ),
15814     Input_file->new("$AUXILIARY/LBTest.txt", v4.1.0,
15815                     Skip => 'Validation Tests',
15816                     ),
15817     Input_file->new("$AUXILIARY/SBTest.txt", v4.1.0,
15818                     Skip => 'Validation Tests',
15819                     ),
15820     Input_file->new("$AUXILIARY/WBTest.txt", v4.1.0,
15821                     Skip => 'Validation Tests',
15822                     ),
15823     Input_file->new("$AUXILIARY/SentenceBreakProperty.txt", v4.1.0,
15824                     Property => 'Sentence_Break',
15825                     Has_Missings_Defaults => $NOT_IGNORED,
15826                     ),
15827     Input_file->new('NamedSequences.txt', v4.1.0,
15828                     Handler => \&process_NamedSequences
15829                     ),
15830     Input_file->new('NameAliases.txt', v5.0.0,
15831                     Property => 'Name_Alias',
15832                     Pre_Handler => ($v_version ge v6.0.0)
15833                                    ? \&setup_v6_name_alias
15834                                    : undef,
15835                     ),
15836     Input_file->new("BidiTest.txt", v5.2.0,
15837                     Skip => 'Validation Tests',
15838                     ),
15839     Input_file->new('UnihanIndicesDictionary.txt', v5.2.0,
15840                     Optional => 1,
15841                     Each_Line_Handler => \&filter_unihan_line,
15842                     ),
15843     Input_file->new('UnihanDataDictionaryLike.txt', v5.2.0,
15844                     Optional => 1,
15845                     Each_Line_Handler => \&filter_unihan_line,
15846                     ),
15847     Input_file->new('UnihanIRGSources.txt', v5.2.0,
15848                     Optional => 1,
15849                     Pre_Handler => \&setup_unihan,
15850                     Each_Line_Handler => \&filter_unihan_line,
15851                     ),
15852     Input_file->new('UnihanNumericValues.txt', v5.2.0,
15853                     Optional => 1,
15854                     Each_Line_Handler => \&filter_unihan_line,
15855                     ),
15856     Input_file->new('UnihanOtherMappings.txt', v5.2.0,
15857                     Optional => 1,
15858                     Each_Line_Handler => \&filter_unihan_line,
15859                     ),
15860     Input_file->new('UnihanRadicalStrokeCounts.txt', v5.2.0,
15861                     Optional => 1,
15862                     Each_Line_Handler => \&filter_unihan_line,
15863                     ),
15864     Input_file->new('UnihanReadings.txt', v5.2.0,
15865                     Optional => 1,
15866                     Each_Line_Handler => \&filter_unihan_line,
15867                     ),
15868     Input_file->new('UnihanVariants.txt', v5.2.0,
15869                     Optional => 1,
15870                     Each_Line_Handler => \&filter_unihan_line,
15871                     ),
15872     Input_file->new('ScriptExtensions.txt', v6.0.0,
15873                     Property => 'Script_Extensions',
15874                     Pre_Handler => \&setup_script_extensions,
15875                     Each_Line_Handler => \&filter_script_extensions_line,
15876                     Has_Missings_Defaults => (($v_version le v6.0.0)
15877                                             ? $NO_DEFAULTS
15878                                             : $IGNORED),
15879                     ),
15880     # The two Indic files are actually available starting in v6.0.0, but their
15881     # property values are missing from PropValueAliases.txt in that release,
15882     # so that further work would have to be done to get them to work properly
15883     # for that release.
15884     Input_file->new('IndicMatraCategory.txt', v6.1.0,
15885                     Property => 'Indic_Matra_Category',
15886                     Has_Missings_Defaults => $NOT_IGNORED,
15887                     Skip => "Provisional; for the analysis and processing of Indic scripts",
15888                     ),
15889     Input_file->new('IndicSyllabicCategory.txt', v6.1.0,
15890                     Property => 'Indic_Syllabic_Category',
15891                     Has_Missings_Defaults => $NOT_IGNORED,
15892                     Skip => "Provisional; for the analysis and processing of Indic scripts",
15893                     ),
15894 );
15895
15896 # End of all the preliminaries.
15897 # Do it...
15898
15899 if ($compare_versions) {
15900     Carp::my_carp(<<END
15901 Warning.  \$compare_versions is set.  Output is not suitable for production
15902 END
15903     );
15904 }
15905
15906 # Put into %potential_files a list of all the files in the directory structure
15907 # that could be inputs to this program, excluding those that we should ignore.
15908 # Use absolute file names because it makes it easier across machine types.
15909 my @ignored_files_full_names = map { File::Spec->rel2abs(
15910                                      internal_file_to_platform($_))
15911                                 } keys %ignored_files;
15912 File::Find::find({
15913     wanted=>sub {
15914         return unless /\.txt$/i;  # Some platforms change the name's case
15915         my $full = lc(File::Spec->rel2abs($_));
15916         $potential_files{$full} = 1
15917                     if ! grep { $full eq lc($_) } @ignored_files_full_names;
15918         return;
15919     }
15920 }, File::Spec->curdir());
15921
15922 my @mktables_list_output_files;
15923 my $old_start_time = 0;
15924
15925 if (! -e $file_list) {
15926     print "'$file_list' doesn't exist, so forcing rebuild.\n" if $verbosity >= $VERBOSE;
15927     $write_unchanged_files = 1;
15928 } elsif ($write_unchanged_files) {
15929     print "Not checking file list '$file_list'.\n" if $verbosity >= $VERBOSE;
15930 }
15931 else {
15932     print "Reading file list '$file_list'\n" if $verbosity >= $VERBOSE;
15933     my $file_handle;
15934     if (! open $file_handle, "<", $file_list) {
15935         Carp::my_carp("Failed to open '$file_list'; turning on -globlist option instead: $!");
15936         $glob_list = 1;
15937     }
15938     else {
15939         my @input;
15940
15941         # Read and parse mktables.lst, placing the results from the first part
15942         # into @input, and the second part into @mktables_list_output_files
15943         for my $list ( \@input, \@mktables_list_output_files ) {
15944             while (<$file_handle>) {
15945                 s/^ \s+ | \s+ $//xg;
15946                 if (/^ \s* \# .* Autogenerated\ starting\ on\ (\d+)/x) {
15947                     $old_start_time = $1;
15948                 }
15949                 next if /^ \s* (?: \# .* )? $/x;
15950                 last if /^ =+ $/x;
15951                 my ( $file ) = split /\t/;
15952                 push @$list, $file;
15953             }
15954             @$list = uniques(@$list);
15955             next;
15956         }
15957
15958         # Look through all the input files
15959         foreach my $input (@input) {
15960             next if $input eq 'version'; # Already have checked this.
15961
15962             # Ignore if doesn't exist.  The checking about whether we care or
15963             # not is done via the Input_file object.
15964             next if ! file_exists($input);
15965
15966             # The paths are stored with relative names, and with '/' as the
15967             # delimiter; convert to absolute on this machine
15968             my $full = lc(File::Spec->rel2abs(internal_file_to_platform($input)));
15969             $potential_files{lc $full} = 1
15970                 if ! grep { lc($full) eq lc($_) } @ignored_files_full_names;
15971         }
15972     }
15973
15974     close $file_handle;
15975 }
15976
15977 if ($glob_list) {
15978
15979     # Here wants to process all .txt files in the directory structure.
15980     # Convert them to full path names.  They are stored in the platform's
15981     # relative style
15982     my @known_files;
15983     foreach my $object (@input_file_objects) {
15984         my $file = $object->file;
15985         next unless defined $file;
15986         push @known_files, File::Spec->rel2abs($file);
15987     }
15988
15989     my @unknown_input_files;
15990     foreach my $file (keys %potential_files) {  # The keys are stored in lc
15991         next if grep { $file eq lc($_) } @known_files;
15992
15993         # Here, the file is unknown to us.  Get relative path name
15994         $file = File::Spec->abs2rel($file);
15995         push @unknown_input_files, $file;
15996
15997         # What will happen is we create a data structure for it, and add it to
15998         # the list of input files to process.  First get the subdirectories
15999         # into an array
16000         my (undef, $directories, undef) = File::Spec->splitpath($file);
16001         $directories =~ s;/$;;;     # Can have extraneous trailing '/'
16002         my @directories = File::Spec->splitdir($directories);
16003
16004         # If the file isn't extracted (meaning none of the directories is the
16005         # extracted one), just add it to the end of the list of inputs.
16006         if (! grep { $EXTRACTED_DIR eq $_ } @directories) {
16007             push @input_file_objects, Input_file->new($file, v0);
16008         }
16009         else {
16010
16011             # Here, the file is extracted.  It needs to go ahead of most other
16012             # processing.  Search for the first input file that isn't a
16013             # special required property (that is, find one whose first_release
16014             # is non-0), and isn't extracted.  Also, the Age property file is
16015             # processed before the extracted ones, just in case
16016             # $compare_versions is set.
16017             for (my $i = 0; $i < @input_file_objects; $i++) {
16018                 if ($input_file_objects[$i]->first_released ne v0
16019                     && lc($input_file_objects[$i]->file) ne 'dage.txt'
16020                     && $input_file_objects[$i]->file !~ /$EXTRACTED_DIR/i)
16021                 {
16022                     splice @input_file_objects, $i, 0,
16023                                                 Input_file->new($file, v0);
16024                     last;
16025                 }
16026             }
16027
16028         }
16029     }
16030     if (@unknown_input_files) {
16031         print STDERR simple_fold(join_lines(<<END
16032
16033 The following files are unknown as to how to handle.  Assuming they are
16034 typical property files.  You'll know by later error messages if it worked or
16035 not:
16036 END
16037         ) . " " . join(", ", @unknown_input_files) . "\n\n");
16038     }
16039 } # End of looking through directory structure for more .txt files.
16040
16041 # Create the list of input files from the objects we have defined, plus
16042 # version
16043 my @input_files = 'version';
16044 foreach my $object (@input_file_objects) {
16045     my $file = $object->file;
16046     next if ! defined $file;    # Not all objects have files
16047     next if $object->optional && ! -e $file;
16048     push @input_files,  $file;
16049 }
16050
16051 if ( $verbosity >= $VERBOSE ) {
16052     print "Expecting ".scalar( @input_files )." input files. ",
16053          "Checking ".scalar( @mktables_list_output_files )." output files.\n";
16054 }
16055
16056 # We set $most_recent to be the most recently changed input file, including
16057 # this program itself (done much earlier in this file)
16058 foreach my $in (@input_files) {
16059     next unless -e $in;        # Keep going even if missing a file
16060     my $mod_time = (stat $in)[9];
16061     $most_recent = $mod_time if $mod_time > $most_recent;
16062
16063     # See that the input files have distinct names, to warn someone if they
16064     # are adding a new one
16065     if ($make_list) {
16066         my ($volume, $directories, $file ) = File::Spec->splitpath($in);
16067         $directories =~ s;/$;;;     # Can have extraneous trailing '/'
16068         my @directories = File::Spec->splitdir($directories);
16069         my $base = $file =~ s/\.txt$//;
16070         construct_filename($file, 'mutable', \@directories);
16071     }
16072 }
16073
16074 my $rebuild = $write_unchanged_files    # Rebuild: if unconditional rebuild
16075               || ! scalar @mktables_list_output_files  # or if no outputs known
16076               || $old_start_time < $most_recent;       # or out-of-date
16077
16078 # Now we check to see if any output files are older than youngest, if
16079 # they are, we need to continue on, otherwise we can presumably bail.
16080 if (! $rebuild) {
16081     foreach my $out (@mktables_list_output_files) {
16082         if ( ! file_exists($out)) {
16083             print "'$out' is missing.\n" if $verbosity >= $VERBOSE;
16084             $rebuild = 1;
16085             last;
16086          }
16087         #local $to_trace = 1 if main::DEBUG;
16088         trace $most_recent, (stat $out)[9] if main::DEBUG && $to_trace;
16089         if ( (stat $out)[9] <= $most_recent ) {
16090             #trace "$out:  most recent mod time: ", (stat $out)[9], ", youngest: $most_recent\n" if main::DEBUG && $to_trace;
16091             print "'$out' is too old.\n" if $verbosity >= $VERBOSE;
16092             $rebuild = 1;
16093             last;
16094         }
16095     }
16096 }
16097 if (! $rebuild) {
16098     print "Files seem to be ok, not bothering to rebuild.  Add '-w' option to force build\n";
16099     exit(0);
16100 }
16101 print "Must rebuild tables.\n" if $verbosity >= $VERBOSE;
16102
16103 # Ready to do the major processing.  First create the perl pseudo-property.
16104 $perl = Property->new('perl', Type => $NON_STRING, Perl_Extension => 1);
16105
16106 # Process each input file
16107 foreach my $file (@input_file_objects) {
16108     $file->run;
16109 }
16110
16111 # Finish the table generation.
16112
16113 print "Finishing processing Unicode properties\n" if $verbosity >= $PROGRESS;
16114 finish_Unicode();
16115
16116 print "Compiling Perl properties\n" if $verbosity >= $PROGRESS;
16117 compile_perl();
16118
16119 print "Creating Perl synonyms\n" if $verbosity >= $PROGRESS;
16120 add_perl_synonyms();
16121
16122 print "Writing tables\n" if $verbosity >= $PROGRESS;
16123 write_all_tables();
16124
16125 # Write mktables.lst
16126 if ( $file_list and $make_list ) {
16127
16128     print "Updating '$file_list'\n" if $verbosity >= $PROGRESS;
16129     foreach my $file (@input_files, @files_actually_output) {
16130         my (undef, $directories, $file) = File::Spec->splitpath($file);
16131         my @directories = File::Spec->splitdir($directories);
16132         $file = join '/', @directories, $file;
16133     }
16134
16135     my $ofh;
16136     if (! open $ofh,">",$file_list) {
16137         Carp::my_carp("Can't write to '$file_list'.  Skipping: $!");
16138         return
16139     }
16140     else {
16141         my $localtime = localtime $start_time;
16142         print $ofh <<"END";
16143 #
16144 # $file_list -- File list for $0.
16145 #
16146 #   Autogenerated starting on $start_time ($localtime)
16147 #
16148 # - First section is input files
16149 #   ($0 itself is not listed but is automatically considered an input)
16150 # - Section separator is /^=+\$/
16151 # - Second section is a list of output files.
16152 # - Lines matching /^\\s*#/ are treated as comments
16153 #   which along with blank lines are ignored.
16154 #
16155
16156 # Input files:
16157
16158 END
16159         print $ofh "$_\n" for sort(@input_files);
16160         print $ofh "\n=================================\n# Output files:\n\n";
16161         print $ofh "$_\n" for sort @files_actually_output;
16162         print $ofh "\n# ",scalar(@input_files)," input files\n",
16163                 "# ",scalar(@files_actually_output)+1," output files\n\n",
16164                 "# End list\n";
16165         close $ofh
16166             or Carp::my_carp("Failed to close $ofh: $!");
16167
16168         print "Filelist has ",scalar(@input_files)," input files and ",
16169             scalar(@files_actually_output)+1," output files\n"
16170             if $verbosity >= $VERBOSE;
16171     }
16172 }
16173
16174 # Output these warnings unless -q explicitly specified.
16175 if ($verbosity >= $NORMAL_VERBOSITY && ! $debug_skip) {
16176     if (@unhandled_properties) {
16177         print "\nProperties and tables that unexpectedly have no code points\n";
16178         foreach my $property (sort @unhandled_properties) {
16179             print $property, "\n";
16180         }
16181     }
16182
16183     if (%potential_files) {
16184         print "\nInput files that are not considered:\n";
16185         foreach my $file (sort keys %potential_files) {
16186             print File::Spec->abs2rel($file), "\n";
16187         }
16188     }
16189     print "\nAll done\n" if $verbosity >= $VERBOSE;
16190 }
16191 exit(0);
16192
16193 # TRAILING CODE IS USED BY make_property_test_script()
16194 __DATA__
16195
16196 use strict;
16197 use warnings;
16198
16199 # If run outside the normal test suite on an ASCII platform, you can
16200 # just create a latin1_to_native() function that just returns its
16201 # inputs, because that's the only function used from test.pl
16202 require "test.pl";
16203
16204 # Test qr/\X/ and the \p{} regular expression constructs.  This file is
16205 # constructed by mktables from the tables it generates, so if mktables is
16206 # buggy, this won't necessarily catch those bugs.  Tests are generated for all
16207 # feasible properties; a few aren't currently feasible; see
16208 # is_code_point_usable() in mktables for details.
16209
16210 # Standard test packages are not used because this manipulates SIG_WARN.  It
16211 # exits 0 if every non-skipped test succeeded; -1 if any failed.
16212
16213 my $Tests = 0;
16214 my $Fails = 0;
16215
16216 sub Expect($$$$) {
16217     my $expected = shift;
16218     my $ord = shift;
16219     my $regex  = shift;
16220     my $warning_type = shift;   # Type of warning message, like 'deprecated'
16221                                 # or empty if none
16222     my $line   = (caller)[2];
16223     $ord = ord(latin1_to_native(chr($ord)));
16224
16225     # Convert the code point to hex form
16226     my $string = sprintf "\"\\x{%04X}\"", $ord;
16227
16228     my @tests = "";
16229
16230     # The first time through, use all warnings.  If the input should generate
16231     # a warning, add another time through with them turned off
16232     push @tests, "no warnings '$warning_type';" if $warning_type;
16233
16234     foreach my $no_warnings (@tests) {
16235
16236         # Store any warning messages instead of outputting them
16237         local $SIG{__WARN__} = $SIG{__WARN__};
16238         my $warning_message;
16239         $SIG{__WARN__} = sub { $warning_message = $_[0] };
16240
16241         $Tests++;
16242
16243         # A string eval is needed because of the 'no warnings'.
16244         # Assumes no parens in the regular expression
16245         my $result = eval "$no_warnings
16246                             my \$RegObj = qr($regex);
16247                             $string =~ \$RegObj ? 1 : 0";
16248         if (not defined $result) {
16249             print "not ok $Tests - couldn't compile /$regex/; line $line: $@\n";
16250             $Fails++;
16251         }
16252         elsif ($result ^ $expected) {
16253             print "not ok $Tests - expected $expected but got $result for $string =~ qr/$regex/; line $line\n";
16254             $Fails++;
16255         }
16256         elsif ($warning_message) {
16257             if (! $warning_type || ($warning_type && $no_warnings)) {
16258                 print "not ok $Tests - for qr/$regex/ did not expect warning message '$warning_message'; line $line\n";
16259                 $Fails++;
16260             }
16261             else {
16262                 print "ok $Tests - expected and got a warning message for qr/$regex/; line $line\n";
16263             }
16264         }
16265         elsif ($warning_type && ! $no_warnings) {
16266             print "not ok $Tests - for qr/$regex/ expected a $warning_type warning message, but got none; line $line\n";
16267             $Fails++;
16268         }
16269         else {
16270             print "ok $Tests - got $result for $string =~ qr/$regex/; line $line\n";
16271         }
16272     }
16273     return;
16274 }
16275
16276 sub Error($) {
16277     my $regex  = shift;
16278     $Tests++;
16279     if (eval { 'x' =~ qr/$regex/; 1 }) {
16280         $Fails++;
16281         my $line = (caller)[2];
16282         print "not ok $Tests - re compiled ok, but expected error for qr/$regex/; line $line: $@\n";
16283     }
16284     else {
16285         my $line = (caller)[2];
16286         print "ok $Tests - got and expected error for qr/$regex/; line $line\n";
16287     }
16288     return;
16289 }
16290
16291 # GCBTest.txt character that separates grapheme clusters
16292 my $breakable_utf8 = my $breakable = chr(0xF7);
16293 utf8::upgrade($breakable_utf8);
16294
16295 # GCBTest.txt character that indicates that the adjoining code points are part
16296 # of the same grapheme cluster
16297 my $nobreak_utf8 = my $nobreak = chr(0xD7);
16298 utf8::upgrade($nobreak_utf8);
16299
16300 sub Test_X($) {
16301     # Test qr/\X/ matches.  The input is a line from auxiliary/GCBTest.txt
16302     # Each such line is a sequence of code points given by their hex numbers,
16303     # separated by the two characters defined just before this subroutine that
16304     # indicate that either there can or cannot be a break between the adjacent
16305     # code points.  If there isn't a break, that means the sequence forms an
16306     # extended grapheme cluster, which means that \X should match the whole
16307     # thing.  If there is a break, \X should stop there.  This is all
16308     # converted by this routine into a match:
16309     #   $string =~ /(\X)/,
16310     # Each \X should match the next cluster; and that is what is checked.
16311
16312     my $template = shift;
16313
16314     my $line   = (caller)[2];
16315
16316     # The line contains characters above the ASCII range, but in Latin1.  It
16317     # may or may not be in utf8, and if it is, it may or may not know it.  So,
16318     # convert these characters to 8 bits.  If knows is in utf8, simply
16319     # downgrade.
16320     if (utf8::is_utf8($template)) {
16321         utf8::downgrade($template);
16322     } else {
16323
16324         # Otherwise, if it is in utf8, but doesn't know it, the next lines
16325         # convert the two problematic characters to their 8-bit equivalents.
16326         # If it isn't in utf8, they don't harm anything.
16327         use bytes;
16328         $template =~ s/$nobreak_utf8/$nobreak/g;
16329         $template =~ s/$breakable_utf8/$breakable/g;
16330     }
16331
16332     # Get rid of the leading and trailing breakables
16333     $template =~ s/^ \s* $breakable \s* //x;
16334     $template =~ s/ \s* $breakable \s* $ //x;
16335
16336     # And no-breaks become just a space.
16337     $template =~ s/ \s* $nobreak \s* / /xg;
16338
16339     # Split the input into segments that are breakable between them.
16340     my @segments = split /\s*$breakable\s*/, $template;
16341
16342     my $string = "";
16343     my $display_string = "";
16344     my @should_match;
16345     my @should_display;
16346
16347     # Convert the code point sequence in each segment into a Perl string of
16348     # characters
16349     foreach my $segment (@segments) {
16350         my @code_points = split /\s+/, $segment;
16351         my $this_string = "";
16352         my $this_display = "";
16353         foreach my $code_point (@code_points) {
16354             $this_string .= latin1_to_native(chr(hex $code_point));
16355             $this_display .= "\\x{$code_point}";
16356         }
16357
16358         # The next cluster should match the string in this segment.
16359         push @should_match, $this_string;
16360         push @should_display, $this_display;
16361         $string .= $this_string;
16362         $display_string .= $this_display;
16363     }
16364
16365     # If a string can be represented in both non-ut8 and utf8, test both cases
16366     UPGRADE:
16367     for my $to_upgrade (0 .. 1) {
16368
16369         if ($to_upgrade) {
16370
16371             # If already in utf8, would just be a repeat
16372             next UPGRADE if utf8::is_utf8($string);
16373
16374             utf8::upgrade($string);
16375         }
16376
16377         # Finally, do the \X match.
16378         my @matches = $string =~ /(\X)/g;
16379
16380         # Look through each matched cluster to verify that it matches what we
16381         # expect.
16382         my $min = (@matches < @should_match) ? @matches : @should_match;
16383         for my $i (0 .. $min - 1) {
16384             $Tests++;
16385             if ($matches[$i] eq $should_match[$i]) {
16386                 print "ok $Tests - ";
16387                 if ($i == 0) {
16388                     print "In \"$display_string\" =~ /(\\X)/g, \\X #1";
16389                 } else {
16390                     print "And \\X #", $i + 1,
16391                 }
16392                 print " correctly matched $should_display[$i]; line $line\n";
16393             } else {
16394                 $matches[$i] = join("", map { sprintf "\\x{%04X}", $_ }
16395                                                     unpack("U*", $matches[$i]));
16396                 print "not ok $Tests - In \"$display_string\" =~ /(\\X)/g, \\X #",
16397                     $i + 1,
16398                     " should have matched $should_display[$i]",
16399                     " but instead matched $matches[$i]",
16400                     ".  Abandoning rest of line $line\n";
16401                 next UPGRADE;
16402             }
16403         }
16404
16405         # And the number of matches should equal the number of expected matches.
16406         $Tests++;
16407         if (@matches == @should_match) {
16408             print "ok $Tests - Nothing was left over; line $line\n";
16409         } else {
16410             print "not ok $Tests - There were ", scalar @should_match, " \\X matches expected, but got ", scalar @matches, " instead; line $line\n";
16411         }
16412     }
16413
16414     return;
16415 }
16416
16417 sub Finished() {
16418     print "1..$Tests\n";
16419     exit($Fails ? -1 : 0);
16420 }
16421
16422 Error('\p{Script=InGreek}');    # Bug #69018
16423 Test_X("1100 $nobreak 1161");  # Bug #70940
16424 Expect(0, 0x2028, '\p{Print}', ""); # Bug # 71722
16425 Expect(0, 0x2029, '\p{Print}', ""); # Bug # 71722
16426 Expect(1, 0xFF10, '\p{XDigit}', ""); # Bug # 71726