lib/unicore/mktables

   1 #!/usr/bin/perl -w
   2
   3 # !!!!!!!!!!!!!!       IF YOU MODIFY THIS FILE       !!!!!!!!!!!!!!!!!!!!!!!!!
   4 # Any files created or read by this program should be listed in 'mktables.lst'
   5 # Use -makelist to regenerate it.
   6
   7 # There was an attempt when this was first rewritten to make it 5.8
   8 # compatible, but that has now been abandoned, and newer constructs are used
   9 # as convenient.
  10
  11 # NOTE: this script can run quite slowly in older/slower systems.
  12 # It can also consume a lot of memory (128 MB or more), you may need
  13 # to raise your process resource limits (e.g. in bash, "ulimit -a"
  14 # to inspect, and "ulimit -d ..." or "ulimit -m ..." to set)
  15
  16 my $start_time;
  17 BEGIN { # Get the time the script started running; do it at compilation to
  18         # get it as close as possible
  19     $start_time= time;
  20 }
  21
  22 require 5.010_001;
  23 use strict;
  24 use warnings;
  25 use Carp;
  26 use Config;
  27 use File::Find;
  28 use File::Path;
  29 use File::Spec;
  30 use Text::Tabs;
  31 use re "/aa";
  32 use feature 'state';
  33
  34 sub DEBUG () { 0 }  # Set to 0 for production; 1 for development
  35 my $debugging_build = $Config{"ccflags"} =~ /-DDEBUGGING/;
  36
  37 sub NON_ASCII_PLATFORM { ord("A") != 65 }
  38
  39 # When a new version of Unicode is published, unfortunately the algorithms for
  40 # dealing with various bounds, like \b{gcb}, \b{lb} may have to be updated
  41 # manually.  The changes may or may not be backward compatible with older
  42 # releases.  The code is in regen/mk_invlist.pl and regexec.c.  Make the
  43 # changes, then come back here and set the variable below to what version the
  44 # code is expecting.  If a newer version of Unicode is being compiled than
  45 # expected, a warning will be generated.  If an older version is being
  46 # compiled, any bounds tests that fail in the generated test file (-maketest
  47 # option) will be marked as TODO.
  48 my $version_of_mk_invlist_bounds = v12.1.0;
  49
  50 ##########################################################################
  51 #
  52 # mktables -- create the runtime Perl Unicode files (lib/unicore/.../*.pl),
  53 # from the Unicode database files (lib/unicore/.../*.txt),  It also generates
  54 # a pod file and .t files, depending on option parameters.
  55 #
  56 # The structure of this file is:
  57 #   First these introductory comments; then
  58 #   code needed for everywhere, such as debugging stuff; then
  59 #   code to handle input parameters; then
  60 #   data structures likely to be of external interest (some of which depend on
  61 #       the input parameters, so follows them; then
  62 #   more data structures and subroutine and package (class) definitions; then
  63 #   the small actual loop to process the input files and finish up; then
  64 #   a __DATA__ section, for the .t tests
  65 #
  66 # This program works on all releases of Unicode so far.  The outputs have been
  67 # scrutinized most intently for release 5.1.  The others have been checked for
  68 # somewhat more than just sanity.  It can handle all non-provisional Unicode
  69 # character properties in those releases.
  70 #
  71 # This program is mostly about Unicode character (or code point) properties.
  72 # A property describes some attribute or quality of a code point, like if it
  73 # is lowercase or not, its name, what version of Unicode it was first defined
  74 # in, or what its uppercase equivalent is.  Unicode deals with these disparate
  75 # possibilities by making all properties into mappings from each code point
  76 # into some corresponding value.  In the case of it being lowercase or not,
  77 # the mapping is either to 'Y' or 'N' (or various synonyms thereof).  Each
  78 # property maps each Unicode code point to a single value, called a "property
  79 # value".  (Some more recently defined properties, map a code point to a set
  80 # of values.)
  81 #
  82 # When using a property in a regular expression, what is desired isn't the
  83 # mapping of the code point to its property's value, but the reverse (or the
  84 # mathematical "inverse relation"): starting with the property value, "Does a
  85 # code point map to it?"  These are written in a "compound" form:
  86 # \p{property=value}, e.g., \p{category=punctuation}.  This program generates
  87 # files containing the lists of code points that map to each such regular
  88 # expression property value, one file per list
  89 #
  90 # There is also a single form shortcut that Perl adds for many of the commonly
  91 # used properties.  This happens for all binary properties, plus script,
  92 # general_category, and block properties.
  93 #
  94 # Thus the outputs of this program are files.  There are map files, mostly in
  95 # the 'To' directory; and there are list files for use in regular expression
  96 # matching, all in subdirectories of the 'lib' directory, with each
  97 # subdirectory being named for the property that the lists in it are for.
  98 # Bookkeeping, test, and documentation files are also generated.
  99
 100 my $matches_directory = 'lib';   # Where match (\p{}) files go.
 101 my $map_directory = 'To';        # Where map files go.
 102
 103 # DATA STRUCTURES
 104 #
 105 # The major data structures of this program are Property, of course, but also
 106 # Table.  There are two kinds of tables, very similar to each other.
 107 # "Match_Table" is the data structure giving the list of code points that have
 108 # a particular property value, mentioned above.  There is also a "Map_Table"
 109 # data structure which gives the property's mapping from code point to value.
 110 # There are two structures because the match tables need to be combined in
 111 # various ways, such as constructing unions, intersections, complements, etc.,
 112 # and the map ones don't.  And there would be problems, perhaps subtle, if
 113 # a map table were inadvertently operated on in some of those ways.
 114 # The use of separate classes with operations defined on one but not the other
 115 # prevents accidentally confusing the two.
 116 #
 117 # At the heart of each table's data structure is a "Range_List", which is just
 118 # an ordered list of "Ranges", plus ancillary information, and methods to
 119 # operate on them.  A Range is a compact way to store property information.
 120 # Each range has a starting code point, an ending code point, and a value that
 121 # is meant to apply to all the code points between the two end points,
 122 # inclusive.  For a map table, this value is the property value for those
 123 # code points.  Two such ranges could be written like this:
 124 #   0x41 .. 0x5A, 'Upper',
 125 #   0x61 .. 0x7A, 'Lower'
 126 #
 127 # Each range also has a type used as a convenience to classify the values.
 128 # Most ranges in this program will be Type 0, or normal, but there are some
 129 # ranges that have a non-zero type.  These are used only in map tables, and
 130 # are for mappings that don't fit into the normal scheme of things.  Mappings
 131 # that require a hash entry to communicate with utf8.c are one example;
 132 # another example is mappings for charnames.pm to use which indicate a name
 133 # that is algorithmically determinable from its code point (and the reverse).
 134 # These are used to significantly compact these tables, instead of listing
 135 # each one of the tens of thousands individually.
 136 #
 137 # In a match table, the value of a range is irrelevant (and hence the type as
 138 # well, which will always be 0), and arbitrarily set to the empty string.
 139 # Using the example above, there would be two match tables for those two
 140 # entries, one named Upper would contain the 0x41..0x5A range, and the other
 141 # named Lower would contain 0x61..0x7A.
 142 #
 143 # Actually, there are two types of range lists, "Range_Map" is the one
 144 # associated with map tables, and "Range_List" with match tables.
 145 # Again, this is so that methods can be defined on one and not the others so
 146 # as to prevent operating on them in incorrect ways.
 147 #
 148 # Eventually, most tables are written out to files to be read by Unicode::UCD.
 149 # All tables could in theory be written, but some are suppressed because there
 150 # is no current practical use for them.  It is easy to change which get
 151 # written by changing various lists that are near the top of the actual code
 152 # in this file.  The table data structures contain enough ancillary
 153 # information to allow them to be treated as separate entities for writing,
 154 # such as the path to each one's file.  There is a heading in each map table
 155 # that gives the format of its entries, and what the map is for all the code
 156 # points missing from it.  (This allows tables to be more compact.)
 157 #
 158 # The Property data structure contains one or more tables.  All properties
 159 # contain a map table (except the $perl property which is a
 160 # pseudo-property containing only match tables), and any properties that
 161 # are usable in regular expression matches also contain various matching
 162 # tables, one for each value the property can have.  A binary property can
 163 # have two values, True and False (or Y and N, which are preferred by Unicode
 164 # terminology).  Thus each of these properties will have a map table that
 165 # takes every code point and maps it to Y or N (but having ranges cuts the
 166 # number of entries in that table way down), and two match tables, one
 167 # which has a list of all the code points that map to Y, and one for all the
 168 # code points that map to N.  (For each binary property, a third table is also
 169 # generated for the pseudo Perl property.  It contains the identical code
 170 # points as the Y table, but can be written in regular expressions, not in the
 171 # compound form, but in a "single" form like \p{IsUppercase}.)  Many
 172 # properties are binary, but some properties have several possible values,
 173 # some have many, and properties like Name have a different value for every
 174 # named code point.  Those will not, unless the controlling lists are changed,
 175 # have their match tables written out.  But all the ones which can be used in
 176 # regular expression \p{} and \P{} constructs will.  Prior to 5.14, generally
 177 # a property would have either its map table or its match tables written but
 178 # not both.  Again, what gets written is controlled by lists which can easily
 179 # be changed.  Starting in 5.14, advantage was taken of this, and all the map
 180 # tables needed to reconstruct the Unicode db are now written out, while
 181 # suppressing the Unicode .txt files that contain the data.  Our tables are
 182 # much more compact than the .txt files, so a significant space savings was
 183 # achieved.  Also, tables are not written out that are trivially derivable
 184 # from tables that do get written.  So, there typically is no file containing
 185 # the code points not matched by a binary property (the table for \P{} versus
 186 # lowercase \p{}), since you just need to invert the True table to get the
 187 # False table.
 188
 189 # Properties have a 'Type', like 'binary', or 'string', or 'enum' depending on
 190 # how many match tables there are and the content of the maps.  This 'Type' is
 191 # different than a range 'Type', so don't get confused by the two concepts
 192 # having the same name.
 193 #
 194 # For information about the Unicode properties, see Unicode's UAX44 document:
 195
 196 my $unicode_reference_url = 'http://www.unicode.org/reports/tr44/';
 197
 198 # As stated earlier, this program will work on any release of Unicode so far.
 199 # Most obvious problems in earlier data have NOT been corrected except when
 200 # necessary to make Perl or this program work reasonably, and to keep out
 201 # potential security issues.  For example, no folding information was given in
 202 # early releases, so this program substitutes lower case instead, just so that
 203 # a regular expression with the /i option will do something that actually
 204 # gives the right results in many cases.  There are also a couple other
 205 # corrections for version 1.1.5, commented at the point they are made.  As an
 206 # example of corrections that weren't made (but could be) is this statement
 207 # from DerivedAge.txt: "The supplementary private use code points and the
 208 # non-character code points were assigned in version 2.0, but not specifically
 209 # listed in the UCD until versions 3.0 and 3.1 respectively."  (To be precise
 210 # it was 3.0.1 not 3.0.0)  More information on Unicode version glitches is
 211 # further down in these introductory comments.
 212 #
 213 # This program works on all non-provisional properties as of the current
 214 # Unicode release, though the files for some are suppressed for various
 215 # reasons.  You can change which are output by changing lists in this program.
 216 #
 217 # The old version of mktables emphasized the term "Fuzzy" to mean Unicode's
 218 # loose matchings rules (from Unicode TR18):
 219 #
 220 #    The recommended names for UCD properties and property values are in
 221 #    PropertyAliases.txt [Prop] and PropertyValueAliases.txt
 222 #    [PropValue]. There are both abbreviated names and longer, more
 223 #    descriptive names. It is strongly recommended that both names be
 224 #    recognized, and that loose matching of property names be used,
 225 #    whereby the case distinctions, whitespace, hyphens, and underbar
 226 #    are ignored.
 227 #
 228 # The program still allows Fuzzy to override its determination of if loose
 229 # matching should be used, but it isn't currently used, as it is no longer
 230 # needed; the calculations it makes are good enough.
 231 #
 232 # SUMMARY OF HOW IT WORKS:
 233 #
 234 #   Process arguments
 235 #
 236 #   A list is constructed containing each input file that is to be processed
 237 #
 238 #   Each file on the list is processed in a loop, using the associated handler
 239 #   code for each:
 240 #        The PropertyAliases.txt and PropValueAliases.txt files are processed
 241 #            first.  These files name the properties and property values.
 242 #            Objects are created of all the property and property value names
 243 #            that the rest of the input should expect, including all synonyms.
 244 #        The other input files give mappings from properties to property
 245 #           values.  That is, they list code points and say what the mapping
 246 #           is under the given property.  Some files give the mappings for
 247 #           just one property; and some for many.  This program goes through
 248 #           each file and populates the properties and their map tables from
 249 #           them.  Some properties are listed in more than one file, and
 250 #           Unicode has set up a precedence as to which has priority if there
 251 #           is a conflict.  Thus the order of processing matters, and this
 252 #           program handles the conflict possibility by processing the
 253 #           overriding input files last, so that if necessary they replace
 254 #           earlier values.
 255 #        After this is all done, the program creates the property mappings not
 256 #            furnished by Unicode, but derivable from what it does give.
 257 #        The tables of code points that match each property value in each
 258 #            property that is accessible by regular expressions are created.
 259 #        The Perl-defined properties are created and populated.  Many of these
 260 #            require data determined from the earlier steps
 261 #        Any Perl-defined synonyms are created, and name clashes between Perl
 262 #            and Unicode are reconciled and warned about.
 263 #        All the properties are written to files
 264 #        Any other files are written, and final warnings issued.
 265 #
 266 # For clarity, a number of operators have been overloaded to work on tables:
 267 #   ~ means invert (take all characters not in the set).  The more
 268 #       conventional '!' is not used because of the possibility of confusing
 269 #       it with the actual boolean operation.
 270 #   + means union
 271 #   - means subtraction
 272 #   & means intersection
 273 # The precedence of these is the order listed.  Parentheses should be
 274 # copiously used.  These are not a general scheme.  The operations aren't
 275 # defined for a number of things, deliberately, to avoid getting into trouble.
 276 # Operations are done on references and affect the underlying structures, so
 277 # that the copy constructors for them have been overloaded to not return a new
 278 # clone, but the input object itself.
 279 #
 280 # The bool operator is deliberately not overloaded to avoid confusion with
 281 # "should it mean if the object merely exists, or also is non-empty?".
 282 #
 283 # WHY CERTAIN DESIGN DECISIONS WERE MADE
 284 #
 285 # This program needs to be able to run under miniperl.  Therefore, it uses a
 286 # minimum of other modules, and hence implements some things itself that could
 287 # be gotten from CPAN
 288 #
 289 # This program uses inputs published by the Unicode Consortium.  These can
 290 # change incompatibly between releases without the Perl maintainers realizing
 291 # it.  Therefore this program is now designed to try to flag these.  It looks
 292 # at the directories where the inputs are, and flags any unrecognized files.
 293 # It keeps track of all the properties in the files it handles, and flags any
 294 # that it doesn't know how to handle.  It also flags any input lines that
 295 # don't match the expected syntax, among other checks.
 296 #
 297 # It is also designed so if a new input file matches one of the known
 298 # templates, one hopefully just needs to add it to a list to have it
 299 # processed.
 300 #
 301 # As mentioned earlier, some properties are given in more than one file.  In
 302 # particular, the files in the extracted directory are supposedly just
 303 # reformattings of the others.  But they contain information not easily
 304 # derivable from the other files, including results for Unihan (which isn't
 305 # usually available to this program) and for unassigned code points.  They
 306 # also have historically had errors or been incomplete.  In an attempt to
 307 # create the best possible data, this program thus processes them first to
 308 # glean information missing from the other files; then processes those other
 309 # files to override any errors in the extracted ones.  Much of the design was
 310 # driven by this need to store things and then possibly override them.
 311 #
 312 # It tries to keep fatal errors to a minimum, to generate something usable for
 313 # testing purposes.  It always looks for files that could be inputs, and will
 314 # warn about any that it doesn't know how to handle (the -q option suppresses
 315 # the warning).
 316 #
 317 # Why is there more than one type of range?
 318 #   This simplified things.  There are some very specialized code points that
 319 #   have to be handled specially for output, such as Hangul syllable names.
 320 #   By creating a range type (done late in the development process), it
 321 #   allowed this to be stored with the range, and overridden by other input.
 322 #   Originally these were stored in another data structure, and it became a
 323 #   mess trying to decide if a second file that was for the same property was
 324 #   overriding the earlier one or not.
 325 #
 326 # Why are there two kinds of tables, match and map?
 327 #   (And there is a base class shared by the two as well.)  As stated above,
 328 #   they actually are for different things.  Development proceeded much more
 329 #   smoothly when I (khw) realized the distinction.  Map tables are used to
 330 #   give the property value for every code point (actually every code point
 331 #   that doesn't map to a default value).  Match tables are used for regular
 332 #   expression matches, and are essentially the inverse mapping.  Separating
 333 #   the two allows more specialized methods, and error checks so that one
 334 #   can't just take the intersection of two map tables, for example, as that
 335 #   is nonsensical.
 336 #
 337 # What about 'fate' and 'status'.  The concept of a table's fate was created
 338 #   late when it became clear that something more was needed.  The difference
 339 #   between this and 'status' is unclean, and could be improved if someone
 340 #   wanted to spend the effort.
 341 #
 342 # DEBUGGING
 343 #
 344 # This program is written so it will run under miniperl.  Occasionally changes
 345 # will cause an error where the backtrace doesn't work well under miniperl.
 346 # To diagnose the problem, you can instead run it under regular perl, if you
 347 # have one compiled.
 348 #
 349 # There is a good trace facility.  To enable it, first sub DEBUG must be set
 350 # to return true.  Then a line like
 351 #
 352 # local $to_trace = 1 if main::DEBUG;
 353 #
 354 # can be added to enable tracing in its lexical scope (plus dynamic) or until
 355 # you insert another line:
 356 #
 357 # local $to_trace = 0 if main::DEBUG;
 358 #
 359 # To actually trace, use a line like "trace $a, @b, %c, ...;
 360 #
 361 # Some of the more complex subroutines already have trace statements in them.
 362 # Permanent trace statements should be like:
 363 #
 364 # trace ... if main::DEBUG && $to_trace;
 365 #
 366 # main::stack_trace() will display what its name implies
 367 #
 368 # If there is just one or a few files that you're debugging, you can easily
 369 # cause most everything else to be skipped.  Change the line
 370 #
 371 # my $debug_skip = 0;
 372 #
 373 # to 1, and every file whose object is in @input_file_objects and doesn't have
 374 # a, 'non_skip => 1,' in its constructor will be skipped.  However, skipping
 375 # Jamo.txt or UnicodeData.txt will likely cause fatal errors.
 376 #
 377 # To compare the output tables, it may be useful to specify the -annotate
 378 # flag.  (As of this writing, this can't be done on a clean workspace, due to
 379 # requirements in Text::Tabs used in this option; so first run mktables
 380 # without this option.)  This option adds comment lines to each table, one for
 381 # each non-algorithmically named character giving, currently its code point,
 382 # name, and graphic representation if printable (and you have a font that
 383 # knows about it).  This makes it easier to see what the particular code
 384 # points are in each output table.  Non-named code points are annotated with a
 385 # description of their status, and contiguous ones with the same description
 386 # will be output as a range rather than individually.  Algorithmically named
 387 # characters are also output as ranges, except when there are just a few
 388 # contiguous ones.
 389 #
 390 # FUTURE ISSUES
 391 #
 392 # The program would break if Unicode were to change its names so that
 393 # interior white space, underscores, or dashes differences were significant
 394 # within property and property value names.
 395 #
 396 # It might be easier to use the xml versions of the UCD if this program ever
 397 # would need heavy revision, and the ability to handle old versions was not
 398 # required.
 399 #
 400 # There is the potential for name collisions, in that Perl has chosen names
 401 # that Unicode could decide it also likes.  There have been such collisions in
 402 # the past, with mostly Perl deciding to adopt the Unicode definition of the
 403 # name.  However in the 5.2 Unicode beta testing, there were a number of such
 404 # collisions, which were withdrawn before the final release, because of Perl's
 405 # and other's protests.  These all involved new properties which began with
 406 # 'Is'.  Based on the protests, Unicode is unlikely to try that again.  Also,
 407 # many of the Perl-defined synonyms, like Any, Word, etc, are listed in a
 408 # Unicode document, so they are unlikely to be used by Unicode for another
 409 # purpose.  However, they might try something beginning with 'In', or use any
 410 # of the other Perl-defined properties.  This program will warn you of name
 411 # collisions, and refuse to generate tables with them, but manual intervention
 412 # will be required in this event.  One scheme that could be implemented, if
 413 # necessary, would be to have this program generate another file, or add a
 414 # field to mktables.lst that gives the date of first definition of a property.
 415 # Each new release of Unicode would use that file as a basis for the next
 416 # iteration.  And the Perl synonym addition code could sort based on the age
 417 # of the property, so older properties get priority, and newer ones that clash
 418 # would be refused; hence existing code would not be impacted, and some other
 419 # synonym would have to be used for the new property.  This is ugly, and
 420 # manual intervention would certainly be easier to do in the short run; lets
 421 # hope it never comes to this.
 422 #
 423 # A NOTE ON UNIHAN
 424 #
 425 # This program can generate tables from the Unihan database.  But that DB
 426 # isn't normally available, so it is marked as optional.  Prior to version
 427 # 5.2, this database was in a single file, Unihan.txt.  In 5.2 the database
 428 # was split into 8 different files, all beginning with the letters 'Unihan'.
 429 # If you plunk those files down into the directory mktables ($0) is in, this
 430 # program will read them and automatically create tables for the properties
 431 # from it that are listed in PropertyAliases.txt and PropValueAliases.txt,
 432 # plus any you add to the @cjk_properties array and the @cjk_property_values
 433 # array, being sure to add necessary '# @missings' lines to the latter.  For
 434 # Unicode versions earlier than 5.2, most of the Unihan properties are not
 435 # listed at all in PropertyAliases nor PropValueAliases.  This program assumes
 436 # for these early releases that you want the properties that are specified in
 437 # the 5.2 release.
 438 #
 439 # You may need to adjust the entries to suit your purposes.  setup_unihan(),
 440 # and filter_unihan_line() are the functions where this is done.  This program
 441 # already does some adjusting to make the lines look more like the rest of the
 442 # Unicode DB;  You can see what that is in filter_unihan_line()
 443 #
 444 # There is a bug in the 3.2 data file in which some values for the
 445 # kPrimaryNumeric property have commas and an unexpected comment.  A filter
 446 # could be added to correct these; or for a particular installation, the
 447 # Unihan.txt file could be edited to fix them.
 448 #
 449 # HOW TO ADD A FILE TO BE PROCESSED
 450 #
 451 # A new file from Unicode needs to have an object constructed for it in
 452 # @input_file_objects, probably at the end or at the end of the extracted
 453 # ones.  The program should warn you if its name will clash with others on
 454 # restrictive file systems, like DOS.  If so, figure out a better name, and
 455 # add lines to the README.perl file giving that.  If the file is a character
 456 # property, it should be in the format that Unicode has implicitly
 457 # standardized for such files for the more recently introduced ones.
 458 # If so, the Input_file constructor for @input_file_objects can just be the
 459 # file name and release it first appeared in.  If not, then it should be
 460 # possible to construct an each_line_handler() to massage the line into the
 461 # standardized form.
 462 #
 463 # For non-character properties, more code will be needed.  You can look at
 464 # the existing entries for clues.
 465 #
 466 # UNICODE VERSIONS NOTES
 467 #
 468 # The Unicode UCD has had a number of errors in it over the versions.  And
 469 # these remain, by policy, in the standard for that version.  Therefore it is
 470 # risky to correct them, because code may be expecting the error.  So this
 471 # program doesn't generally make changes, unless the error breaks the Perl
 472 # core.  As an example, some versions of 2.1.x Jamo.txt have the wrong value
 473 # for U+1105, which causes real problems for the algorithms for Jamo
 474 # calculations, so it is changed here.
 475 #
 476 # But it isn't so clear cut as to what to do about concepts that are
 477 # introduced in a later release; should they extend back to earlier releases
 478 # where the concept just didn't exist?  It was easier to do this than to not,
 479 # so that's what was done.  For example, the default value for code points not
 480 # in the files for various properties was probably undefined until changed by
 481 # some version.  No_Block for blocks is such an example.  This program will
 482 # assign No_Block even in Unicode versions that didn't have it.  This has the
 483 # benefit that code being written doesn't have to special case earlier
 484 # versions; and the detriment that it doesn't match the Standard precisely for
 485 # the affected versions.
 486 #
 487 # Here are some observations about some of the issues in early versions:
 488 #
 489 # Prior to version 3.0, there were 3 character decompositions.  These are not
 490 # handled by Unicode::Normalize, nor will it compile when presented a version
 491 # that has them.  However, you can trivially get it to compile by simply
 492 # ignoring those decompositions, by changing the croak to a carp.  At the time
 493 # of this writing, the line (in dist/Unicode-Normalize/Normalize.pm or
 494 # dist/Unicode-Normalize/mkheader) reads
 495 #
 496 #   croak("Weird Canonical Decomposition of U+$h");
 497 #
 498 # Simply comment it out.  It will compile, but will not know about any three
 499 # character decompositions.
 500
 501 # The number of code points in \p{alpha=True} halved in 2.1.9.  It turns out
 502 # that the reason is that the CJK block starting at 4E00 was removed from
 503 # PropList, and was not put back in until 3.1.0.  The Perl extension (the
 504 # single property name \p{alpha}) has the correct values.  But the compound
 505 # form is simply not generated until 3.1, as it can be argued that prior to
 506 # this release, this was not an official property.  The comments for
 507 # filter_old_style_proplist() give more details.
 508 #
 509 # Unicode introduced the synonym Space for White_Space in 4.1.  Perl has
 510 # always had a \p{Space}.  In release 3.2 only, they are not synonymous.  The
 511 # reason is that 3.2 introduced U+205F=medium math space, which was not
 512 # classed as white space, but Perl figured out that it should have been. 4.0
 513 # reclassified it correctly.
 514 #
 515 # Another change between 3.2 and 4.0 is the CCC property value ATBL.  In 3.2
 516 # this was erroneously a synonym for 202 (it should be 200).  In 4.0, ATB
 517 # became 202, and ATBL was left with no code points, as all the ones that
 518 # mapped to 202 stayed mapped to 202.  Thus if your program used the numeric
 519 # name for the class, it would not have been affected, but if it used the
 520 # mnemonic, it would have been.
 521 #
 522 # \p{Script=Hrkt} (Katakana_Or_Hiragana) came in 4.0.1.  Before that, code
 523 # points which eventually came to have this script property value, instead
 524 # mapped to "Unknown".  But in the next release all these code points were
 525 # moved to \p{sc=common} instead.
 526
 527 # The tests furnished  by Unicode for testing WordBreak and SentenceBreak
 528 # generate errors in 5.0 and earlier.
 529 #
 530 # The default for missing code points for BidiClass is complicated.  Starting
 531 # in 3.1.1, the derived file DBidiClass.txt handles this, but this program
 532 # tries to do the best it can for earlier releases.  It is done in
 533 # process_PropertyAliases()
 534 #
 535 # In version 2.1.2, the entry in UnicodeData.txt:
 536 #   0275;LATIN SMALL LETTER BARRED O;Ll;0;L;;;;;N;;;;019F;
 537 # should instead be
 538 #   0275;LATIN SMALL LETTER BARRED O;Ll;0;L;;;;;N;;;019F;;019F
 539 # Without this change, there are casing problems for this character.
 540 #
 541 # Search for $string_compare_versions to see how to compare changes to
 542 # properties between Unicode versions
 543 #
 544 ##############################################################################
 545
 546 my $UNDEF = ':UNDEF:';  # String to print out for undefined values in tracing
 547                         # and errors
 548 my $MAX_LINE_WIDTH = 78;
 549
 550 # Debugging aid to skip most files so as to not be distracted by them when
 551 # concentrating on the ones being debugged.  Add
 552 # non_skip => 1,
 553 # to the constructor for those files you want processed when you set this.
 554 # Files with a first version number of 0 are special: they are always
 555 # processed regardless of the state of this flag.  Generally, Jamo.txt and
 556 # UnicodeData.txt must not be skipped if you want this program to not die
 557 # before normal completion.
 558 my $debug_skip = 0;
 559
 560
 561 # Normally these are suppressed.
 562 my $write_Unicode_deprecated_tables = 0;
 563
 564 # Set to 1 to enable tracing.
 565 our $to_trace = 0;
 566
 567 { # Closure for trace: debugging aid
 568     my $print_caller = 1;        # ? Include calling subroutine name
 569     my $main_with_colon = 'main::';
 570     my $main_colon_length = length($main_with_colon);
 571
 572     sub trace {
 573         return unless $to_trace;        # Do nothing if global flag not set
 574
 575         my @input = @_;
 576
 577         local $DB::trace = 0;
 578         $DB::trace = 0;          # Quiet 'used only once' message
 579
 580         my $line_number;
 581
 582         # Loop looking up the stack to get the first non-trace caller
 583         my $caller_line;
 584         my $caller_name;
 585         my $i = 0;
 586         do {
 587             $line_number = $caller_line;
 588             (my $pkg, my $file, $caller_line, my $caller) = caller $i++;
 589             $caller = $main_with_colon unless defined $caller;
 590
 591             $caller_name = $caller;
 592
 593             # get rid of pkg
 594             $caller_name =~ s/.*:://;
 595             if (substr($caller_name, 0, $main_colon_length)
 596                 eq $main_with_colon)
 597             {
 598                 $caller_name = substr($caller_name, $main_colon_length);
 599             }
 600
 601         } until ($caller_name ne 'trace');
 602
 603         # If the stack was empty, we were called from the top level
 604         $caller_name = 'main' if ($caller_name eq ""
 605                                     || $caller_name eq 'trace');
 606
 607         my $output = "";
 608         #print STDERR __LINE__, ": ", join ", ", @input, "\n";
 609         foreach my $string (@input) {
 610             if (ref $string eq 'ARRAY' || ref $string eq 'HASH') {
 611                 $output .= simple_dumper($string);
 612             }
 613             else {
 614                 $string = "$string" if ref $string;
 615                 $string = $UNDEF unless defined $string;
 616                 chomp $string;
 617                 $string = '""' if $string eq "";
 618                 $output .= " " if $output ne ""
 619                                 && $string ne ""
 620                                 && substr($output, -1, 1) ne " "
 621                                 && substr($string, 0, 1) ne " ";
 622                 $output .= $string;
 623             }
 624         }
 625
 626         print STDERR sprintf "%4d: ", $line_number if defined $line_number;
 627         print STDERR "$caller_name: " if $print_caller;
 628         print STDERR $output, "\n";
 629         return;
 630     }
 631 }
 632
 633 sub stack_trace() {
 634     local $to_trace = 1 if main::DEBUG;
 635     my $line = (caller(0))[2];
 636     my $i = 1;
 637
 638     # Accumulate the stack trace
 639     while (1) {
 640         my ($pkg, $file, $caller_line, $caller) = caller $i++;
 641
 642         last unless defined $caller;
 643
 644         trace "called from $caller() at line $line";
 645         $line = $caller_line;
 646     }
 647 }
 648
 649 # This is for a rarely used development feature that allows you to compare two
 650 # versions of the Unicode standard without having to deal with changes caused
 651 # by the code points introduced in the later version.  You probably also want
 652 # to use the -annotate option when using this.  Run this program on a unicore
 653 # containing the starting release you want to compare.  Save that output
 654 # structure.  Then, switching to a unicore with the ending release, change the
 655 # "" in the $string_compare_versions definition just below to a string
 656 # containing a SINGLE dotted Unicode release number (e.g. "2.1") corresponding
 657 # to the starting release.  This program will then compile, but throw away all
 658 # code points introduced after the starting release.  Finally use a diff tool
 659 # to compare the two directory structures.  They include only the code points
 660 # common to both releases, and you can see the changes caused just by the
 661 # underlying release semantic changes.  For versions earlier than 3.2, you
 662 # must copy a version of DAge.txt into the directory.
 663 my $string_compare_versions = DEBUG && "";
 664 my $compare_versions = DEBUG
 665                        && $string_compare_versions
 666                        && pack "C*", split /\./, $string_compare_versions;
 667
 668 sub uniques {
 669     # Returns non-duplicated input values.  From "Perl Best Practices:
 670     # Encapsulated Cleverness".  p. 455 in first edition.
 671
 672     my %seen;
 673     # Arguably this breaks encapsulation, if the goal is to permit multiple
 674     # distinct objects to stringify to the same value, and be interchangeable.
 675     # However, for this program, no two objects stringify identically, and all
 676     # lists passed to this function are either objects or strings. So this
 677     # doesn't affect correctness, but it does give a couple of percent speedup.
 678     no overloading;
 679     return grep { ! $seen{$_}++ } @_;
 680 }
 681
 682 $0 = File::Spec->canonpath($0);
 683
 684 my $make_test_script = 0;      # ? Should we output a test script
 685 my $make_norm_test_script = 0; # ? Should we output a normalization test script
 686 my $write_unchanged_files = 0; # ? Should we update the output files even if
 687                                #    we don't think they have changed
 688 my $use_directory = "";        # ? Should we chdir somewhere.
 689 my $pod_directory;             # input directory to store the pod file.
 690 my $pod_file = 'perluniprops';
 691 my $t_path;                     # Path to the .t test file
 692 my $file_list = 'mktables.lst'; # File to store input and output file names.
 693                                # This is used to speed up the build, by not
 694                                # executing the main body of the program if
 695                                # nothing on the list has changed since the
 696                                # previous build
 697 my $make_list = 1;             # ? Should we write $file_list.  Set to always
 698                                # make a list so that when the pumpking is
 699                                # preparing a release, s/he won't have to do
 700                                # special things
 701 my $glob_list = 0;             # ? Should we try to include unknown .txt files
 702                                # in the input.
 703 my $output_range_counts = $debugging_build;   # ? Should we include the number
 704                                               # of code points in ranges in
 705                                               # the output
 706 my $annotate = 0;              # ? Should character names be in the output
 707
 708 # Verbosity levels; 0 is quiet
 709 my $NORMAL_VERBOSITY = 1;
 710 my $PROGRESS = 2;
 711 my $VERBOSE = 3;
 712
 713 my $verbosity = $NORMAL_VERBOSITY;
 714
 715 # Stored in mktables.lst so that if this program is called with different
 716 # options, will regenerate even if the files otherwise look like they're
 717 # up-to-date.
 718 my $command_line_arguments = join " ", @ARGV;
 719
 720 # Process arguments
 721 while (@ARGV) {
 722     my $arg = shift @ARGV;
 723     if ($arg eq '-v') {
 724         $verbosity = $VERBOSE;
 725     }
 726     elsif ($arg eq '-p') {
 727         $verbosity = $PROGRESS;
 728         $| = 1;     # Flush buffers as we go.
 729     }
 730     elsif ($arg eq '-q') {
 731         $verbosity = 0;
 732     }
 733     elsif ($arg eq '-w') {
 734         # update the files even if they haven't changed
 735         $write_unchanged_files = 1;
 736     }
 737     elsif ($arg eq '-check') {
 738         my $this = shift @ARGV;
 739         my $ok = shift @ARGV;
 740         if ($this ne $ok) {
 741             print "Skipping as check params are not the same.\n";
 742             exit(0);
 743         }
 744     }
 745     elsif ($arg eq '-P' && defined ($pod_directory = shift)) {
 746         -d $pod_directory or croak "Directory '$pod_directory' doesn't exist";
 747     }
 748     elsif ($arg eq '-maketest' || ($arg eq '-T' && defined ($t_path = shift)))
 749     {
 750         $make_test_script = 1;
 751     }
 752     elsif ($arg eq '-makenormtest')
 753     {
 754         $make_norm_test_script = 1;
 755     }
 756     elsif ($arg eq '-makelist') {
 757         $make_list = 1;
 758     }
 759     elsif ($arg eq '-C' && defined ($use_directory = shift)) {
 760         -d $use_directory or croak "Unknown directory '$use_directory'";
 761     }
 762     elsif ($arg eq '-L') {
 763
 764         # Existence not tested until have chdir'd
 765         $file_list = shift;
 766     }
 767     elsif ($arg eq '-globlist') {
 768         $glob_list = 1;
 769     }
 770     elsif ($arg eq '-c') {
 771         $output_range_counts = ! $output_range_counts
 772     }
 773     elsif ($arg eq '-annotate') {
 774         $annotate = 1;
 775         $debugging_build = 1;
 776         $output_range_counts = 1;
 777     }
 778     else {
 779         my $with_c = 'with';
 780         $with_c .= 'out' if $output_range_counts;   # Complements the state
 781         croak <<END;
 782 usage: $0 [-c|-p|-q|-v|-w] [-C dir] [-L filelist] [ -P pod_dir ]
 783           [ -T test_file_path ] [-globlist] [-makelist] [-maketest]
 784           [-check A B ]
 785   -c          : Output comments $with_c number of code points in ranges
 786   -q          : Quiet Mode: Only output serious warnings.
 787   -p          : Set verbosity level to normal plus show progress.
 788   -v          : Set Verbosity level high:  Show progress and non-serious
 789                 warnings
 790   -w          : Write files regardless
 791   -C dir      : Change to this directory before proceeding. All relative paths
 792                 except those specified by the -P and -T options will be done
 793                 with respect to this directory.
 794   -P dir      : Output $pod_file file to directory 'dir'.
 795   -T path     : Create a test script as 'path'; overrides -maketest
 796   -L filelist : Use alternate 'filelist' instead of standard one
 797   -globlist   : Take as input all non-Test *.txt files in current and sub
 798                 directories
 799   -maketest   : Make test script 'TestProp.pl' in current (or -C directory),
 800                 overrides -T
 801   -makelist   : Rewrite the file list $file_list based on current setup
 802   -annotate   : Output an annotation for each character in the table files;
 803                 useful for debugging mktables, looking at diffs; but is slow
 804                 and memory intensive
 805   -check A B  : Executes $0 only if A and B are the same
 806 END
 807     }
 808 }
 809
 810 # Stores the most-recently changed file.  If none have changed, can skip the
 811 # build
 812 my $most_recent = (stat $0)[9];   # Do this before the chdir!
 813
 814 # Change directories now, because need to read 'version' early.
 815 if ($use_directory) {
 816     if ($pod_directory && ! File::Spec->file_name_is_absolute($pod_directory)) {
 817         $pod_directory = File::Spec->rel2abs($pod_directory);
 818     }
 819     if ($t_path && ! File::Spec->file_name_is_absolute($t_path)) {
 820         $t_path = File::Spec->rel2abs($t_path);
 821     }
 822     chdir $use_directory or croak "Failed to chdir to '$use_directory':$!";
 823     if ($pod_directory && File::Spec->file_name_is_absolute($pod_directory)) {
 824         $pod_directory = File::Spec->abs2rel($pod_directory);
 825     }
 826     if ($t_path && File::Spec->file_name_is_absolute($t_path)) {
 827         $t_path = File::Spec->abs2rel($t_path);
 828     }
 829 }
 830
 831 # Get Unicode version into regular and v-string.  This is done now because
 832 # various tables below get populated based on it.  These tables are populated
 833 # here to be near the top of the file, and so easily seeable by those needing
 834 # to modify things.
 835 open my $VERSION, "<", "version"
 836                     or croak "$0: can't open required file 'version': $!\n";
 837 my $string_version = <$VERSION>;
 838 close $VERSION;
 839 chomp $string_version;
 840 my $v_version = pack "C*", split /\./, $string_version;        # v string
 841
 842 my $unicode_version = ($compare_versions)
 843                       ? (  "$string_compare_versions (using "
 844                          . "$string_version rules)")
 845                       : $string_version;
 846
 847 # The following are the complete names of properties with property values that
 848 # are known to not match any code points in some versions of Unicode, but that
 849 # may change in the future so they should be matchable, hence an empty file is
 850 # generated for them.
 851 my @tables_that_may_be_empty;
 852 push @tables_that_may_be_empty, 'Joining_Type=Left_Joining'
 853                                                     if $v_version lt v6.3.0;
 854 push @tables_that_may_be_empty, 'Script=Common' if $v_version le v4.0.1;
 855 push @tables_that_may_be_empty, 'Title' if $v_version lt v2.0.0;
 856 push @tables_that_may_be_empty, 'Script=Katakana_Or_Hiragana'
 857                                                     if $v_version ge v4.1.0;
 858 push @tables_that_may_be_empty, 'Script_Extensions=Katakana_Or_Hiragana'
 859                                                     if $v_version ge v6.0.0;
 860 push @tables_that_may_be_empty, 'Grapheme_Cluster_Break=Prepend'
 861                                                     if $v_version ge v6.1.0;
 862 push @tables_that_may_be_empty, 'Canonical_Combining_Class=CCC133'
 863                                                     if $v_version ge v6.2.0;
 864
 865 # The lists below are hashes, so the key is the item in the list, and the
 866 # value is the reason why it is in the list.  This makes generation of
 867 # documentation easier.
 868
 869 my %why_suppressed;  # No file generated for these.
 870
 871 # Files aren't generated for empty extraneous properties.  This is arguable.
 872 # Extraneous properties generally come about because a property is no longer
 873 # used in a newer version of Unicode.  If we generated a file without code
 874 # points, programs that used to work on that property will still execute
 875 # without errors.  It just won't ever match (or will always match, with \P{}).
 876 # This means that the logic is now likely wrong.  I (khw) think its better to
 877 # find this out by getting an error message.  Just move them to the table
 878 # above to change this behavior
 879 my %why_suppress_if_empty_warn_if_not = (
 880
 881    # It is the only property that has ever officially been removed from the
 882    # Standard.  The database never contained any code points for it.
 883    'Special_Case_Condition' => 'Obsolete',
 884
 885    # Apparently never official, but there were code points in some versions of
 886    # old-style PropList.txt
 887    'Non_Break' => 'Obsolete',
 888 );
 889
 890 # These would normally go in the warn table just above, but they were changed
 891 # a long time before this program was written, so warnings about them are
 892 # moot.
 893 if ($v_version gt v3.2.0) {
 894     push @tables_that_may_be_empty,
 895                                 'Canonical_Combining_Class=Attached_Below_Left'
 896 }
 897
 898 # Obsoleted
 899 if ($v_version ge v11.0.0) {
 900     push @tables_that_may_be_empty, qw(
 901                                        Grapheme_Cluster_Break=E_Base
 902                                        Grapheme_Cluster_Break=E_Base_GAZ
 903                                        Grapheme_Cluster_Break=E_Modifier
 904                                        Grapheme_Cluster_Break=Glue_After_Zwj
 905                                        Word_Break=E_Base
 906                                        Word_Break=E_Base_GAZ
 907                                        Word_Break=E_Modifier
 908                                        Word_Break=Glue_After_Zwj);
 909 }
 910
 911 # Enum values for to_output_map() method in the Map_Table package. (0 is don't
 912 # output)
 913 my $EXTERNAL_MAP = 1;
 914 my $INTERNAL_MAP = 2;
 915 my $OUTPUT_ADJUSTED = 3;
 916
 917 # To override computed values for writing the map tables for these properties.
 918 # The default for enum map tables is to write them out, so that the Unicode
 919 # .txt files can be removed, but all the data to compute any property value
 920 # for any code point is available in a more compact form.
 921 my %global_to_output_map = (
 922     # Needed by UCD.pm, but don't want to publicize that it exists, so won't
 923     # get stuck supporting it if things change.  Since it is a STRING
 924     # property, it normally would be listed in the pod, but INTERNAL_MAP
 925     # suppresses that.
 926     Unicode_1_Name => $INTERNAL_MAP,
 927
 928     Present_In => 0,                # Suppress, as easily computed from Age
 929     Block => (NON_ASCII_PLATFORM) ? 1 : 0,  # Suppress, as Blocks.txt is
 930                                             # retained, but needed for
 931                                             # non-ASCII
 932
 933     # Suppress, as mapping can be found instead from the
 934     # Perl_Decomposition_Mapping file
 935     Decomposition_Type => 0,
 936 );
 937
 938 # There are several types of obsolete properties defined by Unicode.  These
 939 # must be hand-edited for every new Unicode release.
 940 my %why_deprecated;  # Generates a deprecated warning message if used.
 941 my %why_stabilized;  # Documentation only
 942 my %why_obsolete;    # Documentation only
 943
 944 {   # Closure
 945     my $simple = 'Perl uses the more complete version';
 946     my $unihan = 'Unihan properties are by default not enabled in the Perl core.  Instead use CPAN: Unicode::Unihan';
 947
 948     my $other_properties = 'other properties';
 949     my $contributory = "Used by Unicode internally for generating $other_properties and not intended to be used stand-alone";
 950     my $why_no_expand  = "Deprecated by Unicode.  These are characters that expand to more than one character in the specified normalization form, but whether they actually take up more bytes or not depends on the encoding being used.  For example, a UTF-8 encoded character may expand to a different number of bytes than a UTF-32 encoded character.";
 951
 952     %why_deprecated = (
 953         'Grapheme_Link' => 'Duplicates ccc=vr (Canonical_Combining_Class=Virama)',
 954         'Jamo_Short_Name' => $contributory,
 955         'Line_Break=Surrogate' => 'Surrogates should never appear in well-formed text, and therefore shouldn\'t be the basis for line breaking',
 956         'Other_Alphabetic' => $contributory,
 957         'Other_Default_Ignorable_Code_Point' => $contributory,
 958         'Other_Grapheme_Extend' => $contributory,
 959         'Other_ID_Continue' => $contributory,
 960         'Other_ID_Start' => $contributory,
 961         'Other_Lowercase' => $contributory,
 962         'Other_Math' => $contributory,
 963         'Other_Uppercase' => $contributory,
 964         'Expands_On_NFC' => $why_no_expand,
 965         'Expands_On_NFD' => $why_no_expand,
 966         'Expands_On_NFKC' => $why_no_expand,
 967         'Expands_On_NFKD' => $why_no_expand,
 968     );
 969
 970     %why_suppressed = (
 971         # There is a lib/unicore/Decomposition.pl (used by Normalize.pm) which
 972         # contains the same information, but without the algorithmically
 973         # determinable Hangul syllables'.  This file is not published, so it's
 974         # existence is not noted in the comment.
 975         'Decomposition_Mapping' => 'Accessible via Unicode::Normalize or prop_invmap() or charprop() in Unicode::UCD::',
 976
 977         # Don't suppress ISO_Comment, as otherwise special handling is needed
 978         # to differentiate between it and gc=c, which can be written as 'isc',
 979         # which is the same characters as ISO_Comment's short name.
 980
 981         'Name' => "Accessible via \\N{...} or 'use charnames;' or charprop() or prop_invmap() in Unicode::UCD::",
 982
 983         'Simple_Case_Folding' => "$simple.  Can access this through casefold(), charprop(), or prop_invmap() in Unicode::UCD",
 984         'Simple_Lowercase_Mapping' => "$simple.  Can access this through charinfo(), charprop(), or prop_invmap() in Unicode::UCD",
 985         'Simple_Titlecase_Mapping' => "$simple.  Can access this through charinfo(), charprop(), or prop_invmap() in Unicode::UCD",
 986         'Simple_Uppercase_Mapping' => "$simple.  Can access this through charinfo(), charprop(), or prop_invmap() in Unicode::UCD",
 987
 988         FC_NFKC_Closure => 'Deprecated by Unicode, and supplanted in usage by NFKC_Casefold; otherwise not useful',
 989     );
 990
 991     foreach my $property (
 992
 993             # The following are suppressed because they were made contributory
 994             # or deprecated by Unicode before Perl ever thought about
 995             # supporting them.
 996             'Jamo_Short_Name',
 997             'Grapheme_Link',
 998             'Expands_On_NFC',
 999             'Expands_On_NFD',
1000             'Expands_On_NFKC',
1001             'Expands_On_NFKD',
1002
1003             # The following are suppressed because they have been marked
1004             # as deprecated for a sufficient amount of time
1005             'Other_Alphabetic',
1006             'Other_Default_Ignorable_Code_Point',
1007             'Other_Grapheme_Extend',
1008             'Other_ID_Continue',
1009             'Other_ID_Start',
1010             'Other_Lowercase',
1011             'Other_Math',
1012             'Other_Uppercase',
1013     ) {
1014         $why_suppressed{$property} = $why_deprecated{$property};
1015     }
1016
1017     # Customize the message for all the 'Other_' properties
1018     foreach my $property (keys %why_deprecated) {
1019         next if (my $main_property = $property) !~ s/^Other_//;
1020         $why_deprecated{$property} =~ s/$other_properties/the $main_property property (which should be used instead)/;
1021     }
1022 }
1023
1024 if ($write_Unicode_deprecated_tables) {
1025     foreach my $property (keys %why_suppressed) {
1026         delete $why_suppressed{$property} if $property =~
1027                                                     / ^ Other | Grapheme /x;
1028     }
1029 }
1030
1031 if ($v_version ge 4.0.0) {
1032     $why_stabilized{'Hyphen'} = 'Use the Line_Break property instead; see www.unicode.org/reports/tr14';
1033     if ($v_version ge 6.0.0) {
1034         $why_deprecated{'Hyphen'} = 'Supplanted by Line_Break property values; see www.unicode.org/reports/tr14';
1035     }
1036 }
1037 if ($v_version ge 5.2.0 && $v_version lt 6.0.0) {
1038     $why_obsolete{'ISO_Comment'} = 'Code points for it have been removed';
1039     if ($v_version ge 6.0.0) {
1040         $why_deprecated{'ISO_Comment'} = 'No longer needed for Unicode\'s internal chart generation; otherwise not useful, and code points for it have been removed';
1041     }
1042 }
1043
1044 # Probably obsolete forever
1045 if ($v_version ge v4.1.0) {
1046     $why_suppressed{'Script=Katakana_Or_Hiragana'} = 'Obsolete.  All code points previously matched by this have been moved to "Script=Common".';
1047 }
1048 if ($v_version ge v6.0.0) {
1049     $why_suppressed{'Script=Katakana_Or_Hiragana'} .= '  Consider instead using "Script_Extensions=Katakana" or "Script_Extensions=Hiragana" (or both)';
1050     $why_suppressed{'Script_Extensions=Katakana_Or_Hiragana'} = 'All code points that would be matched by this are matched by either "Script_Extensions=Katakana" or "Script_Extensions=Hiragana"';
1051 }
1052
1053 # This program can create files for enumerated-like properties, such as
1054 # 'Numeric_Type'.  This file would be the same format as for a string
1055 # property, with a mapping from code point to its value, so you could look up,
1056 # for example, the script a code point is in.  But no one so far wants this
1057 # mapping, or they have found another way to get it since this is a new
1058 # feature.  So no file is generated except if it is in this list.
1059 my @output_mapped_properties = split "\n", <<END;
1060 END
1061
1062 # If you want more Unihan properties than the default, you need to add them to
1063 # these arrays.  Depending on the property type, @missing lines might have to
1064 # be added to the second array.  A sample entry would be (including the '#'):
1065 # @missing: 0000..10FFFF; cjkAccountingNumeric; NaN
1066 my @cjk_properties = split "\n", <<'END';
1067 END
1068 my @cjk_property_values = split "\n", <<'END';
1069 END
1070
1071 # The input files don't list every code point.  Those not listed are to be
1072 # defaulted to some value.  Below are hard-coded what those values are for
1073 # non-binary properties as of 5.1.  Starting in 5.0, there are
1074 # machine-parsable comment lines in the files that give the defaults; so this
1075 # list shouldn't have to be extended.  The claim is that all missing entries
1076 # for binary properties will default to 'N'.  Unicode tried to change that in
1077 # 5.2, but the beta period produced enough protest that they backed off.
1078 #
1079 # The defaults for the fields that appear in UnicodeData.txt in this hash must
1080 # be in the form that it expects.  The others may be synonyms.
1081 my $CODE_POINT = '<code point>';
1082 my %default_mapping = (
1083     Age => "Unassigned",
1084     # Bidi_Class => Complicated; set in code
1085     Bidi_Mirroring_Glyph => "",
1086     Block => 'No_Block',
1087     Canonical_Combining_Class => 0,
1088     Case_Folding => $CODE_POINT,
1089     Decomposition_Mapping => $CODE_POINT,
1090     Decomposition_Type => 'None',
1091     East_Asian_Width => "Neutral",
1092     FC_NFKC_Closure => $CODE_POINT,
1093     General_Category => ($v_version le 6.3.0) ? 'Cn' : 'Unassigned',
1094     Grapheme_Cluster_Break => 'Other',
1095     Hangul_Syllable_Type => 'NA',
1096     ISO_Comment => "",
1097     Jamo_Short_Name => "",
1098     Joining_Group => "No_Joining_Group",
1099     # Joining_Type => Complicated; set in code
1100     kIICore => 'N',   #                       Is converted to binary
1101     #Line_Break => Complicated; set in code
1102     Lowercase_Mapping => $CODE_POINT,
1103     Name => "",
1104     Name_Alias => "",
1105     NFC_QC => 'Yes',
1106     NFD_QC => 'Yes',
1107     NFKC_QC => 'Yes',
1108     NFKD_QC => 'Yes',
1109     Numeric_Type => 'None',
1110     Numeric_Value => 'NaN',
1111     Script => ($v_version le 4.1.0) ? 'Common' : 'Unknown',
1112     Sentence_Break => 'Other',
1113     Simple_Case_Folding => $CODE_POINT,
1114     Simple_Lowercase_Mapping => $CODE_POINT,
1115     Simple_Titlecase_Mapping => $CODE_POINT,
1116     Simple_Uppercase_Mapping => $CODE_POINT,
1117     Titlecase_Mapping => $CODE_POINT,
1118     Unicode_1_Name => "",
1119     Unicode_Radical_Stroke => "",
1120     Uppercase_Mapping => $CODE_POINT,
1121     Word_Break => 'Other',
1122 );
1123
1124 ### End of externally interesting definitions, except for @input_file_objects
1125
1126 my $HEADER=<<"EOF";
1127 # !!!!!!!   DO NOT EDIT THIS FILE   !!!!!!!
1128 # This file is machine-generated by $0 from the Unicode
1129 # database, Version $unicode_version.  Any changes made here will be lost!
1130 EOF
1131
1132 my $INTERNAL_ONLY_HEADER = <<"EOF";
1133
1134 # !!!!!!!   INTERNAL PERL USE ONLY   !!!!!!!
1135 # This file is for internal use by core Perl only.  The format and even the
1136 # name or existence of this file are subject to change without notice.  Don't
1137 # use it directly.  Use Unicode::UCD to access the Unicode character data
1138 # base.
1139 EOF
1140
1141 my $DEVELOPMENT_ONLY=<<"EOF";
1142 # !!!!!!!   DEVELOPMENT USE ONLY   !!!!!!!
1143 # This file contains information artificially constrained to code points
1144 # present in Unicode release $string_compare_versions.
1145 # IT CANNOT BE RELIED ON.  It is for use during development only and should
1146 # not be used for production.
1147
1148 EOF
1149
1150 my $MAX_UNICODE_CODEPOINT_STRING = ($v_version ge v2.0.0)
1151                                    ? "10FFFF"
1152                                    : "FFFF";
1153 my $MAX_UNICODE_CODEPOINT = hex $MAX_UNICODE_CODEPOINT_STRING;
1154 my $MAX_UNICODE_CODEPOINTS = $MAX_UNICODE_CODEPOINT + 1;
1155
1156 # We work with above-Unicode code points, up to IV_MAX, but we may want to use
1157 # sentinels above that number.  Therefore for internal use, we use a much
1158 # smaller number, translating it to IV_MAX only for output.  The exact number
1159 # is immaterial (all above-Unicode code points are treated exactly the same),
1160 # but the algorithm requires it to be at least
1161 # 2 * $MAX_UNICODE_CODEPOINTS + 1
1162 my $MAX_WORKING_CODEPOINTS= $MAX_UNICODE_CODEPOINT * 8;
1163 my $MAX_WORKING_CODEPOINT = $MAX_WORKING_CODEPOINTS - 1;
1164 my $MAX_WORKING_CODEPOINT_STRING = sprintf("%X", $MAX_WORKING_CODEPOINT);
1165
1166 my $MAX_PLATFORM_CODEPOINT = ~0 >> 1;
1167
1168 # Matches legal code point.  4-6 hex numbers, If there are 6, the first
1169 # two must be 10; if there are 5, the first must not be a 0.  Written this way
1170 # to decrease backtracking.  The first regex allows the code point to be at
1171 # the end of a word, but to work properly, the word shouldn't end with a valid
1172 # hex character.  The second one won't match a code point at the end of a
1173 # word, and doesn't have the run-on issue
1174 my $run_on_code_point_re =
1175             qr/ (?: 10[0-9A-F]{4} | [1-9A-F][0-9A-F]{4} | [0-9A-F]{4} ) \b/x;
1176 my $code_point_re = qr/\b$run_on_code_point_re/;
1177
1178 # This matches the beginning of the line in the Unicode DB files that give the
1179 # defaults for code points not listed (i.e., missing) in the file.  The code
1180 # depends on this ending with a semi-colon, so it can assume it is a valid
1181 # field when the line is split() by semi-colons
1182 my $missing_defaults_prefix = qr/^#\s+\@missing:\s+0000\.\.10FFFF\s*;/;
1183
1184 # Property types.  Unicode has more types, but these are sufficient for our
1185 # purposes.
1186 my $UNKNOWN = -1;   # initialized to illegal value
1187 my $NON_STRING = 1; # Either binary or enum
1188 my $BINARY = 2;
1189 my $FORCED_BINARY = 3; # Not a binary property, but, besides its normal
1190                        # tables, additional true and false tables are
1191                        # generated so that false is anything matching the
1192                        # default value, and true is everything else.
1193 my $ENUM = 4;       # Include catalog
1194 my $STRING = 5;     # Anything else: string or misc
1195
1196 # Some input files have lines that give default values for code points not
1197 # contained in the file.  Sometimes these should be ignored.
1198 my $NO_DEFAULTS = 0;        # Must evaluate to false
1199 my $NOT_IGNORED = 1;
1200 my $IGNORED = 2;
1201
1202 # Range types.  Each range has a type.  Most ranges are type 0, for normal,
1203 # and will appear in the main body of the tables in the output files, but
1204 # there are other types of ranges as well, listed below, that are specially
1205 # handled.   There are pseudo-types as well that will never be stored as a
1206 # type, but will affect the calculation of the type.
1207
1208 # 0 is for normal, non-specials
1209 my $MULTI_CP = 1;           # Sequence of more than code point
1210 my $HANGUL_SYLLABLE = 2;
1211 my $CP_IN_NAME = 3;         # The NAME contains the code point appended to it.
1212 my $NULL = 4;               # The map is to the null string; utf8.c can't
1213                             # handle these, nor is there an accepted syntax
1214                             # for them in \p{} constructs
1215 my $COMPUTE_NO_MULTI_CP = 5; # Pseudo-type; means that ranges that would
1216                              # otherwise be $MULTI_CP type are instead type 0
1217
1218 # process_generic_property_file() can accept certain overrides in its input.
1219 # Each of these must begin AND end with $CMD_DELIM.
1220 my $CMD_DELIM = "\a";
1221 my $REPLACE_CMD = 'replace';    # Override the Replace
1222 my $MAP_TYPE_CMD = 'map_type';  # Override the Type
1223
1224 my $NO = 0;
1225 my $YES = 1;
1226
1227 # Values for the Replace argument to add_range.
1228 # $NO                      # Don't replace; add only the code points not
1229                            # already present.
1230 my $IF_NOT_EQUIVALENT = 1; # Replace only under certain conditions; details in
1231                            # the comments at the subroutine definition.
1232 my $UNCONDITIONALLY = 2;   # Replace without conditions.
1233 my $MULTIPLE_BEFORE = 4;   # Don't replace, but add a duplicate record if
1234                            # already there
1235 my $MULTIPLE_AFTER = 5;    # Don't replace, but add a duplicate record if
1236                            # already there
1237 my $CROAK = 6;             # Die with an error if is already there
1238
1239 # Flags to give property statuses.  The phrases are to remind maintainers that
1240 # if the flag is changed, the indefinite article referring to it in the
1241 # documentation may need to be as well.
1242 my $NORMAL = "";
1243 my $DEPRECATED = 'D';
1244 my $a_bold_deprecated = "a 'B<$DEPRECATED>'";
1245 my $A_bold_deprecated = "A 'B<$DEPRECATED>'";
1246 my $DISCOURAGED = 'X';
1247 my $a_bold_discouraged = "an 'B<$DISCOURAGED>'";
1248 my $A_bold_discouraged = "An 'B<$DISCOURAGED>'";
1249 my $STRICTER = 'T';
1250 my $a_bold_stricter = "a 'B<$STRICTER>'";
1251 my $A_bold_stricter = "A 'B<$STRICTER>'";
1252 my $STABILIZED = 'S';
1253 my $a_bold_stabilized = "an 'B<$STABILIZED>'";
1254 my $A_bold_stabilized = "An 'B<$STABILIZED>'";
1255 my $OBSOLETE = 'O';
1256 my $a_bold_obsolete = "an 'B<$OBSOLETE>'";
1257 my $A_bold_obsolete = "An 'B<$OBSOLETE>'";
1258
1259 # Aliases can also have an extra status:
1260 my $INTERNAL_ALIAS = 'P';
1261
1262 my %status_past_participles = (
1263     $DISCOURAGED => 'discouraged',
1264     $STABILIZED => 'stabilized',
1265     $OBSOLETE => 'obsolete',
1266     $DEPRECATED => 'deprecated',
1267     $INTERNAL_ALIAS => 'reserved for Perl core internal use only',
1268 );
1269
1270 # Table fates.  These are somewhat ordered, so that fates < $MAP_PROXIED should be
1271 # externally documented.
1272 my $ORDINARY = 0;       # The normal fate.
1273 my $MAP_PROXIED = 1;    # The map table for the property isn't written out,
1274                         # but there is a file written that can be used to
1275                         # reconstruct this table
1276 my $INTERNAL_ONLY = 2;  # The file for this table is written out, but it is
1277                         # for Perl's internal use only
1278 my $LEGACY_ONLY = 3;    # Like $INTERNAL_ONLY, but not actually used by Perl.
1279                         # Is for backwards compatibility for applications that
1280                         # read the file directly, so it's format is
1281                         # unchangeable.
1282 my $SUPPRESSED = 4;     # The file for this table is not written out, and as a
1283                         # result, we don't bother to do many computations on
1284                         # it.
1285 my $PLACEHOLDER = 5;    # Like $SUPPRESSED, but we go through all the
1286                         # computations anyway, as the values are needed for
1287                         # things to work.  This happens when we have Perl
1288                         # extensions that depend on Unicode tables that
1289                         # wouldn't normally be in a given Unicode version.
1290
1291 # The format of the values of the tables:
1292 my $EMPTY_FORMAT = "";
1293 my $BINARY_FORMAT = 'b';
1294 my $DECIMAL_FORMAT = 'd';
1295 my $FLOAT_FORMAT = 'f';
1296 my $INTEGER_FORMAT = 'i';
1297 my $HEX_FORMAT = 'x';
1298 my $RATIONAL_FORMAT = 'r';
1299 my $STRING_FORMAT = 's';
1300 my $ADJUST_FORMAT = 'a';
1301 my $HEX_ADJUST_FORMAT = 'ax';
1302 my $DECOMP_STRING_FORMAT = 'c';
1303 my $STRING_WHITE_SPACE_LIST = 'sw';
1304
1305 my %map_table_formats = (
1306     $BINARY_FORMAT => 'binary',
1307     $DECIMAL_FORMAT => 'single decimal digit',
1308     $FLOAT_FORMAT => 'floating point number',
1309     $INTEGER_FORMAT => 'integer',
1310     $HEX_FORMAT => 'non-negative hex whole number; a code point',
1311     $RATIONAL_FORMAT => 'rational: an integer or a fraction',
1312     $STRING_FORMAT => 'string',
1313     $ADJUST_FORMAT => 'some entries need adjustment',
1314     $HEX_ADJUST_FORMAT => 'mapped value in hex; some entries need adjustment',
1315     $DECOMP_STRING_FORMAT => 'Perl\'s internal (Normalize.pm) decomposition mapping',
1316     $STRING_WHITE_SPACE_LIST => 'string, but some elements are interpreted as a list; white space occurs only as list item separators'
1317 );
1318
1319 # Unicode didn't put such derived files in a separate directory at first.
1320 my $EXTRACTED_DIR = (-d 'extracted') ? 'extracted' : "";
1321 my $EXTRACTED = ($EXTRACTED_DIR) ? "$EXTRACTED_DIR/" : "";
1322 my $AUXILIARY = 'auxiliary';
1323
1324 # Hashes and arrays that will eventually go into UCD.pl for the use of UCD.pm
1325 my %loose_to_file_of;       # loosely maps table names to their respective
1326                             # files
1327 my %stricter_to_file_of;    # same; but for stricter mapping.
1328 my %loose_property_to_file_of; # Maps a loose property name to its map file
1329 my %strict_property_to_file_of; # Same, but strict
1330 my @inline_definitions = "V0"; # Each element gives a definition of a unique
1331                             # inversion list.  When a definition is inlined,
1332                             # its value in the hash it's in (one of the two
1333                             # defined just above) will include an index into
1334                             # this array.  The 0th element is initialized to
1335                             # the definition for a zero length inversion list
1336 my %file_to_swash_name;     # Maps the file name to its corresponding key name
1337                             # in the hash %Unicode::UCD::SwashInfo
1338 my %nv_floating_to_rational; # maps numeric values floating point numbers to
1339                              # their rational equivalent
1340 my %loose_property_name_of; # Loosely maps (non_string) property names to
1341                             # standard form
1342 my %strict_property_name_of; # Strictly maps (non_string) property names to
1343                             # standard form
1344 my %string_property_loose_to_name; # Same, for string properties.
1345 my %loose_defaults;         # keys are of form "prop=value", where 'prop' is
1346                             # the property name in standard loose form, and
1347                             # 'value' is the default value for that property,
1348                             # also in standard loose form.
1349 my %loose_to_standard_value; # loosely maps table names to the canonical
1350                             # alias for them
1351 my %ambiguous_names;        # keys are alias names (in standard form) that
1352                             # have more than one possible meaning.
1353 my %combination_property;   # keys are alias names (in standard form) that
1354                             # have both a map table, and a binary one that
1355                             # yields true for all non-null maps.
1356 my %prop_aliases;           # Keys are standard property name; values are each
1357                             # one's aliases
1358 my %prop_value_aliases;     # Keys of top level are standard property name;
1359                             # values are keys to another hash,  Each one is
1360                             # one of the property's values, in standard form.
1361                             # The values are that prop-val's aliases.
1362 my %skipped_files;          # List of files that we skip
1363 my %ucd_pod;    # Holds entries that will go into the UCD section of the pod
1364
1365 # Most properties are immune to caseless matching, otherwise you would get
1366 # nonsensical results, as properties are a function of a code point, not
1367 # everything that is caselessly equivalent to that code point.  For example,
1368 # Changes_When_Case_Folded('s') should be false, whereas caselessly it would
1369 # be true because 's' and 'S' are equivalent caselessly.  However,
1370 # traditionally, [:upper:] and [:lower:] are equivalent caselessly, so we
1371 # extend that concept to those very few properties that are like this.  Each
1372 # such property will match the full range caselessly.  They are hard-coded in
1373 # the program; it's not worth trying to make it general as it's extremely
1374 # unlikely that they will ever change.
1375 my %caseless_equivalent_to;
1376
1377 # This is the range of characters that were in Release 1 of Unicode, and
1378 # removed in Release 2 (replaced with the current Hangul syllables starting at
1379 # U+AC00).  The range was reused starting in Release 3 for other purposes.
1380 my $FIRST_REMOVED_HANGUL_SYLLABLE = 0x3400;
1381 my $FINAL_REMOVED_HANGUL_SYLLABLE = 0x4DFF;
1382
1383 # These constants names and values were taken from the Unicode standard,
1384 # version 5.1, section 3.12.  They are used in conjunction with Hangul
1385 # syllables.  The '_string' versions are so generated tables can retain the
1386 # hex format, which is the more familiar value
1387 my $SBase_string = "0xAC00";
1388 my $SBase = CORE::hex $SBase_string;
1389 my $LBase_string = "0x1100";
1390 my $LBase = CORE::hex $LBase_string;
1391 my $VBase_string = "0x1161";
1392 my $VBase = CORE::hex $VBase_string;
1393 my $TBase_string = "0x11A7";
1394 my $TBase = CORE::hex $TBase_string;
1395 my $SCount = 11172;
1396 my $LCount = 19;
1397 my $VCount = 21;
1398 my $TCount = 28;
1399 my $NCount = $VCount * $TCount;
1400
1401 # For Hangul syllables;  These store the numbers from Jamo.txt in conjunction
1402 # with the above published constants.
1403 my %Jamo;
1404 my %Jamo_L;     # Leading consonants
1405 my %Jamo_V;     # Vowels
1406 my %Jamo_T;     # Trailing consonants
1407
1408 # For code points whose name contains its ordinal as a '-ABCD' suffix.
1409 # The key is the base name of the code point, and the value is an
1410 # array giving all the ranges that use this base name.  Each range
1411 # is actually a hash giving the 'low' and 'high' values of it.
1412 my %names_ending_in_code_point;
1413 my %loose_names_ending_in_code_point;   # Same as above, but has blanks, dashes
1414                                         # removed from the names
1415 # Inverse mapping.  The list of ranges that have these kinds of
1416 # names.  Each element contains the low, high, and base names in an
1417 # anonymous hash.
1418 my @code_points_ending_in_code_point;
1419
1420 # To hold Unicode's normalization test suite
1421 my @normalization_tests;
1422
1423 # Boolean: does this Unicode version have the hangul syllables, and are we
1424 # writing out a table for them?
1425 my $has_hangul_syllables = 0;
1426
1427 # Does this Unicode version have code points whose names end in their
1428 # respective code points, and are we writing out a table for them?  0 for no;
1429 # otherwise points to first property that a table is needed for them, so that
1430 # if multiple tables are needed, we don't create duplicates
1431 my $needing_code_points_ending_in_code_point = 0;
1432
1433 my @backslash_X_tests;     # List of tests read in for testing \X
1434 my @LB_tests;              # List of tests read in for testing \b{lb}
1435 my @SB_tests;              # List of tests read in for testing \b{sb}
1436 my @WB_tests;              # List of tests read in for testing \b{wb}
1437 my @unhandled_properties;  # Will contain a list of properties found in
1438                            # the input that we didn't process.
1439 my @match_properties;      # Properties that have match tables, to be
1440                            # listed in the pod
1441 my @map_properties;        # Properties that get map files written
1442 my @named_sequences;       # NamedSequences.txt contents.
1443 my %potential_files;       # Generated list of all .txt files in the directory
1444                            # structure so we can warn if something is being
1445                            # ignored.
1446 my @missing_early_files;   # Generated list of absent files that we need to
1447                            # proceed in compiling this early Unicode version
1448 my @files_actually_output; # List of files we generated.
1449 my @more_Names;            # Some code point names are compound; this is used
1450                            # to store the extra components of them.
1451 my $E_FLOAT_PRECISION = 2; # The minimum number of digits after the decimal
1452                            # point of a normalized floating point number
1453                            # needed to match before we consider it equivalent
1454                            # to a candidate rational
1455
1456 # These store references to certain commonly used property objects
1457 my $age;
1458 my $ccc;
1459 my $gc;
1460 my $perl;
1461 my $block;
1462 my $perl_charname;
1463 my $print;
1464 my $All;
1465 my $Assigned;   # All assigned characters in this Unicode release
1466 my $DI;         # Default_Ignorable_Code_Point property
1467 my $NChar;      # Noncharacter_Code_Point property
1468 my $script;
1469 my $scx;        # Script_Extensions property
1470
1471 # Are there conflicting names because of beginning with 'In_', or 'Is_'
1472 my $has_In_conflicts = 0;
1473 my $has_Is_conflicts = 0;
1474
1475 sub internal_file_to_platform ($) {
1476     # Convert our file paths which have '/' separators to those of the
1477     # platform.
1478
1479     my $file = shift;
1480     return undef unless defined $file;
1481
1482     return File::Spec->join(split '/', $file);
1483 }
1484
1485 sub file_exists ($) {   # platform independent '-e'.  This program internally
1486                         # uses slash as a path separator.
1487     my $file = shift;
1488     return 0 if ! defined $file;
1489     return -e internal_file_to_platform($file);
1490 }
1491
1492 sub objaddr($) {
1493     # Returns the address of the blessed input object.
1494     # It doesn't check for blessedness because that would do a string eval
1495     # every call, and the program is structured so that this is never called
1496     # for a non-blessed object.
1497
1498     no overloading; # If overloaded, numifying below won't work.
1499
1500     # Numifying a ref gives its address.
1501     return pack 'J', $_[0];
1502 }
1503
1504 # These are used only if $annotate is true.
1505 # The entire range of Unicode characters is examined to populate these
1506 # after all the input has been processed.  But most can be skipped, as they
1507 # have the same descriptive phrases, such as being unassigned
1508 my @viacode;            # Contains the 1 million character names
1509 my @age;                # And their ages ("" if none)
1510 my @printable;          # boolean: And are those characters printable?
1511 my @annotate_char_type; # Contains a type of those characters, specifically
1512                         # for the purposes of annotation.
1513 my $annotate_ranges;    # A map of ranges of code points that have the same
1514                         # name for the purposes of annotation.  They map to the
1515                         # upper edge of the range, so that the end point can
1516                         # be immediately found.  This is used to skip ahead to
1517                         # the end of a range, and avoid processing each
1518                         # individual code point in it.
1519 my $unassigned_sans_noncharacters; # A Range_List of the unassigned
1520                                    # characters, but excluding those which are
1521                                    # also noncharacter code points
1522
1523 # The annotation types are an extension of the regular range types, though
1524 # some of the latter are folded into one.  Make the new types negative to
1525 # avoid conflicting with the regular types
1526 my $SURROGATE_TYPE = -1;
1527 my $UNASSIGNED_TYPE = -2;
1528 my $PRIVATE_USE_TYPE = -3;
1529 my $NONCHARACTER_TYPE = -4;
1530 my $CONTROL_TYPE = -5;
1531 my $ABOVE_UNICODE_TYPE = -6;
1532 my $UNKNOWN_TYPE = -7;  # Used only if there is a bug in this program
1533
1534 sub populate_char_info ($) {
1535     # Used only with the $annotate option.  Populates the arrays with the
1536     # input code point's info that are needed for outputting more detailed
1537     # comments.  If calling context wants a return, it is the end point of
1538     # any contiguous range of characters that share essentially the same info
1539
1540     my $i = shift;
1541     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
1542
1543     $viacode[$i] = $perl_charname->value_of($i) || "";
1544     $age[$i] = (defined $age)
1545                ? (($age->value_of($i) =~ / ^ \d+ \. \d+ $ /x)
1546                   ? $age->value_of($i)
1547                   : "")
1548                : "";
1549
1550     # A character is generally printable if Unicode says it is,
1551     # but below we make sure that most Unicode general category 'C' types
1552     # aren't.
1553     $printable[$i] = $print->contains($i);
1554
1555     # But the characters in this range were removed in v2.0 and replaced by
1556     # different ones later.  Modern fonts will be for the replacement
1557     # characters, so suppress printing them.
1558     if (($v_version lt v2.0
1559          || ($compare_versions && $compare_versions lt v2.0))
1560         && (   $i >= $FIRST_REMOVED_HANGUL_SYLLABLE
1561             && $i <= $FINAL_REMOVED_HANGUL_SYLLABLE))
1562     {
1563         $printable[$i] = 0;
1564     }
1565
1566     $annotate_char_type[$i] = $perl_charname->type_of($i) || 0;
1567
1568     # Only these two regular types are treated specially for annotations
1569     # purposes
1570     $annotate_char_type[$i] = 0 if $annotate_char_type[$i] != $CP_IN_NAME
1571                                 && $annotate_char_type[$i] != $HANGUL_SYLLABLE;
1572
1573     # Give a generic name to all code points that don't have a real name.
1574     # We output ranges, if applicable, for these.  Also calculate the end
1575     # point of the range.
1576     my $end;
1577     if (! $viacode[$i]) {
1578         if ($i > $MAX_UNICODE_CODEPOINT) {
1579             $viacode[$i] = 'Above-Unicode';
1580             $annotate_char_type[$i] = $ABOVE_UNICODE_TYPE;
1581             $printable[$i] = 0;
1582             $end = $MAX_WORKING_CODEPOINT;
1583         }
1584         elsif ($gc-> table('Private_use')->contains($i)) {
1585             $viacode[$i] = 'Private Use';
1586             $annotate_char_type[$i] = $PRIVATE_USE_TYPE;
1587             $printable[$i] = 0;
1588             $end = $gc->table('Private_Use')->containing_range($i)->end;
1589         }
1590         elsif ($NChar->contains($i)) {
1591             $viacode[$i] = 'Noncharacter';
1592             $annotate_char_type[$i] = $NONCHARACTER_TYPE;
1593             $printable[$i] = 0;
1594             $end = $NChar->containing_range($i)->end;
1595         }
1596         elsif ($gc-> table('Control')->contains($i)) {
1597             my $name_ref = property_ref('Name_Alias');
1598             $name_ref = property_ref('Unicode_1_Name') if ! defined $name_ref;
1599             $viacode[$i] = (defined $name_ref)
1600                            ? $name_ref->value_of($i)
1601                            : 'Control';
1602             $annotate_char_type[$i] = $CONTROL_TYPE;
1603             $printable[$i] = 0;
1604         }
1605         elsif ($gc-> table('Unassigned')->contains($i)) {
1606             $annotate_char_type[$i] = $UNASSIGNED_TYPE;
1607             $printable[$i] = 0;
1608             $viacode[$i] = 'Unassigned';
1609
1610             if (defined $block) { # No blocks in earliest releases
1611                 $viacode[$i] .= ', block=' . $block-> value_of($i);
1612                 $end = $gc-> table('Unassigned')->containing_range($i)->end;
1613
1614                 # Because we name the unassigned by the blocks they are in, it
1615                 # can't go past the end of that block, and it also can't go
1616                 # past the unassigned range it is in.  The special table makes
1617                 # sure that the non-characters, which are unassigned, are
1618                 # separated out.
1619                 $end = min($block->containing_range($i)->end,
1620                            $unassigned_sans_noncharacters->
1621                                                     containing_range($i)->end);
1622             }
1623             else {
1624                 $end = $i + 1;
1625                 while ($unassigned_sans_noncharacters->contains($end)) {
1626                     $end++;
1627                 }
1628                 $end--;
1629             }
1630         }
1631         elsif ($perl->table('_Perl_Surrogate')->contains($i)) {
1632             $viacode[$i] = 'Surrogate';
1633             $annotate_char_type[$i] = $SURROGATE_TYPE;
1634             $printable[$i] = 0;
1635             $end = $gc->table('Surrogate')->containing_range($i)->end;
1636         }
1637         else {
1638             Carp::my_carp_bug("Can't figure out how to annotate "
1639                               . sprintf("U+%04X", $i)
1640                               . ".  Proceeding anyway.");
1641             $viacode[$i] = 'UNKNOWN';
1642             $annotate_char_type[$i] = $UNKNOWN_TYPE;
1643             $printable[$i] = 0;
1644         }
1645     }
1646
1647     # Here, has a name, but if it's one in which the code point number is
1648     # appended to the name, do that.
1649     elsif ($annotate_char_type[$i] == $CP_IN_NAME) {
1650         $viacode[$i] .= sprintf("-%04X", $i);
1651
1652         my $limit = $perl_charname->containing_range($i)->end;
1653         if (defined $age) {
1654             # Do all these as groups of the same age, instead of individually,
1655             # because their names are so meaningless, and there are typically
1656             # large quantities of them.
1657             $end = $i + 1;
1658             while ($end <= $limit && $age->value_of($end) == $age[$i]) {
1659                 $end++;
1660             }
1661             $end--;
1662         }
1663         else {
1664             $end = $limit;
1665         }
1666     }
1667
1668     # And here, has a name, but if it's a hangul syllable one, replace it with
1669     # the correct name from the Unicode algorithm
1670     elsif ($annotate_char_type[$i] == $HANGUL_SYLLABLE) {
1671         use integer;
1672         my $SIndex = $i - $SBase;
1673         my $L = $LBase + $SIndex / $NCount;
1674         my $V = $VBase + ($SIndex % $NCount) / $TCount;
1675         my $T = $TBase + $SIndex % $TCount;
1676         $viacode[$i] = "HANGUL SYLLABLE $Jamo{$L}$Jamo{$V}";
1677         $viacode[$i] .= $Jamo{$T} if $T != $TBase;
1678         $end = $perl_charname->containing_range($i)->end;
1679     }
1680
1681     return if ! defined wantarray;
1682     return $i if ! defined $end;    # If not a range, return the input
1683
1684     # Save this whole range so can find the end point quickly
1685     $annotate_ranges->add_map($i, $end, $end);
1686
1687     return $end;
1688 }
1689
1690 # Commented code below should work on Perl 5.8.
1691 ## This 'require' doesn't necessarily work in miniperl, and even if it does,
1692 ## the native perl version of it (which is what would operate under miniperl)
1693 ## is extremely slow, as it does a string eval every call.
1694 #my $has_fast_scalar_util = $^X !~ /miniperl/
1695 #                            && defined eval "require Scalar::Util";
1696 #
1697 #sub objaddr($) {
1698 #    # Returns the address of the blessed input object.  Uses the XS version if
1699 #    # available.  It doesn't check for blessedness because that would do a
1700 #    # string eval every call, and the program is structured so that this is
1701 #    # never called for a non-blessed object.
1702 #
1703 #    return Scalar::Util::refaddr($_[0]) if $has_fast_scalar_util;
1704 #
1705 #    # Check at least that is a ref.
1706 #    my $pkg = ref($_[0]) or return undef;
1707 #
1708 #    # Change to a fake package to defeat any overloaded stringify
1709 #    bless $_[0], 'main::Fake';
1710 #
1711 #    # Numifying a ref gives its address.
1712 #    my $addr = pack 'J', $_[0];
1713 #
1714 #    # Return to original class
1715 #    bless $_[0], $pkg;
1716 #    return $addr;
1717 #}
1718
1719 sub max ($$) {
1720     my $a = shift;
1721     my $b = shift;
1722     return $a if $a >= $b;
1723     return $b;
1724 }
1725
1726 sub min ($$) {
1727     my $a = shift;
1728     my $b = shift;
1729     return $a if $a <= $b;
1730     return $b;
1731 }
1732
1733 sub clarify_number ($) {
1734     # This returns the input number with underscores inserted every 3 digits
1735     # in large (5 digits or more) numbers.  Input must be entirely digits, not
1736     # checked.
1737
1738     my $number = shift;
1739     my $pos = length($number) - 3;
1740     return $number if $pos <= 1;
1741     while ($pos > 0) {
1742         substr($number, $pos, 0) = '_';
1743         $pos -= 3;
1744     }
1745     return $number;
1746 }
1747
1748 sub clarify_code_point_count ($) {
1749     # This is like clarify_number(), but the input is assumed to be a count of
1750     # code points, rather than a generic number.
1751
1752     my $append = "";
1753
1754     my $number = shift;
1755     if ($number > $MAX_UNICODE_CODEPOINTS) {
1756         $number -= ($MAX_WORKING_CODEPOINTS - $MAX_UNICODE_CODEPOINTS);
1757         return "All above-Unicode code points" if $number == 0;
1758         $append = " + all above-Unicode code points";
1759     }
1760     return clarify_number($number) . $append;
1761 }
1762
1763 package Carp;
1764
1765 # These routines give a uniform treatment of messages in this program.  They
1766 # are placed in the Carp package to cause the stack trace to not include them,
1767 # although an alternative would be to use another package and set @CARP_NOT
1768 # for it.
1769
1770 our $Verbose = 1 if main::DEBUG;  # Useful info when debugging
1771
1772 # This is a work-around suggested by Nicholas Clark to fix a problem with Carp
1773 # and overload trying to load Scalar:Util under miniperl.  See
1774 # http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/2009-11/msg01057.html
1775 undef $overload::VERSION;
1776
1777 sub my_carp {
1778     my $message = shift || "";
1779     my $nofold = shift || 0;
1780
1781     if ($message) {
1782         $message = main::join_lines($message);
1783         $message =~ s/^$0: *//;     # Remove initial program name
1784         $message =~ s/[.;,]+$//;    # Remove certain ending punctuation
1785         $message = "\n$0: $message;";
1786
1787         # Fold the message with program name, semi-colon end punctuation
1788         # (which looks good with the message that carp appends to it), and a
1789         # hanging indent for continuation lines.
1790         $message = main::simple_fold($message, "", 4) unless $nofold;
1791         $message =~ s/\n$//;        # Remove the trailing nl so what carp
1792                                     # appends is to the same line
1793     }
1794
1795     return $message if defined wantarray;   # If a caller just wants the msg
1796
1797     carp $message;
1798     return;
1799 }
1800
1801 sub my_carp_bug {
1802     # This is called when it is clear that the problem is caused by a bug in
1803     # this program.
1804
1805     my $message = shift;
1806     $message =~ s/^$0: *//;
1807     $message = my_carp("Bug in $0.  Please report it by running perlbug or if that is unavailable, by sending email to perbug\@perl.org:\n$message");
1808     carp $message;
1809     return;
1810 }
1811
1812 sub carp_too_few_args {
1813     if (@_ != 2) {
1814         my_carp_bug("Wrong number of arguments: to 'carp_too_few_arguments'.  No action taken.");
1815         return;
1816     }
1817
1818     my $args_ref = shift;
1819     my $count = shift;
1820
1821     my_carp_bug("Need at least $count arguments to "
1822         . (caller 1)[3]
1823         . ".  Instead got: '"
1824         . join ', ', @$args_ref
1825         . "'.  No action taken.");
1826     return;
1827 }
1828
1829 sub carp_extra_args {
1830     my $args_ref = shift;
1831     my_carp_bug("Too many arguments to 'carp_extra_args': (" . join(', ', @_) . ");  Extras ignored.") if @_;
1832
1833     unless (ref $args_ref) {
1834         my_carp_bug("Argument to 'carp_extra_args' ($args_ref) must be a ref.  Not checking arguments.");
1835         return;
1836     }
1837     my ($package, $file, $line) = caller;
1838     my $subroutine = (caller 1)[3];
1839
1840     my $list;
1841     if (ref $args_ref eq 'HASH') {
1842         foreach my $key (keys %$args_ref) {
1843             $args_ref->{$key} = $UNDEF unless defined $args_ref->{$key};
1844         }
1845         $list = join ', ', each %{$args_ref};
1846     }
1847     elsif (ref $args_ref eq 'ARRAY') {
1848         foreach my $arg (@$args_ref) {
1849             $arg = $UNDEF unless defined $arg;
1850         }
1851         $list = join ', ', @$args_ref;
1852     }
1853     else {
1854         my_carp_bug("Can't cope with ref "
1855                 . ref($args_ref)
1856                 . " . argument to 'carp_extra_args'.  Not checking arguments.");
1857         return;
1858     }
1859
1860     my_carp_bug("Unrecognized parameters in options: '$list' to $subroutine.  Skipped.");
1861     return;
1862 }
1863
1864 package main;
1865
1866 { # Closure
1867
1868     # This program uses the inside-out method for objects, as recommended in
1869     # "Perl Best Practices".  (This is the best solution still, since this has
1870     # to run under miniperl.)  This closure aids in generating those.  There
1871     # are two routines.  setup_package() is called once per package to set
1872     # things up, and then set_access() is called for each hash representing a
1873     # field in the object.  These routines arrange for the object to be
1874     # properly destroyed when no longer used, and for standard accessor
1875     # functions to be generated.  If you need more complex accessors, just
1876     # write your own and leave those accesses out of the call to set_access().
1877     # More details below.
1878
1879     my %constructor_fields; # fields that are to be used in constructors; see
1880                             # below
1881
1882     # The values of this hash will be the package names as keys to other
1883     # hashes containing the name of each field in the package as keys, and
1884     # references to their respective hashes as values.
1885     my %package_fields;
1886
1887     sub setup_package {
1888         # Sets up the package, creating standard DESTROY and dump methods
1889         # (unless already defined).  The dump method is used in debugging by
1890         # simple_dumper().
1891         # The optional parameters are:
1892         #   a)  a reference to a hash, that gets populated by later
1893         #       set_access() calls with one of the accesses being
1894         #       'constructor'.  The caller can then refer to this, but it is
1895         #       not otherwise used by these two routines.
1896         #   b)  a reference to a callback routine to call during destruction
1897         #       of the object, before any fields are actually destroyed
1898
1899         my %args = @_;
1900         my $constructor_ref = delete $args{'Constructor_Fields'};
1901         my $destroy_callback = delete $args{'Destroy_Callback'};
1902         Carp::carp_extra_args(\@_) if main::DEBUG && %args;
1903
1904         my %fields;
1905         my $package = (caller)[0];
1906
1907         $package_fields{$package} = \%fields;
1908         $constructor_fields{$package} = $constructor_ref;
1909
1910         unless ($package->can('DESTROY')) {
1911             my $destroy_name = "${package}::DESTROY";
1912             no strict "refs";
1913
1914             # Use typeglob to give the anonymous subroutine the name we want
1915             *$destroy_name = sub {
1916                 my $self = shift;
1917                 my $addr = do { no overloading; pack 'J', $self; };
1918
1919                 $self->$destroy_callback if $destroy_callback;
1920                 foreach my $field (keys %{$package_fields{$package}}) {
1921                     #print STDERR __LINE__, ": Destroying ", ref $self, " ", sprintf("%04X", $addr), ": ", $field, "\n";
1922                     delete $package_fields{$package}{$field}{$addr};
1923                 }
1924                 return;
1925             }
1926         }
1927
1928         unless ($package->can('dump')) {
1929             my $dump_name = "${package}::dump";
1930             no strict "refs";
1931             *$dump_name = sub {
1932                 my $self = shift;
1933                 return dump_inside_out($self, $package_fields{$package}, @_);
1934             }
1935         }
1936         return;
1937     }
1938
1939     sub set_access {
1940         # Arrange for the input field to be garbage collected when no longer
1941         # needed.  Also, creates standard accessor functions for the field
1942         # based on the optional parameters-- none if none of these parameters:
1943         #   'addable'    creates an 'add_NAME()' accessor function.
1944         #   'readable' or 'readable_array'   creates a 'NAME()' accessor
1945         #                function.
1946         #   'settable'   creates a 'set_NAME()' accessor function.
1947         #   'constructor' doesn't create an accessor function, but adds the
1948         #                field to the hash that was previously passed to
1949         #                setup_package();
1950         # Any of the accesses can be abbreviated down, so that 'a', 'ad',
1951         # 'add' etc. all mean 'addable'.
1952         # The read accessor function will work on both array and scalar
1953         # values.  If another accessor in the parameter list is 'a', the read
1954         # access assumes an array.  You can also force it to be array access
1955         # by specifying 'readable_array' instead of 'readable'
1956         #
1957         # A sort-of 'protected' access can be set-up by preceding the addable,
1958         # readable or settable with some initial portion of 'protected_' (but,
1959         # the underscore is required), like 'p_a', 'pro_set', etc.  The
1960         # "protection" is only by convention.  All that happens is that the
1961         # accessor functions' names begin with an underscore.  So instead of
1962         # calling set_foo, the call is _set_foo.  (Real protection could be
1963         # accomplished by having a new subroutine, end_package, called at the
1964         # end of each package, and then storing the __LINE__ ranges and
1965         # checking them on every accessor.  But that is way overkill.)
1966
1967         # We create anonymous subroutines as the accessors and then use
1968         # typeglobs to assign them to the proper package and name
1969
1970         my $name = shift;   # Name of the field
1971         my $field = shift;  # Reference to the inside-out hash containing the
1972                             # field
1973
1974         my $package = (caller)[0];
1975
1976         if (! exists $package_fields{$package}) {
1977             croak "$0: Must call 'setup_package' before 'set_access'";
1978         }
1979
1980         # Stash the field so DESTROY can get it.
1981         $package_fields{$package}{$name} = $field;
1982
1983         # Remaining arguments are the accessors.  For each...
1984         foreach my $access (@_) {
1985             my $access = lc $access;
1986
1987             my $protected = "";
1988
1989             # Match the input as far as it goes.
1990             if ($access =~ /^(p[^_]*)_/) {
1991                 $protected = $1;
1992                 if (substr('protected_', 0, length $protected)
1993                     eq $protected)
1994                 {
1995
1996                     # Add 1 for the underscore not included in $protected
1997                     $access = substr($access, length($protected) + 1);
1998                     $protected = '_';
1999                 }
2000                 else {
2001                     $protected = "";
2002                 }
2003             }
2004
2005             if (substr('addable', 0, length $access) eq $access) {
2006                 my $subname = "${package}::${protected}add_$name";
2007                 no strict "refs";
2008
2009                 # add_ accessor.  Don't add if already there, which we
2010                 # determine using 'eq' for scalars and '==' otherwise.
2011                 *$subname = sub {
2012                     use strict "refs";
2013                     return Carp::carp_too_few_args(\@_, 2) if main::DEBUG && @_ < 2;
2014                     my $self = shift;
2015                     my $value = shift;
2016                     my $addr = do { no overloading; pack 'J', $self; };
2017                     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2018                     if (ref $value) {
2019                         return if grep { $value == $_ } @{$field->{$addr}};
2020                     }
2021                     else {
2022                         return if grep { $value eq $_ } @{$field->{$addr}};
2023                     }
2024                     push @{$field->{$addr}}, $value;
2025                     return;
2026                 }
2027             }
2028             elsif (substr('constructor', 0, length $access) eq $access) {
2029                 if ($protected) {
2030                     Carp::my_carp_bug("Can't set-up 'protected' constructors")
2031                 }
2032                 else {
2033                     $constructor_fields{$package}{$name} = $field;
2034                 }
2035             }
2036             elsif (substr('readable_array', 0, length $access) eq $access) {
2037
2038                 # Here has read access.  If one of the other parameters for
2039                 # access is array, or this one specifies array (by being more
2040                 # than just 'readable_'), then create a subroutine that
2041                 # assumes the data is an array.  Otherwise just a scalar
2042                 my $subname = "${package}::${protected}$name";
2043                 if (grep { /^a/i } @_
2044                     or length($access) > length('readable_'))
2045                 {
2046                     no strict "refs";
2047                     *$subname = sub {
2048                         use strict "refs";
2049                         Carp::carp_extra_args(\@_) if main::DEBUG && @_ > 1;
2050                         my $addr = do { no overloading; pack 'J', $_[0]; };
2051                         if (ref $field->{$addr} ne 'ARRAY') {
2052                             my $type = ref $field->{$addr};
2053                             $type = 'scalar' unless $type;
2054                             Carp::my_carp_bug("Trying to read $name as an array when it is a $type.  Big problems.");
2055                             return;
2056                         }
2057                         return scalar @{$field->{$addr}} unless wantarray;
2058
2059                         # Make a copy; had problems with caller modifying the
2060                         # original otherwise
2061                         my @return = @{$field->{$addr}};
2062                         return @return;
2063                     }
2064                 }
2065                 else {
2066
2067                     # Here not an array value, a simpler function.
2068                     no strict "refs";
2069                     *$subname = sub {
2070                         use strict "refs";
2071                         Carp::carp_extra_args(\@_) if main::DEBUG && @_ > 1;
2072                         no overloading;
2073                         return $field->{pack 'J', $_[0]};
2074                     }
2075                 }
2076             }
2077             elsif (substr('settable', 0, length $access) eq $access) {
2078                 my $subname = "${package}::${protected}set_$name";
2079                 no strict "refs";
2080                 *$subname = sub {
2081                     use strict "refs";
2082                     if (main::DEBUG) {
2083                         return Carp::carp_too_few_args(\@_, 2) if @_ < 2;
2084                         Carp::carp_extra_args(\@_) if @_ > 2;
2085                     }
2086                     # $self is $_[0]; $value is $_[1]
2087                     no overloading;
2088                     $field->{pack 'J', $_[0]} = $_[1];
2089                     return;
2090                 }
2091             }
2092             else {
2093                 Carp::my_carp_bug("Unknown accessor type $access.  No accessor set.");
2094             }
2095         }
2096         return;
2097     }
2098 }
2099
2100 package Input_file;
2101
2102 # All input files use this object, which stores various attributes about them,
2103 # and provides for convenient, uniform handling.  The run method wraps the
2104 # processing.  It handles all the bookkeeping of opening, reading, and closing
2105 # the file, returning only significant input lines.
2106 #
2107 # Each object gets a handler which processes the body of the file, and is
2108 # called by run().  All character property files must use the generic,
2109 # default handler, which has code scrubbed to handle things you might not
2110 # expect, including automatic EBCDIC handling.  For files that don't deal with
2111 # mapping code points to a property value, such as test files,
2112 # PropertyAliases, PropValueAliases, and named sequences, you can override the
2113 # handler to be a custom one.  Such a handler should basically be a
2114 # while(next_line()) {...} loop.
2115 #
2116 # You can also set up handlers to
2117 #   0) call during object construction time, after everything else is done
2118 #   1) call before the first line is read, for pre processing
2119 #   2) call to adjust each line of the input before the main handler gets
2120 #      them.  This can be automatically generated, if appropriately simple
2121 #      enough, by specifying a Properties parameter in the constructor.
2122 #   3) call upon EOF before the main handler exits its loop
2123 #   4) call at the end, for post processing
2124 #
2125 # $_ is used to store the input line, and is to be filtered by the
2126 # each_line_handler()s.  So, if the format of the line is not in the desired
2127 # format for the main handler, these are used to do that adjusting.  They can
2128 # be stacked (by enclosing them in an [ anonymous array ] in the constructor,
2129 # so the $_ output of one is used as the input to the next.  The EOF handler
2130 # is also stackable, but none of the others are, but could easily be changed
2131 # to be so.
2132 #
2133 # Some properties are used by the Perl core but aren't defined until later
2134 # Unicode releases.  The perl interpreter would have problems working when
2135 # compiled with an earlier Unicode version that doesn't have them, so we need
2136 # to define them somehow for those releases.  The 'Early' constructor
2137 # parameter can be used to automatically handle this.  It is essentially
2138 # ignored if the Unicode version being compiled has a data file for this
2139 # property.  Either code to execute or a file to read can be specified.
2140 # Details are at the %early definition.
2141 #
2142 # Most of the handlers can call insert_lines() or insert_adjusted_lines()
2143 # which insert the parameters as lines to be processed before the next input
2144 # file line is read.  This allows the EOF handler(s) to flush buffers, for
2145 # example.  The difference between the two routines is that the lines inserted
2146 # by insert_lines() are subjected to the each_line_handler()s.  (So if you
2147 # called it from such a handler, you would get infinite recursion without some
2148 # mechanism to prevent that.)  Lines inserted by insert_adjusted_lines() go
2149 # directly to the main handler without any adjustments.  If the
2150 # post-processing handler calls any of these, there will be no effect.  Some
2151 # error checking for these conditions could be added, but it hasn't been done.
2152 #
2153 # carp_bad_line() should be called to warn of bad input lines, which clears $_
2154 # to prevent further processing of the line.  This routine will output the
2155 # message as a warning once, and then keep a count of the lines that have the
2156 # same message, and output that count at the end of the file's processing.
2157 # This keeps the number of messages down to a manageable amount.
2158 #
2159 # get_missings() should be called to retrieve any @missing input lines.
2160 # Messages will be raised if this isn't done if the options aren't to ignore
2161 # missings.
2162
2163 sub trace { return main::trace(@_); }
2164
2165 { # Closure
2166     # Keep track of fields that are to be put into the constructor.
2167     my %constructor_fields;
2168
2169     main::setup_package(Constructor_Fields => \%constructor_fields);
2170
2171     my %file; # Input file name, required
2172     main::set_access('file', \%file, qw{ c r });
2173
2174     my %first_released; # Unicode version file was first released in, required
2175     main::set_access('first_released', \%first_released, qw{ c r });
2176
2177     my %handler;    # Subroutine to process the input file, defaults to
2178                     # 'process_generic_property_file'
2179     main::set_access('handler', \%handler, qw{ c });
2180
2181     my %property;
2182     # name of property this file is for.  defaults to none, meaning not
2183     # applicable, or is otherwise determinable, for example, from each line.
2184     main::set_access('property', \%property, qw{ c r });
2185
2186     my %optional;
2187     # This is either an unsigned number, or a list of property names.  In the
2188     # former case, if it is non-zero, it means the file is optional, so if the
2189     # file is absent, no warning about that is output.  In the latter case, it
2190     # is a list of properties that the file (exclusively) defines.  If the
2191     # file is present, tables for those properties will be produced; if
2192     # absent, none will, even if they are listed elsewhere (namely
2193     # PropertyAliases.txt and PropValueAliases.txt) as being in this release,
2194     # and no warnings will be raised about them not being available.  (And no
2195     # warning about the file itself will be raised.)
2196     main::set_access('optional', \%optional, qw{ c readable_array } );
2197
2198     my %non_skip;
2199     # This is used for debugging, to skip processing of all but a few input
2200     # files.  Add 'non_skip => 1' to the constructor for those files you want
2201     # processed when you set the $debug_skip global.
2202     main::set_access('non_skip', \%non_skip, 'c');
2203
2204     my %skip;
2205     # This is used to skip processing of this input file (semi-) permanently.
2206     # The value should be the reason the file is being skipped.  It is used
2207     # for files that we aren't planning to process anytime soon, but want to
2208     # allow to be in the directory and be checked for their names not
2209     # conflicting with any other files on a DOS 8.3 name filesystem, but to
2210     # not otherwise be processed, and to not raise a warning about not being
2211     # handled.  In the constructor call, any value that evaluates to a numeric
2212     # 0 or undef means don't skip.  Any other value is a string giving the
2213     # reason it is being skipped, and this will appear in generated pod.
2214     # However, an empty string reason will suppress the pod entry.
2215     # Internally, calls that evaluate to numeric 0 are changed into undef to
2216     # distinguish them from an empty string call.
2217     main::set_access('skip', \%skip, 'c', 'r');
2218
2219     my %each_line_handler;
2220     # list of subroutines to look at and filter each non-comment line in the
2221     # file.  defaults to none.  The subroutines are called in order, each is
2222     # to adjust $_ for the next one, and the final one adjusts it for
2223     # 'handler'
2224     main::set_access('each_line_handler', \%each_line_handler, 'c');
2225
2226     my %retain_trailing_comments;
2227     # This is used to not discard the comments that end data lines.  This
2228     # would be used only for files with non-typical syntax, and most code here
2229     # assumes that comments have been stripped, so special handlers would have
2230     # to be written.  It is assumed that the code will use these in
2231     # single-quoted contexts, and so any "'" marks in the comment will be
2232     # prefixed by a backslash.
2233     main::set_access('retain_trailing_comments', \%retain_trailing_comments, 'c');
2234
2235     my %properties; # Optional ordered list of the properties that occur in each
2236     # meaningful line of the input file.  If present, an appropriate
2237     # each_line_handler() is automatically generated and pushed onto the stack
2238     # of such handlers.  This is useful when a file contains multiple
2239     # properties per line, but no other special considerations are necessary.
2240     # The special value "<ignored>" means to discard the corresponding input
2241     # field.
2242     # Any @missing lines in the file should also match this syntax; no such
2243     # files exist as of 6.3.  But if it happens in a future release, the code
2244     # could be expanded to properly parse them.
2245     main::set_access('properties', \%properties, qw{ c r });
2246
2247     my %has_missings_defaults;
2248     # ? Are there lines in the file giving default values for code points
2249     # missing from it?.  Defaults to NO_DEFAULTS.  Otherwise NOT_IGNORED is
2250     # the norm, but IGNORED means it has such lines, but the handler doesn't
2251     # use them.  Having these three states allows us to catch changes to the
2252     # UCD that this program should track.  XXX This could be expanded to
2253     # specify the syntax for such lines, like %properties above.
2254     main::set_access('has_missings_defaults',
2255                                         \%has_missings_defaults, qw{ c r });
2256
2257     my %construction_time_handler;
2258     # Subroutine to call at the end of the new method.  If undef, no such
2259     # handler is called.
2260     main::set_access('construction_time_handler',
2261                                         \%construction_time_handler, qw{ c });
2262
2263     my %pre_handler;
2264     # Subroutine to call before doing anything else in the file.  If undef, no
2265     # such handler is called.
2266     main::set_access('pre_handler', \%pre_handler, qw{ c });
2267
2268     my %eof_handler;
2269     # Subroutines to call upon getting an EOF on the input file, but before
2270     # that is returned to the main handler.  This is to allow buffers to be
2271     # flushed.  The handler is expected to call insert_lines() or
2272     # insert_adjusted() with the buffered material
2273     main::set_access('eof_handler', \%eof_handler, qw{ c });
2274
2275     my %post_handler;
2276     # Subroutine to call after all the lines of the file are read in and
2277     # processed.  If undef, no such handler is called.  Note that this cannot
2278     # add lines to be processed; instead use eof_handler
2279     main::set_access('post_handler', \%post_handler, qw{ c });
2280
2281     my %progress_message;
2282     # Message to print to display progress in lieu of the standard one
2283     main::set_access('progress_message', \%progress_message, qw{ c });
2284
2285     my %handle;
2286     # cache open file handle, internal.  Is undef if file hasn't been
2287     # processed at all, empty if has;
2288     main::set_access('handle', \%handle);
2289
2290     my %added_lines;
2291     # cache of lines added virtually to the file, internal
2292     main::set_access('added_lines', \%added_lines);
2293
2294     my %remapped_lines;
2295     # cache of lines added virtually to the file, internal
2296     main::set_access('remapped_lines', \%remapped_lines);
2297
2298     my %errors;
2299     # cache of errors found, internal
2300     main::set_access('errors', \%errors);
2301
2302     my %missings;
2303     # storage of '@missing' defaults lines
2304     main::set_access('missings', \%missings);
2305
2306     my %early;
2307     # Used for properties that must be defined (for Perl's purposes) on
2308     # versions of Unicode earlier than Unicode itself defines them.  The
2309     # parameter is an array (it would be better to be a hash, but not worth
2310     # bothering about due to its rare use).
2311     #
2312     # The first element is either a code reference to call when in a release
2313     # earlier than the Unicode file is available in, or it is an alternate
2314     # file to use instead of the non-existent one.  This file must have been
2315     # plunked down in the same directory as mktables.  Should you be compiling
2316     # on a release that needs such a file, mktables will abort the
2317     # compilation, and tell you where to get the necessary file(s), and what
2318     # name(s) to use to store them as.
2319     # In the case of specifying an alternate file, the array must contain two
2320     # further elements:
2321     #
2322     # [1] is the name of the property that will be generated by this file.
2323     # The class automatically takes the input file and excludes any code
2324     # points in it that were not assigned in the Unicode version being
2325     # compiled.  It then uses this result to define the property in the given
2326     # version.  Since the property doesn't actually exist in the Unicode
2327     # version being compiled, this should be a name accessible only by core
2328     # perl.  If it is the same name as the regular property, the constructor
2329     # will mark the output table as a $PLACEHOLDER so that it doesn't actually
2330     # get output, and so will be unusable by non-core code.  Otherwise it gets
2331     # marked as $INTERNAL_ONLY.
2332     #
2333     # [2] is a property value to assign (only when compiling Unicode 1.1.5) to
2334     # the Hangul syllables in that release (which were ripped out in version
2335     # 2) for the given property .  (Hence it is ignored except when compiling
2336     # version 1.  You only get one value that applies to all of them, which
2337     # may not be the actual reality, but probably nobody cares anyway for
2338     # these obsolete characters.)
2339     #
2340     # [3] if present is the default value for the property to assign for code
2341     # points not given in the input.  If not present, the default from the
2342     # normal property is used
2343     #
2344     # [-1] If there is an extra final element that is the string 'ONLY_EARLY'.
2345     # it means to not add the name in [1] as an alias to the property name
2346     # used for these.  Normally, when compiling Unicode versions that don't
2347     # invoke the early handling, the name is added as a synonym.
2348     #
2349     # Not all files can be handled in the above way, and so the code ref
2350     # alternative is available.  It can do whatever it needs to.  The other
2351     # array elements are optional in this case, and the code is free to use or
2352     # ignore them if they are present.
2353     #
2354     # Internally, the constructor unshifts a 0 or 1 onto this array to
2355     # indicate if an early alternative is actually being used or not.  This
2356     # makes for easier testing later on.
2357     main::set_access('early', \%early, 'c');
2358
2359     my %only_early;
2360     main::set_access('only_early', \%only_early, 'c');
2361
2362     my %required_even_in_debug_skip;
2363     # debug_skip is used to speed up compilation during debugging by skipping
2364     # processing files that are not needed for the task at hand.  However,
2365     # some files pretty much can never be skipped, and this is used to specify
2366     # that this is one of them.  In order to skip this file, the call to the
2367     # constructor must be edited to comment out this parameter.
2368     main::set_access('required_even_in_debug_skip',
2369                      \%required_even_in_debug_skip, 'c');
2370
2371     my %withdrawn;
2372     # Some files get removed from the Unicode DB.  This is a version object
2373     # giving the first release without this file.
2374     main::set_access('withdrawn', \%withdrawn, 'c');
2375
2376     my %in_this_release;
2377     # Calculated value from %first_released and %withdrawn.  Are we compiling
2378     # a Unicode release which includes this file?
2379     main::set_access('in_this_release', \%in_this_release);
2380
2381     sub _next_line;
2382     sub _next_line_with_remapped_range;
2383
2384     sub new {
2385         my $class = shift;
2386
2387         my $self = bless \do{ my $anonymous_scalar }, $class;
2388         my $addr = do { no overloading; pack 'J', $self; };
2389
2390         # Set defaults
2391         $handler{$addr} = \&main::process_generic_property_file;
2392         $retain_trailing_comments{$addr} = 0;
2393         $non_skip{$addr} = 0;
2394         $skip{$addr} = undef;
2395         $has_missings_defaults{$addr} = $NO_DEFAULTS;
2396         $handle{$addr} = undef;
2397         $added_lines{$addr} = [ ];
2398         $remapped_lines{$addr} = [ ];
2399         $each_line_handler{$addr} = [ ];
2400         $eof_handler{$addr} = [ ];
2401         $errors{$addr} = { };
2402         $missings{$addr} = [ ];
2403         $early{$addr} = [ ];
2404         $optional{$addr} = [ ];
2405
2406         # Two positional parameters.
2407         return Carp::carp_too_few_args(\@_, 2) if main::DEBUG && @_ < 2;
2408         $file{$addr} = main::internal_file_to_platform(shift);
2409         $first_released{$addr} = shift;
2410
2411         # The rest of the arguments are key => value pairs
2412         # %constructor_fields has been set up earlier to list all possible
2413         # ones.  Either set or push, depending on how the default has been set
2414         # up just above.
2415         my %args = @_;
2416         foreach my $key (keys %args) {
2417             my $argument = $args{$key};
2418
2419             # Note that the fields are the lower case of the constructor keys
2420             my $hash = $constructor_fields{lc $key};
2421             if (! defined $hash) {
2422                 Carp::my_carp_bug("Unrecognized parameters '$key => $argument' to new() for $self.  Skipped");
2423                 next;
2424             }
2425             if (ref $hash->{$addr} eq 'ARRAY') {
2426                 if (ref $argument eq 'ARRAY') {
2427                     foreach my $argument (@{$argument}) {
2428                         next if ! defined $argument;
2429                         push @{$hash->{$addr}}, $argument;
2430                     }
2431                 }
2432                 else {
2433                     push @{$hash->{$addr}}, $argument if defined $argument;
2434                 }
2435             }
2436             else {
2437                 $hash->{$addr} = $argument;
2438             }
2439             delete $args{$key};
2440         };
2441
2442         $non_skip{$addr} = 1 if $required_even_in_debug_skip{$addr};
2443
2444         # Convert 0 (meaning don't skip) to undef
2445         undef $skip{$addr} unless $skip{$addr};
2446
2447         # Handle the case where this file is optional
2448         my $pod_message_for_non_existent_optional = "";
2449         if ($optional{$addr}->@*) {
2450
2451             # First element is the pod message
2452             $pod_message_for_non_existent_optional
2453                                                 = shift $optional{$addr}->@*;
2454             # Convert a 0 'Optional' argument to an empty list to make later
2455             # code more concise.
2456             if (   $optional{$addr}->@*
2457                 && $optional{$addr}->@* == 1
2458                 && $optional{$addr}[0] ne ""
2459                 && $optional{$addr}[0] !~ /\D/
2460                 && $optional{$addr}[0] == 0)
2461             {
2462                 $optional{$addr} = [ ];
2463             }
2464             else {  # But if the only element doesn't evaluate to 0, make sure
2465                     # that this file is indeed considered optional below.
2466                 unshift $optional{$addr}->@*, 1;
2467             }
2468         }
2469
2470         my $progress;
2471         my $function_instead_of_file = 0;
2472
2473         if ($early{$addr}->@* && $early{$addr}[-1] eq 'ONLY_EARLY') {
2474             $only_early{$addr} = 1;
2475             pop $early{$addr}->@*;
2476         }
2477
2478         # If we are compiling a Unicode release earlier than the file became
2479         # available, the constructor may have supplied a substitute
2480         if ($first_released{$addr} gt $v_version && $early{$addr}->@*) {
2481
2482             # Yes, we have a substitute, that we will use; mark it so
2483             unshift $early{$addr}->@*, 1;
2484
2485             # See the definition of %early for what the array elements mean.
2486             # Note that we have just unshifted onto the array, so the numbers
2487             # below are +1 of those in the %early description.
2488             # If we have a property this defines, create a table and default
2489             # map for it now (at essentially compile time), so that it will be
2490             # available for the whole of run time.  (We will want to add this
2491             # name as an alias when we are using the official property name;
2492             # but this must be deferred until run(), because at construction
2493             # time the official names have yet to be defined.)
2494             if ($early{$addr}[2]) {
2495                 my $fate = ($property{$addr}
2496                             && $property{$addr} eq $early{$addr}[2])
2497                           ? $PLACEHOLDER
2498                           : $INTERNAL_ONLY;
2499                 my $prop_object = Property->new($early{$addr}[2],
2500                                                 Fate => $fate,
2501                                                 Perl_Extension => 1,
2502                                                 );
2503
2504                 # If not specified by the constructor, use the default mapping
2505                 # for the regular property for this substitute one.
2506                 if ($early{$addr}[4]) {
2507                     $prop_object->set_default_map($early{$addr}[4]);
2508                 }
2509                 elsif (    defined $property{$addr}
2510                        &&  defined $default_mapping{$property{$addr}})
2511                 {
2512                     $prop_object
2513                         ->set_default_map($default_mapping{$property{$addr}});
2514                 }
2515             }
2516
2517             if (ref $early{$addr}[1] eq 'CODE') {
2518                 $function_instead_of_file = 1;
2519
2520                 # If the first element of the array is a code ref, the others
2521                 # are optional.
2522                 $handler{$addr} = $early{$addr}[1];
2523                 $property{$addr} = $early{$addr}[2]
2524                                                 if defined $early{$addr}[2];
2525                 $progress = "substitute $file{$addr}";
2526
2527                 undef $file{$addr};
2528             }
2529             else {  # Specifying a substitute file
2530
2531                 if (! main::file_exists($early{$addr}[1])) {
2532
2533                     # If we don't see the substitute file, generate an error
2534                     # message giving the needed things, and add it to the list
2535                     # of such to output before actual processing happens
2536                     # (hence the user finds out all of them in one run).
2537                     # Instead of creating a general method for NameAliases,
2538                     # hard-code it here, as there is unlikely to ever be a
2539                     # second one which needs special handling.
2540                     my $string_version = ($file{$addr} eq "NameAliases.txt")
2541                                     ? 'at least 6.1 (the later, the better)'
2542                                     : sprintf "%vd", $first_released{$addr};
2543                     push @missing_early_files, <<END;
2544 '$file{$addr}' version $string_version should be copied to '$early{$addr}[1]'.
2545 END
2546                     ;
2547                     return;
2548                 }
2549                 $progress = $early{$addr}[1];
2550                 $progress .= ", substituting for $file{$addr}" if $file{$addr};
2551                 $file{$addr} = $early{$addr}[1];
2552                 $property{$addr} = $early{$addr}[2];
2553
2554                 # Ignore code points not in the version being compiled
2555                 push $each_line_handler{$addr}->@*, \&_exclude_unassigned;
2556
2557                 if (   $v_version lt v2.0        # Hanguls in this release ...
2558                     && defined $early{$addr}[3]) # ... need special treatment
2559                 {
2560                     push $eof_handler{$addr}->@*, \&_fixup_obsolete_hanguls;
2561                 }
2562             }
2563
2564             # And this substitute is valid for all releases.
2565             $first_released{$addr} = v0;
2566         }
2567         else {  # Normal behavior
2568             $progress = $file{$addr};
2569             unshift $early{$addr}->@*, 0; # No substitute
2570         }
2571
2572         my $file = $file{$addr};
2573         $progress_message{$addr} = "Processing $progress"
2574                                             unless $progress_message{$addr};
2575
2576         # A file should be there if it is within the window of versions for
2577         # which Unicode supplies it
2578         if ($withdrawn{$addr} && $withdrawn{$addr} le $v_version) {
2579             $in_this_release{$addr} = 0;
2580             $skip{$addr} = "";
2581         }
2582         else {
2583             $in_this_release{$addr} = $first_released{$addr} le $v_version;
2584
2585             # Check that the file for this object (possibly using a substitute
2586             # for early releases) exists or we have a function alternative
2587             if (   ! $function_instead_of_file
2588                 && ! main::file_exists($file))
2589             {
2590                 # Here there is nothing available for this release.  This is
2591                 # fine if we aren't expecting anything in this release.
2592                 if (! $in_this_release{$addr}) {
2593                     $skip{$addr} = "";  # Don't remark since we expected
2594                                         # nothing and got nothing
2595                 }
2596                 elsif ($optional{$addr}->@*) {
2597
2598                     # Here the file is optional in this release; Use the
2599                     # passed in text to document this case in the pod.
2600                     $skip{$addr} = $pod_message_for_non_existent_optional;
2601                 }
2602                 elsif (   $in_this_release{$addr}
2603                        && ! defined $skip{$addr}
2604                        && defined $file)
2605                 { # Doesn't exist but should.
2606                     $skip{$addr} = "'$file' not found.  Possibly Big problems";
2607                     Carp::my_carp($skip{$addr});
2608                 }
2609             }
2610             elsif ($debug_skip && ! defined $skip{$addr} && ! $non_skip{$addr})
2611             {
2612
2613                 # The file exists; if not skipped for another reason, and we are
2614                 # skipping most everything during debugging builds, use that as
2615                 # the skip reason.
2616                 $skip{$addr} = '$debug_skip is on'
2617             }
2618         }
2619
2620         if (   ! $debug_skip
2621             && $non_skip{$addr}
2622             && ! $required_even_in_debug_skip{$addr}
2623             && $verbosity)
2624         {
2625             print "Warning: " . __PACKAGE__ . " constructor for $file has useless 'non_skip' in it\n";
2626         }
2627
2628         # Here, we have figured out if we will be skipping this file or not.
2629         # If so, we add any single property it defines to any passed in
2630         # optional property list.  These will be dealt with at run time.
2631         if (defined $skip{$addr}) {
2632             if ($property{$addr}) {
2633                 push $optional{$addr}->@*, $property{$addr};
2634             }
2635         } # Otherwise, are going to process the file.
2636         elsif ($property{$addr}) {
2637
2638             # If the file has a property defined in the constructor for it, it
2639             # means that the property is not listed in the file's entries.  So
2640             # add a handler (to the list of line handlers) to insert the
2641             # property name into the lines, to provide a uniform interface to
2642             # the final processing subroutine.
2643             push @{$each_line_handler{$addr}}, \&_insert_property_into_line;
2644         }
2645         elsif ($properties{$addr}) {
2646
2647             # Similarly, there may be more than one property represented on
2648             # each line, with no clue but the constructor input what those
2649             # might be.  Add a handler for each line in the input so that it
2650             # creates a separate input line for each property in those input
2651             # lines, thus making them suitable to handle generically.
2652
2653             push @{$each_line_handler{$addr}},
2654                  sub {
2655                     my $file = shift;
2656                     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2657
2658                     my @fields = split /\s*;\s*/, $_, -1;
2659
2660                     if (@fields - 1 > @{$properties{$addr}}) {
2661                         $file->carp_bad_line('Extra fields');
2662                         $_ = "";
2663                         return;
2664                     }
2665                     my $range = shift @fields;  # 0th element is always the
2666                                                 # range
2667
2668                     # The next fields in the input line correspond
2669                     # respectively to the stored properties.
2670                     for my $i (0 ..  @{$properties{$addr}} - 1) {
2671                         my $property_name = $properties{$addr}[$i];
2672                         next if $property_name eq '<ignored>';
2673                         $file->insert_adjusted_lines(
2674                               "$range; $property_name; $fields[$i]");
2675                     }
2676                     $_ = "";
2677
2678                     return;
2679                 };
2680         }
2681
2682         {   # On non-ascii platforms, we use a special pre-handler
2683             no strict;
2684             no warnings 'once';
2685             *next_line = (main::NON_ASCII_PLATFORM)
2686                          ? *_next_line_with_remapped_range
2687                          : *_next_line;
2688         }
2689
2690         &{$construction_time_handler{$addr}}($self)
2691                                         if $construction_time_handler{$addr};
2692
2693         return $self;
2694     }
2695
2696
2697     use overload
2698         fallback => 0,
2699         qw("") => "_operator_stringify",
2700         "." => \&main::_operator_dot,
2701         ".=" => \&main::_operator_dot_equal,
2702     ;
2703
2704     sub _operator_stringify {
2705         my $self = shift;
2706
2707         return __PACKAGE__ . " object for " . $self->file;
2708     }
2709
2710     sub run {
2711         # Process the input object $self.  This opens and closes the file and
2712         # calls all the handlers for it.  Currently,  this can only be called
2713         # once per file, as it destroy's the EOF handlers
2714
2715         # flag to make sure extracted files are processed early
2716         state $seen_non_extracted = 0;
2717
2718         my $self = shift;
2719         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2720
2721         my $addr = do { no overloading; pack 'J', $self; };
2722
2723         my $file = $file{$addr};
2724
2725         if (! $file) {
2726             $handle{$addr} = 'pretend_is_open';
2727         }
2728         else {
2729             if ($seen_non_extracted) {
2730                 if ($file =~ /$EXTRACTED/i) # Some platforms may change the
2731                                             # case of the file's name
2732                 {
2733                     Carp::my_carp_bug(main::join_lines(<<END
2734 $file should be processed just after the 'Prop...Alias' files, and before
2735 anything not in the $EXTRACTED_DIR directory.  Proceeding, but the results may
2736 have subtle problems
2737 END
2738                     ));
2739                 }
2740             }
2741             elsif ($EXTRACTED_DIR
2742
2743                     # We only do this check for generic property files
2744                     && $handler{$addr} == \&main::process_generic_property_file
2745
2746                     && $file !~ /$EXTRACTED/i)
2747             {
2748                 # We don't set this (by the 'if' above) if we have no
2749                 # extracted directory, so if running on an early version,
2750                 # this test won't work.  Not worth worrying about.
2751                 $seen_non_extracted = 1;
2752             }
2753
2754             # Mark the file as having being processed, and warn if it
2755             # isn't a file we are expecting.  As we process the files,
2756             # they are deleted from the hash, so any that remain at the
2757             # end of the program are files that we didn't process.
2758             my $fkey = File::Spec->rel2abs($file);
2759             my $exists = delete $potential_files{lc($fkey)};
2760
2761             Carp::my_carp("Was not expecting '$file'.")
2762                                     if $exists && ! $in_this_release{$addr};
2763
2764             # If there is special handling for compiling Unicode releases
2765             # earlier than the first one in which Unicode defines this
2766             # property ...
2767             if ($early{$addr}->@* > 1) {
2768
2769                 # Mark as processed any substitute file that would be used in
2770                 # such a release
2771                 $fkey = File::Spec->rel2abs($early{$addr}[1]);
2772                 delete $potential_files{lc($fkey)};
2773
2774                 # As commented in the constructor code, when using the
2775                 # official property, we still have to allow the publicly
2776                 # inaccessible early name so that the core code which uses it
2777                 # will work regardless.
2778                 if (   ! $only_early{$addr}
2779                     && ! $early{$addr}[0]
2780                     && $early{$addr}->@* > 2)
2781                 {
2782                     my $early_property_name = $early{$addr}[2];
2783                     if ($property{$addr} ne $early_property_name) {
2784                         main::property_ref($property{$addr})
2785                                             ->add_alias($early_property_name);
2786                     }
2787                 }
2788             }
2789
2790             # We may be skipping this file ...
2791             if (defined $skip{$addr}) {
2792
2793                 # If the file isn't supposed to be in this release, there is
2794                 # nothing to do
2795                 if ($in_this_release{$addr}) {
2796
2797                     # But otherwise, we may print a message
2798                     if ($debug_skip) {
2799                         print STDERR "Skipping input file '$file'",
2800                                      " because '$skip{$addr}'\n";
2801                     }
2802
2803                     # And add it to the list of skipped files, which is later
2804                     # used to make the pod
2805                     $skipped_files{$file} = $skip{$addr};
2806
2807                     # The 'optional' list contains properties that are also to
2808                     # be skipped along with the file.  (There may also be
2809                     # digits which are just placeholders to make sure it isn't
2810                     # an empty list
2811                     foreach my $property ($optional{$addr}->@*) {
2812                         next unless $property =~ /\D/;
2813                         my $prop_object = main::property_ref($property);
2814                         next unless defined $prop_object;
2815                         $prop_object->set_fate($SUPPRESSED, $skip{$addr});
2816                     }
2817                 }
2818
2819                 return;
2820             }
2821
2822             # Here, we are going to process the file.  Open it, converting the
2823             # slashes used in this program into the proper form for the OS
2824             my $file_handle;
2825             if (not open $file_handle, "<", $file) {
2826                 Carp::my_carp("Can't open $file.  Skipping: $!");
2827                 return;
2828             }
2829             $handle{$addr} = $file_handle; # Cache the open file handle
2830
2831             # If possible, make sure that the file is the correct version.
2832             # (This data isn't available on early Unicode releases or in
2833             # UnicodeData.txt.)  We don't do this check if we are using a
2834             # substitute file instead of the official one (though the code
2835             # could be extended to do so).
2836             if ($in_this_release{$addr}
2837                 && ! $early{$addr}[0]
2838                 && lc($file) ne 'unicodedata.txt')
2839             {
2840                 if ($file !~ /^Unihan/i) {
2841
2842                     # The non-Unihan files started getting version numbers in
2843                     # 3.2, but some files in 4.0 are unchanged from 3.2, and
2844                     # marked as 3.2.  4.0.1 is the first version where there
2845                     # are no files marked as being from less than 4.0, though
2846                     # some are marked as 4.0.  In versions after that, the
2847                     # numbers are correct.
2848                     if ($v_version ge v4.0.1) {
2849                         $_ = <$file_handle>;    # The version number is in the
2850                                                 # very first line
2851                         if ($_ !~ / - $string_version \. /x) {
2852                             chomp;
2853                             $_ =~ s/^#\s*//;
2854
2855                             # 4.0.1 had some valid files that weren't updated.
2856                             if (! ($v_version eq v4.0.1 && $_ =~ /4\.0\.0/)) {
2857                                 die Carp::my_carp("File '$file' is version "
2858                                                 . "'$_'.  It should be "
2859                                                 . "version $string_version");
2860                             }
2861                         }
2862                     }
2863                 }
2864                 elsif ($v_version ge v6.0.0) { # Unihan
2865
2866                     # Unihan files didn't get accurate version numbers until
2867                     # 6.0.  The version is somewhere in the first comment
2868                     # block
2869                     while (<$file_handle>) {
2870                         if ($_ !~ /^#/) {
2871                             Carp::my_carp_bug("Could not find the expected "
2872                                             . "version info in file '$file'");
2873                             last;
2874                         }
2875                         chomp;
2876                         $_ =~ s/^#\s*//;
2877                         next if $_ !~ / version: /x;
2878                         last if $_ =~ /$string_version/;
2879                         die Carp::my_carp("File '$file' is version "
2880                                         . "'$_'.  It should be "
2881                                         . "version $string_version");
2882                     }
2883                 }
2884             }
2885         }
2886
2887         print "$progress_message{$addr}\n" if $verbosity >= $PROGRESS;
2888
2889         # Call any special handler for before the file.
2890         &{$pre_handler{$addr}}($self) if $pre_handler{$addr};
2891
2892         # Then the main handler
2893         &{$handler{$addr}}($self);
2894
2895         # Then any special post-file handler.
2896         &{$post_handler{$addr}}($self) if $post_handler{$addr};
2897
2898         # If any errors have been accumulated, output the counts (as the first
2899         # error message in each class was output when it was encountered).
2900         if ($errors{$addr}) {
2901             my $total = 0;
2902             my $types = 0;
2903             foreach my $error (keys %{$errors{$addr}}) {
2904                 $total += $errors{$addr}->{$error};
2905                 delete $errors{$addr}->{$error};
2906                 $types++;
2907             }
2908             if ($total > 1) {
2909                 my $message
2910                         = "A total of $total lines had errors in $file.  ";
2911
2912                 $message .= ($types == 1)
2913                             ? '(Only the first one was displayed.)'
2914                             : '(Only the first of each type was displayed.)';
2915                 Carp::my_carp($message);
2916             }
2917         }
2918
2919         if (@{$missings{$addr}}) {
2920             Carp::my_carp_bug("Handler for $file didn't look at all the \@missing lines.  Generated tables likely are wrong");
2921         }
2922
2923         # If a real file handle, close it.
2924         close $handle{$addr} or Carp::my_carp("Can't close $file: $!") if
2925                                                         ref $handle{$addr};
2926         $handle{$addr} = "";   # Uses empty to indicate that has already seen
2927                                # the file, as opposed to undef
2928         return;
2929     }
2930
2931     sub _next_line {
2932         # Sets $_ to be the next logical input line, if any.  Returns non-zero
2933         # if such a line exists.  'logical' means that any lines that have
2934         # been added via insert_lines() will be returned in $_ before the file
2935         # is read again.
2936
2937         my $self = shift;
2938         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2939
2940         my $addr = do { no overloading; pack 'J', $self; };
2941
2942         # Here the file is open (or if the handle is not a ref, is an open
2943         # 'virtual' file).  Get the next line; any inserted lines get priority
2944         # over the file itself.
2945         my $adjusted;
2946
2947         LINE:
2948         while (1) { # Loop until find non-comment, non-empty line
2949             #local $to_trace = 1 if main::DEBUG;
2950             my $inserted_ref = shift @{$added_lines{$addr}};
2951             if (defined $inserted_ref) {
2952                 ($adjusted, $_) = @{$inserted_ref};
2953                 trace $adjusted, $_ if main::DEBUG && $to_trace;
2954                 return 1 if $adjusted;
2955             }
2956             else {
2957                 last if ! ref $handle{$addr}; # Don't read unless is real file
2958                 last if ! defined ($_ = readline $handle{$addr});
2959             }
2960             chomp;
2961             trace $_ if main::DEBUG && $to_trace;
2962
2963             # See if this line is the comment line that defines what property
2964             # value that code points that are not listed in the file should
2965             # have.  The format or existence of these lines is not guaranteed
2966             # by Unicode since they are comments, but the documentation says
2967             # that this was added for machine-readability, so probably won't
2968             # change.  This works starting in Unicode Version 5.0.  They look
2969             # like:
2970             #
2971             # @missing: 0000..10FFFF; Not_Reordered
2972             # @missing: 0000..10FFFF; Decomposition_Mapping; <code point>
2973             # @missing: 0000..10FFFF; ; NaN
2974             #
2975             # Save the line for a later get_missings() call.
2976             if (/$missing_defaults_prefix/) {
2977                 if ($has_missings_defaults{$addr} == $NO_DEFAULTS) {
2978                     $self->carp_bad_line("Unexpected \@missing line.  Assuming no missing entries");
2979                 }
2980                 elsif ($has_missings_defaults{$addr} == $NOT_IGNORED) {
2981                     my @defaults = split /\s* ; \s*/x, $_;
2982
2983                     # The first field is the @missing, which ends in a
2984                     # semi-colon, so can safely shift.
2985                     shift @defaults;
2986
2987                     # Some of these lines may have empty field placeholders
2988                     # which get in the way.  An example is:
2989                     # @missing: 0000..10FFFF; ; NaN
2990                     # Remove them.  Process starting from the top so the
2991                     # splice doesn't affect things still to be looked at.
2992                     for (my $i = @defaults - 1; $i >= 0; $i--) {
2993                         next if $defaults[$i] ne "";
2994                         splice @defaults, $i, 1;
2995                     }
2996
2997                     # What's left should be just the property (maybe) and the
2998                     # default.  Having only one element means it doesn't have
2999                     # the property.
3000                     my $default;
3001                     my $property;
3002                     if (@defaults >= 1) {
3003                         if (@defaults == 1) {
3004                             $default = $defaults[0];
3005                         }
3006                         else {
3007                             $property = $defaults[0];
3008                             $default = $defaults[1];
3009                         }
3010                     }
3011
3012                     if (@defaults < 1
3013                         || @defaults > 2
3014                         || ($default =~ /^</
3015                             && $default !~ /^<code *point>$/i
3016                             && $default !~ /^<none>$/i
3017                             && $default !~ /^<script>$/i))
3018                     {
3019                         $self->carp_bad_line("Unrecognized \@missing line: $_.  Assuming no missing entries");
3020                     }
3021                     else {
3022
3023                         # If the property is missing from the line, it should
3024                         # be the one for the whole file
3025                         $property = $property{$addr} if ! defined $property;
3026
3027                         # Change <none> to the null string, which is what it
3028                         # really means.  If the default is the code point
3029                         # itself, set it to <code point>, which is what
3030                         # Unicode uses (but sometimes they've forgotten the
3031                         # space)
3032                         if ($default =~ /^<none>$/i) {
3033                             $default = "";
3034                         }
3035                         elsif ($default =~ /^<code *point>$/i) {
3036                             $default = $CODE_POINT;
3037                         }
3038                         elsif ($default =~ /^<script>$/i) {
3039
3040                             # Special case this one.  Currently is from
3041                             # ScriptExtensions.txt, and means for all unlisted
3042                             # code points, use their Script property values.
3043                             # For the code points not listed in that file, the
3044                             # default value is 'Unknown'.
3045                             $default = "Unknown";
3046                         }
3047
3048                         # Store them as a sub-arrays with both components.
3049                         push @{$missings{$addr}}, [ $default, $property ];
3050                     }
3051                 }
3052
3053                 # There is nothing for the caller to process on this comment
3054                 # line.
3055                 next;
3056             }
3057
3058             # Unless to keep, remove comments.  If to keep, ignore
3059             # comment-only lines
3060             if ($retain_trailing_comments{$addr}) {
3061                 next if / ^ \s* \# /x;
3062
3063                 # But escape any single quotes (done in both the comment and
3064                 # non-comment portion; this could be a bug someday, but not
3065                 # likely)
3066                 s/'/\\'/g;
3067             }
3068             else {
3069                 s/#.*//;
3070             }
3071
3072             # Remove trailing space, and skip this line if the result is empty
3073             s/\s+$//;
3074             next if /^$/;
3075
3076             # Call any handlers for this line, and skip further processing of
3077             # the line if the handler sets the line to null.
3078             foreach my $sub_ref (@{$each_line_handler{$addr}}) {
3079                 &{$sub_ref}($self);
3080                 next LINE if /^$/;
3081             }
3082
3083             # Here the line is ok.  return success.
3084             return 1;
3085         } # End of looping through lines.
3086
3087         # If there are EOF handlers, call each (only once) and if it generates
3088         # more lines to process go back in the loop to handle them.
3089         while ($eof_handler{$addr}->@*) {
3090             &{$eof_handler{$addr}[0]}($self);
3091             shift $eof_handler{$addr}->@*;   # Currently only get one shot at it.
3092             goto LINE if $added_lines{$addr};
3093         }
3094
3095         # Return failure -- no more lines.
3096         return 0;
3097
3098     }
3099
3100     sub _next_line_with_remapped_range {
3101         my $self = shift;
3102         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3103
3104         # like _next_line(), but for use on non-ASCII platforms.  It sets $_
3105         # to be the next logical input line, if any.  Returns non-zero if such
3106         # a line exists.  'logical' means that any lines that have been added
3107         # via insert_lines() will be returned in $_ before the file is read
3108         # again.
3109         #
3110         # The difference from _next_line() is that this remaps the Unicode
3111         # code points in the input to those of the native platform.  Each
3112         # input line contains a single code point, or a single contiguous
3113         # range of them  This routine splits each range into its individual
3114         # code points and caches them.  It returns the cached values,
3115         # translated into their native equivalents, one at a time, for each
3116         # call, before reading the next line.  Since native values can only be
3117         # a single byte wide, no translation is needed for code points above
3118         # 0xFF, and ranges that are entirely above that number are not split.
3119         # If an input line contains the range 254-1000, it would be split into
3120         # three elements: 254, 255, and 256-1000.  (The downstream table
3121         # insertion code will sort and coalesce the individual code points
3122         # into appropriate ranges.)
3123
3124         my $addr = do { no overloading; pack 'J', $self; };
3125
3126         while (1) {
3127
3128             # Look in cache before reading the next line.  Return any cached
3129             # value, translated
3130             my $inserted = shift @{$remapped_lines{$addr}};
3131             if (defined $inserted) {
3132                 trace $inserted if main::DEBUG && $to_trace;
3133                 $_ = $inserted =~ s/^ ( \d+ ) /sprintf("%04X", utf8::unicode_to_native($1))/xer;
3134                 trace $_ if main::DEBUG && $to_trace;
3135                 return 1;
3136             }
3137
3138             # Get the next line.
3139             return 0 unless _next_line($self);
3140
3141             # If there is a special handler for it, return the line,
3142             # untranslated.  This should happen only for files that are
3143             # special, not being code-point related, such as property names.
3144             return 1 if $handler{$addr}
3145                                     != \&main::process_generic_property_file;
3146
3147             my ($range, $property_name, $map, @remainder)
3148                 = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
3149
3150             if (@remainder
3151                 || ! defined $property_name
3152                 || $range !~ /^ ($code_point_re) (?:\.\. ($code_point_re) )? $/x)
3153             {
3154                 Carp::my_carp_bug("Unrecognized input line '$_'.  Ignored");
3155             }
3156
3157             my $low = hex $1;
3158             my $high = (defined $2) ? hex $2 : $low;
3159
3160             # If the input maps the range to another code point, remap the
3161             # target if it is between 0 and 255.
3162             my $tail;
3163             if (defined $map) {
3164                 $map =~ s/\b 00 ( [0-9A-F]{2} ) \b/sprintf("%04X", utf8::unicode_to_native(hex $1))/gxe;
3165                 $tail = "$property_name; $map";
3166                 $_ = "$range; $tail";
3167             }
3168             else {
3169                 $tail = $property_name;
3170             }
3171
3172             # If entire range is above 255, just return it, unchanged (except
3173             # any mapped-to code point, already changed above)
3174             return 1 if $low > 255;
3175
3176             # Cache an entry for every code point < 255.  For those in the
3177             # range above 255, return a dummy entry for just that portion of
3178             # the range.  Note that this will be out-of-order, but that is not
3179             # a problem.
3180             foreach my $code_point ($low .. $high) {
3181                 if ($code_point > 255) {
3182                     $_ = sprintf "%04X..%04X; $tail", $code_point, $high;
3183                     return 1;
3184                 }
3185                 push @{$remapped_lines{$addr}}, "$code_point; $tail";
3186             }
3187         } # End of looping through lines.
3188
3189         # NOTREACHED
3190     }
3191
3192 #   Not currently used, not fully tested.
3193 #    sub peek {
3194 #        # Non-destructive lookahead one non-adjusted, non-comment, non-blank
3195 #        # record.  Not callable from an each_line_handler(), nor does it call
3196 #        # an each_line_handler() on the line.
3197 #
3198 #        my $self = shift;
3199 #        my $addr = do { no overloading; pack 'J', $self; };
3200 #
3201 #        foreach my $inserted_ref (@{$added_lines{$addr}}) {
3202 #            my ($adjusted, $line) = @{$inserted_ref};
3203 #            next if $adjusted;
3204 #
3205 #            # Remove comments and trailing space, and return a non-empty
3206 #            # resulting line
3207 #            $line =~ s/#.*//;
3208 #            $line =~ s/\s+$//;
3209 #            return $line if $line ne "";
3210 #        }
3211 #
3212 #        return if ! ref $handle{$addr}; # Don't read unless is real file
3213 #        while (1) { # Loop until find non-comment, non-empty line
3214 #            local $to_trace = 1 if main::DEBUG;
3215 #            trace $_ if main::DEBUG && $to_trace;
3216 #            return if ! defined (my $line = readline $handle{$addr});
3217 #            chomp $line;
3218 #            push @{$added_lines{$addr}}, [ 0, $line ];
3219 #
3220 #            $line =~ s/#.*//;
3221 #            $line =~ s/\s+$//;
3222 #            return $line if $line ne "";
3223 #        }
3224 #
3225 #        return;
3226 #    }
3227
3228
3229     sub insert_lines {
3230         # Lines can be inserted so that it looks like they were in the input
3231         # file at the place it was when this routine is called.  See also
3232         # insert_adjusted_lines().  Lines inserted via this routine go through
3233         # any each_line_handler()
3234
3235         my $self = shift;
3236
3237         # Each inserted line is an array, with the first element being 0 to
3238         # indicate that this line hasn't been adjusted, and needs to be
3239         # processed.
3240         no overloading;
3241         push @{$added_lines{pack 'J', $self}}, map { [ 0, $_ ] } @_;
3242         return;
3243     }
3244
3245     sub insert_adjusted_lines {
3246         # Lines can be inserted so that it looks like they were in the input
3247         # file at the place it was when this routine is called.  See also
3248         # insert_lines().  Lines inserted via this routine are already fully
3249         # adjusted, ready to be processed; each_line_handler()s handlers will
3250         # not be called.  This means this is not a completely general
3251         # facility, as only the last each_line_handler on the stack should
3252         # call this.  It could be made more general, by passing to each of the
3253         # line_handlers their position on the stack, which they would pass on
3254         # to this routine, and that would replace the boolean first element in
3255         # the anonymous array pushed here, so that the next_line routine could
3256         # use that to call only those handlers whose index is after it on the
3257         # stack.  But this is overkill for what is needed now.
3258
3259         my $self = shift;
3260         trace $_[0] if main::DEBUG && $to_trace;
3261
3262         # Each inserted line is an array, with the first element being 1 to
3263         # indicate that this line has been adjusted
3264         no overloading;
3265         push @{$added_lines{pack 'J', $self}}, map { [ 1, $_ ] } @_;
3266         return;
3267     }
3268
3269     sub get_missings {
3270         # Returns the stored up @missings lines' values, and clears the list.
3271         # The values are in an array, consisting of the default in the first
3272         # element, and the property in the 2nd.  However, since these lines
3273         # can be stacked up, the return is an array of all these arrays.
3274
3275         my $self = shift;
3276         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3277
3278         my $addr = do { no overloading; pack 'J', $self; };
3279
3280         # If not accepting a list return, just return the first one.
3281         return shift @{$missings{$addr}} unless wantarray;
3282
3283         my @return = @{$missings{$addr}};
3284         undef @{$missings{$addr}};
3285         return @return;
3286     }
3287
3288     sub _exclude_unassigned {
3289
3290         # Takes the range in $_ and excludes code points that aren't assigned
3291         # in this release
3292
3293         state $skip_inserted_count = 0;
3294
3295         # Ignore recursive calls.
3296         if ($skip_inserted_count) {
3297             $skip_inserted_count--;
3298             return;
3299         }
3300
3301         # Find what code points are assigned in this release
3302         main::calculate_Assigned() if ! defined $Assigned;
3303
3304         my $self = shift;
3305         my $addr = do { no overloading; pack 'J', $self; };
3306         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3307
3308         my ($range, @remainder)
3309             = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
3310
3311         # Examine the range.
3312         if ($range =~ /^ ($code_point_re) (?:\.\. ($code_point_re) )? $/x)
3313         {
3314             my $low = hex $1;
3315             my $high = (defined $2) ? hex $2 : $low;
3316
3317             # Split the range into subranges of just those code points in it
3318             # that are assigned.
3319             my @ranges = (Range_List->new(Initialize
3320                               => Range->new($low, $high)) & $Assigned)->ranges;
3321
3322             # Do nothing if nothing in the original range is assigned in this
3323             # release; handle normally if everything is in this release.
3324             if (! @ranges) {
3325                 $_ = "";
3326             }
3327             elsif (@ranges != 1) {
3328
3329                 # Here, some code points in the original range aren't in this
3330                 # release; @ranges gives the ones that are.  Create fake input
3331                 # lines for each of the ranges, and set things up so that when
3332                 # this routine is called on that fake input, it will do
3333                 # nothing.
3334                 $skip_inserted_count = @ranges;
3335                 my $remainder = join ";", @remainder;
3336                 for my $range (@ranges) {
3337                     $self->insert_lines(sprintf("%04X..%04X;%s",
3338                                     $range->start, $range->end, $remainder));
3339                 }
3340                 $_ = "";    # The original range is now defunct.
3341             }
3342         }
3343
3344         return;
3345     }
3346
3347     sub _fixup_obsolete_hanguls {
3348
3349         # This is called only when compiling Unicode version 1.  All Unicode
3350         # data for subsequent releases assumes that the code points that were
3351         # Hangul syllables in this release only are something else, so if
3352         # using such data, we have to override it
3353
3354         my $self = shift;
3355         my $addr = do { no overloading; pack 'J', $self; };
3356         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3357
3358         my $object = main::property_ref($property{$addr});
3359         $object->add_map($FIRST_REMOVED_HANGUL_SYLLABLE,
3360                          $FINAL_REMOVED_HANGUL_SYLLABLE,
3361                          $early{$addr}[3],  # Passed-in value for these
3362                          Replace => $UNCONDITIONALLY);
3363     }
3364
3365     sub _insert_property_into_line {
3366         # Add a property field to $_, if this file requires it.
3367
3368         my $self = shift;
3369         my $addr = do { no overloading; pack 'J', $self; };
3370         my $property = $property{$addr};
3371         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3372
3373         $_ =~ s/(;|$)/; $property$1/;
3374         return;
3375     }
3376
3377     sub carp_bad_line {
3378         # Output consistent error messages, using either a generic one, or the
3379         # one given by the optional parameter.  To avoid gazillions of the
3380         # same message in case the syntax of a  file is way off, this routine
3381         # only outputs the first instance of each message, incrementing a
3382         # count so the totals can be output at the end of the file.
3383
3384         my $self = shift;
3385         my $message = shift;
3386         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3387
3388         my $addr = do { no overloading; pack 'J', $self; };
3389
3390         $message = 'Unexpected line' unless $message;
3391
3392         # No trailing punctuation so as to fit with our addenda.
3393         $message =~ s/[.:;,]$//;
3394
3395         # If haven't seen this exact message before, output it now.  Otherwise
3396         # increment the count of how many times it has occurred
3397         unless ($errors{$addr}->{$message}) {
3398             Carp::my_carp("$message in '$_' in "
3399                             . $file{$addr}
3400                             . " at line $..  Skipping this line;");
3401             $errors{$addr}->{$message} = 1;
3402         }
3403         else {
3404             $errors{$addr}->{$message}++;
3405         }
3406
3407         # Clear the line to prevent any further (meaningful) processing of it.
3408         $_ = "";
3409
3410         return;
3411     }
3412 } # End closure
3413
3414 package Multi_Default;
3415
3416 # Certain properties in early versions of Unicode had more than one possible
3417 # default for code points missing from the files.  In these cases, one
3418 # default applies to everything left over after all the others are applied,
3419 # and for each of the others, there is a description of which class of code
3420 # points applies to it.  This object helps implement this by storing the
3421 # defaults, and for all but that final default, an eval string that generates
3422 # the class that it applies to.
3423
3424
3425 {   # Closure
3426
3427     main::setup_package();
3428
3429     my %class_defaults;
3430     # The defaults structure for the classes
3431     main::set_access('class_defaults', \%class_defaults);
3432
3433     my %other_default;
3434     # The default that applies to everything left over.
3435     main::set_access('other_default', \%other_default, 'r');
3436
3437
3438     sub new {
3439         # The constructor is called with default => eval pairs, terminated by
3440         # the left-over default. e.g.
3441         # Multi_Default->new(
3442         #        'T' => '$gc->table("Mn") + $gc->table("Cf") - 0x200C
3443         #               -  0x200D',
3444         #        'R' => 'some other expression that evaluates to code points',
3445         #        .
3446         #        .
3447         #        .
3448         #        'U'));
3449         # It is best to leave the final value be the one that matches the
3450         # above-Unicode code points.
3451
3452         my $class = shift;
3453
3454         my $self = bless \do{my $anonymous_scalar}, $class;
3455         my $addr = do { no overloading; pack 'J', $self; };
3456
3457         while (@_ > 1) {
3458             my $default = shift;
3459             my $eval = shift;
3460             $class_defaults{$addr}->{$default} = $eval;
3461         }
3462
3463         $other_default{$addr} = shift;
3464
3465         return $self;
3466     }
3467
3468     sub get_next_defaults {
3469         # Iterates and returns the next class of defaults.
3470         my $self = shift;
3471         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3472
3473         my $addr = do { no overloading; pack 'J', $self; };
3474
3475         return each %{$class_defaults{$addr}};
3476     }
3477 }
3478
3479 package Alias;
3480
3481 # An alias is one of the names that a table goes by.  This class defines them
3482 # including some attributes.  Everything is currently setup in the
3483 # constructor.
3484
3485
3486 {   # Closure
3487
3488     main::setup_package();
3489
3490     my %name;
3491     main::set_access('name', \%name, 'r');
3492
3493     my %loose_match;
3494     # Should this name match loosely or not.
3495     main::set_access('loose_match', \%loose_match, 'r');
3496
3497     my %make_re_pod_entry;
3498     # Some aliases should not get their own entries in the re section of the
3499     # pod, because they are covered by a wild-card, and some we want to
3500     # discourage use of.  Binary
3501     main::set_access('make_re_pod_entry', \%make_re_pod_entry, 'r', 's');
3502
3503     my %ucd;
3504     # Is this documented to be accessible via Unicode::UCD
3505     main::set_access('ucd', \%ucd, 'r', 's');
3506
3507     my %status;
3508     # Aliases have a status, like deprecated, or even suppressed (which means
3509     # they don't appear in documentation).  Enum
3510     main::set_access('status', \%status, 'r');
3511
3512     my %ok_as_filename;
3513     # Similarly, some aliases should not be considered as usable ones for
3514     # external use, such as file names, or we don't want documentation to
3515     # recommend them.  Boolean
3516     main::set_access('ok_as_filename', \%ok_as_filename, 'r');
3517
3518     sub new {
3519         my $class = shift;
3520
3521         my $self = bless \do { my $anonymous_scalar }, $class;
3522         my $addr = do { no overloading; pack 'J', $self; };
3523
3524         $name{$addr} = shift;
3525         $loose_match{$addr} = shift;
3526         $make_re_pod_entry{$addr} = shift;
3527         $ok_as_filename{$addr} = shift;
3528         $status{$addr} = shift;
3529         $ucd{$addr} = shift;
3530
3531         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3532
3533         # Null names are never ok externally
3534         $ok_as_filename{$addr} = 0 if $name{$addr} eq "";
3535
3536         return $self;
3537     }
3538 }
3539
3540 package Range;
3541
3542 # A range is the basic unit for storing code points, and is described in the
3543 # comments at the beginning of the program.  Each range has a starting code
3544 # point; an ending code point (not less than the starting one); a value
3545 # that applies to every code point in between the two end-points, inclusive;
3546 # and an enum type that applies to the value.  The type is for the user's
3547 # convenience, and has no meaning here, except that a non-zero type is
3548 # considered to not obey the normal Unicode rules for having standard forms.
3549 #
3550 # The same structure is used for both map and match tables, even though in the
3551 # latter, the value (and hence type) is irrelevant and could be used as a
3552 # comment.  In map tables, the value is what all the code points in the range
3553 # map to.  Type 0 values have the standardized version of the value stored as
3554 # well, so as to not have to recalculate it a lot.
3555
3556 sub trace { return main::trace(@_); }
3557
3558 {   # Closure
3559
3560     main::setup_package();
3561
3562     my %start;
3563     main::set_access('start', \%start, 'r', 's');
3564
3565     my %end;
3566     main::set_access('end', \%end, 'r', 's');
3567
3568     my %value;
3569     main::set_access('value', \%value, 'r', 's');
3570
3571     my %type;
3572     main::set_access('type', \%type, 'r');
3573
3574     my %standard_form;
3575     # The value in internal standard form.  Defined only if the type is 0.
3576     main::set_access('standard_form', \%standard_form);
3577
3578     # Note that if these fields change, the dump() method should as well
3579
3580     sub new {
3581         return Carp::carp_too_few_args(\@_, 3) if main::DEBUG && @_ < 3;
3582         my $class = shift;
3583
3584         my $self = bless \do { my $anonymous_scalar }, $class;
3585         my $addr = do { no overloading; pack 'J', $self; };
3586
3587         $start{$addr} = shift;
3588         $end{$addr} = shift;
3589
3590         my %args = @_;
3591
3592         my $value = delete $args{'Value'};  # Can be 0
3593         $value = "" unless defined $value;
3594         $value{$addr} = $value;
3595
3596         $type{$addr} = delete $args{'Type'} || 0;
3597
3598         Carp::carp_extra_args(\%args) if main::DEBUG && %args;
3599
3600         return $self;
3601     }
3602
3603     use overload
3604         fallback => 0,
3605         qw("") => "_operator_stringify",
3606         "." => \&main::_operator_dot,
3607         ".=" => \&main::_operator_dot_equal,
3608     ;
3609
3610     sub _operator_stringify {
3611         my $self = shift;
3612         my $addr = do { no overloading; pack 'J', $self; };
3613
3614         # Output it like '0041..0065 (value)'
3615         my $return = sprintf("%04X", $start{$addr})
3616                         .  '..'
3617                         . sprintf("%04X", $end{$addr});
3618         my $value = $value{$addr};
3619         my $type = $type{$addr};
3620         $return .= ' (';
3621         $return .= "$value";
3622         $return .= ", Type=$type" if $type != 0;
3623         $return .= ')';
3624
3625         return $return;
3626     }
3627
3628     sub standard_form {
3629         # Calculate the standard form only if needed, and cache the result.
3630         # The standard form is the value itself if the type is special.
3631         # This represents a considerable CPU and memory saving - at the time
3632         # of writing there are 368676 non-special objects, but the standard
3633         # form is only requested for 22047 of them - ie about 6%.
3634
3635         my $self = shift;
3636         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3637
3638         my $addr = do { no overloading; pack 'J', $self; };
3639
3640         return $standard_form{$addr} if defined $standard_form{$addr};
3641
3642         my $value = $value{$addr};
3643         return $value if $type{$addr};
3644         return $standard_form{$addr} = main::standardize($value);
3645     }
3646
3647     sub dump {
3648         # Human, not machine readable.  For machine readable, comment out this
3649         # entire routine and let the standard one take effect.
3650         my $self = shift;
3651         my $indent = shift;
3652         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3653
3654         my $addr = do { no overloading; pack 'J', $self; };
3655
3656         my $return = $indent
3657                     . sprintf("%04X", $start{$addr})
3658                     . '..'
3659                     . sprintf("%04X", $end{$addr})
3660                     . " '$value{$addr}';";
3661         if (! defined $standard_form{$addr}) {
3662             $return .= "(type=$type{$addr})";
3663         }
3664         elsif ($standard_form{$addr} ne $value{$addr}) {
3665             $return .= "(standard '$standard_form{$addr}')";
3666         }
3667         return $return;
3668     }
3669 } # End closure
3670
3671 package _Range_List_Base;
3672
3673 # Base class for range lists.  A range list is simply an ordered list of
3674 # ranges, so that the ranges with the lowest starting numbers are first in it.
3675 #
3676 # When a new range is added that is adjacent to an existing range that has the
3677 # same value and type, it merges with it to form a larger range.
3678 #
3679 # Ranges generally do not overlap, except that there can be multiple entries
3680 # of single code point ranges.  This is because of NameAliases.txt.
3681 #
3682 # In this program, there is a standard value such that if two different
3683 # values, have the same standard value, they are considered equivalent.  This
3684 # value was chosen so that it gives correct results on Unicode data
3685
3686 # There are a number of methods to manipulate range lists, and some operators
3687 # are overloaded to handle them.
3688
3689 sub trace { return main::trace(@_); }
3690
3691 { # Closure
3692
3693     our $addr;
3694
3695     # Max is initialized to a negative value that isn't adjacent to 0, for
3696     # simpler tests
3697     my $max_init = -2;
3698
3699     main::setup_package();
3700
3701     my %ranges;
3702     # The list of ranges
3703     main::set_access('ranges', \%ranges, 'readable_array');
3704
3705     my %max;
3706     # The highest code point in the list.  This was originally a method, but
3707     # actual measurements said it was used a lot.
3708     main::set_access('max', \%max, 'r');
3709
3710     my %each_range_iterator;
3711     # Iterator position for each_range()
3712     main::set_access('each_range_iterator', \%each_range_iterator);
3713
3714     my %owner_name_of;
3715     # Name of parent this is attached to, if any.  Solely for better error
3716     # messages.
3717     main::set_access('owner_name_of', \%owner_name_of, 'p_r');
3718
3719     my %_search_ranges_cache;
3720     # A cache of the previous result from _search_ranges(), for better
3721     # performance
3722     main::set_access('_search_ranges_cache', \%_search_ranges_cache);
3723
3724     sub new {
3725         my $class = shift;
3726         my %args = @_;
3727
3728         # Optional initialization data for the range list.
3729         my $initialize = delete $args{'Initialize'};
3730
3731         my $self;
3732
3733         # Use _union() to initialize.  _union() returns an object of this
3734         # class, which means that it will call this constructor recursively.
3735         # But it won't have this $initialize parameter so that it won't
3736         # infinitely loop on this.
3737         return _union($class, $initialize, %args) if defined $initialize;
3738
3739         $self = bless \do { my $anonymous_scalar }, $class;
3740         my $addr = do { no overloading; pack 'J', $self; };
3741
3742         # Optional parent object, only for debug info.
3743         $owner_name_of{$addr} = delete $args{'Owner'};
3744         $owner_name_of{$addr} = "" if ! defined $owner_name_of{$addr};
3745
3746         # Stringify, in case it is an object.
3747         $owner_name_of{$addr} = "$owner_name_of{$addr}";
3748
3749         # This is used only for error messages, and so a colon is added
3750         $owner_name_of{$addr} .= ": " if $owner_name_of{$addr} ne "";
3751
3752         Carp::carp_extra_args(\%args) if main::DEBUG && %args;
3753
3754         $max{$addr} = $max_init;
3755
3756         $_search_ranges_cache{$addr} = 0;
3757         $ranges{$addr} = [];
3758
3759         return $self;
3760     }
3761
3762     use overload
3763         fallback => 0,
3764         qw("") => "_operator_stringify",
3765         "." => \&main::_operator_dot,
3766         ".=" => \&main::_operator_dot_equal,
3767     ;
3768
3769     sub _operator_stringify {
3770         my $self = shift;
3771         my $addr = do { no overloading; pack 'J', $self; };
3772
3773         return "Range_List attached to '$owner_name_of{$addr}'"
3774                                                 if $owner_name_of{$addr};
3775         return "anonymous Range_List " . \$self;
3776     }
3777
3778     sub _union {
3779         # Returns the union of the input code points.  It can be called as
3780         # either a constructor or a method.  If called as a method, the result
3781         # will be a new() instance of the calling object, containing the union
3782         # of that object with the other parameter's code points;  if called as
3783         # a constructor, the first parameter gives the class that the new object
3784         # should be, and the second parameter gives the code points to go into
3785         # it.
3786         # In either case, there are two parameters looked at by this routine;
3787         # any additional parameters are passed to the new() constructor.
3788         #
3789         # The code points can come in the form of some object that contains
3790         # ranges, and has a conventionally named method to access them; or
3791         # they can be an array of individual code points (as integers); or
3792         # just a single code point.
3793         #
3794         # If they are ranges, this routine doesn't make any effort to preserve
3795         # the range values and types of one input over the other.  Therefore
3796         # this base class should not allow _union to be called from other than
3797         # initialization code, so as to prevent two tables from being added
3798         # together where the range values matter.  The general form of this
3799         # routine therefore belongs in a derived class, but it was moved here
3800         # to avoid duplication of code.  The failure to overload this in this
3801         # class keeps it safe.
3802         #
3803         # It does make the effort during initialization to accept tables with
3804         # multiple values for the same code point, and to preserve the order
3805         # of these.  If there is only one input range or range set, it doesn't
3806         # sort (as it should already be sorted to the desired order), and will
3807         # accept multiple values per code point.  Otherwise it will merge
3808         # multiple values into a single one.
3809
3810         my $self;
3811         my @args;   # Arguments to pass to the constructor
3812
3813         my $class = shift;
3814
3815         # If a method call, will start the union with the object itself, and
3816         # the class of the new object will be the same as self.
3817         if (ref $class) {
3818             $self = $class;
3819             $class = ref $self;
3820             push @args, $self;
3821         }
3822
3823         # Add the other required parameter.
3824         push @args, shift;
3825         # Rest of parameters are passed on to the constructor
3826
3827         # Accumulate all records from both lists.
3828         my @records;
3829         my $input_count = 0;
3830         for my $arg (@args) {
3831             #local $to_trace = 0 if main::DEBUG;
3832             trace "argument = $arg" if main::DEBUG && $to_trace;
3833             if (! defined $arg) {
3834                 my $message = "";
3835                 if (defined $self) {
3836                     no overloading;
3837                     $message .= $owner_name_of{pack 'J', $self};
3838                 }
3839                 Carp::my_carp_bug($message . "Undefined argument to _union.  No union done.");
3840                 return;
3841             }
3842
3843             $arg = [ $arg ] if ! ref $arg;
3844             my $type = ref $arg;
3845             if ($type eq 'ARRAY') {
3846                 foreach my $element (@$arg) {
3847                     push @records, Range->new($element, $element);
3848                     $input_count++;
3849                 }
3850             }
3851             elsif ($arg->isa('Range')) {
3852                 push @records, $arg;
3853                 $input_count++;
3854             }
3855             elsif ($arg->can('ranges')) {
3856                 push @records, $arg->ranges;
3857                 $input_count++;
3858             }
3859             else {
3860                 my $message = "";
3861                 if (defined $self) {
3862                     no overloading;
3863                     $message .= $owner_name_of{pack 'J', $self};
3864                 }
3865                 Carp::my_carp_bug($message . "Cannot take the union of a $type.  No union done.");
3866                 return;
3867             }
3868         }
3869
3870         # Sort with the range containing the lowest ordinal first, but if
3871         # two ranges start at the same code point, sort with the bigger range
3872         # of the two first, because it takes fewer cycles.
3873         if ($input_count > 1) {
3874             @records = sort { ($a->start <=> $b->start)
3875                                       or
3876                                     # if b is shorter than a, b->end will be
3877                                     # less than a->end, and we want to select
3878                                     # a, so want to return -1
3879                                     ($b->end <=> $a->end)
3880                                    } @records;
3881         }
3882
3883         my $new = $class->new(@_);
3884
3885         # Fold in records so long as they add new information.
3886         for my $set (@records) {
3887             my $start = $set->start;
3888             my $end   = $set->end;
3889             my $value = $set->value;
3890             my $type  = $set->type;
3891             if ($start > $new->max) {
3892                 $new->_add_delete('+', $start, $end, $value, Type => $type);
3893             }
3894             elsif ($end > $new->max) {
3895                 $new->_add_delete('+', $new->max +1, $end, $value,
3896                                                                 Type => $type);
3897             }
3898             elsif ($input_count == 1) {
3899                 # Here, overlaps existing range, but is from a single input,
3900                 # so preserve the multiple values from that input.
3901                 $new->_add_delete('+', $start, $end, $value, Type => $type,
3902                                                 Replace => $MULTIPLE_AFTER);
3903             }
3904         }
3905
3906         return $new;
3907     }
3908
3909     sub range_count {        # Return the number of ranges in the range list
3910         my $self = shift;
3911         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3912
3913         no overloading;
3914         return scalar @{$ranges{pack 'J', $self}};
3915     }
3916
3917     sub min {
3918         # Returns the minimum code point currently in the range list, or if
3919         # the range list is empty, 2 beyond the max possible.  This is a
3920         # method because used so rarely, that not worth saving between calls,
3921         # and having to worry about changing it as ranges are added and
3922         # deleted.
3923
3924         my $self = shift;
3925         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3926
3927         my $addr = do { no overloading; pack 'J', $self; };
3928
3929         # If the range list is empty, return a large value that isn't adjacent
3930         # to any that could be in the range list, for simpler tests
3931         return $MAX_WORKING_CODEPOINT + 2 unless scalar @{$ranges{$addr}};
3932         return $ranges{$addr}->[0]->start;
3933     }
3934
3935     sub contains {
3936         # Boolean: Is argument in the range list?  If so returns $i such that:
3937         #   range[$i]->end < $codepoint <= range[$i+1]->end
3938         # which is one beyond what you want; this is so that the 0th range
3939         # doesn't return false
3940         my $self = shift;
3941         my $codepoint = shift;
3942         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3943
3944         my $i = $self->_search_ranges($codepoint);
3945         return 0 unless defined $i;
3946
3947         # The search returns $i, such that
3948         #   range[$i-1]->end < $codepoint <= range[$i]->end
3949         # So is in the table if and only iff it is at least the start position
3950         # of range $i.
3951         no overloading;
3952         return 0 if $ranges{pack 'J', $self}->[$i]->start > $codepoint;
3953         return $i + 1;
3954     }
3955
3956     sub containing_range {
3957         # Returns the range object that contains the code point, undef if none
3958
3959         my $self = shift;
3960         my $codepoint = shift;
3961         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3962
3963         my $i = $self->contains($codepoint);
3964         return unless $i;
3965
3966         # contains() returns 1 beyond where we should look
3967         no overloading;
3968         return $ranges{pack 'J', $self}->[$i-1];
3969     }
3970
3971     sub value_of {
3972         # Returns the value associated with the code point, undef if none
3973
3974         my $self = shift;
3975         my $codepoint = shift;
3976         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3977
3978         my $range = $self->containing_range($codepoint);
3979         return unless defined $range;
3980
3981         return $range->value;
3982     }
3983
3984     sub type_of {
3985         # Returns the type of the range containing the code point, undef if
3986         # the code point is not in the table
3987
3988         my $self = shift;
3989         my $codepoint = shift;
3990         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3991
3992         my $range = $self->containing_range($codepoint);
3993         return unless defined $range;
3994
3995         return $range->type;
3996     }
3997
3998     sub _search_ranges {
3999         # Find the range in the list which contains a code point, or where it
4000         # should go if were to add it.  That is, it returns $i, such that:
4001         #   range[$i-1]->end < $codepoint <= range[$i]->end
4002         # Returns undef if no such $i is possible (e.g. at end of table), or
4003         # if there is an error.
4004
4005         my $self = shift;
4006         my $code_point = shift;
4007         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4008
4009         my $addr = do { no overloading; pack 'J', $self; };
4010
4011         return if $code_point > $max{$addr};
4012         my $r = $ranges{$addr};                # The current list of ranges
4013         my $range_list_size = scalar @$r;
4014         my $i;
4015
4016         use integer;        # want integer division
4017
4018         # Use the cached result as the starting guess for this one, because,
4019         # an experiment on 5.1 showed that 90% of the time the cache was the
4020         # same as the result on the next call (and 7% it was one less).
4021         $i = $_search_ranges_cache{$addr};
4022         $i = 0 if $i >= $range_list_size;   # Reset if no longer valid (prob.
4023                                             # from an intervening deletion
4024         #local $to_trace = 1 if main::DEBUG;
4025         trace "previous \$i is still valid: $i" if main::DEBUG && $to_trace && $code_point <= $r->[$i]->end && ($i == 0 || $r->[$i-1]->end < $code_point);
4026         return $i if $code_point <= $r->[$i]->end
4027                      && ($i == 0 || $r->[$i-1]->end < $code_point);
4028
4029         # Here the cache doesn't yield the correct $i.  Try adding 1.
4030         if ($i < $range_list_size - 1
4031             && $r->[$i]->end < $code_point &&
4032             $code_point <= $r->[$i+1]->end)
4033         {
4034             $i++;
4035             trace "next \$i is correct: $i" if main::DEBUG && $to_trace;
4036             $_search_ranges_cache{$addr} = $i;
4037             return $i;
4038         }
4039
4040         # Here, adding 1 also didn't work.  We do a binary search to
4041         # find the correct position, starting with current $i
4042         my $lower = 0;
4043         my $upper = $range_list_size - 1;
4044         while (1) {
4045             trace "top of loop i=$i:", sprintf("%04X", $r->[$lower]->start), "[$lower] .. ", sprintf("%04X", $r->[$i]->start), "[$i] .. ", sprintf("%04X", $r->[$upper]->start), "[$upper]" if main::DEBUG && $to_trace;
4046
4047             if ($code_point <= $r->[$i]->end) {
4048
4049                 # Here we have met the upper constraint.  We can quit if we
4050                 # also meet the lower one.
4051                 last if $i == 0 || $r->[$i-1]->end < $code_point;
4052
4053                 $upper = $i;        # Still too high.
4054
4055             }
4056             else {
4057
4058                 # Here, $r[$i]->end < $code_point, so look higher up.
4059                 $lower = $i;
4060             }
4061
4062             # Split search domain in half to try again.
4063             my $temp = ($upper + $lower) / 2;
4064
4065             # No point in continuing unless $i changes for next time
4066             # in the loop.
4067             if ($temp == $i) {
4068
4069                 # We can't reach the highest element because of the averaging.
4070                 # So if one below the upper edge, force it there and try one
4071                 # more time.
4072                 if ($i == $range_list_size - 2) {
4073
4074                     trace "Forcing to upper edge" if main::DEBUG && $to_trace;
4075                     $i = $range_list_size - 1;
4076
4077                     # Change $lower as well so if fails next time through,
4078                     # taking the average will yield the same $i, and we will
4079                     # quit with the error message just below.
4080                     $lower = $i;
4081                     next;
4082                 }
4083                 Carp::my_carp_bug("$owner_name_of{$addr}Can't find where the range ought to go.  No action taken.");
4084                 return;
4085             }
4086             $i = $temp;
4087         } # End of while loop
4088
4089         if (main::DEBUG && $to_trace) {
4090             trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i;
4091             trace "i=  [ $i ]", $r->[$i];
4092             trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < $range_list_size - 1;
4093         }
4094
4095         # Here we have found the offset.  Cache it as a starting point for the
4096         # next call.
4097         $_search_ranges_cache{$addr} = $i;
4098         return $i;
4099     }
4100
4101     sub _add_delete {
4102         # Add, replace or delete ranges to or from a list.  The $type
4103         # parameter gives which:
4104         #   '+' => insert or replace a range, returning a list of any changed
4105         #          ranges.
4106         #   '-' => delete a range, returning a list of any deleted ranges.
4107         #
4108         # The next three parameters give respectively the start, end, and
4109         # value associated with the range.  'value' should be null unless the
4110         # operation is '+';
4111         #
4112         # The range list is kept sorted so that the range with the lowest
4113         # starting position is first in the list, and generally, adjacent
4114         # ranges with the same values are merged into a single larger one (see
4115         # exceptions below).
4116         #
4117         # There are more parameters; all are key => value pairs:
4118         #   Type    gives the type of the value.  It is only valid for '+'.
4119         #           All ranges have types; if this parameter is omitted, 0 is
4120         #           assumed.  Ranges with type 0 are assumed to obey the
4121         #           Unicode rules for casing, etc; ranges with other types are
4122         #           not.  Otherwise, the type is arbitrary, for the caller's
4123         #           convenience, and looked at only by this routine to keep
4124         #           adjacent ranges of different types from being merged into
4125         #           a single larger range, and when Replace =>
4126         #           $IF_NOT_EQUIVALENT is specified (see just below).
4127         #   Replace  determines what to do if the range list already contains
4128         #            ranges which coincide with all or portions of the input
4129         #            range.  It is only valid for '+':
4130         #       => $NO            means that the new value is not to replace
4131         #                         any existing ones, but any empty gaps of the
4132         #                         range list coinciding with the input range
4133         #                         will be filled in with the new value.
4134         #       => $UNCONDITIONALLY  means to replace the existing values with
4135         #                         this one unconditionally.  However, if the
4136         #                         new and old values are identical, the
4137         #                         replacement is skipped to save cycles
4138         #       => $IF_NOT_EQUIVALENT means to replace the existing values
4139         #          (the default)  with this one if they are not equivalent.
4140         #                         Ranges are equivalent if their types are the
4141         #                         same, and they are the same string; or if
4142         #                         both are type 0 ranges, if their Unicode
4143         #                         standard forms are identical.  In this last
4144         #                         case, the routine chooses the more "modern"
4145         #                         one to use.  This is because some of the
4146         #                         older files are formatted with values that
4147         #                         are, for example, ALL CAPs, whereas the
4148         #                         derived files have a more modern style,
4149         #                         which looks better.  By looking for this
4150         #                         style when the pre-existing and replacement
4151         #                         standard forms are the same, we can move to
4152         #                         the modern style
4153         #       => $MULTIPLE_BEFORE means that if this range duplicates an
4154         #                         existing one, but has a different value,
4155         #                         don't replace the existing one, but insert
4156         #                         this one so that the same range can occur
4157         #                         multiple times.  They are stored LIFO, so
4158         #                         that the final one inserted is the first one
4159         #                         returned in an ordered search of the table.
4160         #                         If this is an exact duplicate, including the
4161         #                         value, the original will be moved to be
4162         #                         first, before any other duplicate ranges
4163         #                         with different values.
4164         #       => $MULTIPLE_AFTER is like $MULTIPLE_BEFORE, but is stored
4165         #                         FIFO, so that this one is inserted after all
4166         #                         others that currently exist.  If this is an
4167         #                         exact duplicate, including value, of an
4168         #                         existing range, this one is discarded
4169         #                         (leaving the existing one in its original,
4170         #                         higher priority position
4171         #       => $CROAK         Die with an error if is already there
4172         #       => anything else  is the same as => $IF_NOT_EQUIVALENT
4173         #
4174         # "same value" means identical for non-type-0 ranges, and it means
4175         # having the same standard forms for type-0 ranges.
4176
4177         return Carp::carp_too_few_args(\@_, 5) if main::DEBUG && @_ < 5;
4178
4179         my $self = shift;
4180         my $operation = shift;   # '+' for add/replace; '-' for delete;
4181         my $start = shift;
4182         my $end   = shift;
4183         my $value = shift;
4184
4185         my %args = @_;
4186
4187         $value = "" if not defined $value;        # warning: $value can be "0"
4188
4189         my $replace = delete $args{'Replace'};
4190         $replace = $IF_NOT_EQUIVALENT unless defined $replace;
4191
4192         my $type = delete $args{'Type'};
4193         $type = 0 unless defined $type;
4194
4195         Carp::carp_extra_args(\%args) if main::DEBUG && %args;
4196
4197         my $addr = do { no overloading; pack 'J', $self; };
4198
4199         if ($operation ne '+' && $operation ne '-') {
4200             Carp::my_carp_bug("$owner_name_of{$addr}First parameter to _add_delete must be '+' or '-'.  No action taken.");
4201             return;
4202         }
4203         unless (defined $start && defined $end) {
4204             Carp::my_carp_bug("$owner_name_of{$addr}Undefined start and/or end to _add_delete.  No action taken.");
4205             return;
4206         }
4207         unless ($end >= $start) {
4208             Carp::my_carp_bug("$owner_name_of{$addr}End of range (" . sprintf("%04X", $end) . ") must not be before start (" . sprintf("%04X", $start) . ").  No action taken.");
4209             return;
4210         }
4211         #local $to_trace = 1 if main::DEBUG;
4212
4213         if ($operation eq '-') {
4214             if ($replace != $IF_NOT_EQUIVALENT) {
4215                 Carp::my_carp_bug("$owner_name_of{$addr}Replace => \$IF_NOT_EQUIVALENT is required when deleting a range from a range list.  Assuming Replace => \$IF_NOT_EQUIVALENT.");
4216                 $replace = $IF_NOT_EQUIVALENT;
4217             }
4218             if ($type) {
4219                 Carp::my_carp_bug("$owner_name_of{$addr}Type => 0 is required when deleting a range from a range list.  Assuming Type => 0.");
4220                 $type = 0;
4221             }
4222             if ($value ne "") {
4223                 Carp::my_carp_bug("$owner_name_of{$addr}Value => \"\" is required when deleting a range from a range list.  Assuming Value => \"\".");
4224                 $value = "";
4225             }
4226         }
4227
4228         my $r = $ranges{$addr};               # The current list of ranges
4229         my $range_list_size = scalar @$r;     # And its size
4230         my $max = $max{$addr};                # The current high code point in
4231                                               # the list of ranges
4232
4233         # Do a special case requiring fewer machine cycles when the new range
4234         # starts after the current highest point.  The Unicode input data is
4235         # structured so this is common.
4236         if ($start > $max) {
4237
4238             trace "$owner_name_of{$addr} $operation", sprintf("%04X..%04X (%s) type=%d; prev max=%04X", $start, $end, $value, $type, $max) if main::DEBUG && $to_trace;
4239             return if $operation eq '-'; # Deleting a non-existing range is a
4240                                          # no-op
4241
4242             # If the new range doesn't logically extend the current final one
4243             # in the range list, create a new range at the end of the range
4244             # list.  (max cleverly is initialized to a negative number not
4245             # adjacent to 0 if the range list is empty, so even adding a range
4246             # to an empty range list starting at 0 will have this 'if'
4247             # succeed.)
4248             if ($start > $max + 1        # non-adjacent means can't extend.
4249                 || @{$r}[-1]->value ne $value # values differ, can't extend.
4250                 || @{$r}[-1]->type != $type # types differ, can't extend.
4251             ) {
4252                 push @$r, Range->new($start, $end,
4253                                      Value => $value,
4254                                      Type => $type);
4255             }
4256             else {
4257
4258                 # Here, the new range starts just after the current highest in
4259                 # the range list, and they have the same type and value.
4260                 # Extend the existing range to incorporate the new one.
4261                 @{$r}[-1]->set_end($end);
4262             }
4263
4264             # This becomes the new maximum.
4265             $max{$addr} = $end;
4266
4267             return;
4268         }
4269         #local $to_trace = 0 if main::DEBUG;
4270
4271         trace "$owner_name_of{$addr} $operation", sprintf("%04X", $start) . '..' . sprintf("%04X", $end) . " ($value) replace=$replace" if main::DEBUG && $to_trace;
4272
4273         # Here, the input range isn't after the whole rest of the range list.
4274         # Most likely 'splice' will be needed.  The rest of the routine finds
4275         # the needed splice parameters, and if necessary, does the splice.
4276         # First, find the offset parameter needed by the splice function for
4277         # the input range.  Note that the input range may span multiple
4278         # existing ones, but we'll worry about that later.  For now, just find
4279         # the beginning.  If the input range is to be inserted starting in a
4280         # position not currently in the range list, it must (obviously) come
4281         # just after the range below it, and just before the range above it.
4282         # Slightly less obviously, it will occupy the position currently
4283         # occupied by the range that is to come after it.  More formally, we
4284         # are looking for the position, $i, in the array of ranges, such that:
4285         #
4286         # r[$i-1]->start <= r[$i-1]->end < $start < r[$i]->start <= r[$i]->end
4287         #
4288         # (The ordered relationships within existing ranges are also shown in
4289         # the equation above).  However, if the start of the input range is
4290         # within an existing range, the splice offset should point to that
4291         # existing range's position in the list; that is $i satisfies a
4292         # somewhat different equation, namely:
4293         #
4294         #r[$i-1]->start <= r[$i-1]->end < r[$i]->start <= $start <= r[$i]->end
4295         #
4296         # More briefly, $start can come before or after r[$i]->start, and at
4297         # this point, we don't know which it will be.  However, these
4298         # two equations share these constraints:
4299         #
4300         #   r[$i-1]->end < $start <= r[$i]->end
4301         #
4302         # And that is good enough to find $i.
4303
4304         my $i = $self->_search_ranges($start);
4305         if (! defined $i) {
4306             Carp::my_carp_bug("Searching $self for range beginning with $start unexpectedly returned undefined.  Operation '$operation' not performed");
4307             return;
4308         }
4309
4310         # The search function returns $i such that:
4311         #
4312         # r[$i-1]->end < $start <= r[$i]->end
4313         #
4314         # That means that $i points to the first range in the range list
4315         # that could possibly be affected by this operation.  We still don't
4316         # know if the start of the input range is within r[$i], or if it
4317         # points to empty space between r[$i-1] and r[$i].
4318         trace "[$i] is the beginning splice point.  Existing range there is ", $r->[$i] if main::DEBUG && $to_trace;
4319
4320         # Special case the insertion of data that is not to replace any
4321         # existing data.
4322         if ($replace == $NO) {  # If $NO, has to be operation '+'
4323             #local $to_trace = 1 if main::DEBUG;
4324             trace "Doesn't replace" if main::DEBUG && $to_trace;
4325
4326             # Here, the new range is to take effect only on those code points
4327             # that aren't already in an existing range.  This can be done by
4328             # looking through the existing range list and finding the gaps in
4329             # the ranges that this new range affects, and then calling this
4330             # function recursively on each of those gaps, leaving untouched
4331             # anything already in the list.  Gather up a list of the changed
4332             # gaps first so that changes to the internal state as new ranges
4333             # are added won't be a problem.
4334             my @gap_list;
4335
4336             # First, if the starting point of the input range is outside an
4337             # existing one, there is a gap from there to the beginning of the
4338             # existing range -- add a span to fill the part that this new
4339             # range occupies
4340             if ($start < $r->[$i]->start) {
4341                 push @gap_list, Range->new($start,
4342                                            main::min($end,
4343                                                      $r->[$i]->start - 1),
4344                                            Type => $type);
4345                 trace "gap before $r->[$i] [$i], will add", $gap_list[-1] if main::DEBUG && $to_trace;
4346             }
4347
4348             # Then look through the range list for other gaps until we reach
4349             # the highest range affected by the input one.
4350             my $j;
4351             for ($j = $i+1; $j < $range_list_size; $j++) {
4352                 trace "j=[$j]", $r->[$j] if main::DEBUG && $to_trace;
4353                 last if $end < $r->[$j]->start;
4354
4355                 # If there is a gap between when this range starts and the
4356                 # previous one ends, add a span to fill it.  Note that just
4357                 # because there are two ranges doesn't mean there is a
4358                 # non-zero gap between them.  It could be that they have
4359                 # different values or types
4360                 if ($r->[$j-1]->end + 1 != $r->[$j]->start) {
4361                     push @gap_list,
4362                         Range->new($r->[$j-1]->end + 1,
4363                                    $r->[$j]->start - 1,
4364                                    Type => $type);
4365                     trace "gap between $r->[$j-1] and $r->[$j] [$j], will add: $gap_list[-1]" if main::DEBUG && $to_trace;
4366                 }
4367             }
4368
4369             # Here, we have either found an existing range in the range list,
4370             # beyond the area affected by the input one, or we fell off the
4371             # end of the loop because the input range affects the whole rest
4372             # of the range list.  In either case, $j is 1 higher than the
4373             # highest affected range.  If $j == $i, it means that there are no
4374             # affected ranges, that the entire insertion is in the gap between
4375             # r[$i-1], and r[$i], which we already have taken care of before
4376             # the loop.
4377             # On the other hand, if there are affected ranges, it might be
4378             # that there is a gap that needs filling after the final such
4379             # range to the end of the input range
4380             if ($r->[$j-1]->end < $end) {
4381                     push @gap_list, Range->new(main::max($start,
4382                                                          $r->[$j-1]->end + 1),
4383                                                $end,
4384                                                Type => $type);
4385                     trace "gap after $r->[$j-1], will add $gap_list[-1]" if main::DEBUG && $to_trace;
4386             }
4387
4388             # Call recursively to fill in all the gaps.
4389             foreach my $gap (@gap_list) {
4390                 $self->_add_delete($operation,
4391                                    $gap->start,
4392                                    $gap->end,
4393                                    $value,
4394                                    Type => $type);
4395             }
4396
4397             return;
4398         }
4399
4400         # Here, we have taken care of the case where $replace is $NO.
4401         # Remember that here, r[$i-1]->end < $start <= r[$i]->end
4402         # If inserting a multiple record, this is where it goes, before the
4403         # first (if any) existing one if inserting LIFO.  (If this is to go
4404         # afterwards, FIFO, we below move the pointer to there.)  These imply
4405         # an insertion, and no change to any existing ranges.  Note that $i
4406         # can be -1 if this new range doesn't actually duplicate any existing,
4407         # and comes at the beginning of the list.
4408         if ($replace == $MULTIPLE_BEFORE || $replace == $MULTIPLE_AFTER) {
4409
4410             if ($start != $end) {
4411                 Carp::my_carp_bug("$owner_name_of{$addr}Can't cope with adding a multiple record when the range ($start..$end) contains more than one code point.  No action taken.");
4412                 return;
4413             }
4414
4415             # If the new code point is within a current range ...
4416             if ($end >= $r->[$i]->start) {
4417
4418                 # Don't add an exact duplicate, as it isn't really a multiple
4419                 my $existing_value = $r->[$i]->value;
4420                 my $existing_type = $r->[$i]->type;
4421                 return if $value eq $existing_value && $type eq $existing_type;
4422
4423                 # If the multiple value is part of an existing range, we want
4424                 # to split up that range, so that only the single code point
4425                 # is affected.  To do this, we first call ourselves
4426                 # recursively to delete that code point from the table, having
4427                 # preserved its current data above.  Then we call ourselves
4428                 # recursively again to add the new multiple, which we know by
4429                 # the test just above is different than the current code
4430                 # point's value, so it will become a range containing a single
4431                 # code point: just itself.  Finally, we add back in the
4432                 # pre-existing code point, which will again be a single code
4433                 # point range.  Because 'i' likely will have changed as a
4434                 # result of these operations, we can't just continue on, but
4435                 # do this operation recursively as well.  If we are inserting
4436                 # LIFO, the pre-existing code point needs to go after the new
4437                 # one, so use MULTIPLE_AFTER; and vice versa.
4438                 if ($r->[$i]->start != $r->[$i]->end) {
4439                     $self->_add_delete('-', $start, $end, "");
4440                     $self->_add_delete('+', $start, $end, $value, Type => $type);
4441                     return $self->_add_delete('+',
4442                             $start, $end,
4443                             $existing_value,
4444                             Type => $existing_type,
4445                             Replace => ($replace == $MULTIPLE_BEFORE)
4446                                        ? $MULTIPLE_AFTER
4447                                        : $MULTIPLE_BEFORE);
4448                 }
4449             }
4450
4451             # If to place this new record after, move to beyond all existing
4452             # ones; but don't add this one if identical to any of them, as it
4453             # isn't really a multiple.  This leaves the original order, so
4454             # that the current request is ignored.  The reasoning is that the
4455             # previous request that wanted this record to have high priority
4456             # should have precedence.
4457             if ($replace == $MULTIPLE_AFTER) {
4458                 while ($i < @$r && $r->[$i]->start == $start) {
4459                     return if $value eq $r->[$i]->value
4460                               && $type eq $r->[$i]->type;
4461                     $i++;
4462                 }
4463             }
4464             else {
4465                 # If instead we are to place this new record before any
4466                 # existing ones, remove any identical ones that come after it.
4467                 # This changes the existing order so that the new one is
4468                 # first, as is being requested.
4469                 for (my $j = $i + 1;
4470                      $j < @$r && $r->[$j]->start == $start;
4471                      $j++)
4472                 {
4473                     if ($value eq $r->[$j]->value && $type eq $r->[$j]->type) {
4474                         splice @$r, $j, 1;
4475                         last;   # There should only be one instance, so no
4476                                 # need to keep looking
4477                     }
4478                 }
4479             }
4480
4481             trace "Adding multiple record at $i with $start..$end, $value" if main::DEBUG && $to_trace;
4482             my @return = splice @$r,
4483                                 $i,
4484                                 0,
4485                                 Range->new($start,
4486                                            $end,
4487                                            Value => $value,
4488                                            Type => $type);
4489             if (main::DEBUG && $to_trace) {
4490                 trace "After splice:";
4491                 trace 'i-2=[', $i-2, ']', $r->[$i-2] if $i >= 2;
4492                 trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i >= 1;
4493                 trace "i  =[", $i, "]", $r->[$i] if $i >= 0;
4494                 trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < @$r - 1;
4495                 trace 'i+2=[', $i+2, ']', $r->[$i+2] if $i < @$r - 2;
4496                 trace 'i+3=[', $i+3, ']', $r->[$i+3] if $i < @$r - 3;
4497             }
4498             return @return;
4499         }
4500
4501         # Here, we have taken care of $NO and $MULTIPLE_foo replaces.  This
4502         # leaves delete, insert, and replace either unconditionally or if not
4503         # equivalent.  $i still points to the first potential affected range.
4504         # Now find the highest range affected, which will determine the length
4505         # parameter to splice.  (The input range can span multiple existing
4506         # ones.)  If this isn't a deletion, while we are looking through the
4507         # range list, see also if this is a replacement rather than a clean
4508         # insertion; that is if it will change the values of at least one
4509         # existing range.  Start off assuming it is an insert, until find it
4510         # isn't.
4511         my $clean_insert = $operation eq '+';
4512         my $j;        # This will point to the highest affected range
4513
4514         # For non-zero types, the standard form is the value itself;
4515         my $standard_form = ($type) ? $value : main::standardize($value);
4516
4517         for ($j = $i; $j < $range_list_size; $j++) {
4518             trace "Looking for highest affected range; the one at $j is ", $r->[$j] if main::DEBUG && $to_trace;
4519
4520             # If find a range that it doesn't overlap into, we can stop
4521             # searching
4522             last if $end < $r->[$j]->start;
4523
4524             # Here, overlaps the range at $j.  If the values don't match,
4525             # and so far we think this is a clean insertion, it becomes a
4526             # non-clean insertion, i.e., a 'change' or 'replace' instead.
4527             if ($clean_insert) {
4528                 if ($r->[$j]->standard_form ne $standard_form) {
4529                     $clean_insert = 0;
4530                     if ($replace == $CROAK) {
4531                         main::croak("The range to add "
4532                         . sprintf("%04X", $start)
4533                         . '-'
4534                         . sprintf("%04X", $end)
4535                         . " with value '$value' overlaps an existing range $r->[$j]");
4536                     }
4537                 }
4538                 else {
4539
4540                     # Here, the two values are essentially the same.  If the
4541                     # two are actually identical, replacing wouldn't change
4542                     # anything so skip it.
4543                     my $pre_existing = $r->[$j]->value;
4544                     if ($pre_existing ne $value) {
4545
4546                         # Here the new and old standardized values are the
4547                         # same, but the non-standardized values aren't.  If
4548                         # replacing unconditionally, then replace
4549                         if( $replace == $UNCONDITIONALLY) {
4550                             $clean_insert = 0;
4551                         }
4552                         else {
4553
4554                             # Here, are replacing conditionally.  Decide to
4555                             # replace or not based on which appears to look
4556                             # the "nicest".  If one is mixed case and the
4557                             # other isn't, choose the mixed case one.
4558                             my $new_mixed = $value =~ /[A-Z]/
4559                                             && $value =~ /[a-z]/;
4560                             my $old_mixed = $pre_existing =~ /[A-Z]/
4561                                             && $pre_existing =~ /[a-z]/;
4562
4563                             if ($old_mixed != $new_mixed) {
4564                                 $clean_insert = 0 if $new_mixed;
4565                                 if (main::DEBUG && $to_trace) {
4566                                     if ($clean_insert) {
4567                                         trace "Retaining $pre_existing over $value";
4568                                     }
4569                                     else {
4570                                         trace "Replacing $pre_existing with $value";
4571                                     }
4572                                 }
4573                             }
4574                             else {
4575
4576                                 # Here casing wasn't different between the two.
4577                                 # If one has hyphens or underscores and the
4578                                 # other doesn't, choose the one with the
4579                                 # punctuation.
4580                                 my $new_punct = $value =~ /[-_]/;
4581                                 my $old_punct = $pre_existing =~ /[-_]/;
4582
4583                                 if ($old_punct != $new_punct) {
4584                                     $clean_insert = 0 if $new_punct;
4585                                     if (main::DEBUG && $to_trace) {
4586                                         if ($clean_insert) {
4587                                             trace "Retaining $pre_existing over $value";
4588                                         }
4589                                         else {
4590                                             trace "Replacing $pre_existing with $value";
4591                                         }
4592                                     }
4593                                 }   # else existing one is just as "good";
4594                                     # retain it to save cycles.
4595                             }
4596                         }
4597                     }
4598                 }
4599             }
4600         } # End of loop looking for highest affected range.
4601
4602         # Here, $j points to one beyond the highest range that this insertion
4603         # affects (hence to beyond the range list if that range is the final
4604         # one in the range list).
4605
4606         # The splice length is all the affected ranges.  Get it before
4607         # subtracting, for efficiency, so we don't have to later add 1.
4608         my $length = $j - $i;
4609
4610         $j--;        # $j now points to the highest affected range.
4611         trace "Final affected range is $j: $r->[$j]" if main::DEBUG && $to_trace;
4612
4613         # Here, have taken care of $NO and $MULTIPLE_foo replaces.
4614         # $j points to the highest affected range.  But it can be < $i or even
4615         # -1.  These happen only if the insertion is entirely in the gap
4616         # between r[$i-1] and r[$i].  Here's why: j < i means that the j loop
4617         # above exited first time through with $end < $r->[$i]->start.  (And
4618         # then we subtracted one from j)  This implies also that $start <
4619         # $r->[$i]->start, but we know from above that $r->[$i-1]->end <
4620         # $start, so the entire input range is in the gap.
4621         if ($j < $i) {
4622
4623             # Here the entire input range is in the gap before $i.
4624
4625             if (main::DEBUG && $to_trace) {
4626                 if ($i) {
4627                     trace "Entire range is between $r->[$i-1] and $r->[$i]";
4628                 }
4629                 else {
4630                     trace "Entire range is before $r->[$i]";
4631                 }
4632             }
4633             return if $operation ne '+'; # Deletion of a non-existent range is
4634                                          # a no-op
4635         }
4636         else {
4637
4638             # Here part of the input range is not in the gap before $i.  Thus,
4639             # there is at least one affected one, and $j points to the highest
4640             # such one.
4641
4642             # At this point, here is the situation:
4643             # This is not an insertion of a multiple, nor of tentative ($NO)
4644             # data.
4645             #   $i  points to the first element in the current range list that
4646             #            may be affected by this operation.  In fact, we know
4647             #            that the range at $i is affected because we are in
4648             #            the else branch of this 'if'
4649             #   $j  points to the highest affected range.
4650             # In other words,
4651             #   r[$i-1]->end < $start <= r[$i]->end
4652             # And:
4653             #   r[$i-1]->end < $start <= $end < r[$j+1]->start
4654             #
4655             # Also:
4656             #   $clean_insert is a boolean which is set true if and only if
4657             #        this is a "clean insertion", i.e., not a change nor a
4658             #        deletion (multiple was handled above).
4659
4660             # We now have enough information to decide if this call is a no-op
4661             # or not.  It is a no-op if this is an insertion of already
4662             # existing data.  To be so, it must be contained entirely in one
4663             # range.
4664
4665             if (main::DEBUG && $to_trace && $clean_insert
4666                                          && $start >= $r->[$i]->start
4667                                          && $end   <= $r->[$i]->end)
4668             {
4669                     trace "no-op";
4670             }
4671             return if $clean_insert
4672                       && $start >= $r->[$i]->start
4673                       && $end   <= $r->[$i]->end;
4674         }
4675
4676         # Here, we know that some action will have to be taken.  We have
4677         # calculated the offset and length (though adjustments may be needed)
4678         # for the splice.  Now start constructing the replacement list.
4679         my @replacement;
4680         my $splice_start = $i;
4681
4682         my $extends_below;
4683         my $extends_above;
4684
4685         # See if should extend any adjacent ranges.
4686         if ($operation eq '-') { # Don't extend deletions
4687             $extends_below = $extends_above = 0;
4688         }
4689         else {  # Here, should extend any adjacent ranges.  See if there are
4690                 # any.
4691             $extends_below = ($i > 0
4692                             # can't extend unless adjacent
4693                             && $r->[$i-1]->end == $start -1
4694                             # can't extend unless are same standard value
4695                             && $r->[$i-1]->standard_form eq $standard_form
4696                             # can't extend unless share type
4697                             && $r->[$i-1]->type == $type);
4698             $extends_above = ($j+1 < $range_list_size
4699                             && $r->[$j+1]->start == $end +1
4700                             && $r->[$j+1]->standard_form eq $standard_form
4701                             && $r->[$j+1]->type == $type);
4702         }
4703         if ($extends_below && $extends_above) { # Adds to both
4704             $splice_start--;     # start replace at element below
4705             $length += 2;        # will replace on both sides
4706             trace "Extends both below and above ranges" if main::DEBUG && $to_trace;
4707
4708             # The result will fill in any gap, replacing both sides, and
4709             # create one large range.
4710             @replacement = Range->new($r->[$i-1]->start,
4711                                       $r->[$j+1]->end,
4712                                       Value => $value,
4713                                       Type => $type);
4714         }
4715         else {
4716
4717             # Here we know that the result won't just be the conglomeration of
4718             # a new range with both its adjacent neighbors.  But it could
4719             # extend one of them.
4720
4721             if ($extends_below) {
4722
4723                 # Here the new element adds to the one below, but not to the
4724                 # one above.  If inserting, and only to that one range,  can
4725                 # just change its ending to include the new one.
4726                 if ($length == 0 && $clean_insert) {
4727                     $r->[$i-1]->set_end($end);
4728                     trace "inserted range extends range to below so it is now $r->[$i-1]" if main::DEBUG && $to_trace;
4729                     return;
4730                 }
4731                 else {
4732                     trace "Changing inserted range to start at ", sprintf("%04X",  $r->[$i-1]->start), " instead of ", sprintf("%04X", $start) if main::DEBUG && $to_trace;
4733                     $splice_start--;        # start replace at element below
4734                     $length++;              # will replace the element below
4735                     $start = $r->[$i-1]->start;
4736                 }
4737             }
4738             elsif ($extends_above) {
4739
4740                 # Here the new element adds to the one above, but not below.
4741                 # Mirror the code above
4742                 if ($length == 0 && $clean_insert) {
4743                     $r->[$j+1]->set_start($start);
4744                     trace "inserted range extends range to above so it is now $r->[$j+1]" if main::DEBUG && $to_trace;
4745                     return;
4746                 }
4747                 else {
4748                     trace "Changing inserted range to end at ", sprintf("%04X",  $r->[$j+1]->end), " instead of ", sprintf("%04X", $end) if main::DEBUG && $to_trace;
4749                     $length++;        # will replace the element above
4750                     $end = $r->[$j+1]->end;
4751                 }
4752             }
4753
4754             trace "Range at $i is $r->[$i]" if main::DEBUG && $to_trace;
4755
4756             # Finally, here we know there will have to be a splice.
4757             # If the change or delete affects only the highest portion of the
4758             # first affected range, the range will have to be split.  The
4759             # splice will remove the whole range, but will replace it by a new
4760             # range containing just the unaffected part.  So, in this case,
4761             # add to the replacement list just this unaffected portion.
4762             if (! $extends_below
4763                 && $start > $r->[$i]->start && $start <= $r->[$i]->end)
4764             {
4765                 push @replacement,
4766                     Range->new($r->[$i]->start,
4767                                $start - 1,
4768                                Value => $r->[$i]->value,
4769                                Type => $r->[$i]->type);
4770             }
4771
4772             # In the case of an insert or change, but not a delete, we have to
4773             # put in the new stuff;  this comes next.
4774             if ($operation eq '+') {
4775                 push @replacement, Range->new($start,
4776                                               $end,
4777                                               Value => $value,
4778                                               Type => $type);
4779             }
4780
4781             trace "Range at $j is $r->[$j]" if main::DEBUG && $to_trace && $j != $i;
4782             #trace "$end >=", $r->[$j]->start, " && $end <", $r->[$j]->end if main::DEBUG && $to_trace;
4783
4784             # And finally, if we're changing or deleting only a portion of the
4785             # highest affected range, it must be split, as the lowest one was.
4786             if (! $extends_above
4787                 && $j >= 0  # Remember that j can be -1 if before first
4788                             # current element
4789                 && $end >= $r->[$j]->start
4790                 && $end < $r->[$j]->end)
4791             {
4792                 push @replacement,
4793                     Range->new($end + 1,
4794                                $r->[$j]->end,
4795                                Value => $r->[$j]->value,
4796                                Type => $r->[$j]->type);
4797             }
4798         }
4799
4800         # And do the splice, as calculated above
4801         if (main::DEBUG && $to_trace) {
4802             trace "replacing $length element(s) at $i with ";
4803             foreach my $replacement (@replacement) {
4804                 trace "    $replacement";
4805             }
4806             trace "Before splice:";
4807             trace 'i-2=[', $i-2, ']', $r->[$i-2] if $i >= 2;
4808             trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i >= 1;
4809             trace "i  =[", $i, "]", $r->[$i];
4810             trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < @$r - 1;
4811             trace 'i+2=[', $i+2, ']', $r->[$i+2] if $i < @$r - 2;
4812         }
4813
4814         my @return = splice @$r, $splice_start, $length, @replacement;
4815
4816         if (main::DEBUG && $to_trace) {
4817             trace "After splice:";
4818             trace 'i-2=[', $i-2, ']', $r->[$i-2] if $i >= 2;
4819             trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i >= 1;
4820             trace "i  =[", $i, "]", $r->[$i];
4821             trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < @$r - 1;
4822             trace 'i+2=[', $i+2, ']', $r->[$i+2] if $i < @$r - 2;
4823             trace "removed ", @return if @return;
4824         }
4825
4826         # An actual deletion could have changed the maximum in the list.
4827         # There was no deletion if the splice didn't return something, but
4828         # otherwise recalculate it.  This is done too rarely to worry about
4829         # performance.
4830         if ($operation eq '-' && @return) {
4831             if (@$r) {
4832                 $max{$addr} = $r->[-1]->end;
4833             }
4834             else {  # Now empty
4835                 $max{$addr} = $max_init;
4836             }
4837         }
4838         return @return;
4839     }
4840
4841     sub reset_each_range {  # reset the iterator for each_range();
4842         my $self = shift;
4843         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4844
4845         no overloading;
4846         undef $each_range_iterator{pack 'J', $self};
4847         return;
4848     }
4849
4850     sub each_range {
4851         # Iterate over each range in a range list.  Results are undefined if
4852         # the range list is changed during the iteration.
4853
4854         my $self = shift;
4855         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4856
4857         my $addr = do { no overloading; pack 'J', $self; };
4858
4859         return if $self->is_empty;
4860
4861         $each_range_iterator{$addr} = -1
4862                                 if ! defined $each_range_iterator{$addr};
4863         $each_range_iterator{$addr}++;
4864         return $ranges{$addr}->[$each_range_iterator{$addr}]
4865                         if $each_range_iterator{$addr} < @{$ranges{$addr}};
4866         undef $each_range_iterator{$addr};
4867         return;
4868     }
4869
4870     sub count {        # Returns count of code points in range list
4871         my $self = shift;
4872         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4873
4874         my $addr = do { no overloading; pack 'J', $self; };
4875
4876         my $count = 0;
4877         foreach my $range (@{$ranges{$addr}}) {
4878             $count += $range->end - $range->start + 1;
4879         }
4880         return $count;
4881     }
4882
4883     sub delete_range {    # Delete a range
4884         my $self = shift;
4885         my $start = shift;
4886         my $end = shift;
4887
4888         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4889
4890         return $self->_add_delete('-', $start, $end, "");
4891     }
4892
4893     sub is_empty { # Returns boolean as to if a range list is empty
4894         my $self = shift;
4895         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4896
4897         no overloading;
4898         return scalar @{$ranges{pack 'J', $self}} == 0;
4899     }
4900
4901     sub hash {
4902         # Quickly returns a scalar suitable for separating tables into
4903         # buckets, i.e. it is a hash function of the contents of a table, so
4904         # there are relatively few conflicts.
4905
4906         my $self = shift;
4907         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4908
4909         my $addr = do { no overloading; pack 'J', $self; };
4910
4911         # These are quickly computable.  Return looks like 'min..max;count'
4912         return $self->min . "..$max{$addr};" . scalar @{$ranges{$addr}};
4913     }
4914 } # End closure for _Range_List_Base
4915
4916 package Range_List;
4917 use parent '-norequire', '_Range_List_Base';
4918
4919 # A Range_List is a range list for match tables; i.e. the range values are
4920 # not significant.  Thus a number of operations can be safely added to it,
4921 # such as inversion, intersection.  Note that union is also an unsafe
4922 # operation when range values are cared about, and that method is in the base
4923 # class, not here.  But things are set up so that that method is callable only
4924 # during initialization.  Only in this derived class, is there an operation
4925 # that combines two tables.  A Range_Map can thus be used to initialize a
4926 # Range_List, and its mappings will be in the list, but are not significant to
4927 # this class.
4928
4929 sub trace { return main::trace(@_); }
4930
4931 { # Closure
4932
4933     use overload
4934         fallback => 0,
4935         '+' => sub { my $self = shift;
4936                     my $other = shift;
4937
4938                     return $self->_union($other)
4939                 },
4940         '+=' => sub { my $self = shift;
4941                     my $other = shift;
4942                     my $reversed = shift;
4943
4944                     if ($reversed) {
4945                         Carp::my_carp_bug("Bad news.  Can't cope with '"
4946                         . ref($other)
4947                         . ' += '
4948                         . ref($self)
4949                         . "'.  undef returned.");
4950                         return;
4951                     }
4952
4953                     return $self->_union($other)
4954                 },
4955         '&' => sub { my $self = shift;
4956                     my $other = shift;
4957
4958                     return $self->_intersect($other, 0);
4959                 },
4960         '&=' => sub { my $self = shift;
4961                     my $other = shift;
4962                     my $reversed = shift;
4963
4964                     if ($reversed) {
4965                         Carp::my_carp_bug("Bad news.  Can't cope with '"
4966                         . ref($other)
4967                         . ' &= '
4968                         . ref($self)
4969                         . "'.  undef returned.");
4970                         return;
4971                     }
4972
4973                     return $self->_intersect($other, 0);
4974                 },
4975         '~' => "_invert",
4976         '-' => "_subtract",
4977     ;
4978
4979     sub _invert {
4980         # Returns a new Range_List that gives all code points not in $self.
4981
4982         my $self = shift;
4983
4984         my $new = Range_List->new;
4985
4986         # Go through each range in the table, finding the gaps between them
4987         my $max = -1;   # Set so no gap before range beginning at 0
4988         for my $range ($self->ranges) {
4989             my $start = $range->start;
4990             my $end   = $range->end;
4991
4992             # If there is a gap before this range, the inverse will contain
4993             # that gap.
4994             if ($start > $max + 1) {
4995                 $new->add_range($max + 1, $start - 1);
4996             }
4997             $max = $end;
4998         }
4999
5000         # And finally, add the gap from the end of the table to the max
5001         # possible code point
5002         if ($max < $MAX_WORKING_CODEPOINT) {
5003             $new->add_range($max + 1, $MAX_WORKING_CODEPOINT);
5004         }
5005         return $new;
5006     }
5007
5008     sub _subtract {
5009         # Returns a new Range_List with the argument deleted from it.  The
5010         # argument can be a single code point, a range, or something that has
5011         # a range, with the _range_list() method on it returning them
5012
5013         my $self = shift;
5014         my $other = shift;
5015         my $reversed = shift;
5016         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5017
5018         if ($reversed) {
5019             Carp::my_carp_bug("Bad news.  Can't cope with '"
5020             . ref($other)
5021             . ' - '
5022             . ref($self)
5023             . "'.  undef returned.");
5024             return;
5025         }
5026
5027         my $new = Range_List->new(Initialize => $self);
5028
5029         if (! ref $other) { # Single code point
5030             $new->delete_range($other, $other);
5031         }
5032         elsif ($other->isa('Range')) {
5033             $new->delete_range($other->start, $other->end);
5034         }
5035         elsif ($other->can('_range_list')) {
5036             foreach my $range ($other->_range_list->ranges) {
5037                 $new->delete_range($range->start, $range->end);
5038             }
5039         }
5040         else {
5041             Carp::my_carp_bug("Can't cope with a "
5042                         . ref($other)
5043                         . " argument to '-'.  Subtraction ignored."
5044                         );
5045             return $self;
5046         }
5047
5048         return $new;
5049     }
5050
5051     sub _intersect {
5052         # Returns either a boolean giving whether the two inputs' range lists
5053         # intersect (overlap), or a new Range_List containing the intersection
5054         # of the two lists.  The optional final parameter being true indicates
5055         # to do the check instead of the intersection.
5056
5057         my $a_object = shift;
5058         my $b_object = shift;
5059         my $check_if_overlapping = shift;
5060         $check_if_overlapping = 0 unless defined $check_if_overlapping;
5061         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5062
5063         if (! defined $b_object) {
5064             my $message = "";
5065             $message .= $a_object->_owner_name_of if defined $a_object;
5066             Carp::my_carp_bug($message .= "Called with undefined value.  Intersection not done.");
5067             return;
5068         }
5069
5070         # a & b = !(!a | !b), or in our terminology = ~ ( ~a + -b )
5071         # Thus the intersection could be much more simply be written:
5072         #   return ~(~$a_object + ~$b_object);
5073         # But, this is slower, and when taking the inverse of a large
5074         # range_size_1 table, back when such tables were always stored that
5075         # way, it became prohibitively slow, hence the code was changed to the
5076         # below
5077
5078         if ($b_object->isa('Range')) {
5079             $b_object = Range_List->new(Initialize => $b_object,
5080                                         Owner => $a_object->_owner_name_of);
5081         }
5082         $b_object = $b_object->_range_list if $b_object->can('_range_list');
5083
5084         my @a_ranges = $a_object->ranges;
5085         my @b_ranges = $b_object->ranges;
5086
5087         #local $to_trace = 1 if main::DEBUG;
5088         trace "intersecting $a_object with ", scalar @a_ranges, "ranges and $b_object with", scalar @b_ranges, " ranges" if main::DEBUG && $to_trace;
5089
5090         # Start with the first range in each list
5091         my $a_i = 0;
5092         my $range_a = $a_ranges[$a_i];
5093         my $b_i = 0;
5094         my $range_b = $b_ranges[$b_i];
5095
5096         my $new = __PACKAGE__->new(Owner => $a_object->_owner_name_of)
5097                                                 if ! $check_if_overlapping;
5098
5099         # If either list is empty, there is no intersection and no overlap
5100         if (! defined $range_a || ! defined $range_b) {
5101             return $check_if_overlapping ? 0 : $new;
5102         }
5103         trace "range_a[$a_i]=$range_a; range_b[$b_i]=$range_b" if main::DEBUG && $to_trace;
5104
5105         # Otherwise, must calculate the intersection/overlap.  Start with the
5106         # very first code point in each list
5107         my $a = $range_a->start;
5108         my $b = $range_b->start;
5109
5110         # Loop through all the ranges of each list; in each iteration, $a and
5111         # $b are the current code points in their respective lists
5112         while (1) {
5113
5114             # If $a and $b are the same code point, ...
5115             if ($a == $b) {
5116
5117                 # it means the lists overlap.  If just checking for overlap
5118                 # know the answer now,
5119                 return 1 if $check_if_overlapping;
5120
5121                 # The intersection includes this code point plus anything else
5122                 # common to both current ranges.
5123                 my $start = $a;
5124                 my $end = main::min($range_a->end, $range_b->end);
5125                 if (! $check_if_overlapping) {
5126                     trace "adding intersection range ", sprintf("%04X", $start) . ".." . sprintf("%04X", $end) if main::DEBUG && $to_trace;
5127                     $new->add_range($start, $end);
5128                 }
5129
5130                 # Skip ahead to the end of the current intersect
5131                 $a = $b = $end;
5132
5133                 # If the current intersect ends at the end of either range (as
5134                 # it must for at least one of them), the next possible one
5135                 # will be the beginning code point in it's list's next range.
5136                 if ($a == $range_a->end) {
5137                     $range_a = $a_ranges[++$a_i];
5138                     last unless defined $range_a;
5139                     $a = $range_a->start;
5140                 }
5141                 if ($b == $range_b->end) {
5142                     $range_b = $b_ranges[++$b_i];
5143                     last unless defined $range_b;
5144                     $b = $range_b->start;
5145                 }
5146
5147                 trace "range_a[$a_i]=$range_a; range_b[$b_i]=$range_b" if main::DEBUG && $to_trace;
5148             }
5149             elsif ($a < $b) {
5150
5151                 # Not equal, but if the range containing $a encompasses $b,
5152                 # change $a to be the middle of the range where it does equal
5153                 # $b, so the next iteration will get the intersection
5154                 if ($range_a->end >= $b) {
5155                     $a = $b;
5156                 }
5157                 else {
5158
5159                     # Here, the current range containing $a is entirely below
5160                     # $b.  Go try to find a range that could contain $b.
5161                     $a_i = $a_object->_search_ranges($b);
5162
5163                     # If no range found, quit.
5164                     last unless defined $a_i;
5165
5166                     # The search returns $a_i, such that
5167                     #   range_a[$a_i-1]->end < $b <= range_a[$a_i]->end
5168                     # Set $a to the beginning of this new range, and repeat.
5169                     $range_a = $a_ranges[$a_i];
5170                     $a = $range_a->start;
5171                 }
5172             }
5173             else { # Here, $b < $a.
5174
5175                 # Mirror image code to the leg just above
5176                 if ($range_b->end >= $a) {
5177                     $b = $a;
5178                 }
5179                 else {
5180                     $b_i = $b_object->_search_ranges($a);
5181                     last unless defined $b_i;
5182                     $range_b = $b_ranges[$b_i];
5183                     $b = $range_b->start;
5184                 }
5185             }
5186         } # End of looping through ranges.
5187
5188         # Intersection fully computed, or now know that there is no overlap
5189         return $check_if_overlapping ? 0 : $new;
5190     }
5191
5192     sub overlaps {
5193         # Returns boolean giving whether the two arguments overlap somewhere
5194
5195         my $self = shift;
5196         my $other = shift;
5197         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5198
5199         return $self->_intersect($other, 1);
5200     }
5201
5202     sub add_range {
5203         # Add a range to the list.
5204
5205         my $self = shift;
5206         my $start = shift;
5207         my $end = shift;
5208         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5209
5210         return $self->_add_delete('+', $start, $end, "");
5211     }
5212
5213     sub matches_identically_to {
5214         # Return a boolean as to whether or not two Range_Lists match identical
5215         # sets of code points.
5216
5217         my $self = shift;
5218         my $other = shift;
5219         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5220
5221         # These are ordered in increasing real time to figure out (at least
5222         # until a patch changes that and doesn't change this)
5223         return 0 if $self->max != $other->max;
5224         return 0 if $self->min != $other->min;
5225         return 0 if $self->range_count != $other->range_count;
5226         return 0 if $self->count != $other->count;
5227
5228         # Here they could be identical because all the tests above passed.
5229         # The loop below is somewhat simpler since we know they have the same
5230         # number of elements.  Compare range by range, until reach the end or
5231         # find something that differs.
5232         my @a_ranges = $self->ranges;
5233         my @b_ranges = $other->ranges;
5234         for my $i (0 .. @a_ranges - 1) {
5235             my $a = $a_ranges[$i];
5236             my $b = $b_ranges[$i];
5237             trace "self $a; other $b" if main::DEBUG && $to_trace;
5238             return 0 if ! defined $b
5239                         || $a->start != $b->start
5240                         || $a->end != $b->end;
5241         }
5242         return 1;
5243     }
5244
5245     sub is_code_point_usable {
5246         # This used only for making the test script.  See if the input
5247         # proposed trial code point is one that Perl will handle.  If second
5248         # parameter is 0, it won't select some code points for various
5249         # reasons, noted below.
5250
5251         my $code = shift;
5252         my $try_hard = shift;
5253         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5254
5255         return 0 if $code < 0;                # Never use a negative
5256
5257         # shun null.  I'm (khw) not sure why this was done, but NULL would be
5258         # the character very frequently used.
5259         return $try_hard if $code == 0x0000;
5260
5261         # shun non-character code points.
5262         return $try_hard if $code >= 0xFDD0 && $code <= 0xFDEF;
5263         return $try_hard if ($code & 0xFFFE) == 0xFFFE; # includes FFFF
5264
5265         return $try_hard if $code > $MAX_UNICODE_CODEPOINT;   # keep in range
5266         return $try_hard if $code >= 0xD800 && $code <= 0xDFFF; # no surrogate
5267
5268         return 1;
5269     }
5270
5271     sub get_valid_code_point {
5272         # Return a code point that's part of the range list.  Returns nothing
5273         # if the table is empty or we can't find a suitable code point.  This
5274         # used only for making the test script.
5275
5276         my $self = shift;
5277         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5278
5279         my $addr = do { no overloading; pack 'J', $self; };
5280
5281         # On first pass, don't choose less desirable code points; if no good
5282         # one is found, repeat, allowing a less desirable one to be selected.
5283         for my $try_hard (0, 1) {
5284
5285             # Look through all the ranges for a usable code point.
5286             for my $set (reverse $self->ranges) {
5287
5288                 # Try the edge cases first, starting with the end point of the
5289                 # range.
5290                 my $end = $set->end;
5291                 return $end if is_code_point_usable($end, $try_hard);
5292                 $end = $MAX_UNICODE_CODEPOINT + 1 if $end > $MAX_UNICODE_CODEPOINT;
5293
5294                 # End point didn't, work.  Start at the beginning and try
5295                 # every one until find one that does work.
5296                 for my $trial ($set->start .. $end - 1) {
5297                     return $trial if is_code_point_usable($trial, $try_hard);
5298                 }
5299             }
5300         }
5301         return ();  # If none found, give up.
5302     }
5303
5304     sub get_invalid_code_point {
5305         # Return a code point that's not part of the table.  Returns nothing
5306         # if the table covers all code points or a suitable code point can't
5307         # be found.  This used only for making the test script.
5308
5309         my $self = shift;
5310         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5311
5312         # Just find a valid code point of the inverse, if any.
5313         return Range_List->new(Initialize => ~ $self)->get_valid_code_point;
5314     }
5315 } # end closure for Range_List
5316
5317 package Range_Map;
5318 use parent '-norequire', '_Range_List_Base';
5319
5320 # A Range_Map is a range list in which the range values (called maps) are
5321 # significant, and hence shouldn't be manipulated by our other code, which
5322 # could be ambiguous or lose things.  For example, in taking the union of two
5323 # lists, which share code points, but which have differing values, which one
5324 # has precedence in the union?
5325 # It turns out that these operations aren't really necessary for map tables,
5326 # and so this class was created to make sure they aren't accidentally
5327 # applied to them.
5328
5329 { # Closure
5330
5331     sub add_map {
5332         # Add a range containing a mapping value to the list
5333
5334         my $self = shift;
5335         # Rest of parameters passed on
5336
5337         return $self->_add_delete('+', @_);
5338     }
5339
5340     sub replace_map {
5341         # Replace a range
5342
5343         my $self = shift;
5344
5345         return $self->_add_delete('+', @_, Replace => $UNCONDITIONALLY);
5346     }
5347
5348     sub add_duplicate {
5349         # Adds entry to a range list which can duplicate an existing entry
5350
5351         my $self = shift;
5352         my $code_point = shift;
5353         my $value = shift;
5354         my %args = @_;
5355         my $replace = delete $args{'Replace'} // $MULTIPLE_BEFORE;
5356         Carp::carp_extra_args(\%args) if main::DEBUG && %args;
5357
5358         return $self->add_map($code_point, $code_point,
5359                                 $value, Replace => $replace);
5360     }
5361 } # End of closure for package Range_Map
5362
5363 package _Base_Table;
5364
5365 # A table is the basic data structure that gets written out into a file for
5366 # use by the Perl core.  This is the abstract base class implementing the
5367 # common elements from the derived ones.  A list of the methods to be
5368 # furnished by an implementing class is just after the constructor.
5369
5370 sub standardize { return main::standardize($_[0]); }
5371 sub trace { return main::trace(@_); }
5372
5373 { # Closure
5374
5375     main::setup_package();
5376
5377     my %range_list;
5378     # Object containing the ranges of the table.
5379     main::set_access('range_list', \%range_list, 'p_r', 'p_s');
5380
5381     my %full_name;
5382     # The full table name.
5383     main::set_access('full_name', \%full_name, 'r');
5384
5385     my %name;
5386     # The table name, almost always shorter
5387     main::set_access('name', \%name, 'r');
5388
5389     my %short_name;
5390     # The shortest of all the aliases for this table, with underscores removed
5391     main::set_access('short_name', \%short_name);
5392
5393     my %nominal_short_name_length;
5394     # The length of short_name before removing underscores
5395     main::set_access('nominal_short_name_length',
5396                     \%nominal_short_name_length);
5397
5398     my %complete_name;
5399     # The complete name, including property.
5400     main::set_access('complete_name', \%complete_name, 'r');
5401
5402     my %property;
5403     # Parent property this table is attached to.
5404     main::set_access('property', \%property, 'r');
5405
5406     my %aliases;
5407     # Ordered list of alias objects of the table's name.  The first ones in
5408     # the list are output first in comments
5409     main::set_access('aliases', \%aliases, 'readable_array');
5410
5411     my %comment;
5412     # A comment associated with the table for human readers of the files
5413     main::set_access('comment', \%comment, 's');
5414
5415     my %description;
5416     # A comment giving a short description of the table's meaning for human
5417     # readers of the files.
5418     main::set_access('description', \%description, 'readable_array');
5419
5420     my %note;
5421     # A comment giving a short note about the table for human readers of the
5422     # files.
5423     main::set_access('note', \%note, 'readable_array');
5424
5425     my %fate;
5426     # Enum; there are a number of possibilities for what happens to this
5427     # table: it could be normal, or suppressed, or not for external use.  See
5428     # values at definition for $SUPPRESSED.
5429     main::set_access('fate', \%fate, 'r');
5430
5431     my %find_table_from_alias;
5432     # The parent property passes this pointer to a hash which this class adds
5433     # all its aliases to, so that the parent can quickly take an alias and
5434     # find this table.
5435     main::set_access('find_table_from_alias', \%find_table_from_alias, 'p_r');
5436
5437     my %locked;
5438     # After this table is made equivalent to another one; we shouldn't go
5439     # changing the contents because that could mean it's no longer equivalent
5440     main::set_access('locked', \%locked, 'r');
5441
5442     my %file_path;
5443     # This gives the final path to the file containing the table.  Each
5444     # directory in the path is an element in the array
5445     main::set_access('file_path', \%file_path, 'readable_array');
5446
5447     my %status;
5448     # What is the table's status, normal, $OBSOLETE, etc.  Enum
5449     main::set_access('status', \%status, 'r');
5450
5451     my %status_info;
5452     # A comment about its being obsolete, or whatever non normal status it has
5453     main::set_access('status_info', \%status_info, 'r');
5454
5455     my %caseless_equivalent;
5456     # The table this is equivalent to under /i matching, if any.
5457     main::set_access('caseless_equivalent', \%caseless_equivalent, 'r', 's');
5458
5459     my %range_size_1;
5460     # Is the table to be output with each range only a single code point?
5461     # This is done to avoid breaking existing code that may have come to rely
5462     # on this behavior in previous versions of this program.)
5463     main::set_access('range_size_1', \%range_size_1, 'r', 's');
5464
5465     my %perl_extension;
5466     # A boolean set iff this table is a Perl extension to the Unicode
5467     # standard.
5468     main::set_access('perl_extension', \%perl_extension, 'r');
5469
5470     my %output_range_counts;
5471     # A boolean set iff this table is to have comments written in the
5472     # output file that contain the number of code points in the range.
5473     # The constructor can override the global flag of the same name.
5474     main::set_access('output_range_counts', \%output_range_counts, 'r');
5475
5476     my %write_as_invlist;
5477     # A boolean set iff the output file for this table is to be in the form of
5478     # an inversion list/map.
5479     main::set_access('write_as_invlist', \%write_as_invlist, 'r');
5480
5481     my %format;
5482     # The format of the entries of the table.  This is calculated from the
5483     # data in the table (or passed in the constructor).  This is an enum e.g.,
5484     # $STRING_FORMAT.  It is marked protected as it should not be generally
5485     # used to override calculations.
5486     main::set_access('format', \%format, 'r', 'p_s');
5487
5488     my %has_dependency;
5489     # A boolean that gives whether some other table in this property is
5490     # defined as the complement of this table.  This is a crude, but currently
5491     # sufficient, mechanism to make this table not get destroyed before what
5492     # is dependent on it is.  Other dependencies could be added, so the name
5493     # was chosen to reflect a more general situation than actually is
5494     # currently the case.
5495     main::set_access('has_dependency', \%has_dependency, 'r', 's');
5496
5497     sub new {
5498         # All arguments are key => value pairs, which you can see below, most
5499         # of which match fields documented above.  Otherwise: Re_Pod_Entry,
5500         # OK_as_Filename, and Fuzzy apply to the names of the table, and are
5501         # documented in the Alias package
5502
5503         return Carp::carp_too_few_args(\@_, 2) if main::DEBUG && @_ < 2;
5504
5505         my $class = shift;
5506
5507         my $self = bless \do { my $anonymous_scalar }, $class;
5508         my $addr = do { no overloading; pack 'J', $self; };
5509
5510         my %args = @_;
5511
5512         $name{$addr} = delete $args{'Name'};
5513         $find_table_from_alias{$addr} = delete $args{'_Alias_Hash'};
5514         $full_name{$addr} = delete $args{'Full_Name'};
5515         my $complete_name = $complete_name{$addr}
5516                           = delete $args{'Complete_Name'};
5517         $format{$addr} = delete $args{'Format'};
5518         $output_range_counts{$addr} = delete $args{'Output_Range_Counts'};
5519         $property{$addr} = delete $args{'_Property'};
5520         $range_list{$addr} = delete $args{'_Range_List'};
5521         $status{$addr} = delete $args{'Status'} || $NORMAL;
5522         $status_info{$addr} = delete $args{'_Status_Info'} || "";
5523         $range_size_1{$addr} = delete $args{'Range_Size_1'} || 0;
5524         $caseless_equivalent{$addr} = delete $args{'Caseless_Equivalent'} || 0;
5525         $fate{$addr} = delete $args{'Fate'} || $ORDINARY;
5526         $write_as_invlist{$addr} = delete $args{'Write_As_Invlist'};# No default
5527         my $ucd = delete $args{'UCD'};
5528
5529         my $description = delete $args{'Description'};
5530         my $ok_as_filename = delete $args{'OK_as_Filename'};
5531         my $loose_match = delete $args{'Fuzzy'};
5532         my $note = delete $args{'Note'};
5533         my $make_re_pod_entry = delete $args{'Re_Pod_Entry'};
5534         my $perl_extension = delete $args{'Perl_Extension'};
5535         my $suppression_reason = delete $args{'Suppression_Reason'};
5536
5537         # Shouldn't have any left over
5538         Carp::carp_extra_args(\%args) if main::DEBUG && %args;
5539
5540         # Can't use || above because conceivably the name could be 0, and
5541         # can't use // operator in case this program gets used in Perl 5.8
5542         $full_name{$addr} = $name{$addr} if ! defined $full_name{$addr};
5543         $output_range_counts{$addr} = $output_range_counts if
5544                                         ! defined $output_range_counts{$addr};
5545
5546         $aliases{$addr} = [ ];
5547         $comment{$addr} = [ ];
5548         $description{$addr} = [ ];
5549         $note{$addr} = [ ];
5550         $file_path{$addr} = [ ];
5551         $locked{$addr} = "";
5552         $has_dependency{$addr} = 0;
5553
5554         push @{$description{$addr}}, $description if $description;
5555         push @{$note{$addr}}, $note if $note;
5556
5557         if ($fate{$addr} == $PLACEHOLDER) {
5558
5559             # A placeholder table doesn't get documented, is a perl extension,
5560             # and quite likely will be empty
5561             $make_re_pod_entry = 0 if ! defined $make_re_pod_entry;
5562             $perl_extension = 1 if ! defined $perl_extension;
5563             $ucd = 0 if ! defined $ucd;
5564             push @tables_that_may_be_empty, $complete_name{$addr};
5565             $self->add_comment(<<END);
5566 This is a placeholder because it is not in Version $string_version of Unicode,
5567 but is needed by the Perl core to work gracefully.  Because it is not in this
5568 version of Unicode, it will not be listed in $pod_file.pod
5569 END
5570         }
5571         elsif (exists $why_suppressed{$complete_name}
5572                 # Don't suppress if overridden
5573                 && ! grep { $_ eq $complete_name{$addr} }
5574                                                     @output_mapped_properties)
5575         {
5576             $fate{$addr} = $SUPPRESSED;
5577         }
5578         elsif ($fate{$addr} == $SUPPRESSED) {
5579             Carp::my_carp_bug("Need reason for suppressing") unless $suppression_reason;
5580             # Though currently unused
5581         }
5582         elsif ($suppression_reason) {
5583             Carp::my_carp_bug("A reason was given for suppressing, but not suppressed");
5584         }
5585
5586         # If hasn't set its status already, see if it is on one of the
5587         # lists of properties or tables that have particular statuses; if
5588         # not, is normal.  The lists are prioritized so the most serious
5589         # ones are checked first
5590         if (! $status{$addr}) {
5591             if (exists $why_deprecated{$complete_name}) {
5592                 $status{$addr} = $DEPRECATED;
5593             }
5594             elsif (exists $why_stabilized{$complete_name}) {
5595                 $status{$addr} = $STABILIZED;
5596             }
5597             elsif (exists $why_obsolete{$complete_name}) {
5598                 $status{$addr} = $OBSOLETE;
5599             }
5600
5601             # Existence above doesn't necessarily mean there is a message
5602             # associated with it.  Use the most serious message.
5603             if ($status{$addr}) {
5604                 if ($why_deprecated{$complete_name}) {
5605                     $status_info{$addr}
5606                                 = $why_deprecated{$complete_name};
5607                 }
5608                 elsif ($why_stabilized{$complete_name}) {
5609                     $status_info{$addr}
5610                                 = $why_stabilized{$complete_name};
5611                 }
5612                 elsif ($why_obsolete{$complete_name}) {
5613                     $status_info{$addr}
5614                                 = $why_obsolete{$complete_name};
5615                 }
5616             }
5617         }
5618
5619         $perl_extension{$addr} = $perl_extension || 0;
5620
5621         # Don't list a property by default that is internal only
5622         if ($fate{$addr} > $MAP_PROXIED) {
5623             $make_re_pod_entry = 0 if ! defined $make_re_pod_entry;
5624             $ucd = 0 if ! defined $ucd;
5625         }
5626         else {
5627             $ucd = 1 if ! defined $ucd;
5628         }
5629
5630         # By convention what typically gets printed only or first is what's
5631         # first in the list, so put the full name there for good output
5632         # clarity.  Other routines rely on the full name being first on the
5633         # list
5634         $self->add_alias($full_name{$addr},
5635                             OK_as_Filename => $ok_as_filename,
5636                             Fuzzy => $loose_match,
5637                             Re_Pod_Entry => $make_re_pod_entry,
5638                             Status => $status{$addr},
5639                             UCD => $ucd,
5640                             );
5641
5642         # Then comes the other name, if meaningfully different.
5643         if (standardize($full_name{$addr}) ne standardize($name{$addr})) {
5644             $self->add_alias($name{$addr},
5645                             OK_as_Filename => $ok_as_filename,
5646                             Fuzzy => $loose_match,
5647                             Re_Pod_Entry => $make_re_pod_entry,
5648                             Status => $status{$addr},
5649                             UCD => $ucd,
5650                             );
5651         }
5652
5653         return $self;
5654     }
5655
5656     # Here are the methods that are required to be defined by any derived
5657     # class
5658     for my $sub (qw(
5659                     handle_special_range
5660                     append_to_body
5661                     pre_body
5662                 ))
5663                 # write() knows how to write out normal ranges, but it calls
5664                 # handle_special_range() when it encounters a non-normal one.
5665                 # append_to_body() is called by it after it has handled all
5666                 # ranges to add anything after the main portion of the table.
5667                 # And finally, pre_body() is called after all this to build up
5668                 # anything that should appear before the main portion of the
5669                 # table.  Doing it this way allows things in the middle to
5670                 # affect what should appear before the main portion of the
5671                 # table.
5672     {
5673         no strict "refs";
5674         *$sub = sub {
5675             Carp::my_carp_bug( __LINE__
5676                               . ": Must create method '$sub()' for "
5677                               . ref shift);
5678             return;
5679         }
5680     }
5681
5682     use overload
5683         fallback => 0,
5684         "." => \&main::_operator_dot,
5685         ".=" => \&main::_operator_dot_equal,
5686         '!=' => \&main::_operator_not_equal,
5687         '==' => \&main::_operator_equal,
5688     ;
5689
5690     sub ranges {
5691         # Returns the array of ranges associated with this table.
5692
5693         no overloading;
5694         return $range_list{pack 'J', shift}->ranges;
5695     }
5696
5697     sub add_alias {
5698         # Add a synonym for this table.
5699
5700         return Carp::carp_too_few_args(\@_, 3) if main::DEBUG && @_ < 3;
5701
5702         my $self = shift;
5703         my $name = shift;       # The name to add.
5704         my $pointer = shift;    # What the alias hash should point to.  For
5705                                 # map tables, this is the parent property;
5706                                 # for match tables, it is the table itself.
5707
5708         my %args = @_;
5709         my $loose_match = delete $args{'Fuzzy'};
5710
5711         my $ok_as_filename = delete $args{'OK_as_Filename'};
5712         $ok_as_filename = 1 unless defined $ok_as_filename;
5713
5714         # An internal name does not get documented, unless overridden by the
5715         # input; same for making tests for it.
5716         my $status = delete $args{'Status'} || (($name =~ /^_/)
5717                                                 ? $INTERNAL_ALIAS
5718                                                 : $NORMAL);
5719         my $make_re_pod_entry = delete $args{'Re_Pod_Entry'}
5720                                             // (($status ne $INTERNAL_ALIAS)
5721                                                ? (($name =~ /^_/) ? $NO : $YES)
5722                                                : $NO);
5723         my $ucd = delete $args{'UCD'} // (($name =~ /^_/) ? 0 : 1);
5724
5725         Carp::carp_extra_args(\%args) if main::DEBUG && %args;
5726
5727         # Capitalize the first letter of the alias unless it is one of the CJK
5728         # ones which specifically begins with a lower 'k'.  Do this because
5729         # Unicode has varied whether they capitalize first letters or not, and
5730         # have later changed their minds and capitalized them, but not the
5731         # other way around.  So do it always and avoid changes from release to
5732         # release
5733         $name = ucfirst($name) unless $name =~ /^k[A-Z]/;
5734
5735         my $addr = do { no overloading; pack 'J', $self; };
5736
5737         # Figure out if should be loosely matched if not already specified.
5738         if (! defined $loose_match) {
5739
5740             # Is a loose_match if isn't null, and doesn't begin with an
5741             # underscore and isn't just a number
5742             if ($name ne ""
5743                 && substr($name, 0, 1) ne '_'
5744                 && $name !~ qr{^[0-9_.+-/]+$})
5745             {
5746                 $loose_match = 1;
5747             }
5748             else {
5749                 $loose_match = 0;
5750             }
5751         }
5752
5753         # If this alias has already been defined, do nothing.
5754         return if defined $find_table_from_alias{$addr}->{$name};
5755
5756         # That includes if it is standardly equivalent to an existing alias,
5757         # in which case, add this name to the list, so won't have to search
5758         # for it again.
5759         my $standard_name = main::standardize($name);
5760         if (defined $find_table_from_alias{$addr}->{$standard_name}) {
5761             $find_table_from_alias{$addr}->{$name}
5762                         = $find_table_from_alias{$addr}->{$standard_name};
5763             return;
5764         }
5765
5766         # Set the index hash for this alias for future quick reference.
5767         $find_table_from_alias{$addr}->{$name} = $pointer;
5768         $find_table_from_alias{$addr}->{$standard_name} = $pointer;
5769         local $to_trace = 0 if main::DEBUG;
5770         trace "adding alias $name to $pointer" if main::DEBUG && $to_trace;
5771         trace "adding alias $standard_name to $pointer" if main::DEBUG && $to_trace;
5772
5773
5774         # Put the new alias at the end of the list of aliases unless the final
5775         # element begins with an underscore (meaning it is for internal perl
5776         # use) or is all numeric, in which case, put the new one before that
5777         # one.  This floats any all-numeric or underscore-beginning aliases to
5778         # the end.  This is done so that they are listed last in output lists,
5779         # to encourage the user to use a better name (either more descriptive
5780         # or not an internal-only one) instead.  This ordering is relied on
5781         # implicitly elsewhere in this program, like in short_name()
5782         my $list = $aliases{$addr};
5783         my $insert_position = (@$list == 0
5784                                 || (substr($list->[-1]->name, 0, 1) ne '_'
5785                                     && $list->[-1]->name =~ /\D/))
5786                             ? @$list
5787                             : @$list - 1;
5788         splice @$list,
5789                 $insert_position,
5790                 0,
5791                 Alias->new($name, $loose_match, $make_re_pod_entry,
5792                            $ok_as_filename, $status, $ucd);
5793
5794         # This name may be shorter than any existing ones, so clear the cache
5795         # of the shortest, so will have to be recalculated.
5796         no overloading;
5797         undef $short_name{pack 'J', $self};
5798         return;
5799     }
5800
5801     sub short_name {
5802         # Returns a name suitable for use as the base part of a file name.
5803         # That is, shorter wins.  It can return undef if there is no suitable
5804         # name.  The name has all non-essential underscores removed.
5805
5806         # The optional second parameter is a reference to a scalar in which
5807         # this routine will store the length the returned name had before the
5808         # underscores were removed, or undef if the return is undef.
5809
5810         # The shortest name can change if new aliases are added.  So using
5811         # this should be deferred until after all these are added.  The code
5812         # that does that should clear this one's cache.
5813         # Any name with alphabetics is preferred over an all numeric one, even
5814         # if longer.
5815
5816         my $self = shift;
5817         my $nominal_length_ptr = shift;
5818         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5819
5820         my $addr = do { no overloading; pack 'J', $self; };
5821
5822         # For efficiency, don't recalculate, but this means that adding new
5823         # aliases could change what the shortest is, so the code that does
5824         # that needs to undef this.
5825         if (defined $short_name{$addr}) {
5826             if ($nominal_length_ptr) {
5827                 $$nominal_length_ptr = $nominal_short_name_length{$addr};
5828             }
5829             return $short_name{$addr};
5830         }
5831
5832         # Look at each alias
5833         my $is_last_resort = 0;
5834         my $deprecated_or_discouraged
5835                                 = qr/ ^ (?: $DEPRECATED | $DISCOURAGED ) $/x;
5836         foreach my $alias ($self->aliases()) {
5837
5838             # Don't use an alias that isn't ok to use for an external name.
5839             next if ! $alias->ok_as_filename;
5840
5841             my $name = main::Standardize($alias->name);
5842             trace $self, $name if main::DEBUG && $to_trace;
5843
5844             # Take the first one, or any non-deprecated non-discouraged one
5845             # over one that is, or a shorter one that isn't numeric.  This
5846             # relies on numeric aliases always being last in the array
5847             # returned by aliases().  Any alpha one will have precedence.
5848             if (   ! defined $short_name{$addr}
5849                 || (   $is_last_resort
5850                     && $alias->status !~ $deprecated_or_discouraged)
5851                 || ($name =~ /\D/
5852                     && length($name) < length($short_name{$addr})))
5853             {
5854                 # Remove interior underscores.
5855                 ($short_name{$addr} = $name) =~ s/ (?<= . ) _ (?= . ) //xg;
5856
5857                 $nominal_short_name_length{$addr} = length $name;
5858                 $is_last_resort = $alias->status =~ $deprecated_or_discouraged;
5859             }
5860         }
5861
5862         # If the short name isn't a nice one, perhaps an equivalent table has
5863         # a better one.
5864         if (   $self->can('children')
5865             && (   ! defined $short_name{$addr}
5866                 || $short_name{$addr} eq ""
5867                 || $short_name{$addr} eq "_"))
5868         {
5869             my $return;
5870             foreach my $follower ($self->children) {    # All equivalents
5871                 my $follower_name = $follower->short_name;
5872                 next unless defined $follower_name;
5873
5874                 # Anything (except undefined) is better than underscore or
5875                 # empty
5876                 if (! defined $return || $return eq "_") {
5877                     $return = $follower_name;
5878                     next;
5879                 }
5880
5881                 # If the new follower name isn't "_" and is shorter than the
5882                 # current best one, prefer the new one.
5883                 next if $follower_name eq "_";
5884                 next if length $follower_name > length $return;
5885                 $return = $follower_name;
5886             }
5887             $short_name{$addr} = $return if defined $return;
5888         }
5889
5890         # If no suitable external name return undef
5891         if (! defined $short_name{$addr}) {
5892             $$nominal_length_ptr = undef if $nominal_length_ptr;
5893             return;
5894         }
5895
5896         # Don't allow a null short name.
5897         if ($short_name{$addr} eq "") {
5898             $short_name{$addr} = '_';
5899             $nominal_short_name_length{$addr} = 1;
5900         }
5901
5902         trace $self, $short_name{$addr} if main::DEBUG && $to_trace;
5903
5904         if ($nominal_length_ptr) {
5905             $$nominal_length_ptr = $nominal_short_name_length{$addr};
5906         }
5907         return $short_name{$addr};
5908     }
5909
5910     sub external_name {
5911         # Returns the external name that this table should be known by.  This
5912         # is usually the short_name, but not if the short_name is undefined,
5913         # in which case the external_name is arbitrarily set to the
5914         # underscore.
5915
5916         my $self = shift;
5917         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5918
5919         my $short = $self->short_name;
5920         return $short if defined $short;
5921
5922         return '_';
5923     }
5924
5925     sub add_description { # Adds the parameter as a short description.
5926
5927         my $self = shift;
5928         my $description = shift;
5929         chomp $description;
5930         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5931
5932         no overloading;
5933         push @{$description{pack 'J', $self}}, $description;
5934
5935         return;
5936     }
5937
5938     sub add_note { # Adds the parameter as a short note.
5939
5940         my $self = shift;
5941         my $note = shift;
5942         chomp $note;
5943         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5944
5945         no overloading;
5946         push @{$note{pack 'J', $self}}, $note;
5947
5948         return;
5949     }
5950
5951     sub add_comment { # Adds the parameter as a comment.
5952
5953         return unless $debugging_build;
5954
5955         my $self = shift;
5956         my $comment = shift;
5957         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5958
5959         chomp $comment;
5960
5961         no overloading;
5962         push @{$comment{pack 'J', $self}}, $comment;
5963
5964         return;
5965     }
5966
5967     sub comment {
5968         # Return the current comment for this table.  If called in list
5969         # context, returns the array of comments.  In scalar, returns a string
5970         # of each element joined together with a period ending each.
5971
5972         my $self = shift;
5973         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5974
5975         my $addr = do { no overloading; pack 'J', $self; };
5976         my @list = @{$comment{$addr}};
5977         return @list if wantarray;
5978         my $return = "";
5979         foreach my $sentence (@list) {
5980             $return .= '.  ' if $return;
5981             $return .= $sentence;
5982             $return =~ s/\.$//;
5983         }
5984         $return .= '.' if $return;
5985         return $return;
5986     }
5987
5988     sub initialize {
5989         # Initialize the table with the argument which is any valid
5990         # initialization for range lists.
5991
5992         my $self = shift;
5993         my $addr = do { no overloading; pack 'J', $self; };
5994         my $initialization = shift;
5995         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5996
5997         # Replace the current range list with a new one of the same exact
5998         # type.
5999         my $class = ref $range_list{$addr};
6000         $range_list{$addr} = $class->new(Owner => $self,
6001                                         Initialize => $initialization);
6002         return;
6003
6004     }
6005
6006     sub header {
6007         # The header that is output for the table in the file it is written
6008         # in.
6009
6010         my $self = shift;
6011         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6012
6013         my $return = "";
6014         $return .= $DEVELOPMENT_ONLY if $compare_versions;
6015         $return .= $HEADER;
6016         return $return;
6017     }
6018
6019     sub merge_single_annotation_line ($$$) {
6020         my ($output, $annotation, $annotation_column) = @_;
6021
6022         # This appends an annotation comment, $annotation, to $output,
6023         # starting in or after column $annotation_column, removing any
6024         # pre-existing comment from $output.
6025
6026         $annotation =~ s/^ \s* \# \  //x;
6027         $output =~ s/ \s* ( \# \N* )? \n //x;
6028         $output = Text::Tabs::expand($output);
6029
6030         my $spaces = $annotation_column - length $output;
6031         $spaces = 2 if $spaces < 0;  # Have 2 blanks before the comment
6032
6033         $output = sprintf "%s%*s# %s",
6034                             $output,
6035                             $spaces,
6036                             " ",
6037                             $annotation;
6038         return Text::Tabs::unexpand $output;
6039     }
6040
6041     sub write {
6042         # Write a representation of the table to its file.  It calls several
6043         # functions furnished by sub-classes of this abstract base class to
6044         # handle non-normal ranges, to add stuff before the table, and at its
6045         # end.  If the table is to be written so that adjustments are
6046         # required, this does that conversion.
6047
6048         my $self = shift;
6049         my $use_adjustments = shift; # ? output in adjusted format or not
6050         my $suppress_value = shift;  # Optional, if the value associated with
6051                                      # a range equals this one, don't write
6052                                      # the range
6053         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6054
6055         my $addr = do { no overloading; pack 'J', $self; };
6056         my $write_as_invlist = $write_as_invlist{$addr};
6057
6058         # Start with the header
6059         my @HEADER = $self->header;
6060
6061         # Then the comments
6062         push @HEADER, "\n", main::simple_fold($comment{$addr}, '# '), "\n"
6063                                                         if $comment{$addr};
6064
6065         # Things discovered processing the main body of the document may
6066         # affect what gets output before it, therefore pre_body() isn't called
6067         # until after all other processing of the table is done.
6068
6069         # The main body looks like a 'here' document.  If there are comments,
6070         # get rid of them when processing it.
6071         my @OUT;
6072         if ($annotate || $output_range_counts) {
6073             # Use the line below in Perls that don't have /r
6074             #push @OUT, 'return join "\n",  map { s/\s*#.*//mg; $_ } split "\n", <<\'END\';' . "\n";
6075             push @OUT, "return <<'END' =~ s/\\s*#.*//mgr;\n";
6076         } else {
6077             push @OUT, "return <<'END';\n";
6078         }
6079
6080         if ($range_list{$addr}->is_empty) {
6081
6082             # This is a kludge for empty tables to silence a warning in
6083             # utf8.c, which can't really deal with empty tables, but it can
6084             # deal with a table that matches nothing, as the inverse of 'All'
6085             # does.
6086             push @OUT, "!Unicode::UCD::All\n";
6087         }
6088         elsif ($self->name eq 'N'
6089
6090                # To save disk space and table cache space, avoid putting out
6091                # binary N tables, but instead create a file which just inverts
6092                # the Y table.  Since the file will still exist and occupy a
6093                # certain number of blocks, might as well output the whole
6094                # thing if it all will fit in one block.   The number of
6095                # ranges below is an approximate number for that.
6096                && ($self->property->type == $BINARY
6097                    || $self->property->type == $FORCED_BINARY)
6098                # && $self->property->tables == 2  Can't do this because the
6099                #        non-binary properties, like NFDQC aren't specifiable
6100                #        by the notation
6101                && $range_list{$addr}->ranges > 15
6102                && ! $annotate)  # Under --annotate, want to see everything
6103         {
6104             push @OUT, "!Unicode::UCD::" . $self->property->name . "\n";
6105         }
6106         else {
6107             my $range_size_1 = $range_size_1{$addr};
6108
6109             # To make it more readable, use a minimum indentation
6110             my $comment_indent;
6111
6112             # These are used only in $annotate option
6113             my $format;         # e.g. $HEX_ADJUST_FORMAT
6114             my $include_name;   # ? Include the character's name in the
6115                                 # annotation?
6116             my $include_cp;     # ? Include its code point
6117
6118             if (! $annotate) {
6119                 $comment_indent = ($self->isa('Map_Table'))
6120                                   ? 24
6121                                   : ($write_as_invlist)
6122                                     ? 8
6123                                     : 16;
6124             }
6125             else {
6126                 $format = $self->format;
6127
6128                 # The name of the character is output only for tables that
6129                 # don't already include the name in the output.
6130                 my $property = $self->property;
6131                 $include_name =
6132                     !  ($property == $perl_charname
6133                         || $property == main::property_ref('Unicode_1_Name')
6134                         || $property == main::property_ref('Name')
6135                         || $property == main::property_ref('Name_Alias')
6136                        );
6137
6138                 # Don't include the code point in the annotation where all
6139                 # lines are a single code point, so it can be easily found in
6140                 # the first column
6141                 $include_cp = ! $range_size_1;
6142
6143                 if (! $self->isa('Map_Table')) {
6144                     $comment_indent = ($write_as_invlist) ? 8 : 16;
6145                 }
6146                 else {
6147                     $comment_indent = 16;
6148
6149                     # There are just a few short ranges in this table, so no
6150                     # need to include the code point in the annotation.
6151                     $include_cp = 0 if $format eq $DECOMP_STRING_FORMAT;
6152
6153                     # We're trying to get this to look good, as the whole
6154                     # point is to make human-readable tables.  It is easier to
6155                     # read if almost all the annotation comments begin in the
6156                     # same column.  Map tables have varying width maps, so can
6157                     # create a jagged comment appearance.  This code does a
6158                     # preliminary pass through these tables looking for the
6159                     # maximum width map in each, and causing the comments to
6160                     # begin just to the right of that.  However, if the
6161                     # comments begin too far to the right of most lines, it's
6162                     # hard to line them up horizontally with their real data.
6163                     # Therefore we ignore the longest outliers
6164                     my $ignore_longest_X_percent = 2;  # Discard longest X%
6165
6166                     # Each key in this hash is a width of at least one of the
6167                     # maps in the table.  Its value is how many lines have
6168                     # that width.
6169                     my %widths;
6170
6171                     # We won't space things further left than one tab stop
6172                     # after the rest of the line; initializing it to that
6173                     # number saves some work.
6174                     my $max_map_width = 8;
6175
6176                     # Fill in the %widths hash
6177                     my $total = 0;
6178                     for my $set ($range_list{$addr}->ranges) {
6179                         my $value = $set->value;
6180
6181                         # These range types don't appear in the main table
6182                         next if $set->type == 0
6183                                 && defined $suppress_value
6184                                 && $value eq $suppress_value;
6185                         next if $set->type == $MULTI_CP
6186                                 || $set->type == $NULL;
6187
6188                         # Include 2 spaces before the beginning of the
6189                         # comment
6190                         my $this_width = length($value) + 2;
6191
6192                         # Ranges of the remaining non-zero types usually
6193                         # occupy just one line (maybe occasionally two, but
6194                         # this doesn't have to be dead accurate).  This is
6195                         # because these ranges are like "unassigned code
6196                         # points"
6197                         my $count = ($set->type != 0)
6198                                     ? 1
6199                                     : $set->end - $set->start + 1;
6200                         $widths{$this_width} += $count;
6201                         $total += $count;
6202                         $max_map_width = $this_width
6203                                             if $max_map_width < $this_width;
6204                     }
6205
6206                     # If the widest map gives us less than two tab stops
6207                     # worth, just take it as-is.
6208                     if ($max_map_width > 16) {
6209
6210                         # Otherwise go through %widths until we have included
6211                         # the desired percentage of lines in the whole table.
6212                         my $running_total = 0;
6213                         foreach my $width (sort { $a <=> $b } keys %widths)
6214                         {
6215                             $running_total += $widths{$width};
6216                             use integer;
6217                             if ($running_total * 100 / $total
6218                                             >= 100 - $ignore_longest_X_percent)
6219                             {
6220                                 $max_map_width = $width;
6221                                 last;
6222                             }
6223                         }
6224                     }
6225                     $comment_indent += $max_map_width;
6226                 }
6227             }
6228
6229             # Values for previous time through the loop.  Initialize to
6230             # something that won't be adjacent to the first iteration;
6231             # only $previous_end matters for that.
6232             my $previous_start;
6233             my $previous_end = -2;
6234             my $previous_value;
6235
6236             # Values for next time through the portion of the loop that splits
6237             # the range.  0 in $next_start means there is no remaining portion
6238             # to deal with.
6239             my $next_start = 0;
6240             my $next_end;
6241             my $next_value;
6242             my $offset = 0;
6243             my $invlist_count = 0;
6244
6245             my $output_value_in_hex = $self->isa('Map_Table')
6246                                 && ($self->format eq $HEX_ADJUST_FORMAT
6247                                     || $self->to_output_map == $EXTERNAL_MAP);
6248             # Use leading zeroes just for files whose format should not be
6249             # changed from what it has been.  Otherwise, they just take up
6250             # space and time to process.
6251             my $hex_format = ($self->isa('Map_Table')
6252                               && $self->to_output_map == $EXTERNAL_MAP)
6253                              ? "%04X"
6254                              : "%X";
6255
6256             # The values for some of these tables are stored in mktables as
6257             # hex strings.  Normally, these are just output as strings without
6258             # change, but when we are doing adjustments, we have to operate on
6259             # these numerically, so we convert those to decimal to do that,
6260             # and back to hex for output
6261             my $convert_map_to_from_hex = 0;
6262             my $output_map_in_hex = 0;
6263             if ($self->isa('Map_Table')) {
6264                 $convert_map_to_from_hex
6265                    = ($use_adjustments && $self->format eq $HEX_ADJUST_FORMAT)
6266                       || ($annotate && $self->format eq $HEX_FORMAT);
6267                 $output_map_in_hex = $convert_map_to_from_hex
6268                                  || $self->format eq $HEX_FORMAT;
6269             }
6270
6271             # To store any annotations about the characters.
6272             my @annotation;
6273
6274             # Output each range as part of the here document.
6275             RANGE:
6276             for my $set ($range_list{$addr}->ranges) {
6277                 if ($set->type != 0) {
6278                     $self->handle_special_range($set);
6279                     next RANGE;
6280                 }
6281                 my $start = $set->start;
6282                 my $end   = $set->end;
6283                 my $value  = $set->value;
6284
6285                 # Don't output ranges whose value is the one to suppress
6286                 next RANGE if defined $suppress_value
6287                               && $value eq $suppress_value;
6288
6289                 $value = CORE::hex $value if $convert_map_to_from_hex;
6290
6291
6292                 {   # This bare block encloses the scope where we may need to
6293                     # 'redo' to.  Consider a table that is to be written out
6294                     # using single item ranges.  This is given in the
6295                     # $range_size_1 boolean.  To accomplish this, we split the
6296                     # range each time through the loop into two portions, the
6297                     # first item, and the rest.  We handle that first item
6298                     # this time in the loop, and 'redo' to repeat the process
6299                     # for the rest of the range.
6300                     #
6301                     # We may also have to do it, with other special handling,
6302                     # if the table has adjustments.  Consider the table that
6303                     # contains the lowercasing maps.  mktables stores the
6304                     # ASCII range ones as 26 ranges:
6305                     #       ord('A') => ord('a'), .. ord('Z') => ord('z')
6306                     # For compactness, the table that gets written has this as
6307                     # just one range
6308                     #       ( ord('A') .. ord('Z') ) => ord('a')
6309                     # and the software that reads the tables is smart enough
6310                     # to "connect the dots".  This change is accomplished in
6311                     # this loop by looking to see if the current iteration
6312                     # fits the paradigm of the previous iteration, and if so,
6313                     # we merge them by replacing the final output item with
6314                     # the merged data.  Repeated 25 times, this gets A-Z.  But
6315                     # we also have to make sure we don't screw up cases where
6316                     # we have internally stored
6317                     #       ( 0x1C4 .. 0x1C6 ) => 0x1C5
6318                     # This single internal range has to be output as 3 ranges,
6319                     # which is done by splitting, like we do for $range_size_1
6320                     # tables.  (There are very few of such ranges that need to
6321                     # be split, so the gain of doing the combining of other
6322                     # ranges far outweighs the splitting of these.)  The
6323                     # values to use for the redo at the end of this block are
6324                     # set up just below in the scalars whose names begin with
6325                     # '$next_'.
6326
6327                     if (($use_adjustments || $range_size_1) && $end != $start)
6328                     {
6329                         $next_start = $start + 1;
6330                         $next_end = $end;
6331                         $next_value = $value;
6332                         $end = $start;
6333                     }
6334
6335                     if ($use_adjustments && ! $range_size_1) {
6336
6337                         # If this range is adjacent to the previous one, and
6338                         # the values in each are integers that are also
6339                         # adjacent (differ by 1), then this range really
6340                         # extends the previous one that is already in element
6341                         # $OUT[-1].  So we pop that element, and pretend that
6342                         # the range starts with whatever it started with.
6343                         # $offset is incremented by 1 each time so that it
6344                         # gives the current offset from the first element in
6345                         # the accumulating range, and we keep in $value the
6346                         # value of that first element.
6347                         if ($start == $previous_end + 1
6348                             && $value =~ /^ -? \d+ $/xa
6349                             && $previous_value =~ /^ -? \d+ $/xa
6350                             && ($value == ($previous_value + ++$offset)))
6351                         {
6352                             pop @OUT;
6353                             $start = $previous_start;
6354                             $value = $previous_value;
6355                         }
6356                         else {
6357                             $offset = 0;
6358                             if (@annotation == 1) {
6359                                 $OUT[-1] = merge_single_annotation_line(
6360                                     $OUT[-1], $annotation[0], $comment_indent);
6361                             }
6362                             else {
6363                                 push @OUT, @annotation;
6364                             }
6365                         }
6366                         undef @annotation;
6367
6368                         # Save the current values for the next time through
6369                         # the loop.
6370                         $previous_start = $start;
6371                         $previous_end = $end;
6372                         $previous_value = $value;
6373                     }
6374
6375                     if ($write_as_invlist) {
6376                         if (   $previous_end > 0
6377                             && $output_range_counts{$addr})
6378                         {
6379                             my $complement_count = $start - $previous_end - 1;
6380                             if ($complement_count > 1) {
6381                                 $OUT[-1] = merge_single_annotation_line(
6382                                     $OUT[-1],
6383                                        "#"
6384                                      . (" " x 17)
6385                                      . "["
6386                                      .  main::clarify_code_point_count(
6387                                                             $complement_count)
6388                                       . "] in complement\n",
6389                                     $comment_indent);
6390                             }
6391                         }
6392
6393                         # Inversion list format has a single number per line,
6394                         # the starting code point of a range that matches the
6395                         # property
6396                         push @OUT, $start, "\n";
6397                         $invlist_count++;
6398
6399                         # Add a comment with the size of the range, if
6400                         # requested.
6401                         if ($output_range_counts{$addr}) {
6402                             $OUT[-1] = merge_single_annotation_line(
6403                                     $OUT[-1],
6404                                     "# ["
6405                                       . main::clarify_code_point_count($end - $start + 1)
6406                                       . "]\n",
6407                                     $comment_indent);
6408                         }
6409                     }
6410                     elsif ($start != $end) { # If there is a range
6411                         if ($end == $MAX_WORKING_CODEPOINT) {
6412                             push @OUT, sprintf "$hex_format\t$hex_format",
6413                                                 $start,
6414                                                 $MAX_PLATFORM_CODEPOINT;
6415                         }
6416                         else {
6417                             push @OUT, sprintf "$hex_format\t$hex_format",
6418                                                 $start,       $end;
6419                         }
6420                         if (length $value) {
6421                             if ($convert_map_to_from_hex) {
6422                                 $OUT[-1] .= sprintf "\t$hex_format\n", $value;
6423                             }
6424                             else {
6425                                 $OUT[-1] .= "\t$value\n";
6426                             }
6427                         }
6428
6429                         # Add a comment with the size of the range, if
6430                         # requested.
6431                         if ($output_range_counts{$addr}) {
6432                             $OUT[-1] = merge_single_annotation_line(
6433                                     $OUT[-1],
6434                                     "# ["
6435                                       . main::clarify_code_point_count($end - $start + 1)
6436                                       . "]\n",
6437                                     $comment_indent);
6438                         }
6439                     }
6440                     else { # Here to output a single code point per line.
6441
6442                         # Use any passed in subroutine to output.
6443                         if (ref $range_size_1 eq 'CODE') {
6444                             for my $i ($start .. $end) {
6445                                 push @OUT, &{$range_size_1}($i, $value);
6446                             }
6447                         }
6448                         else {
6449
6450                             # Here, caller is ok with default output.
6451                             for (my $i = $start; $i <= $end; $i++) {
6452                                 if ($convert_map_to_from_hex) {
6453                                     push @OUT,
6454                                         sprintf "$hex_format\t\t$hex_format\n",
6455                                                  $i,            $value;
6456                                 }
6457                                 else {
6458                                     push @OUT, sprintf $hex_format, $i;
6459                                     $OUT[-1] .= "\t\t$value" if $value ne "";
6460                                     $OUT[-1] .= "\n";
6461                                 }
6462                             }
6463                         }
6464                     }
6465
6466                     if ($annotate) {
6467                         for (my $i = $start; $i <= $end; $i++) {
6468                             my $annotation = "";
6469
6470                             # Get character information if don't have it already
6471                             main::populate_char_info($i)
6472                                                      if ! defined $viacode[$i];
6473                             my $type = $annotate_char_type[$i];
6474
6475                             # Figure out if should output the next code points
6476                             # as part of a range or not.  If this is not in an
6477                             # annotation range, then won't output as a range,
6478                             # so returns $i.  Otherwise use the end of the
6479                             # annotation range, but no further than the
6480                             # maximum possible end point of the loop.
6481                             my $range_end =
6482                                         $range_size_1
6483                                         ? $start
6484                                         : main::min(
6485                                           $annotate_ranges->value_of($i) || $i,
6486                                           $end);
6487
6488                             # Use a range if it is a range, and either is one
6489                             # of the special annotation ranges, or the range
6490                             # is at most 3 long.  This last case causes the
6491                             # algorithmically named code points to be output
6492                             # individually in spans of at most 3, as they are
6493                             # the ones whose $type is > 0.
6494                             if ($range_end != $i
6495                                 && ( $type < 0 || $range_end - $i > 2))
6496                             {
6497                                 # Here is to output a range.  We don't allow a
6498                                 # caller-specified output format--just use the
6499                                 # standard one.
6500                                 my $range_name = $viacode[$i];
6501
6502                                 # For the code points which end in their hex
6503                                 # value, we eliminate that from the output
6504                                 # annotation, and capitalize only the first
6505                                 # letter of each word.
6506                                 if ($type == $CP_IN_NAME) {
6507                                     my $hex = sprintf $hex_format, $i;
6508                                     $range_name =~ s/-$hex$//;
6509                                     my @words = split " ", $range_name;
6510                                     for my $word (@words) {
6511                                         $word =
6512                                           ucfirst(lc($word)) if $word ne 'CJK';
6513                                     }
6514                                     $range_name = join " ", @words;
6515                                 }
6516                                 elsif ($type == $HANGUL_SYLLABLE) {
6517                                     $range_name = "Hangul Syllable";
6518                                 }
6519
6520                                 # If the annotation would just repeat what's
6521                                 # already being output as the range, skip it.
6522                                 # (When an inversion list is being written, it
6523                                 # isn't a repeat, as that always is in
6524                                 # decimal)
6525                                 if (   $write_as_invlist
6526                                     || $i != $start
6527                                     || $range_end < $end)
6528                                 {
6529                                     if ($range_end < $MAX_WORKING_CODEPOINT)
6530                                     {
6531                                         $annotation = sprintf "%04X..%04X",
6532                                                               $i,   $range_end;
6533                                     }
6534                                     else {
6535                                         $annotation = sprintf "%04X..INFINITY",
6536                                                                $i;
6537                                     }
6538                                 }
6539                                 else { # Indent if not displaying code points
6540                                     $annotation = " " x 4;
6541                                 }
6542
6543                                 if ($range_name) {
6544                                     $annotation .= " $age[$i]" if $age[$i];
6545                                     $annotation .= " $range_name";
6546                                 }
6547
6548                                 # Include the number of code points in the
6549                                 # range
6550                                 my $count =
6551                                     main::clarify_code_point_count($range_end - $i + 1);
6552                                 $annotation .= " [$count]\n";
6553
6554                                 # Skip to the end of the range
6555                                 $i = $range_end;
6556                             }
6557                             else { # Not in a range.
6558                                 my $comment = "";
6559
6560                                 # When outputting the names of each character,
6561                                 # use the character itself if printable
6562                                 $comment .= "'" . main::display_chr($i) . "' "
6563                                                             if $printable[$i];
6564
6565                                 my $output_value = $value;
6566
6567                                 # Determine the annotation
6568                                 if ($format eq $DECOMP_STRING_FORMAT) {
6569
6570                                     # This is very specialized, with the type
6571                                     # of decomposition beginning the line
6572                                     # enclosed in <...>, and the code points
6573                                     # that the code point decomposes to
6574                                     # separated by blanks.  Create two
6575                                     # strings, one of the printable
6576                                     # characters, and one of their official
6577                                     # names.
6578                                     (my $map = $output_value)
6579                                                     =~ s/ \ * < .*? > \ +//x;
6580                                     my $tostr = "";
6581                                     my $to_name = "";
6582                                     my $to_chr = "";
6583                                     foreach my $to (split " ", $map) {
6584                                         $to = CORE::hex $to;
6585                                         $to_name .= " + " if $to_name;
6586                                         $to_chr .= main::display_chr($to);
6587                                         main::populate_char_info($to)
6588                                                     if ! defined $viacode[$to];
6589                                         $to_name .=  $viacode[$to];
6590                                     }
6591
6592                                     $comment .=
6593                                     "=> '$to_chr'; $viacode[$i] => $to_name";
6594                                 }
6595                                 else {
6596                                     $output_value += $i - $start
6597                                                    if $use_adjustments
6598                                                       # Don't try to adjust a
6599                                                       # non-integer
6600                                                    && $output_value !~ /[-\D]/;
6601
6602                                     if ($output_map_in_hex) {
6603                                         main::populate_char_info($output_value)
6604                                           if ! defined $viacode[$output_value];
6605                                         $comment .= " => '"
6606                                         . main::display_chr($output_value)
6607                                         . "'; " if $printable[$output_value];
6608                                     }
6609                                     if ($include_name && $viacode[$i]) {
6610                                         $comment .= " " if $comment;
6611                                         $comment .= $viacode[$i];
6612                                     }
6613                                     if ($output_map_in_hex) {
6614                                         $comment .=
6615                                                 " => $viacode[$output_value]"
6616                                                     if $viacode[$output_value];
6617                                         $output_value = sprintf($hex_format,
6618                                                                 $output_value);
6619                                     }
6620                                 }
6621
6622                                 if ($include_cp) {
6623                                     $annotation = sprintf "%04X %s", $i, $age[$i];
6624                                     if ($use_adjustments) {
6625                                         $annotation .= " => $output_value";
6626                                     }
6627                                 }
6628
6629                                 if ($comment ne "") {
6630                                     $annotation .= " " if $annotation ne "";
6631                                     $annotation .= $comment;
6632                                 }
6633                                 $annotation .= "\n" if $annotation ne "";
6634                             }
6635
6636                             if ($annotation ne "") {
6637                                 push @annotation, (" " x $comment_indent)
6638                                                   .  "# $annotation";
6639                             }
6640                         }
6641
6642                         # If not adjusting, we don't have to go through the
6643                         # loop again to know that the annotation comes next
6644                         # in the output.
6645                         if (! $use_adjustments) {
6646                             if (@annotation == 1) {
6647                                 $OUT[-1] = merge_single_annotation_line(
6648                                     $OUT[-1], $annotation[0], $comment_indent);
6649                             }
6650                             else {
6651                                 push @OUT, map { Text::Tabs::unexpand $_ }
6652                                                @annotation;
6653                             }
6654                             undef @annotation;
6655                         }
6656                     }
6657
6658                     # Add the beginning of the range that doesn't match the
6659                     # property, except if the just added match range extends
6660                     # to infinity.  We do this after any annotations for the
6661                     # match range.
6662                     if ($write_as_invlist && $end < $MAX_WORKING_CODEPOINT) {
6663                         push @OUT, $end + 1, "\n";
6664                         $invlist_count++;
6665                     }
6666
6667                     # If we split the range, set up so the next time through
6668                     # we get the remainder, and redo.
6669                     if ($next_start) {
6670                         $start = $next_start;
6671                         $end = $next_end;
6672                         $value = $next_value;
6673                         $next_start = 0;
6674                         redo;
6675                     }
6676                 }
6677             } # End of loop through all the table's ranges
6678
6679             push @OUT, @annotation; # Add orphaned annotation, if any
6680
6681             splice @OUT, 1, 0, "V$invlist_count\n" if $invlist_count;
6682         }
6683
6684         # Add anything that goes after the main body, but within the here
6685         # document,
6686         my $append_to_body = $self->append_to_body;
6687         push @OUT, $append_to_body if $append_to_body;
6688
6689         # And finish the here document.
6690         push @OUT, "END\n";
6691
6692         # Done with the main portion of the body.  Can now figure out what
6693         # should appear before it in the file.
6694         my $pre_body = $self->pre_body;
6695         push @HEADER, $pre_body, "\n" if $pre_body;
6696
6697         # All these files should have a .pl suffix added to them.
6698         my @file_with_pl = @{$file_path{$addr}};
6699         $file_with_pl[-1] .= '.pl';
6700
6701         main::write(\@file_with_pl,
6702                     $annotate,      # utf8 iff annotating
6703                     \@HEADER,
6704                     \@OUT);
6705         return;
6706     }
6707
6708     sub set_status {    # Set the table's status
6709         my $self = shift;
6710         my $status = shift; # The status enum value
6711         my $info = shift;   # Any message associated with it.
6712         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6713
6714         my $addr = do { no overloading; pack 'J', $self; };
6715
6716         $status{$addr} = $status;
6717         $status_info{$addr} = $info;
6718         return;
6719     }
6720
6721     sub set_fate {  # Set the fate of a table
6722         my $self = shift;
6723         my $fate = shift;
6724         my $reason = shift;
6725         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6726
6727         my $addr = do { no overloading; pack 'J', $self; };
6728
6729         return if $fate{$addr} == $fate;    # If no-op
6730
6731         # Can only change the ordinary fate, except if going to $MAP_PROXIED
6732         return if $fate{$addr} != $ORDINARY && $fate != $MAP_PROXIED;
6733
6734         $fate{$addr} = $fate;
6735
6736         # Don't document anything to do with a non-normal fated table
6737         if ($fate != $ORDINARY) {
6738             my $put_in_pod = ($fate == $MAP_PROXIED) ? 1 : 0;
6739             foreach my $alias ($self->aliases) {
6740                 $alias->set_ucd($put_in_pod);
6741
6742                 # MAP_PROXIED doesn't affect the match tables
6743                 next if $fate == $MAP_PROXIED;
6744                 $alias->set_make_re_pod_entry($put_in_pod);
6745             }
6746         }
6747
6748         # Save the reason for suppression for output
6749         if ($fate >= $SUPPRESSED) {
6750             $reason = "" unless defined $reason;
6751             $why_suppressed{$complete_name{$addr}} = $reason;
6752         }
6753
6754         return;
6755     }
6756
6757     sub lock {
6758         # Don't allow changes to the table from now on.  This stores a stack
6759         # trace of where it was called, so that later attempts to modify it
6760         # can immediately show where it got locked.
6761
6762         my $self = shift;
6763         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6764
6765         my $addr = do { no overloading; pack 'J', $self; };
6766
6767         $locked{$addr} = "";
6768
6769         my $line = (caller(0))[2];
6770         my $i = 1;
6771
6772         # Accumulate the stack trace
6773         while (1) {
6774             my ($pkg, $file, $caller_line, $caller) = caller $i++;
6775
6776             last unless defined $caller;
6777
6778             $locked{$addr} .= "    called from $caller() at line $line\n";
6779             $line = $caller_line;
6780         }
6781         $locked{$addr} .= "    called from main at line $line\n";
6782
6783         return;
6784     }
6785
6786     sub carp_if_locked {
6787         # Return whether a table is locked or not, and, by the way, complain
6788         # if is locked
6789
6790         my $self = shift;
6791         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6792
6793         my $addr = do { no overloading; pack 'J', $self; };
6794
6795         return 0 if ! $locked{$addr};
6796         Carp::my_carp_bug("Can't modify a locked table. Stack trace of locking:\n$locked{$addr}\n\n");
6797         return 1;
6798     }
6799
6800     sub set_file_path { # Set the final directory path for this table
6801         my $self = shift;
6802         # Rest of parameters passed on
6803
6804         no overloading;
6805         @{$file_path{pack 'J', $self}} = @_;
6806         return
6807     }
6808
6809     # Accessors for the range list stored in this table.  First for
6810     # unconditional
6811     for my $sub (qw(
6812                     containing_range
6813                     contains
6814                     count
6815                     each_range
6816                     hash
6817                     is_empty
6818                     matches_identically_to
6819                     max
6820                     min
6821                     range_count
6822                     reset_each_range
6823                     type_of
6824                     value_of
6825                 ))
6826     {
6827         no strict "refs";
6828         *$sub = sub {
6829             use strict "refs";
6830             my $self = shift;
6831             return $self->_range_list->$sub(@_);
6832         }
6833     }
6834
6835     # Then for ones that should fail if locked
6836     for my $sub (qw(
6837                     delete_range
6838                 ))
6839     {
6840         no strict "refs";
6841         *$sub = sub {
6842             use strict "refs";
6843             my $self = shift;
6844
6845             return if $self->carp_if_locked;
6846             no overloading;
6847             return $self->_range_list->$sub(@_);
6848         }
6849     }
6850
6851 } # End closure
6852
6853 package Map_Table;
6854 use parent '-norequire', '_Base_Table';
6855
6856 # A Map Table is a table that contains the mappings from code points to
6857 # values.  There are two weird cases:
6858 # 1) Anomalous entries are ones that aren't maps of ranges of code points, but
6859 #    are written in the table's file at the end of the table nonetheless.  It
6860 #    requires specially constructed code to handle these; utf8.c can not read
6861 #    these in, so they should not go in $map_directory.  As of this writing,
6862 #    the only case that these happen is for named sequences used in
6863 #    charnames.pm.   But this code doesn't enforce any syntax on these, so
6864 #    something else could come along that uses it.
6865 # 2) Specials are anything that doesn't fit syntactically into the body of the
6866 #    table.  The ranges for these have a map type of non-zero.  The code below
6867 #    knows about and handles each possible type.   In most cases, these are
6868 #    written as part of the header.
6869 #
6870 # A map table deliberately can't be manipulated at will unlike match tables.
6871 # This is because of the ambiguities having to do with what to do with
6872 # overlapping code points.  And there just isn't a need for those things;
6873 # what one wants to do is just query, add, replace, or delete mappings, plus
6874 # write the final result.
6875 # However, there is a method to get the list of possible ranges that aren't in
6876 # this table to use for defaulting missing code point mappings.  And,
6877 # map_add_or_replace_non_nulls() does allow one to add another table to this
6878 # one, but it is clearly very specialized, and defined that the other's
6879 # non-null values replace this one's if there is any overlap.
6880
6881 sub trace { return main::trace(@_); }
6882
6883 { # Closure
6884
6885     main::setup_package();
6886
6887     my %default_map;
6888     # Many input files omit some entries; this gives what the mapping for the
6889     # missing entries should be
6890     main::set_access('default_map', \%default_map, 'r');
6891
6892     my %anomalous_entries;
6893     # Things that go in the body of the table which don't fit the normal
6894     # scheme of things, like having a range.  Not much can be done with these
6895     # once there except to output them.  This was created to handle named
6896     # sequences.
6897     main::set_access('anomalous_entry', \%anomalous_entries, 'a');
6898     main::set_access('anomalous_entries',       # Append singular, read plural
6899                     \%anomalous_entries,
6900                     'readable_array');
6901
6902     my %replacement_property;
6903     # Certain files are unused by Perl itself, and are kept only for backwards
6904     # compatibility for programs that used them before Unicode::UCD existed.
6905     # These are termed legacy properties.  At some point they may be removed,
6906     # but for now mark them as legacy.  If non empty, this is the name of the
6907     # property to use instead (i.e., the modern equivalent).
6908     main::set_access('replacement_property', \%replacement_property, 'r');
6909
6910     my %to_output_map;
6911     # Enum as to whether or not to write out this map table, and how:
6912     #   0               don't output
6913     #   $EXTERNAL_MAP   means its existence is noted in the documentation, and
6914     #                   it should not be removed nor its format changed.  This
6915     #                   is done for those files that have traditionally been
6916     #                   output.  Maps of legacy-only properties default to
6917     #                   this.
6918     #   $INTERNAL_MAP   means Perl reserves the right to do anything it wants
6919     #                   with this file
6920     #   $OUTPUT_ADJUSTED means that it is an $INTERNAL_MAP, and instead of
6921     #                   outputting the actual mappings as-is, we adjust things
6922     #                   to create a much more compact table. Only those few
6923     #                   tables where the mapping is convertible at least to an
6924     #                   integer and compacting makes a big difference should
6925     #                   have this.  Hence, the default is to not do this
6926     #                   unless the table's default mapping is to $CODE_POINT,
6927     #                   and the range size is not 1.
6928     main::set_access('to_output_map', \%to_output_map, 's');
6929
6930     sub new {
6931         my $class = shift;
6932         my $name = shift;
6933
6934         my %args = @_;
6935
6936         # Optional initialization data for the table.
6937         my $initialize = delete $args{'Initialize'};
6938
6939         my $default_map = delete $args{'Default_Map'};
6940         my $property = delete $args{'_Property'};
6941         my $full_name = delete $args{'Full_Name'};
6942         my $replacement_property = delete $args{'Replacement_Property'} // "";
6943         my $to_output_map = delete $args{'To_Output_Map'};
6944
6945         # Rest of parameters passed on; legacy properties have several common
6946         # other attributes
6947         if ($replacement_property) {
6948             $args{"Fate"} = $LEGACY_ONLY;
6949             $args{"Range_Size_1"} = 1;
6950             $args{"Perl_Extension"} = 1;
6951             $args{"UCD"} = 0;
6952         }
6953
6954         my $range_list = Range_Map->new(Owner => $property);
6955
6956         my $self = $class->SUPER::new(
6957                                     Name => $name,
6958                                     Complete_Name =>  $full_name,
6959                                     Full_Name => $full_name,
6960                                     _Property => $property,
6961                                     _Range_List => $range_list,
6962                                     Write_As_Invlist => 0,
6963                                     %args);
6964
6965         my $addr = do { no overloading; pack 'J', $self; };
6966
6967         $anomalous_entries{$addr} = [];
6968         $default_map{$addr} = $default_map;
6969         $replacement_property{$addr} = $replacement_property;
6970         $to_output_map = $EXTERNAL_MAP if ! defined $to_output_map
6971                                           && $replacement_property;
6972         $to_output_map{$addr} = $to_output_map;
6973
6974         $self->initialize($initialize) if defined $initialize;
6975
6976         return $self;
6977     }
6978
6979     use overload
6980         fallback => 0,
6981         qw("") => "_operator_stringify",
6982     ;
6983
6984     sub _operator_stringify {
6985         my $self = shift;
6986
6987         my $name = $self->property->full_name;
6988         $name = '""' if $name eq "";
6989         return "Map table for Property '$name'";
6990     }
6991
6992     sub add_alias {
6993         # Add a synonym for this table (which means the property itself)
6994         my $self = shift;
6995         my $name = shift;
6996         # Rest of parameters passed on.
6997
6998         $self->SUPER::add_alias($name, $self->property, @_);
6999         return;
7000     }
7001
7002     sub add_map {
7003         # Add a range of code points to the list of specially-handled code
7004         # points.  $MULTI_CP is assumed if the type of special is not passed
7005         # in.
7006
7007         my $self = shift;
7008         my $lower = shift;
7009         my $upper = shift;
7010         my $string = shift;
7011         my %args = @_;
7012
7013         my $type = delete $args{'Type'} || 0;
7014         # Rest of parameters passed on
7015
7016         # Can't change the table if locked.
7017         return if $self->carp_if_locked;
7018
7019         my $addr = do { no overloading; pack 'J', $self; };
7020
7021         $self->_range_list->add_map($lower, $upper,
7022                                     $string,
7023                                     @_,
7024                                     Type => $type);
7025         return;
7026     }
7027
7028     sub append_to_body {
7029         # Adds to the written HERE document of the table's body any anomalous
7030         # entries in the table..
7031
7032         my $self = shift;
7033         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7034
7035         my $addr = do { no overloading; pack 'J', $self; };
7036
7037         return "" unless @{$anomalous_entries{$addr}};
7038         return join("\n", @{$anomalous_entries{$addr}}) . "\n";
7039     }
7040
7041     sub map_add_or_replace_non_nulls {
7042         # This adds the mappings in the table $other to $self.  Non-null
7043         # mappings from $other override those in $self.  It essentially merges
7044         # the two tables, with the second having priority except for null
7045         # mappings.
7046
7047         my $self = shift;
7048         my $other = shift;
7049         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7050
7051         return if $self->carp_if_locked;
7052
7053         if (! $other->isa(__PACKAGE__)) {
7054             Carp::my_carp_bug("$other should be a "
7055                         . __PACKAGE__
7056                         . ".  Not a '"
7057                         . ref($other)
7058                         . "'.  Not added;");
7059             return;
7060         }
7061
7062         my $addr = do { no overloading; pack 'J', $self; };
7063         my $other_addr = do { no overloading; pack 'J', $other; };
7064
7065         local $to_trace = 0 if main::DEBUG;
7066
7067         my $self_range_list = $self->_range_list;
7068         my $other_range_list = $other->_range_list;
7069         foreach my $range ($other_range_list->ranges) {
7070             my $value = $range->value;
7071             next if $value eq "";
7072             $self_range_list->_add_delete('+',
7073                                           $range->start,
7074                                           $range->end,
7075                                           $value,
7076                                           Type => $range->type,
7077                                           Replace => $UNCONDITIONALLY);
7078         }
7079
7080         return;
7081     }
7082
7083     sub set_default_map {
7084         # Define what code points that are missing from the input files should
7085         # map to.  The optional second parameter 'full_name' indicates to
7086         # force using the full name of the map instead of its standard name.
7087
7088         my $self = shift;
7089         my $map = shift;
7090         my $use_full_name = shift // 0;
7091         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7092
7093         if ($use_full_name && $use_full_name ne 'full_name') {
7094             Carp::my_carp_bug("Second parameter to set_default_map() if"
7095                             . " present, must be 'full_name'");
7096         }
7097
7098         my $addr = do { no overloading; pack 'J', $self; };
7099
7100         # Convert the input to the standard equivalent, if any (won't have any
7101         # for $STRING properties)
7102         my $standard = $self->property->table($map);
7103         if (defined $standard) {
7104             $map = ($use_full_name)
7105                    ? $standard->full_name
7106                    : $standard->name;
7107         }
7108
7109         # Warn if there already is a non-equivalent default map for this
7110         # property.  Note that a default map can be a ref, which means that
7111         # what it actually means is delayed until later in the program, and it
7112         # IS permissible to override it here without a message.
7113         my $default_map = $default_map{$addr};
7114         if (defined $default_map
7115             && ! ref($default_map)
7116             && $default_map ne $map
7117             && main::Standardize($map) ne $default_map)
7118         {
7119             my $property = $self->property;
7120             my $map_table = $property->table($map);
7121             my $default_table = $property->table($default_map);
7122             if (defined $map_table
7123                 && defined $default_table
7124                 && $map_table != $default_table)
7125             {
7126                 Carp::my_carp("Changing the default mapping for "
7127                             . $property
7128                             . " from $default_map to $map'");
7129             }
7130         }
7131
7132         $default_map{$addr} = $map;
7133
7134         # Don't also create any missing table for this map at this point,
7135         # because if we did, it could get done before the main table add is
7136         # done for PropValueAliases.txt; instead the caller will have to make
7137         # sure it exists, if desired.
7138         return;
7139     }
7140
7141     sub to_output_map {
7142         # Returns boolean: should we write this map table?
7143
7144         my $self = shift;
7145         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7146
7147         my $addr = do { no overloading; pack 'J', $self; };
7148
7149         # If overridden, use that
7150         return $to_output_map{$addr} if defined $to_output_map{$addr};
7151
7152         my $full_name = $self->full_name;
7153         return $global_to_output_map{$full_name}
7154                                 if defined $global_to_output_map{$full_name};
7155
7156         # If table says to output, do so; if says to suppress it, do so.
7157         my $fate = $self->fate;
7158         return $INTERNAL_MAP if $fate == $INTERNAL_ONLY;
7159         return $EXTERNAL_MAP if grep { $_ eq $full_name } @output_mapped_properties;
7160         return 0 if $fate == $SUPPRESSED || $fate == $MAP_PROXIED;
7161
7162         my $type = $self->property->type;
7163
7164         # Don't want to output binary map tables even for debugging.
7165         return 0 if $type == $BINARY;
7166
7167         # But do want to output string ones.  All the ones that remain to
7168         # be dealt with (i.e. which haven't explicitly been set to external)
7169         # are for internal Perl use only.  The default for those that map to
7170         # $CODE_POINT and haven't been restricted to a single element range
7171         # is to use the adjusted form.
7172         if ($type == $STRING) {
7173             return $INTERNAL_MAP if $self->range_size_1
7174                                     || $default_map{$addr} ne $CODE_POINT;
7175             return $OUTPUT_ADJUSTED;
7176         }
7177
7178         # Otherwise is an $ENUM, do output it, for Perl's purposes
7179         return $INTERNAL_MAP;
7180     }
7181
7182     sub inverse_list {
7183         # Returns a Range_List that is gaps of the current table.  That is,
7184         # the inversion
7185
7186         my $self = shift;
7187         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7188
7189         my $current = Range_List->new(Initialize => $self->_range_list,
7190                                 Owner => $self->property);
7191         return ~ $current;
7192     }
7193
7194     sub header {
7195         my $self = shift;
7196         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7197
7198         my $return = $self->SUPER::header();
7199
7200         if ($self->to_output_map >= $INTERNAL_MAP) {
7201             $return .= $INTERNAL_ONLY_HEADER;
7202         }
7203         else {
7204             my $property_name = $self->property->replacement_property;
7205
7206             # The legacy-only properties were gotten above; but there are some
7207             # other properties whose files are in current use that have fixed
7208             # formats.
7209             $property_name = $self->property->full_name unless $property_name;
7210
7211             $return .= <<END;
7212
7213 # !!!!!!!   IT IS DEPRECATED TO USE THIS FILE   !!!!!!!
7214
7215 # This file is for internal use by core Perl only.  It is retained for
7216 # backwards compatibility with applications that may have come to rely on it,
7217 # but its format and even its name or existence are subject to change without
7218 # notice in a future Perl version.  Don't use it directly.  Instead, its
7219 # contents are now retrievable through a stable API in the Unicode::UCD
7220 # module: Unicode::UCD::prop_invmap('$property_name') (Values for individual
7221 # code points can be retrieved via Unicode::UCD::charprop());
7222 END
7223         }
7224         return $return;
7225     }
7226
7227     sub set_final_comment {
7228         # Just before output, create the comment that heads the file
7229         # containing this table.
7230
7231         return unless $debugging_build;
7232
7233         my $self = shift;
7234         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7235
7236         # No sense generating a comment if aren't going to write it out.
7237         return if ! $self->to_output_map;
7238
7239         my $addr = do { no overloading; pack 'J', $self; };
7240
7241         my $property = $self->property;
7242
7243         # Get all the possible names for this property.  Don't use any that
7244         # aren't ok for use in a file name, etc.  This is perhaps causing that
7245         # flag to do double duty, and may have to be changed in the future to
7246         # have our own flag for just this purpose; but it works now to exclude
7247         # Perl generated synonyms from the lists for properties, where the
7248         # name is always the proper Unicode one.
7249         my @property_aliases = grep { $_->ok_as_filename } $self->aliases;
7250
7251         my $count = $self->count;
7252         my $default_map = $default_map{$addr};
7253
7254         # The ranges that map to the default aren't output, so subtract that
7255         # to get those actually output.  A property with matching tables
7256         # already has the information calculated.
7257         if ($property->type != $STRING && $property->type != $FORCED_BINARY) {
7258             $count -= $property->table($default_map)->count;
7259         }
7260         elsif (defined $default_map) {
7261
7262             # But for $STRING properties, must calculate now.  Subtract the
7263             # count from each range that maps to the default.
7264             foreach my $range ($self->_range_list->ranges) {
7265                 if ($range->value eq $default_map) {
7266                     $count -= $range->end +1 - $range->start;
7267                 }
7268             }
7269
7270         }
7271
7272         # Get a  string version of $count with underscores in large numbers,
7273         # for clarity.
7274         my $string_count = main::clarify_code_point_count($count);
7275
7276         my $code_points = ($count == 1)
7277                         ? 'single code point'
7278                         : "$string_count code points";
7279
7280         my $mapping;
7281         my $these_mappings;
7282         my $are;
7283         if (@property_aliases <= 1) {
7284             $mapping = 'mapping';
7285             $these_mappings = 'this mapping';
7286             $are = 'is'
7287         }
7288         else {
7289             $mapping = 'synonymous mappings';
7290             $these_mappings = 'these mappings';
7291             $are = 'are'
7292         }
7293         my $cp;
7294         if ($count >= $MAX_UNICODE_CODEPOINTS) {
7295             $cp = "any code point in Unicode Version $string_version";
7296         }
7297         else {
7298             my $map_to;
7299             if ($default_map eq "") {
7300                 $map_to = 'the null string';
7301             }
7302             elsif ($default_map eq $CODE_POINT) {
7303                 $map_to = "itself";
7304             }
7305             else {
7306                 $map_to = "'$default_map'";
7307             }
7308             if ($count == 1) {
7309                 $cp = "the single code point";
7310             }
7311             else {
7312                 $cp = "one of the $code_points";
7313             }
7314             $cp .= " in Unicode Version $unicode_version for which the mapping is not to $map_to";
7315         }
7316
7317         my $comment = "";
7318
7319         my $status = $self->status;
7320         if ($status ne $NORMAL) {
7321             my $warn = uc $status_past_participles{$status};
7322             $comment .= <<END;
7323
7324 !!!!!!!   $warn !!!!!!!!!!!!!!!!!!!
7325  All property or property=value combinations contained in this file are $warn.
7326  See $unicode_reference_url for what this means.
7327
7328 END
7329         }
7330         $comment .= "This file returns the $mapping:\n";
7331
7332         my $ucd_accessible_name = "";
7333         my $has_underscore_name = 0;
7334         my $full_name = $self->property->full_name;
7335         for my $i (0 .. @property_aliases - 1) {
7336             my $name = $property_aliases[$i]->name;
7337             $has_underscore_name = 1 if $name =~ /^_/;
7338             $comment .= sprintf("%-8s%s\n", " ", $name . '(cp)');
7339             if ($property_aliases[$i]->ucd) {
7340                 if ($name eq $full_name) {
7341                     $ucd_accessible_name = $full_name;
7342                 }
7343                 elsif (! $ucd_accessible_name) {
7344                     $ucd_accessible_name = $name;
7345                 }
7346             }
7347         }
7348         $comment .= "\nwhere 'cp' is $cp.";
7349         if ($ucd_accessible_name) {
7350             $comment .= "  Note that $these_mappings";
7351             if ($has_underscore_name) {
7352                 $comment .= " (except for the one(s) that begin with an underscore)";
7353             }
7354             $comment .= " $are accessible via the functions prop_invmap('$full_name') or charprop() in Unicode::UCD";
7355
7356         }
7357
7358         # And append any commentary already set from the actual property.
7359         $comment .= "\n\n" . $self->comment if $self->comment;
7360         if ($self->description) {
7361             $comment .= "\n\n" . join " ", $self->description;
7362         }
7363         if ($self->note) {
7364             $comment .= "\n\n" . join " ", $self->note;
7365         }
7366         $comment .= "\n";
7367
7368         if (! $self->perl_extension) {
7369             $comment .= <<END;
7370
7371 For information about what this property really means, see:
7372 $unicode_reference_url
7373 END
7374         }
7375
7376         if ($count) {        # Format differs for empty table
7377                 $comment.= "\nThe format of the ";
7378             if ($self->range_size_1) {
7379                 $comment.= <<END;
7380 main body of lines of this file is: CODE_POINT\\t\\tMAPPING where CODE_POINT
7381 is in hex; MAPPING is what CODE_POINT maps to.
7382 END
7383             }
7384             else {
7385
7386                 # There are tables which end up only having one element per
7387                 # range, but it is not worth keeping track of for making just
7388                 # this comment a little better.
7389                 $comment .= <<END;
7390 non-comment portions of the main body of lines of this file is:
7391 START\\tSTOP\\tMAPPING where START is the starting code point of the
7392 range, in hex; STOP is the ending point, or if omitted, the range has just one
7393 code point; MAPPING is what each code point between START and STOP maps to.
7394 END
7395                 if ($self->output_range_counts) {
7396                     $comment .= <<END;
7397 Numbers in comments in [brackets] indicate how many code points are in the
7398 range (omitted when the range is a single code point or if the mapping is to
7399 the null string).
7400 END
7401                 }
7402             }
7403         }
7404         $self->set_comment(main::join_lines($comment));
7405         return;
7406     }
7407
7408     my %swash_keys; # Makes sure don't duplicate swash names.
7409
7410     # The remaining variables are temporaries used while writing each table,
7411     # to output special ranges.
7412     my @multi_code_point_maps;  # Map is to more than one code point.
7413
7414     sub handle_special_range {
7415         # Called in the middle of write when it finds a range it doesn't know
7416         # how to handle.
7417
7418         my $self = shift;
7419         my $range = shift;
7420         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7421
7422         my $addr = do { no overloading; pack 'J', $self; };
7423
7424         my $type = $range->type;
7425
7426         my $low = $range->start;
7427         my $high = $range->end;
7428         my $map = $range->value;
7429
7430         # No need to output the range if it maps to the default.
7431         return if $map eq $default_map{$addr};
7432
7433         my $property = $self->property;
7434
7435         # Switch based on the map type...
7436         if ($type == $HANGUL_SYLLABLE) {
7437
7438             # These are entirely algorithmically determinable based on
7439             # some constants furnished by Unicode; for now, just set a
7440             # flag to indicate that have them.  After everything is figured
7441             # out, we will output the code that does the algorithm.  (Don't
7442             # output them if not needed because we are suppressing this
7443             # property.)
7444             $has_hangul_syllables = 1 if $property->to_output_map;
7445         }
7446         elsif ($type == $CP_IN_NAME) {
7447
7448             # Code points whose name ends in their code point are also
7449             # algorithmically determinable, but need information about the map
7450             # to do so.  Both the map and its inverse are stored in data
7451             # structures output in the file.  They are stored in the mean time
7452             # in global lists The lists will be written out later into Name.pm,
7453             # which is created only if needed.  In order to prevent duplicates
7454             # in the list, only add to them for one property, should multiple
7455             # ones need them.
7456             if ($needing_code_points_ending_in_code_point == 0) {
7457                 $needing_code_points_ending_in_code_point = $property;
7458             }
7459             if ($property == $needing_code_points_ending_in_code_point) {
7460                 push @{$names_ending_in_code_point{$map}->{'low'}}, $low;
7461                 push @{$names_ending_in_code_point{$map}->{'high'}}, $high;
7462
7463                 my $squeezed = $map =~ s/[-\s]+//gr;
7464                 push @{$loose_names_ending_in_code_point{$squeezed}->{'low'}},
7465                                                                           $low;
7466                 push @{$loose_names_ending_in_code_point{$squeezed}->{'high'}},
7467                                                                          $high;
7468
7469                 push @code_points_ending_in_code_point, { low => $low,
7470                                                         high => $high,
7471                                                         name => $map
7472                                                         };
7473             }
7474         }
7475         elsif ($range->type == $MULTI_CP || $range->type == $NULL) {
7476
7477             # Multi-code point maps and null string maps have an entry
7478             # for each code point in the range.  They use the same
7479             # output format.
7480             for my $code_point ($low .. $high) {
7481
7482                 # The pack() below can't cope with surrogates.  XXX This may
7483                 # no longer be true
7484                 if ($code_point >= 0xD800 && $code_point <= 0xDFFF) {
7485                     Carp::my_carp("Surrogate code point '$code_point' in mapping to '$map' in $self.  No map created");
7486                     next;
7487                 }
7488
7489                 # Generate the hash entries for these in the form that
7490                 # utf8.c understands.
7491                 my $tostr = "";
7492                 my $to_name = "";
7493                 my $to_chr = "";
7494                 foreach my $to (split " ", $map) {
7495                     if ($to !~ /^$code_point_re$/) {
7496                         Carp::my_carp("Illegal code point '$to' in mapping '$map' from $code_point in $self.  No map created");
7497                         next;
7498                     }
7499                     $tostr .= sprintf "\\x{%s}", $to;
7500                     $to = CORE::hex $to;
7501                     if ($annotate) {
7502                         $to_name .= " + " if $to_name;
7503                         $to_chr .= main::display_chr($to);
7504                         main::populate_char_info($to)
7505                                             if ! defined $viacode[$to];
7506                         $to_name .=  $viacode[$to];
7507                     }
7508                 }
7509
7510                 # The unpack yields a list of the bytes that comprise the
7511                 # UTF-8 of $code_point, which are each placed in \xZZ format
7512                 # and output in the %s to map to $tostr, so the result looks
7513                 # like:
7514                 # "\xC4\xB0" => "\x{0069}\x{0307}",
7515                 my $utf8 = sprintf(qq["%s" => "$tostr",],
7516                         join("", map { sprintf "\\x%02X", $_ }
7517                             unpack("U0C*", chr $code_point)));
7518
7519                 # Add a comment so that a human reader can more easily
7520                 # see what's going on.
7521                 push @multi_code_point_maps,
7522                         sprintf("%-45s # U+%04X", $utf8, $code_point);
7523                 if (! $annotate) {
7524                     $multi_code_point_maps[-1] .= " => $map";
7525                 }
7526                 else {
7527                     main::populate_char_info($code_point)
7528                                     if ! defined $viacode[$code_point];
7529                     $multi_code_point_maps[-1] .= " '"
7530                         . main::display_chr($code_point)
7531                         . "' => '$to_chr'; $viacode[$code_point] => $to_name";
7532                 }
7533             }
7534         }
7535         else {
7536             Carp::my_carp("Unrecognized map type '$range->type' in '$range' in $self.  Not written");
7537         }
7538
7539         return;
7540     }
7541
7542     sub pre_body {
7543         # Returns the string that should be output in the file before the main
7544         # body of this table.  It isn't called until the main body is
7545         # calculated, saving a pass.  The string includes some hash entries
7546         # identifying the format of the body, and what the single value should
7547         # be for all ranges missing from it.  It also includes any code points
7548         # which have map_types that don't go in the main table.
7549
7550         my $self = shift;
7551         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7552
7553         my $addr = do { no overloading; pack 'J', $self; };
7554
7555         my $name = $self->property->swash_name;
7556
7557         # Currently there is nothing in the pre_body unless a swash is being
7558         # generated.
7559         return unless defined $name;
7560
7561         if (defined $swash_keys{$name}) {
7562             Carp::my_carp(main::join_lines(<<END
7563 Already created a swash name '$name' for $swash_keys{$name}.  This means that
7564 the same name desired for $self shouldn't be used.  Bad News.  This must be
7565 fixed before production use, but proceeding anyway
7566 END
7567             ));
7568         }
7569         $swash_keys{$name} = "$self";
7570
7571         my $pre_body = "";
7572
7573         # Here we assume we were called after have gone through the whole
7574         # file.  If we actually generated anything for each map type, add its
7575         # respective header and trailer
7576         my $specials_name = "";
7577         if (@multi_code_point_maps) {
7578             $specials_name = "Unicode::UCD::ToSpec$name";
7579             $pre_body .= <<END;
7580
7581 # Some code points require special handling because their mappings are each to
7582 # multiple code points.  These do not appear in the main body, but are defined
7583 # in the hash below.
7584
7585 # Each key is the string of N bytes that together make up the UTF-8 encoding
7586 # for the code point.  (i.e. the same as looking at the code point's UTF-8
7587 # under "use bytes").  Each value is the UTF-8 of the translation, for speed.
7588 \%$specials_name = (
7589 END
7590             $pre_body .= join("\n", @multi_code_point_maps) . "\n);\n";
7591         }
7592
7593         my $format = $self->format;
7594
7595         my $return = "";
7596
7597         my $output_adjusted = ($self->to_output_map == $OUTPUT_ADJUSTED);
7598         if ($output_adjusted) {
7599             if ($specials_name) {
7600                 $return .= <<END;
7601 # The mappings in the non-hash portion of this file must be modified to get the
7602 # correct values by adding the code point ordinal number to each one that is
7603 # numeric.
7604 END
7605             }
7606             else {
7607                 $return .= <<END;
7608 # The mappings must be modified to get the correct values by adding the code
7609 # point ordinal number to each one that is numeric.
7610 END
7611             }
7612         }
7613
7614         $return .= <<END;
7615
7616 # The name this table is to be known by, with the format of the mappings in
7617 # the main body of the table, and what all code points missing from this file
7618 # map to.
7619 \$Unicode::UCD::SwashInfo{'To$name'}{'format'} = '$format'; # $map_table_formats{$format}
7620 END
7621         if ($specials_name) {
7622             $return .= <<END;
7623 \$Unicode::UCD::SwashInfo{'To$name'}{'specials_name'} = '$specials_name'; # Name of hash of special mappings
7624 END
7625         }
7626         my $default_map = $default_map{$addr};
7627
7628         # For $CODE_POINT default maps and using adjustments, instead the default
7629         # becomes zero.
7630         $return .= "\$Unicode::UCD::SwashInfo{'To$name'}{'missing'} = '"
7631                 .  (($output_adjusted && $default_map eq $CODE_POINT)
7632                    ? "0"
7633                    : $default_map)
7634                 . "';";
7635
7636         if ($default_map eq $CODE_POINT) {
7637             $return .= ' # code point maps to itself';
7638         }
7639         elsif ($default_map eq "") {
7640             $return .= ' # code point maps to the null string';
7641         }
7642         $return .= "\n";
7643
7644         $return .= $pre_body;
7645
7646         return $return;
7647     }
7648
7649     sub write {
7650         # Write the table to the file.
7651
7652         my $self = shift;
7653         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7654
7655         my $addr = do { no overloading; pack 'J', $self; };
7656
7657         # Clear the temporaries
7658         undef @multi_code_point_maps;
7659
7660         # Calculate the format of the table if not already done.
7661         my $format = $self->format;
7662         my $type = $self->property->type;
7663         my $default_map = $self->default_map;
7664         if (! defined $format) {
7665             if ($type == $BINARY) {
7666
7667                 # Don't bother checking the values, because we elsewhere
7668                 # verify that a binary table has only 2 values.
7669                 $format = $BINARY_FORMAT;
7670             }
7671             else {
7672                 my @ranges = $self->_range_list->ranges;
7673
7674                 # default an empty table based on its type and default map
7675                 if (! @ranges) {
7676
7677                     # But it turns out that the only one we can say is a
7678                     # non-string (besides binary, handled above) is when the
7679                     # table is a string and the default map is to a code point
7680                     if ($type == $STRING && $default_map eq $CODE_POINT) {
7681                         $format = $HEX_FORMAT;
7682                     }
7683                     else {
7684                         $format = $STRING_FORMAT;
7685                     }
7686                 }
7687                 else {
7688
7689                     # Start with the most restrictive format, and as we find
7690                     # something that doesn't fit with that, change to the next
7691                     # most restrictive, and so on.
7692                     $format = $DECIMAL_FORMAT;
7693                     foreach my $range (@ranges) {
7694                         next if $range->type != 0;  # Non-normal ranges don't
7695                                                     # affect the main body
7696                         my $map = $range->value;
7697                         if ($map ne $default_map) {
7698                             last if $format eq $STRING_FORMAT;  # already at
7699                                                                 # least
7700                                                                 # restrictive
7701                             $format = $INTEGER_FORMAT
7702                                                 if $format eq $DECIMAL_FORMAT
7703                                                     && $map !~ / ^ [0-9] $ /x;
7704                             $format = $FLOAT_FORMAT
7705                                             if $format eq $INTEGER_FORMAT
7706                                                 && $map !~ / ^ -? [0-9]+ $ /x;
7707                             $format = $RATIONAL_FORMAT
7708                                 if $format eq $FLOAT_FORMAT
7709                                     && $map !~ / ^ -? [0-9]+ \. [0-9]* $ /x;
7710                             $format = $HEX_FORMAT
7711                                 if ($format eq $RATIONAL_FORMAT
7712                                        && $map !~
7713                                            m/ ^ -? [0-9]+ ( \/ [0-9]+ )? $ /x)
7714                                         # Assume a leading zero means hex,
7715                                         # even if all digits are 0-9
7716                                     || ($format eq $INTEGER_FORMAT
7717                                         && $map =~ /^0[0-9A-F]/);
7718                             $format = $STRING_FORMAT if $format eq $HEX_FORMAT
7719                                                        && $map =~ /[^0-9A-F]/;
7720                         }
7721                     }
7722                 }
7723             }
7724         } # end of calculating format
7725
7726         if ($default_map eq $CODE_POINT
7727             && $format ne $HEX_FORMAT
7728             && ! defined $self->format)    # manual settings are always
7729                                            # considered ok
7730         {
7731             Carp::my_carp_bug("Expecting hex format for mapping table for $self, instead got '$format'")
7732         }
7733
7734         # If the output is to be adjusted, the format of the table that gets
7735         # output is actually 'a' or 'ax' instead of whatever it is stored
7736         # internally as.
7737         my $output_adjusted = ($self->to_output_map == $OUTPUT_ADJUSTED);
7738         if ($output_adjusted) {
7739             if ($default_map eq $CODE_POINT) {
7740                 $format = $HEX_ADJUST_FORMAT;
7741             }
7742             else {
7743                 $format = $ADJUST_FORMAT;
7744             }
7745         }
7746
7747         $self->_set_format($format);
7748
7749         return $self->SUPER::write(
7750             $output_adjusted,
7751             $default_map);   # don't write defaulteds
7752     }
7753
7754     # Accessors for the underlying list that should fail if locked.
7755     for my $sub (qw(
7756                     add_duplicate
7757                     replace_map
7758                 ))
7759     {
7760         no strict "refs";
7761         *$sub = sub {
7762             use strict "refs";
7763             my $self = shift;
7764
7765             return if $self->carp_if_locked;
7766             return $self->_range_list->$sub(@_);
7767         }
7768     }
7769 } # End closure for Map_Table
7770
7771 package Match_Table;
7772 use parent '-norequire', '_Base_Table';
7773
7774 # A Match table is one which is a list of all the code points that have
7775 # the same property and property value, for use in \p{property=value}
7776 # constructs in regular expressions.  It adds very little data to the base
7777 # structure, but many methods, as these lists can be combined in many ways to
7778 # form new ones.
7779 # There are only a few concepts added:
7780 # 1) Equivalents and Relatedness.
7781 #    Two tables can match the identical code points, but have different names.
7782 #    This always happens when there is a perl single form extension
7783 #    \p{IsProperty} for the Unicode compound form \P{Property=True}.  The two
7784 #    tables are set to be related, with the Perl extension being a child, and
7785 #    the Unicode property being the parent.
7786 #
7787 #    It may be that two tables match the identical code points and we don't
7788 #    know if they are related or not.  This happens most frequently when the
7789 #    Block and Script properties have the exact range.  But note that a
7790 #    revision to Unicode could add new code points to the script, which would
7791 #    now have to be in a different block (as the block was filled, or there
7792 #    would have been 'Unknown' script code points in it and they wouldn't have
7793 #    been identical).  So we can't rely on any two properties from Unicode
7794 #    always matching the same code points from release to release, and thus
7795 #    these tables are considered coincidentally equivalent--not related.  When
7796 #    two tables are unrelated but equivalent, one is arbitrarily chosen as the
7797 #    'leader', and the others are 'equivalents'.  This concept is useful
7798 #    to minimize the number of tables written out.  Only one file is used for
7799 #    any identical set of code points, with entries in UCD.pl mapping all
7800 #    the involved tables to it.
7801 #
7802 #    Related tables will always be identical; we set them up to be so.  Thus
7803 #    if the Unicode one is deprecated, the Perl one will be too.  Not so for
7804 #    unrelated tables.  Relatedness makes generating the documentation easier.
7805 #
7806 # 2) Complement.
7807 #    Like equivalents, two tables may be the inverses of each other, the
7808 #    intersection between them is null, and the union is every Unicode code
7809 #    point.  The two tables that occupy a binary property are necessarily like
7810 #    this.  By specifying one table as the complement of another, we can avoid
7811 #    storing it on disk (using the other table and performing a fast
7812 #    transform), and some memory and calculations.
7813 #
7814 # 3) Conflicting.  It may be that there will eventually be name clashes, with
7815 #    the same name meaning different things.  For a while, there actually were
7816 #    conflicts, but they have so far been resolved by changing Perl's or
7817 #    Unicode's definitions to match the other, but when this code was written,
7818 #    it wasn't clear that that was what was going to happen.  (Unicode changed
7819 #    because of protests during their beta period.)  Name clashes are warned
7820 #    about during compilation, and the documentation.  The generated tables
7821 #    are sane, free of name clashes, because the code suppresses the Perl
7822 #    version.  But manual intervention to decide what the actual behavior
7823 #    should be may be required should this happen.  The introductory comments
7824 #    have more to say about this.
7825 #
7826 # 4) Definition.  This is a string for human consumption that specifies the
7827 #    code points that this table matches.  This is used only for the generated
7828 #    pod file.  It may be specified explicitly, or automatically computed.
7829 #    Only the first portion of complicated definitions is computed and
7830 #    displayed.
7831
7832 sub standardize { return main::standardize($_[0]); }
7833 sub trace { return main::trace(@_); }
7834
7835
7836 { # Closure
7837
7838     main::setup_package();
7839
7840     my %leader;
7841     # The leader table of this one; initially $self.
7842     main::set_access('leader', \%leader, 'r');
7843
7844     my %equivalents;
7845     # An array of any tables that have this one as their leader
7846     main::set_access('equivalents', \%equivalents, 'readable_array');
7847
7848     my %parent;
7849     # The parent table to this one, initially $self.  This allows us to
7850     # distinguish between equivalent tables that are related (for which this
7851     # is set to), and those which may not be, but share the same output file
7852     # because they match the exact same set of code points in the current
7853     # Unicode release.
7854     main::set_access('parent', \%parent, 'r');
7855
7856     my %children;
7857     # An array of any tables that have this one as their parent
7858     main::set_access('children', \%children, 'readable_array');
7859
7860     my %conflicting;
7861     # Array of any tables that would have the same name as this one with
7862     # a different meaning.  This is used for the generated documentation.
7863     main::set_access('conflicting', \%conflicting, 'readable_array');
7864
7865     my %matches_all;
7866     # Set in the constructor for tables that are expected to match all code
7867     # points.
7868     main::set_access('matches_all', \%matches_all, 'r');
7869
7870     my %complement;
7871     # Points to the complement that this table is expressed in terms of; 0 if
7872     # none.
7873     main::set_access('complement', \%complement, 'r');
7874
7875     my %definition;
7876     # Human readable string of the first few ranges of code points matched by
7877     # this table
7878     main::set_access('definition', \%definition, 'r', 's');
7879
7880     sub new {
7881         my $class = shift;
7882
7883         my %args = @_;
7884
7885         # The property for which this table is a listing of property values.
7886         my $property = delete $args{'_Property'};
7887
7888         my $name = delete $args{'Name'};
7889         my $full_name = delete $args{'Full_Name'};
7890         $full_name = $name if ! defined $full_name;
7891
7892         # Optional
7893         my $initialize = delete $args{'Initialize'};
7894         my $matches_all = delete $args{'Matches_All'} || 0;
7895         my $format = delete $args{'Format'};
7896         my $definition = delete $args{'Definition'} // "";
7897         # Rest of parameters passed on.
7898
7899         my $range_list = Range_List->new(Initialize => $initialize,
7900                                          Owner => $property);
7901
7902         my $complete = $full_name;
7903         $complete = '""' if $complete eq "";  # A null name shouldn't happen,
7904                                               # but this helps debug if it
7905                                               # does
7906         # The complete name for a match table includes it's property in a
7907         # compound form 'property=table', except if the property is the
7908         # pseudo-property, perl, in which case it is just the single form,
7909         # 'table' (If you change the '=' must also change the ':' in lots of
7910         # places in this program that assume an equal sign)
7911         $complete = $property->full_name . "=$complete" if $property != $perl;
7912
7913         my $self = $class->SUPER::new(%args,
7914                                       Name => $name,
7915                                       Complete_Name => $complete,
7916                                       Full_Name => $full_name,
7917                                       _Property => $property,
7918                                       _Range_List => $range_list,
7919                                       Format => $EMPTY_FORMAT,
7920                                       Write_As_Invlist => 1,
7921                                       );
7922         my $addr = do { no overloading; pack 'J', $self; };
7923
7924         $conflicting{$addr} = [ ];
7925         $equivalents{$addr} = [ ];
7926         $children{$addr} = [ ];
7927         $matches_all{$addr} = $matches_all;
7928         $leader{$addr} = $self;
7929         $parent{$addr} = $self;
7930         $complement{$addr} = 0;
7931         $definition{$addr} = $definition;
7932
7933         if (defined $format && $format ne $EMPTY_FORMAT) {
7934             Carp::my_carp_bug("'Format' must be '$EMPTY_FORMAT' in a match table instead of '$format'.  Using '$EMPTY_FORMAT'");
7935         }
7936
7937         return $self;
7938     }
7939
7940     # See this program's beginning comment block about overloading these.
7941     use overload
7942         fallback => 0,
7943         qw("") => "_operator_stringify",
7944         '=' => sub {
7945                     my $self = shift;
7946
7947                     return if $self->carp_if_locked;
7948                     return $self;
7949                 },
7950
7951         '+' => sub {
7952                         my $self = shift;
7953                         my $other = shift;
7954
7955                         return $self->_range_list + $other;
7956                     },
7957         '&' => sub {
7958                         my $self = shift;
7959                         my $other = shift;
7960
7961                         return $self->_range_list & $other;
7962                     },
7963         '+=' => sub {
7964                         my $self = shift;
7965                         my $other = shift;
7966                         my $reversed = shift;
7967
7968                         if ($reversed) {
7969                             Carp::my_carp_bug("Bad news.  Can't cope with '"
7970                             . ref($other)
7971                             . ' += '
7972                             . ref($self)
7973                             . "'.  undef returned.");
7974                             return;
7975                         }
7976
7977                         return if $self->carp_if_locked;
7978
7979                         my $addr = do { no overloading; pack 'J', $self; };
7980
7981                         if (ref $other) {
7982
7983                             # Change the range list of this table to be the
7984                             # union of the two.
7985                             $self->_set_range_list($self->_range_list
7986                                                     + $other);
7987                         }
7988                         else {    # $other is just a simple value
7989                             $self->add_range($other, $other);
7990                         }
7991                         return $self;
7992                     },
7993         '&=' => sub {
7994                         my $self = shift;
7995                         my $other = shift;
7996                         my $reversed = shift;
7997
7998                         if ($reversed) {
7999                             Carp::my_carp_bug("Bad news.  Can't cope with '"
8000                             . ref($other)
8001                             . ' &= '
8002                             . ref($self)
8003                             . "'.  undef returned.");
8004                             return;
8005                         }
8006
8007                         return if $self->carp_if_locked;
8008                         $self->_set_range_list($self->_range_list & $other);
8009                         return $self;
8010                     },
8011         '-' => sub { my $self = shift;
8012                     my $other = shift;
8013                     my $reversed = shift;
8014                     if ($reversed) {
8015                         Carp::my_carp_bug("Bad news.  Can't cope with '"
8016                         . ref($other)
8017                         . ' - '
8018                         . ref($self)
8019                         . "'.  undef returned.");
8020                         return;
8021                     }
8022
8023                     return $self->_range_list - $other;
8024                 },
8025         '~' => sub { my $self = shift;
8026                     return ~ $self->_range_list;
8027                 },
8028     ;
8029
8030     sub _operator_stringify {
8031         my $self = shift;
8032
8033         my $name = $self->complete_name;
8034         return "Table '$name'";
8035     }
8036
8037     sub _range_list {
8038         # Returns the range list associated with this table, which will be the
8039         # complement's if it has one.
8040
8041         my $self = shift;
8042         my $complement = $self->complement;
8043
8044         # In order to avoid re-complementing on each access, only do the
8045         # complement the first time, and store the result in this table's
8046         # range list to use henceforth.  However, this wouldn't work if the
8047         # controlling (complement) table changed after we do this, so lock it.
8048         # Currently, the value of the complement isn't needed until after it
8049         # is fully constructed, so this works.  If this were to change, the
8050         # each_range iteration functionality would no longer work on this
8051         # complement.
8052         if ($complement != 0 && $self->SUPER::_range_list->count == 0) {
8053             $self->_set_range_list($self->SUPER::_range_list
8054                                 + ~ $complement->_range_list);
8055             $complement->lock;
8056         }
8057
8058         return $self->SUPER::_range_list;
8059     }
8060
8061     sub add_alias {
8062         # Add a synonym for this table.  See the comments in the base class
8063
8064         my $self = shift;
8065         my $name = shift;
8066         # Rest of parameters passed on.
8067
8068         $self->SUPER::add_alias($name, $self, @_);
8069         return;
8070     }
8071
8072     sub add_conflicting {
8073         # Add the name of some other object to the list of ones that name
8074         # clash with this match table.
8075
8076         my $self = shift;
8077         my $conflicting_name = shift;   # The name of the conflicting object
8078         my $p = shift || 'p';           # Optional, is this a \p{} or \P{} ?
8079         my $conflicting_object = shift; # Optional, the conflicting object
8080                                         # itself.  This is used to
8081                                         # disambiguate the text if the input
8082                                         # name is identical to any of the
8083                                         # aliases $self is known by.
8084                                         # Sometimes the conflicting object is
8085                                         # merely hypothetical, so this has to
8086                                         # be an optional parameter.
8087         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8088
8089         my $addr = do { no overloading; pack 'J', $self; };
8090
8091         # Check if the conflicting name is exactly the same as any existing
8092         # alias in this table (as long as there is a real object there to
8093         # disambiguate with).
8094         if (defined $conflicting_object) {
8095             foreach my $alias ($self->aliases) {
8096                 if (standardize($alias->name) eq standardize($conflicting_name)) {
8097
8098                     # Here, there is an exact match.  This results in
8099                     # ambiguous comments, so disambiguate by changing the
8100                     # conflicting name to its object's complete equivalent.
8101                     $conflicting_name = $conflicting_object->complete_name;
8102                     last;
8103                 }
8104             }
8105         }
8106
8107         # Convert to the \p{...} final name
8108         $conflicting_name = "\\$p" . "{$conflicting_name}";
8109
8110         # Only add once
8111         return if grep { $conflicting_name eq $_ } @{$conflicting{$addr}};
8112
8113         push @{$conflicting{$addr}}, $conflicting_name;
8114
8115         return;
8116     }
8117
8118     sub is_set_equivalent_to {
8119         # Return boolean of whether or not the other object is a table of this
8120         # type and has been marked equivalent to this one.
8121
8122         my $self = shift;
8123         my $other = shift;
8124         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8125
8126         return 0 if ! defined $other; # Can happen for incomplete early
8127                                       # releases
8128         unless ($other->isa(__PACKAGE__)) {
8129             my $ref_other = ref $other;
8130             my $ref_self = ref $self;
8131             Carp::my_carp_bug("Argument to 'is_set_equivalent_to' must be another $ref_self, not a '$ref_other'.  $other not set equivalent to $self.");
8132             return 0;
8133         }
8134
8135         # Two tables are equivalent if they have the same leader.
8136         no overloading;
8137         return $leader{pack 'J', $self} == $leader{pack 'J', $other};
8138         return;
8139     }
8140
8141     sub set_equivalent_to {
8142         # Set $self equivalent to the parameter table.
8143         # The required Related => 'x' parameter is a boolean indicating
8144         # whether these tables are related or not.  If related, $other becomes
8145         # the 'parent' of $self; if unrelated it becomes the 'leader'
8146         #
8147         # Related tables share all characteristics except names; equivalents
8148         # not quite so many.
8149         # If they are related, one must be a perl extension.  This is because
8150         # we can't guarantee that Unicode won't change one or the other in a
8151         # later release even if they are identical now.
8152
8153         my $self = shift;
8154         my $other = shift;
8155
8156         my %args = @_;
8157         my $related = delete $args{'Related'};
8158
8159         Carp::carp_extra_args(\%args) if main::DEBUG && %args;
8160
8161         return if ! defined $other;     # Keep on going; happens in some early
8162                                         # Unicode releases.
8163
8164         if (! defined $related) {
8165             Carp::my_carp_bug("set_equivalent_to must have 'Related => [01] parameter.  Assuming $self is not related to $other");
8166             $related = 0;
8167         }
8168
8169         # If already are equivalent, no need to re-do it;  if subroutine
8170         # returns null, it found an error, also do nothing
8171         my $are_equivalent = $self->is_set_equivalent_to($other);
8172         return if ! defined $are_equivalent || $are_equivalent;
8173
8174         my $addr = do { no overloading; pack 'J', $self; };
8175         my $current_leader = ($related) ? $parent{$addr} : $leader{$addr};
8176
8177         if ($related) {
8178             if ($current_leader->perl_extension) {
8179                 if ($other->perl_extension) {
8180                     Carp::my_carp_bug("Use add_alias() to set two Perl tables '$self' and '$other', equivalent.");
8181                     return;
8182                 }
8183             } elsif ($self->property != $other->property    # Depending on
8184                                                             # situation, might
8185                                                             # be better to use
8186                                                             # add_alias()
8187                                                             # instead for same
8188                                                             # property
8189                      && ! $other->perl_extension
8190
8191                          # We allow the sc and scx properties to be marked as
8192                          # related.  They are in fact related, and this allows
8193                          # the pod to show that better.  This test isn't valid
8194                          # if this is an early Unicode release without the scx
8195                          # property (having that also implies the sc property
8196                          # exists, so don't have to test for no 'sc')
8197                      && (   ! defined $scx
8198                          && ! (   (   $self->property == $script
8199                                    || $self->property == $scx)
8200                                && (   $self->property == $script
8201                                    || $self->property == $scx))))
8202             {
8203                 Carp::my_carp_bug("set_equivalent_to should have 'Related => 0 for equivalencing two Unicode properties.  Assuming $self is not related to $other");
8204                 $related = 0;
8205             }
8206         }
8207
8208         if (! $self->is_empty && ! $self->matches_identically_to($other)) {
8209             Carp::my_carp_bug("$self should be empty or match identically to $other.  Not setting equivalent");
8210             return;
8211         }
8212
8213         my $leader = do { no overloading; pack 'J', $current_leader; };
8214         my $other_addr = do { no overloading; pack 'J', $other; };
8215
8216         # Any tables that are equivalent to or children of this table must now
8217         # instead be equivalent to or (children) to the new leader (parent),
8218         # still equivalent.  The equivalency includes their matches_all info,
8219         # and for related tables, their fate and status.
8220         # All related tables are of necessity equivalent, but the converse
8221         # isn't necessarily true
8222         my $status = $other->status;
8223         my $status_info = $other->status_info;
8224         my $fate = $other->fate;
8225         my $matches_all = $matches_all{other_addr};
8226         my $caseless_equivalent = $other->caseless_equivalent;
8227         foreach my $table ($current_leader, @{$equivalents{$leader}}) {
8228             next if $table == $other;
8229             trace "setting $other to be the leader of $table, status=$status" if main::DEBUG && $to_trace;
8230
8231             my $table_addr = do { no overloading; pack 'J', $table; };
8232             $leader{$table_addr} = $other;
8233             $matches_all{$table_addr} = $matches_all;
8234             $self->_set_range_list($other->_range_list);
8235             push @{$equivalents{$other_addr}}, $table;
8236             if ($related) {
8237                 $parent{$table_addr} = $other;
8238                 push @{$children{$other_addr}}, $table;
8239                 $table->set_status($status, $status_info);
8240
8241                 # This reason currently doesn't get exposed outside; otherwise
8242                 # would have to look up the parent's reason and use it instead.
8243                 $table->set_fate($fate, "Parent's fate");
8244
8245                 $self->set_caseless_equivalent($caseless_equivalent);
8246             }
8247         }
8248
8249         # Now that we've declared these to be equivalent, any changes to one
8250         # of the tables would invalidate that equivalency.
8251         $self->lock;
8252         $other->lock;
8253         return;
8254     }
8255
8256     sub set_complement {
8257         # Set $self to be the complement of the parameter table.  $self is
8258         # locked, as what it contains should all come from the other table.
8259
8260         my $self = shift;
8261         my $other = shift;
8262
8263         my %args = @_;
8264         Carp::carp_extra_args(\%args) if main::DEBUG && %args;
8265
8266         if ($other->complement != 0) {
8267             Carp::my_carp_bug("Can't set $self to be the complement of $other, which itself is the complement of " . $other->complement);
8268             return;
8269         }
8270         my $addr = do { no overloading; pack 'J', $self; };
8271         $complement{$addr} = $other;
8272
8273         # Be sure the other property knows we are depending on them; or the
8274         # other table if it is one in the current property.
8275         if ($self->property != $other->property) {
8276             $other->property->set_has_dependency(1);
8277         }
8278         else {
8279             $other->set_has_dependency(1);
8280         }
8281         $self->lock;
8282         return;
8283     }
8284
8285     sub add_range { # Add a range to the list for this table.
8286         my $self = shift;
8287         # Rest of parameters passed on
8288
8289         return if $self->carp_if_locked;
8290         return $self->_range_list->add_range(@_);
8291     }
8292
8293     sub header {
8294         my $self = shift;
8295         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8296
8297         # All match tables are to be used only by the Perl core.
8298         return $self->SUPER::header() . $INTERNAL_ONLY_HEADER;
8299     }
8300
8301     sub pre_body {  # Does nothing for match tables.
8302         return
8303     }
8304
8305     sub append_to_body {  # Does nothing for match tables.
8306         return
8307     }
8308
8309     sub set_fate {
8310         my $self = shift;
8311         my $fate = shift;
8312         my $reason = shift;
8313         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8314
8315         $self->SUPER::set_fate($fate, $reason);
8316
8317         # All children share this fate
8318         foreach my $child ($self->children) {
8319             $child->set_fate($fate, $reason);
8320         }
8321         return;
8322     }
8323
8324     sub calculate_table_definition
8325     {
8326         # Returns a human-readable string showing some or all of the code
8327         # points matched by this table.  The string will include a
8328         # bracketed-character class for all characters matched in the 00-FF
8329         # range, and the first few ranges matched beyond that.
8330         my $max_ranges = 6;
8331
8332         my $self = shift;
8333         my $definition = $self->definition || "";
8334
8335         # Skip this if already have a definition.
8336         return $definition if $definition;
8337
8338         my $lows_string = "";   # The string representation of the 0-FF
8339                                 # characters
8340         my $string_range = "";  # The string rep. of the above FF ranges
8341         my $range_count = 0;    # How many ranges in $string_rage
8342
8343         my @lows_invlist;       # The inversion list of the 0-FF code points
8344         my $first_non_control = ord(" ");   # Everything below this is a
8345                                             # control, on ASCII or EBCDIC
8346         my $max_table_code_point = $self->max;
8347
8348         # On ASCII platforms, the range 80-FF contains no printables.
8349         my $highest_printable = ((main::NON_ASCII_PLATFORM) ? 255 : 126);
8350
8351
8352         # Look through the first few ranges matched by this table.
8353         $self->reset_each_range;    # Defensive programming
8354         while (defined (my $range = $self->each_range())) {
8355             my $start = $range->start;
8356             my $end = $range->end;
8357
8358             # Accumulate an inversion list of the 00-FF code points
8359             if ($start < 256 && ($start > 0 || $end < 256)) {
8360                 push @lows_invlist, $start;
8361                 push @lows_invlist, 1 + (($end < 256) ? $end : 255);
8362
8363                 # Get next range if there are more ranges below 256
8364                 next if $end < 256 && $end < $max_table_code_point;
8365
8366                 # If the range straddles the 255/256 boundary, we split it
8367                 # there.  We already added above the low portion to the
8368                 # inversion list
8369                 $start = 256 if $end > 256;
8370             }
8371
8372             # Here, @lows_invlist contains the code points below 256, and
8373             # there is no other range, or the current one starts at or above
8374             # 256.  Generate the [char class] for the 0-255 ones.
8375             while (@lows_invlist) {
8376
8377                 # If this range (necessarily the first one, by the way) starts
8378                 # at 0 ...
8379                 if ($lows_invlist[0] == 0) {
8380
8381                     # If it ends within the block of controls, that means that
8382                     # some controls are in it and some aren't.  Since Unicode
8383                     # properties pretty much only know about a few of the
8384                     # controls, like \n, \t, this means that its one of them
8385                     # that isn't in the range.  Complement the inversion list
8386                     # which will likely cause these to be output using their
8387                     # mnemonics, hence being clearer.
8388                     if ($lows_invlist[1] < $first_non_control) {
8389                         $lows_string .= '^';
8390                         shift @lows_invlist;
8391                         push @lows_invlist, 256;
8392                     }
8393                     elsif ($lows_invlist[1] <= $highest_printable) {
8394
8395                         # Here, it extends into the printables block.  Split
8396                         # into two ranges so that the controls are separate.
8397                         $lows_string .= sprintf "\\x00-\\x%02x",
8398                                                     $first_non_control - 1;
8399                         $lows_invlist[0] = $first_non_control;
8400                     }
8401                 }
8402
8403                 # If the range completely contains the printables, don't
8404                 # individually spell out the printables.
8405                 if (    $lows_invlist[0] <= $first_non_control
8406                     && $lows_invlist[1] > $highest_printable)
8407                 {
8408                     $lows_string .= sprintf "\\x%02x-\\x%02x",
8409                                         $lows_invlist[0], $lows_invlist[1] - 1;
8410                     shift @lows_invlist;
8411                     shift @lows_invlist;
8412                     next;
8413                 }
8414
8415                 # Here, the range may include some but not all printables.
8416                 # Look at each one individually
8417                 foreach my $ord (shift @lows_invlist .. shift(@lows_invlist) - 1) {
8418                     my $char = chr $ord;
8419
8420                     # If there is already something in the list, an
8421                     # alphanumeric char could be the next in sequence.  If so,
8422                     # we start or extend a range.  That is, we could have so
8423                     # far something like 'a-c', and the next char is a 'd', so
8424                     # we change it to 'a-d'.  We use native_to_unicode()
8425                     # because a-z on EBCDIC means 26 chars, and excludes the
8426                     # gap ones.
8427                     if ($lows_string ne "" && $char =~ /[[:alnum:]]/) {
8428                         my $prev = substr($lows_string, -1);
8429                         if (   $prev !~ /[[:alnum:]]/
8430                             ||   utf8::native_to_unicode(ord $prev) + 1
8431                               != utf8::native_to_unicode(ord $char))
8432                         {
8433                             # Not extending the range
8434                             $lows_string .= $char;
8435                         }
8436                         elsif (   length $lows_string > 1
8437                                && substr($lows_string, -2, 1) eq '-')
8438                         {
8439                             # We had a sequence like '-c' and the current
8440                             # character is 'd'.  Extend the range.
8441                             substr($lows_string, -1, 1) = $char;
8442                         }
8443                         else {
8444                             # We had something like 'd' and this is 'e'.
8445                             # Start a range.
8446                             $lows_string .= "-$char";
8447                         }
8448                     }
8449                     elsif ($char =~ /[[:graph:]]/) {
8450
8451                         # We output a graphic char as-is, preceded by a
8452                         # backslash if it is a metacharacter
8453                         $lows_string .= '\\'
8454                                 if $char =~ /[\\\^\$\@\%\|()\[\]\{\}\-\/"']/;
8455                         $lows_string .= $char;
8456                     } # Otherwise use mnemonic for any that have them
8457                     elsif ($char =~ /[\a]/) {
8458                         $lows_string .= '\a';
8459                     }
8460                     elsif ($char =~ /[\b]/) {
8461                         $lows_string .= '\b';
8462                     }
8463                     elsif ($char eq "\e") {
8464                         $lows_string .= '\e';
8465                     }
8466                     elsif ($char eq "\f") {
8467                         $lows_string .= '\f';
8468                     }
8469                     elsif ($char eq "\cK") {
8470                         $lows_string .= '\cK';
8471                     }
8472                     elsif ($char eq "\n") {
8473                         $lows_string .= '\n';
8474                     }
8475                     elsif ($char eq "\r") {
8476                         $lows_string .= '\r';
8477                     }
8478                     elsif ($char eq "\t") {
8479                         $lows_string .= '\t';
8480                     }
8481                     else {
8482
8483                         # Here is a non-graphic without a mnemonic.  We use \x
8484                         # notation.  But if the ordinal of this is one above
8485                         # the previous, create or extend the range
8486                         my $hex_representation = sprintf("%02x", ord $char);
8487                         if (   length $lows_string >= 4
8488                             && substr($lows_string, -4, 2) eq '\\x'
8489                             && hex(substr($lows_string, -2)) + 1 == ord $char)
8490                         {
8491                             if (       length $lows_string >= 5
8492                                 &&     substr($lows_string, -5, 1) eq '-'
8493                                 && (   length $lows_string == 5
8494                                     || substr($lows_string, -6, 1) ne '\\'))
8495                             {
8496                                 substr($lows_string, -2) = $hex_representation;
8497                             }
8498                             else {
8499                                 $lows_string .= '-\\x' . $hex_representation;
8500                             }
8501                         }
8502                         else {
8503                             $lows_string .= '\\x' . $hex_representation;
8504                         }
8505                     }
8506                 }
8507             }
8508
8509             # Done with assembling the string of all lows.  If there are only
8510             # lows in the property, are completely done.
8511             if ($max_table_code_point < 256) {
8512                 $self->reset_each_range;
8513                 last;
8514             }
8515
8516             # Otherwise, quit if reached max number of non-lows ranges.  If
8517             # there are lows, count them as one unit towards the maximum.
8518             $range_count++;
8519             if ($range_count > (($lows_string eq "") ? $max_ranges : $max_ranges - 1)) {
8520                 $string_range .= " ...";
8521                 $self->reset_each_range;
8522                 last;
8523             }
8524
8525             # Otherwise add this range.
8526             $string_range .= ", " if $string_range ne "";
8527             if ($start == $end) {
8528                 $string_range .= sprintf("U+%04X", $start);
8529             }
8530             elsif ($end >= $MAX_WORKING_CODEPOINT)  {
8531                 $string_range .= sprintf("U+%04X..infinity", $start);
8532             }
8533             else  {
8534                 $string_range .= sprintf("U+%04X..%04X",
8535                                         $start, $end);
8536             }
8537         }
8538
8539         # Done with all the ranges we're going to look at.  Assemble the
8540         # definition from the lows + non-lows.
8541
8542         if ($lows_string ne "" || $string_range ne "") {
8543             if ($lows_string ne "") {
8544                 $definition .= "[$lows_string]";
8545                 $definition .= ", " if $string_range;
8546             }
8547             $definition .= $string_range;
8548         }
8549
8550         return $definition;
8551     }
8552
8553     sub write {
8554         my $self = shift;
8555         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8556
8557         return $self->SUPER::write(0); # No adjustments
8558     }
8559
8560     sub set_final_comment {
8561         # This creates a comment for the file that is to hold the match table
8562         # $self.  It is somewhat convoluted to make the English read nicely,
8563         # but, heh, it's just a comment.
8564         # This should be called only with the leader match table of all the
8565         # ones that share the same file.  It lists all such tables, ordered so
8566         # that related ones are together.
8567
8568         return unless $debugging_build;
8569
8570         my $leader = shift;   # Should only be called on the leader table of
8571                               # an equivalent group
8572         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8573
8574         my $addr = do { no overloading; pack 'J', $leader; };
8575
8576         if ($leader{$addr} != $leader) {
8577             Carp::my_carp_bug(<<END
8578 set_final_comment() must be called on a leader table, which $leader is not.
8579 It is equivalent to $leader{$addr}.  No comment created
8580 END
8581             );
8582             return;
8583         }
8584
8585         # Get the number of code points matched by each of the tables in this
8586         # file, and add underscores for clarity.
8587         my $count = $leader->count;
8588         my $unicode_count;
8589         my $non_unicode_string;
8590         if ($count > $MAX_UNICODE_CODEPOINTS) {
8591             $unicode_count = $count - ($MAX_WORKING_CODEPOINT
8592                                        - $MAX_UNICODE_CODEPOINT);
8593             $non_unicode_string = "All above-Unicode code points match as well, and are also returned";
8594         }
8595         else {
8596             $unicode_count = $count;
8597             $non_unicode_string = "";
8598         }
8599         my $string_count = main::clarify_code_point_count($unicode_count);
8600
8601         my $loose_count = 0;        # how many aliases loosely matched
8602         my $compound_name = "";     # ? Are any names compound?, and if so, an
8603                                     # example
8604         my $properties_with_compound_names = 0;    # count of these
8605
8606
8607         my %flags;              # The status flags used in the file
8608         my $total_entries = 0;  # number of entries written in the comment
8609         my $matches_comment = ""; # The portion of the comment about the
8610                                   # \p{}'s
8611         my @global_comments;    # List of all the tables' comments that are
8612                                 # there before this routine was called.
8613         my $has_ucd_alias = 0;  # If there is an alias that is accessible via
8614                                 # Unicode::UCD.  If not, then don't say it is
8615                                 # in the comment
8616
8617         # Get list of all the parent tables that are equivalent to this one
8618         # (including itself).
8619         my @parents = grep { $parent{main::objaddr $_} == $_ }
8620                             main::uniques($leader, @{$equivalents{$addr}});
8621         my $has_unrelated = (@parents >= 2);  # boolean, ? are there unrelated
8622                                               # tables
8623         for my $parent (@parents) {
8624
8625             my $property = $parent->property;
8626
8627             # Special case 'N' tables in properties with two match tables when
8628             # the other is a 'Y' one.  These are likely to be binary tables,
8629             # but not necessarily.  In either case, \P{} will match the
8630             # complement of \p{}, and so if something is a synonym of \p, the
8631             # complement of that something will be the synonym of \P.  This
8632             # would be true of any property with just two match tables, not
8633             # just those whose values are Y and N; but that would require a
8634             # little extra work, and there are none such so far in Unicode.
8635             my $perl_p = 'p';        # which is it?  \p{} or \P{}
8636             my @yes_perl_synonyms;   # list of any synonyms for the 'Y' table
8637
8638             if (scalar $property->tables == 2
8639                 && $parent == $property->table('N')
8640                 && defined (my $yes = $property->table('Y')))
8641             {
8642                 my $yes_addr = do { no overloading; pack 'J', $yes; };
8643                 @yes_perl_synonyms
8644                     = grep { $_->property == $perl }
8645                                     main::uniques($yes,
8646                                                 $parent{$yes_addr},
8647                                                 $parent{$yes_addr}->children);
8648
8649                 # But these synonyms are \P{} ,not \p{}
8650                 $perl_p = 'P';
8651             }
8652
8653             my @description;        # Will hold the table description
8654             my @note;               # Will hold the table notes.
8655             my @conflicting;        # Will hold the table conflicts.
8656
8657             # Look at the parent, any yes synonyms, and all the children
8658             my $parent_addr = do { no overloading; pack 'J', $parent; };
8659             for my $table ($parent,
8660                            @yes_perl_synonyms,
8661                            @{$children{$parent_addr}})
8662             {
8663                 my $table_addr = do { no overloading; pack 'J', $table; };
8664                 my $table_property = $table->property;
8665
8666                 # Tables are separated by a blank line to create a grouping.
8667                 $matches_comment .= "\n" if $matches_comment;
8668
8669                 # The table is named based on the property and value
8670                 # combination it is for, like script=greek.  But there may be
8671                 # a number of synonyms for each side, like 'sc' for 'script',
8672                 # and 'grek' for 'greek'.  Any combination of these is a valid
8673                 # name for this table.  In this case, there are three more,
8674                 # 'sc=grek', 'sc=greek', and 'script='grek'.  Rather than
8675                 # listing all possible combinations in the comment, we make
8676                 # sure that each synonym occurs at least once, and add
8677                 # commentary that the other combinations are possible.
8678                 # Because regular expressions don't recognize things like
8679                 # \p{jsn=}, only look at non-null right-hand-sides
8680                 my @property_aliases = grep { $_->status ne $INTERNAL_ALIAS } $table_property->aliases;
8681                 my @table_aliases = grep { $_->name ne "" } $table->aliases;
8682
8683                 # The alias lists above are already ordered in the order we
8684                 # want to output them.  To ensure that each synonym is listed,
8685                 # we must use the max of the two numbers.  But if there are no
8686                 # legal synonyms (nothing in @table_aliases), then we don't
8687                 # list anything.
8688                 my $listed_combos = (@table_aliases)
8689                                     ?  main::max(scalar @table_aliases,
8690                                                  scalar @property_aliases)
8691                                     : 0;
8692                 trace "$listed_combos, tables=", scalar @table_aliases, "; property names=", scalar @property_aliases if main::DEBUG;
8693
8694                 my $property_had_compound_name = 0;
8695
8696                 for my $i (0 .. $listed_combos - 1) {
8697                     $total_entries++;
8698
8699                     # The current alias for the property is the next one on
8700                     # the list, or if beyond the end, start over.  Similarly
8701                     # for the table (\p{prop=table})
8702                     my $property_alias = $property_aliases
8703                                             [$i % @property_aliases]->name;
8704                     my $table_alias_object = $table_aliases
8705                                                         [$i % @table_aliases];
8706                     my $table_alias = $table_alias_object->name;
8707                     my $loose_match = $table_alias_object->loose_match;
8708                     $has_ucd_alias |= $table_alias_object->ucd;
8709
8710                     if ($table_alias !~ /\D/) { # Clarify large numbers.
8711                         $table_alias = main::clarify_number($table_alias)
8712                     }
8713
8714                     # Add a comment for this alias combination
8715                     my $current_match_comment;
8716                     if ($table_property == $perl) {
8717                         $current_match_comment = "\\$perl_p"
8718                                                     . "{$table_alias}";
8719                     }
8720                     else {
8721                         $current_match_comment
8722                                         = "\\p{$property_alias=$table_alias}";
8723                         $property_had_compound_name = 1;
8724                     }
8725
8726                     # Flag any abnormal status for this table.
8727                     my $flag = $property->status
8728                                 || $table->status
8729                                 || $table_alias_object->status;
8730                     if ($flag && $flag ne $PLACEHOLDER) {
8731                         $flags{$flag} = $status_past_participles{$flag};
8732                     }
8733
8734                     $loose_count++;
8735
8736                     # Pretty up the comment.  Note the \b; it says don't make
8737                     # this line a continuation.
8738                     $matches_comment .= sprintf("\b%-1s%-s%s\n",
8739                                         $flag,
8740                                         " " x 7,
8741                                         $current_match_comment);
8742                 } # End of generating the entries for this table.
8743
8744                 # Save these for output after this group of related tables.
8745                 push @description, $table->description;
8746                 push @note, $table->note;
8747                 push @conflicting, $table->conflicting;
8748
8749                 # And this for output after all the tables.
8750                 push @global_comments, $table->comment;
8751
8752                 # Compute an alternate compound name using the final property
8753                 # synonym and the first table synonym with a colon instead of
8754                 # the equal sign used elsewhere.
8755                 if ($property_had_compound_name) {
8756                     $properties_with_compound_names ++;
8757                     if (! $compound_name || @property_aliases > 1) {
8758                         $compound_name = $property_aliases[-1]->name
8759                                         . ': '
8760                                         . $table_aliases[0]->name;
8761                     }
8762                 }
8763             } # End of looping through all children of this table
8764
8765             # Here have assembled in $matches_comment all the related tables
8766             # to the current parent (preceded by the same info for all the
8767             # previous parents).  Put out information that applies to all of
8768             # the current family.
8769             if (@conflicting) {
8770
8771                 # But output the conflicting information now, as it applies to
8772                 # just this table.
8773                 my $conflicting = join ", ", @conflicting;
8774                 if ($conflicting) {
8775                     $matches_comment .= <<END;
8776
8777     Note that contrary to what you might expect, the above is NOT the same as
8778 END
8779                     $matches_comment .= "any of: " if @conflicting > 1;
8780                     $matches_comment .= "$conflicting\n";
8781                 }
8782             }
8783             if (@description) {
8784                 $matches_comment .= "\n    Meaning: "
8785                                     . join('; ', @description)
8786                                     . "\n";
8787             }
8788             if (@note) {
8789                 $matches_comment .= "\n    Note: "
8790                                     . join("\n    ", @note)
8791                                     . "\n";
8792             }
8793         } # End of looping through all tables
8794
8795         $matches_comment .= "\n$non_unicode_string\n" if $non_unicode_string;
8796
8797
8798         my $code_points;
8799         my $match;
8800         my $any_of_these;
8801         if ($unicode_count == 1) {
8802             $match = 'matches';
8803             $code_points = 'single code point';
8804         }
8805         else {
8806             $match = 'match';
8807             $code_points = "$string_count code points";
8808         }
8809
8810         my $synonyms;
8811         my $entries;
8812         if ($total_entries == 1) {
8813             $synonyms = "";
8814             $entries = 'entry';
8815             $any_of_these = 'this'
8816         }
8817         else {
8818             $synonyms = " any of the following regular expression constructs";
8819             $entries = 'entries';
8820             $any_of_these = 'any of these'
8821         }
8822
8823         my $comment = "";
8824         if ($has_ucd_alias) {
8825             $comment .= "Use Unicode::UCD::prop_invlist() to access the contents of this file.\n\n";
8826         }
8827         if ($has_unrelated) {
8828             $comment .= <<END;
8829 This file is for tables that are not necessarily related:  To conserve
8830 resources, every table that matches the identical set of code points in this
8831 version of Unicode uses this file.  Each one is listed in a separate group
8832 below.  It could be that the tables will match the same set of code points in
8833 other Unicode releases, or it could be purely coincidence that they happen to
8834 be the same in Unicode $unicode_version, and hence may not in other versions.
8835
8836 END
8837         }
8838
8839         if (%flags) {
8840             foreach my $flag (sort keys %flags) {
8841                 $comment .= <<END;
8842 '$flag' below means that this form is $flags{$flag}.
8843 END
8844                 if ($flag eq $INTERNAL_ALIAS) {
8845                     $comment .= "DO NOT USE!!!";
8846                 }
8847                 else {
8848                     $comment .= "Consult $pod_file.pod";
8849                 }
8850                 $comment .= "\n";
8851             }
8852             $comment .= "\n";
8853         }
8854
8855         if ($total_entries == 0) {
8856             Carp::my_carp("No regular expression construct can match $leader, as all names for it are the null string.  Creating file anyway.");
8857             $comment .= <<END;
8858 This file returns the $code_points in Unicode Version
8859 $unicode_version for
8860 $leader, but it is inaccessible through Perl regular expressions, as
8861 "\\p{prop=}" is not recognized.
8862 END
8863
8864         } else {
8865             $comment .= <<END;
8866 This file returns the $code_points in Unicode Version
8867 $unicode_version that
8868 $match$synonyms:
8869
8870 $matches_comment
8871 $pod_file.pod should be consulted for the syntax rules for $any_of_these,
8872 including if adding or subtracting white space, underscore, and hyphen
8873 characters matters or doesn't matter, and other permissible syntactic
8874 variants.  Upper/lower case distinctions never matter.
8875 END
8876
8877         }
8878         if ($compound_name) {
8879             $comment .= <<END;
8880
8881 A colon can be substituted for the equals sign, and
8882 END
8883             if ($properties_with_compound_names > 1) {
8884                 $comment .= <<END;
8885 within each group above,
8886 END
8887             }
8888             $compound_name = sprintf("%-8s\\p{%s}", " ", $compound_name);
8889
8890             # Note the \b below, it says don't make that line a continuation.
8891             $comment .= <<END;
8892 anything to the left of the equals (or colon) can be combined with anything to
8893 the right.  Thus, for example,
8894 $compound_name
8895 \bis also valid.
8896 END
8897         }
8898
8899         # And append any comment(s) from the actual tables.  They are all
8900         # gathered here, so may not read all that well.
8901         if (@global_comments) {
8902             $comment .= "\n" . join("\n\n", @global_comments) . "\n";
8903         }
8904
8905         if ($count) {   # The format differs if no code points, and needs no
8906                         # explanation in that case
8907             if ($leader->write_as_invlist) {
8908                 $comment.= <<END;
8909
8910 The first data line of this file begins with the letter V to indicate it is in
8911 inversion list format.  The number following the V gives the number of lines
8912 remaining.  Each of those remaining lines is a single number representing the
8913 starting code point of a range which goes up to but not including the number
8914 on the next line; The 0th, 2nd, 4th... ranges are for code points that match
8915 the property; the 1st, 3rd, 5th... are ranges of code points that don't match
8916 the property.  The final line's range extends to the platform's infinity.
8917 END
8918             }
8919             else {
8920                 $comment.= <<END;
8921 The format of the lines of this file is:
8922 START\\tSTOP\\twhere START is the starting code point of the range, in hex;
8923 STOP is the ending point, or if omitted, the range has just one code point.
8924 END
8925             }
8926             if ($leader->output_range_counts) {
8927                 $comment .= <<END;
8928 Numbers in comments in [brackets] indicate how many code points are in the
8929 range.
8930 END
8931             }
8932         }
8933
8934         $leader->set_comment(main::join_lines($comment));
8935         return;
8936     }
8937
8938     # Accessors for the underlying list
8939     for my $sub (qw(
8940                     get_valid_code_point
8941                     get_invalid_code_point
8942                 ))
8943     {
8944         no strict "refs";
8945         *$sub = sub {
8946             use strict "refs";
8947             my $self = shift;
8948
8949             return $self->_range_list->$sub(@_);
8950         }
8951     }
8952 } # End closure for Match_Table
8953
8954 package Property;
8955
8956 # The Property class represents a Unicode property, or the $perl
8957 # pseudo-property.  It contains a map table initialized empty at construction
8958 # time, and for properties accessible through regular expressions, various
8959 # match tables, created through the add_match_table() method, and referenced
8960 # by the table('NAME') or tables() methods, the latter returning a list of all
8961 # of the match tables.  Otherwise table operations implicitly are for the map
8962 # table.
8963 #
8964 # Most of the data in the property is actually about its map table, so it
8965 # mostly just uses that table's accessors for most methods.  The two could
8966 # have been combined into one object, but for clarity because of their
8967 # differing semantics, they have been kept separate.  It could be argued that
8968 # the 'file' and 'directory' fields should be kept with the map table.
8969 #
8970 # Each property has a type.  This can be set in the constructor, or in the
8971 # set_type accessor, but mostly it is figured out by the data.  Every property
8972 # starts with unknown type, overridden by a parameter to the constructor, or
8973 # as match tables are added, or ranges added to the map table, the data is
8974 # inspected, and the type changed.  After the table is mostly or entirely
8975 # filled, compute_type() should be called to finalize they analysis.
8976 #
8977 # There are very few operations defined.  One can safely remove a range from
8978 # the map table, and property_add_or_replace_non_nulls() adds the maps from another
8979 # table to this one, replacing any in the intersection of the two.
8980
8981 sub standardize { return main::standardize($_[0]); }
8982 sub trace { return main::trace(@_) if main::DEBUG && $to_trace }
8983
8984 {   # Closure
8985
8986     # This hash will contain as keys, all the aliases of all properties, and
8987     # as values, pointers to their respective property objects.  This allows
8988     # quick look-up of a property from any of its names.
8989     my %alias_to_property_of;
8990
8991     sub dump_alias_to_property_of {
8992         # For debugging
8993
8994         print "\n", main::simple_dumper (\%alias_to_property_of), "\n";
8995         return;
8996     }
8997
8998     sub property_ref {
8999         # This is a package subroutine, not called as a method.
9000         # If the single parameter is a literal '*' it returns a list of all
9001         # defined properties.
9002         # Otherwise, the single parameter is a name, and it returns a pointer
9003         # to the corresponding property object, or undef if none.
9004         #
9005         # Properties can have several different names.  The 'standard' form of
9006         # each of them is stored in %alias_to_property_of as they are defined.
9007         # But it's possible that this subroutine will be called with some
9008         # variant, so if the initial lookup fails, it is repeated with the
9009         # standardized form of the input name.  If found, besides returning the
9010         # result, the input name is added to the list so future calls won't
9011         # have to do the conversion again.
9012
9013         my $name = shift;
9014
9015         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9016
9017         if (! defined $name) {
9018             Carp::my_carp_bug("Undefined input property.  No action taken.");
9019             return;
9020         }
9021
9022         return main::uniques(values %alias_to_property_of) if $name eq '*';
9023
9024         # Return cached result if have it.
9025         my $result = $alias_to_property_of{$name};
9026         return $result if defined $result;
9027
9028         # Convert the input to standard form.
9029         my $standard_name = standardize($name);
9030
9031         $result = $alias_to_property_of{$standard_name};
9032         return unless defined $result;        # Don't cache undefs
9033
9034         # Cache the result before returning it.
9035         $alias_to_property_of{$name} = $result;
9036         return $result;
9037     }
9038
9039
9040     main::setup_package();
9041
9042     my %map;
9043     # A pointer to the map table object for this property
9044     main::set_access('map', \%map);
9045
9046     my %full_name;
9047     # The property's full name.  This is a duplicate of the copy kept in the
9048     # map table, but is needed because stringify needs it during
9049     # construction of the map table, and then would have a chicken before egg
9050     # problem.
9051     main::set_access('full_name', \%full_name, 'r');
9052
9053     my %table_ref;
9054     # This hash will contain as keys, all the aliases of any match tables
9055     # attached to this property, and as values, the pointers to their
9056     # respective tables.  This allows quick look-up of a table from any of its
9057     # names.
9058     main::set_access('table_ref', \%table_ref);
9059
9060     my %type;
9061     # The type of the property, $ENUM, $BINARY, etc
9062     main::set_access('type', \%type, 'r');
9063
9064     my %file;
9065     # The filename where the map table will go (if actually written).
9066     # Normally defaulted, but can be overridden.
9067     main::set_access('file', \%file, 'r', 's');
9068
9069     my %directory;
9070     # The directory where the map table will go (if actually written).
9071     # Normally defaulted, but can be overridden.
9072     main::set_access('directory', \%directory, 's');
9073
9074     my %pseudo_map_type;
9075     # This is used to affect the calculation of the map types for all the
9076     # ranges in the table.  It should be set to one of the values that signify
9077     # to alter the calculation.
9078     main::set_access('pseudo_map_type', \%pseudo_map_type, 'r');
9079
9080     my %has_only_code_point_maps;
9081     # A boolean used to help in computing the type of data in the map table.
9082     main::set_access('has_only_code_point_maps', \%has_only_code_point_maps);
9083
9084     my %unique_maps;
9085     # A list of the first few distinct mappings this property has.  This is
9086     # used to disambiguate between binary and enum property types, so don't
9087     # have to keep more than three.
9088     main::set_access('unique_maps', \%unique_maps);
9089
9090     my %pre_declared_maps;
9091     # A boolean that gives whether the input data should declare all the
9092     # tables used, or not.  If the former, unknown ones raise a warning.
9093     main::set_access('pre_declared_maps',
9094                                     \%pre_declared_maps, 'r', 's');
9095
9096     my %has_dependency;
9097     # A boolean that gives whether some table somewhere is defined as the
9098     # complement of a table in this property.  This is a crude, but currently
9099     # sufficient, mechanism to make this property not get destroyed before
9100     # what is dependent on it is.  Other dependencies could be added, so the
9101     # name was chosen to reflect a more general situation than actually is
9102     # currently the case.
9103     main::set_access('has_dependency', \%has_dependency, 'r', 's');
9104
9105     sub new {
9106         # The only required parameter is the positionally first, name.  All
9107         # other parameters are key => value pairs.  See the documentation just
9108         # above for the meanings of the ones not passed directly on to the map
9109         # table constructor.
9110
9111         my $class = shift;
9112         my $name = shift || "";
9113
9114         my $self = property_ref($name);
9115         if (defined $self) {
9116             my $options_string = join ", ", @_;
9117             $options_string = ".  Ignoring options $options_string" if $options_string;
9118             Carp::my_carp("$self is already in use.  Using existing one$options_string;");
9119             return $self;
9120         }
9121
9122         my %args = @_;
9123
9124         $self = bless \do { my $anonymous_scalar }, $class;
9125         my $addr = do { no overloading; pack 'J', $self; };
9126
9127         $directory{$addr} = delete $args{'Directory'};
9128         $file{$addr} = delete $args{'File'};
9129         $full_name{$addr} = delete $args{'Full_Name'} || $name;
9130         $type{$addr} = delete $args{'Type'} || $UNKNOWN;
9131         $pseudo_map_type{$addr} = delete $args{'Map_Type'};
9132         $pre_declared_maps{$addr} = delete $args{'Pre_Declared_Maps'}
9133                                     # Starting in this release, property
9134                                     # values should be defined for all
9135                                     # properties, except those overriding this
9136                                     // $v_version ge v5.1.0;
9137
9138         # Rest of parameters passed on.
9139
9140         $has_only_code_point_maps{$addr} = 1;
9141         $table_ref{$addr} = { };
9142         $unique_maps{$addr} = { };
9143         $has_dependency{$addr} = 0;
9144
9145         $map{$addr} = Map_Table->new($name,
9146                                     Full_Name => $full_name{$addr},
9147                                     _Alias_Hash => \%alias_to_property_of,
9148                                     _Property => $self,
9149                                     %args);
9150         return $self;
9151     }
9152
9153     # See this program's beginning comment block about overloading the copy
9154     # constructor.  Few operations are defined on properties, but a couple are
9155     # useful.  It is safe to take the inverse of a property, and to remove a
9156     # single code point from it.
9157     use overload
9158         fallback => 0,
9159         qw("") => "_operator_stringify",
9160         "." => \&main::_operator_dot,
9161         ".=" => \&main::_operator_dot_equal,
9162         '==' => \&main::_operator_equal,
9163         '!=' => \&main::_operator_not_equal,
9164         '=' => sub { return shift },
9165         '-=' => "_minus_and_equal",
9166     ;
9167
9168     sub _operator_stringify {
9169         return "Property '" .  shift->full_name . "'";
9170     }
9171
9172     sub _minus_and_equal {
9173         # Remove a single code point from the map table of a property.
9174
9175         my $self = shift;
9176         my $other = shift;
9177         my $reversed = shift;
9178         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9179
9180         if (ref $other) {
9181             Carp::my_carp_bug("Bad news.  Can't cope with a "
9182                         . ref($other)
9183                         . " argument to '-='.  Subtraction ignored.");
9184             return $self;
9185         }
9186         elsif ($reversed) {   # Shouldn't happen in a -=, but just in case
9187             Carp::my_carp_bug("Bad news.  Can't cope with subtracting a "
9188             . ref $self
9189             . " from a non-object.  undef returned.");
9190             return;
9191         }
9192         else {
9193             no overloading;
9194             $map{pack 'J', $self}->delete_range($other, $other);
9195         }
9196         return $self;
9197     }
9198
9199     sub add_match_table {
9200         # Add a new match table for this property, with name given by the
9201         # parameter.  It returns a pointer to the table.
9202
9203         my $self = shift;
9204         my $name = shift;
9205         my %args = @_;
9206
9207         my $addr = do { no overloading; pack 'J', $self; };
9208
9209         my $table = $table_ref{$addr}{$name};
9210         my $standard_name = main::standardize($name);
9211         if (defined $table
9212             || (defined ($table = $table_ref{$addr}{$standard_name})))
9213         {
9214             Carp::my_carp("Table '$name' in $self is already in use.  Using existing one");
9215             $table_ref{$addr}{$name} = $table;
9216             return $table;
9217         }
9218         else {
9219
9220             # See if this is a perl extension, if not passed in.
9221             my $perl_extension = delete $args{'Perl_Extension'};
9222             $perl_extension
9223                         = $self->perl_extension if ! defined $perl_extension;
9224
9225             my $fate;
9226             my $suppression_reason = "";
9227             if ($self->name =~ /^_/) {
9228                 $fate = $SUPPRESSED;
9229                 $suppression_reason = "Parent property is internal only";
9230             }
9231             elsif ($self->fate >= $SUPPRESSED) {
9232                 $fate = $self->fate;
9233                 $suppression_reason = $why_suppressed{$self->complete_name};
9234
9235             }
9236             elsif ($name =~ /^_/) {
9237                 $fate = $INTERNAL_ONLY;
9238             }
9239             $table = Match_Table->new(
9240                                 Name => $name,
9241                                 Perl_Extension => $perl_extension,
9242                                 _Alias_Hash => $table_ref{$addr},
9243                                 _Property => $self,
9244                                 Fate => $fate,
9245                                 Suppression_Reason => $suppression_reason,
9246                                 Status => $self->status,
9247                                 _Status_Info => $self->status_info,
9248                                 %args);
9249             return unless defined $table;
9250         }
9251
9252         # Save the names for quick look up
9253         $table_ref{$addr}{$standard_name} = $table;
9254         $table_ref{$addr}{$name} = $table;
9255
9256         # Perhaps we can figure out the type of this property based on the
9257         # fact of adding this match table.  First, string properties don't
9258         # have match tables; second, a binary property can't have 3 match
9259         # tables
9260         if ($type{$addr} == $UNKNOWN) {
9261             $type{$addr} = $NON_STRING;
9262         }
9263         elsif ($type{$addr} == $STRING) {
9264             Carp::my_carp("$self Added a match table '$name' to a string property '$self'.  Changed it to a non-string property.  Bad News.");
9265             $type{$addr} = $NON_STRING;
9266         }
9267         elsif ($type{$addr} != $ENUM && $type{$addr} != $FORCED_BINARY) {
9268             if (scalar main::uniques(values %{$table_ref{$addr}}) > 2) {
9269                 if ($type{$addr} == $BINARY) {
9270                     Carp::my_carp("$self now has more than 2 tables (with the addition of '$name'), and so is no longer binary.  Changing its type to 'enum'.  Bad News.");
9271                 }
9272                 $type{$addr} = $ENUM;
9273             }
9274         }
9275
9276         return $table;
9277     }
9278
9279     sub delete_match_table {
9280         # Delete the table referred to by $2 from the property $1.
9281
9282         my $self = shift;
9283         my $table_to_remove = shift;
9284         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9285
9286         my $addr = do { no overloading; pack 'J', $self; };
9287
9288         # Remove all names that refer to it.
9289         foreach my $key (keys %{$table_ref{$addr}}) {
9290             delete $table_ref{$addr}{$key}
9291                                 if $table_ref{$addr}{$key} == $table_to_remove;
9292         }
9293
9294         $table_to_remove->DESTROY;
9295         return;
9296     }
9297
9298     sub table {
9299         # Return a pointer to the match table (with name given by the
9300         # parameter) associated with this property; undef if none.
9301
9302         my $self = shift;
9303         my $name = shift;
9304         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9305
9306         my $addr = do { no overloading; pack 'J', $self; };
9307
9308         return $table_ref{$addr}{$name} if defined $table_ref{$addr}{$name};
9309
9310         # If quick look-up failed, try again using the standard form of the
9311         # input name.  If that succeeds, cache the result before returning so
9312         # won't have to standardize this input name again.
9313         my $standard_name = main::standardize($name);
9314         return unless defined $table_ref{$addr}{$standard_name};
9315
9316         $table_ref{$addr}{$name} = $table_ref{$addr}{$standard_name};
9317         return $table_ref{$addr}{$name};
9318     }
9319
9320     sub tables {
9321         # Return a list of pointers to all the match tables attached to this
9322         # property
9323
9324         no overloading;
9325         return main::uniques(values %{$table_ref{pack 'J', shift}});
9326     }
9327
9328     sub directory {
9329         # Returns the directory the map table for this property should be
9330         # output in.  If a specific directory has been specified, that has
9331         # priority;  'undef' is returned if the type isn't defined;
9332         # or $map_directory for everything else.
9333
9334         my $addr = do { no overloading; pack 'J', shift; };
9335
9336         return $directory{$addr} if defined $directory{$addr};
9337         return undef if $type{$addr} == $UNKNOWN;
9338         return $map_directory;
9339     }
9340
9341     sub swash_name {
9342         # Return the name that is used to both:
9343         #   1)  Name the file that the map table is written to.
9344         #   2)  The name of swash related stuff inside that file.
9345         # The reason for this is that the Perl core historically has used
9346         # certain names that aren't the same as the Unicode property names.
9347         # To continue using these, $file is hard-coded in this file for those,
9348         # but otherwise the standard name is used.  This is different from the
9349         # external_name, so that the rest of the files, like in lib can use
9350         # the standard name always, without regard to historical precedent.
9351
9352         my $self = shift;
9353         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9354
9355         my $addr = do { no overloading; pack 'J', $self; };
9356
9357         # Swash names are used only on either
9358         # 1) legacy-only properties, because the formats for these are
9359         #    unchangeable, and they have had these lines in them; or
9360         # 2) regular or internal-only map tables
9361         # 3) otherwise there should be no access to the
9362         #    property map table from other parts of Perl.
9363         return if $map{$addr}->fate != $ORDINARY
9364                   && $map{$addr}->fate != $LEGACY_ONLY
9365                   && ! ($map{$addr}->name =~ /^_/
9366                         && $map{$addr}->fate == $INTERNAL_ONLY);
9367
9368         return $file{$addr} if defined $file{$addr};
9369         return $map{$addr}->external_name;
9370     }
9371
9372     sub to_create_match_tables {
9373         # Returns a boolean as to whether or not match tables should be
9374         # created for this property.
9375
9376         my $self = shift;
9377         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9378
9379         # The whole point of this pseudo property is match tables.
9380         return 1 if $self == $perl;
9381
9382         my $addr = do { no overloading; pack 'J', $self; };
9383
9384         # Don't generate tables of code points that match the property values
9385         # of a string property.  Such a list would most likely have many
9386         # property values, each with just one or very few code points mapping
9387         # to it.
9388         return 0 if $type{$addr} == $STRING;
9389
9390         # Otherwise, do.
9391         return 1;
9392     }
9393
9394     sub property_add_or_replace_non_nulls {
9395         # This adds the mappings in the property $other to $self.  Non-null
9396         # mappings from $other override those in $self.  It essentially merges
9397         # the two properties, with the second having priority except for null
9398         # mappings.
9399
9400         my $self = shift;
9401         my $other = shift;
9402         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9403
9404         if (! $other->isa(__PACKAGE__)) {
9405             Carp::my_carp_bug("$other should be a "
9406                             . __PACKAGE__
9407                             . ".  Not a '"
9408                             . ref($other)
9409                             . "'.  Not added;");
9410             return;
9411         }
9412
9413         no overloading;
9414         return $map{pack 'J', $self}->map_add_or_replace_non_nulls($map{pack 'J', $other});
9415     }
9416
9417     sub set_proxy_for {
9418         # Certain tables are not generally written out to files, but
9419         # Unicode::UCD has the intelligence to know that the file for $self
9420         # can be used to reconstruct those tables.  This routine just changes
9421         # things so that UCD pod entries for those suppressed tables are
9422         # generated, so the fact that a proxy is used is invisible to the
9423         # user.
9424
9425         my $self = shift;
9426
9427         foreach my $property_name (@_) {
9428             my $ref = property_ref($property_name);
9429             next if $ref->to_output_map;
9430             $ref->set_fate($MAP_PROXIED);
9431         }
9432     }
9433
9434     sub set_type {
9435         # Set the type of the property.  Mostly this is figured out by the
9436         # data in the table.  But this is used to set it explicitly.  The
9437         # reason it is not a standard accessor is that when setting a binary
9438         # property, we need to make sure that all the true/false aliases are
9439         # present, as they were omitted in early Unicode releases.
9440
9441         my $self = shift;
9442         my $type = shift;
9443         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9444
9445         if ($type != $ENUM
9446             && $type != $BINARY
9447             && $type != $FORCED_BINARY
9448             && $type != $STRING)
9449         {
9450             Carp::my_carp("Unrecognized type '$type'.  Type not set");
9451             return;
9452         }
9453
9454         { no overloading; $type{pack 'J', $self} = $type; }
9455         return if $type != $BINARY && $type != $FORCED_BINARY;
9456
9457         my $yes = $self->table('Y');
9458         $yes = $self->table('Yes') if ! defined $yes;
9459         $yes = $self->add_match_table('Y', Full_Name => 'Yes')
9460                                                             if ! defined $yes;
9461
9462         # Add aliases in order wanted, duplicates will be ignored.  We use a
9463         # binary property present in all releases for its ordered lists of
9464         # true/false aliases.  Note, that could run into problems in
9465         # outputting things in that we don't distinguish between the name and
9466         # full name of these.  Hopefully, if the table was already created
9467         # before this code is executed, it was done with these set properly.
9468         my $bm = property_ref("Bidi_Mirrored");
9469         foreach my $alias ($bm->table("Y")->aliases) {
9470             $yes->add_alias($alias->name);
9471         }
9472         my $no = $self->table('N');
9473         $no = $self->table('No') if ! defined $no;
9474         $no = $self->add_match_table('N', Full_Name => 'No') if ! defined $no;
9475         foreach my $alias ($bm->table("N")->aliases) {
9476             $no->add_alias($alias->name);
9477         }
9478
9479         return;
9480     }
9481
9482     sub add_map {
9483         # Add a map to the property's map table.  This also keeps
9484         # track of the maps so that the property type can be determined from
9485         # its data.
9486
9487         my $self = shift;
9488         my $start = shift;  # First code point in range
9489         my $end = shift;    # Final code point in range
9490         my $map = shift;    # What the range maps to.
9491         # Rest of parameters passed on.
9492
9493         my $addr = do { no overloading; pack 'J', $self; };
9494
9495         # If haven't the type of the property, gather information to figure it
9496         # out.
9497         if ($type{$addr} == $UNKNOWN) {
9498
9499             # If the map contains an interior blank or dash, or most other
9500             # nonword characters, it will be a string property.  This
9501             # heuristic may actually miss some string properties.  If so, they
9502             # may need to have explicit set_types called for them.  This
9503             # happens in the Unihan properties.
9504             if ($map =~ / (?<= . ) [ -] (?= . ) /x
9505                 || $map =~ / [^\w.\/\ -]  /x)
9506             {
9507                 $self->set_type($STRING);
9508
9509                 # $unique_maps is used for disambiguating between ENUM and
9510                 # BINARY later; since we know the property is not going to be
9511                 # one of those, no point in keeping the data around
9512                 undef $unique_maps{$addr};
9513             }
9514             else {
9515
9516                 # Not necessarily a string.  The final decision has to be
9517                 # deferred until all the data are in.  We keep track of if all
9518                 # the values are code points for that eventual decision.
9519                 $has_only_code_point_maps{$addr} &=
9520                                             $map =~ / ^ $code_point_re $/x;
9521
9522                 # For the purposes of disambiguating between binary and other
9523                 # enumerations at the end, we keep track of the first three
9524                 # distinct property values.  Once we get to three, we know
9525                 # it's not going to be binary, so no need to track more.
9526                 if (scalar keys %{$unique_maps{$addr}} < 3) {
9527                     $unique_maps{$addr}{main::standardize($map)} = 1;
9528                 }
9529             }
9530         }
9531
9532         # Add the mapping by calling our map table's method
9533         return $map{$addr}->add_map($start, $end, $map, @_);
9534     }
9535
9536     sub compute_type {
9537         # Compute the type of the property: $ENUM, $STRING, or $BINARY.  This
9538         # should be called after the property is mostly filled with its maps.
9539         # We have been keeping track of what the property values have been,
9540         # and now have the necessary information to figure out the type.
9541
9542         my $self = shift;
9543         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9544
9545         my $addr = do { no overloading; pack 'J', $self; };
9546
9547         my $type = $type{$addr};
9548
9549         # If already have figured these out, no need to do so again, but we do
9550         # a double check on ENUMS to make sure that a string property hasn't
9551         # improperly been classified as an ENUM, so continue on with those.
9552         return if $type == $STRING
9553                   || $type == $BINARY
9554                   || $type == $FORCED_BINARY;
9555
9556         # If every map is to a code point, is a string property.
9557         if ($type == $UNKNOWN
9558             && ($has_only_code_point_maps{$addr}
9559                 || (defined $map{$addr}->default_map
9560                     && $map{$addr}->default_map eq "")))
9561         {
9562             $self->set_type($STRING);
9563         }
9564         else {
9565
9566             # Otherwise, it is to some sort of enumeration.  (The case where
9567             # it is a Unicode miscellaneous property, and treated like a
9568             # string in this program is handled in add_map()).  Distinguish
9569             # between binary and some other enumeration type.  Of course, if
9570             # there are more than two values, it's not binary.  But more
9571             # subtle is the test that the default mapping is defined means it
9572             # isn't binary.  This in fact may change in the future if Unicode
9573             # changes the way its data is structured.  But so far, no binary
9574             # properties ever have @missing lines for them, so the default map
9575             # isn't defined for them.  The few properties that are two-valued
9576             # and aren't considered binary have the default map defined
9577             # starting in Unicode 5.0, when the @missing lines appeared; and
9578             # this program has special code to put in a default map for them
9579             # for earlier than 5.0 releases.
9580             if ($type == $ENUM
9581                 || scalar keys %{$unique_maps{$addr}} > 2
9582                 || defined $self->default_map)
9583             {
9584                 my $tables = $self->tables;
9585                 my $count = $self->count;
9586                 if ($verbosity && $tables > 500 && $tables/$count > .1) {
9587                     Carp::my_carp_bug("It appears that $self should be a \$STRING property, not an \$ENUM because it has too many match tables: $tables\n");
9588                 }
9589                 $self->set_type($ENUM);
9590             }
9591             else {
9592                 $self->set_type($BINARY);
9593             }
9594         }
9595         undef $unique_maps{$addr};  # Garbage collect
9596         return;
9597     }
9598
9599     sub set_fate {
9600         my $self = shift;
9601         my $fate = shift;
9602         my $reason = shift;  # Ignored unless suppressing
9603         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9604
9605         my $addr = do { no overloading; pack 'J', $self; };
9606         if ($fate >= $SUPPRESSED) {
9607             $why_suppressed{$self->complete_name} = $reason;
9608         }
9609
9610         # Each table shares the property's fate, except that MAP_PROXIED
9611         # doesn't affect match tables
9612         $map{$addr}->set_fate($fate, $reason);
9613         if ($fate != $MAP_PROXIED) {
9614             foreach my $table ($map{$addr}, $self->tables) {
9615                 $table->set_fate($fate, $reason);
9616             }
9617         }
9618         return;
9619     }
9620
9621
9622     # Most of the accessors for a property actually apply to its map table.
9623     # Setup up accessor functions for those, referring to %map
9624     for my $sub (qw(
9625                     add_alias
9626                     add_anomalous_entry
9627                     add_comment
9628                     add_conflicting
9629                     add_description
9630                     add_duplicate
9631                     add_note
9632                     aliases
9633                     comment
9634                     complete_name
9635                     containing_range
9636                     count
9637                     default_map
9638                     definition
9639                     delete_range
9640                     description
9641                     each_range
9642                     external_name
9643                     fate
9644                     file_path
9645                     format
9646                     initialize
9647                     inverse_list
9648                     is_empty
9649                     replacement_property
9650                     name
9651                     note
9652                     perl_extension
9653                     property
9654                     range_count
9655                     ranges
9656                     range_size_1
9657                     replace_map
9658                     reset_each_range
9659                     set_comment
9660                     set_default_map
9661                     set_file_path
9662                     set_final_comment
9663                     _set_format
9664                     set_range_size_1
9665                     set_status
9666                     set_to_output_map
9667                     short_name
9668                     status
9669                     status_info
9670                     to_output_map
9671                     type_of
9672                     value_of
9673                     write
9674                 ))
9675                     # 'property' above is for symmetry, so that one can take
9676                     # the property of a property and get itself, and so don't
9677                     # have to distinguish between properties and tables in
9678                     # calling code
9679     {
9680         no strict "refs";
9681         *$sub = sub {
9682             use strict "refs";
9683             my $self = shift;
9684             no overloading;
9685             return $map{pack 'J', $self}->$sub(@_);
9686         }
9687     }
9688
9689
9690 } # End closure
9691
9692 package main;
9693
9694 sub display_chr {
9695     # Converts an ordinal printable character value to a displayable string,
9696     # using a dotted circle to hold combining characters.
9697
9698     my $ord = shift;
9699     my $chr = chr $ord;
9700     return $chr if $ccc->table(0)->contains($ord);
9701     return "\x{25CC}$chr";
9702 }
9703
9704 sub join_lines($) {
9705     # Returns lines of the input joined together, so that they can be folded
9706     # properly.
9707     # This causes continuation lines to be joined together into one long line
9708     # for folding.  A continuation line is any line that doesn't begin with a
9709     # space or "\b" (the latter is stripped from the output).  This is so
9710     # lines can be be in a HERE document so as to fit nicely in the terminal
9711     # width, but be joined together in one long line, and then folded with
9712     # indents, '#' prefixes, etc, properly handled.
9713     # A blank separates the joined lines except if there is a break; an extra
9714     # blank is inserted after a period ending a line.
9715
9716     # Initialize the return with the first line.
9717     my ($return, @lines) = split "\n", shift;
9718
9719     # If the first line is null, it was an empty line, add the \n back in
9720     $return = "\n" if $return eq "";
9721
9722     # Now join the remainder of the physical lines.
9723     for my $line (@lines) {
9724
9725         # An empty line means wanted a blank line, so add two \n's to get that
9726         # effect, and go to the next line.
9727         if (length $line == 0) {
9728             $return .= "\n\n";
9729             next;
9730         }
9731
9732         # Look at the last character of what we have so far.
9733         my $previous_char = substr($return, -1, 1);
9734
9735         # And at the next char to be output.
9736         my $next_char = substr($line, 0, 1);
9737
9738         if ($previous_char ne "\n") {
9739
9740             # Here didn't end wth a nl.  If the next char a blank or \b, it
9741             # means that here there is a break anyway.  So add a nl to the
9742             # output.
9743             if ($next_char eq " " || $next_char eq "\b") {
9744                 $previous_char = "\n";
9745                 $return .= $previous_char;
9746             }
9747
9748             # Add an extra space after periods.
9749             $return .= " " if $previous_char eq '.';
9750         }
9751
9752         # Here $previous_char is still the latest character to be output.  If
9753         # it isn't a nl, it means that the next line is to be a continuation
9754         # line, with a blank inserted between them.
9755         $return .= " " if $previous_char ne "\n";
9756
9757         # Get rid of any \b
9758         substr($line, 0, 1) = "" if $next_char eq "\b";
9759
9760         # And append this next line.
9761         $return .= $line;
9762     }
9763
9764     return $return;
9765 }
9766
9767 sub simple_fold($;$$$) {
9768     # Returns a string of the input (string or an array of strings) folded
9769     # into multiple-lines each of no more than $MAX_LINE_WIDTH characters plus
9770     # a \n
9771     # This is tailored for the kind of text written by this program,
9772     # especially the pod file, which can have very long names with
9773     # underscores in the middle, or words like AbcDefgHij....  We allow
9774     # breaking in the middle of such constructs if the line won't fit
9775     # otherwise.  The break in such cases will come either just after an
9776     # underscore, or just before one of the Capital letters.
9777
9778     local $to_trace = 0 if main::DEBUG;
9779
9780     my $line = shift;
9781     my $prefix = shift;     # Optional string to prepend to each output
9782                             # line
9783     $prefix = "" unless defined $prefix;
9784
9785     my $hanging_indent = shift; # Optional number of spaces to indent
9786                                 # continuation lines
9787     $hanging_indent = 0 unless $hanging_indent;
9788
9789     my $right_margin = shift;   # Optional number of spaces to narrow the
9790                                 # total width by.
9791     $right_margin = 0 unless defined $right_margin;
9792
9793     # Call carp with the 'nofold' option to avoid it from trying to call us
9794     # recursively
9795     Carp::carp_extra_args(\@_, 'nofold') if main::DEBUG && @_;
9796
9797     # The space available doesn't include what's automatically prepended
9798     # to each line, or what's reserved on the right.
9799     my $max = $MAX_LINE_WIDTH - length($prefix) - $right_margin;
9800     # XXX Instead of using the 'nofold' perhaps better to look up the stack
9801
9802     if (DEBUG && $hanging_indent >= $max) {
9803         Carp::my_carp("Too large a hanging indent ($hanging_indent); must be < $max.  Using 0", 'nofold');
9804         $hanging_indent = 0;
9805     }
9806
9807     # First, split into the current physical lines.
9808     my @line;
9809     if (ref $line) {        # Better be an array, because not bothering to
9810                             # test
9811         foreach my $line (@{$line}) {
9812             push @line, split /\n/, $line;
9813         }
9814     }
9815     else {
9816         @line = split /\n/, $line;
9817     }
9818
9819     #local $to_trace = 1 if main::DEBUG;
9820     trace "", join(" ", @line), "\n" if main::DEBUG && $to_trace;
9821
9822     # Look at each current physical line.
9823     for (my $i = 0; $i < @line; $i++) {
9824         Carp::my_carp("Tabs don't work well.", 'nofold') if $line[$i] =~ /\t/;
9825         #local $to_trace = 1 if main::DEBUG;
9826         trace "i=$i: $line[$i]\n" if main::DEBUG && $to_trace;
9827
9828         # Remove prefix, because will be added back anyway, don't want
9829         # doubled prefix
9830         $line[$i] =~ s/^$prefix//;
9831
9832         # Remove trailing space
9833         $line[$i] =~ s/\s+\Z//;
9834
9835         # If the line is too long, fold it.
9836         if (length $line[$i] > $max) {
9837             my $remainder;
9838
9839             # Here needs to fold.  Save the leading space in the line for
9840             # later.
9841             $line[$i] =~ /^ ( \s* )/x;
9842             my $leading_space = $1;
9843             trace "line length", length $line[$i], "; lead length", length($leading_space) if main::DEBUG && $to_trace;
9844
9845             # If character at final permissible position is white space,
9846             # fold there, which will delete that white space
9847             if (substr($line[$i], $max - 1, 1) =~ /\s/) {
9848                 $remainder = substr($line[$i], $max);
9849                 $line[$i] = substr($line[$i], 0, $max - 1);
9850             }
9851             else {
9852
9853                 # Otherwise fold at an acceptable break char closest to
9854                 # the max length.  Look at just the maximal initial
9855                 # segment of the line
9856                 my $segment = substr($line[$i], 0, $max - 1);
9857                 if ($segment =~
9858                     /^ ( .{$hanging_indent}   # Don't look before the
9859                                               #  indent.
9860                         \ *                   # Don't look in leading
9861                                               #  blanks past the indent
9862                             [^ ] .*           # Find the right-most
9863                         (?:                   #  acceptable break:
9864                             [ \s = ]          # space or equal
9865                             | - (?! [.0-9] )  # or non-unary minus.
9866                         )                     # $1 includes the character
9867                     )/x)
9868                 {
9869                     # Split into the initial part that fits, and remaining
9870                     # part of the input
9871                     $remainder = substr($line[$i], length $1);
9872                     $line[$i] = $1;
9873                     trace $line[$i] if DEBUG && $to_trace;
9874                     trace $remainder if DEBUG && $to_trace;
9875                 }
9876
9877                 # If didn't find a good breaking spot, see if there is a
9878                 # not-so-good breaking spot.  These are just after
9879                 # underscores or where the case changes from lower to
9880                 # upper.  Use \a as a soft hyphen, but give up
9881                 # and don't break the line if there is actually a \a
9882                 # already in the input.  We use an ascii character for the
9883                 # soft-hyphen to avoid any attempt by miniperl to try to
9884                 # access the files that this program is creating.
9885                 elsif ($segment !~ /\a/
9886                        && ($segment =~ s/_/_\a/g
9887                        || $segment =~ s/ ( [a-z] ) (?= [A-Z] )/$1\a/xg))
9888                 {
9889                     # Here were able to find at least one place to insert
9890                     # our substitute soft hyphen.  Find the right-most one
9891                     # and replace it by a real hyphen.
9892                     trace $segment if DEBUG && $to_trace;
9893                     substr($segment,
9894                             rindex($segment, "\a"),
9895                             1) = '-';
9896
9897                     # Then remove the soft hyphen substitutes.
9898                     $segment =~ s/\a//g;
9899                     trace $segment if DEBUG && $to_trace;
9900
9901                     # And split into the initial part that fits, and
9902                     # remainder of the line
9903                     my $pos = rindex($segment, '-');
9904                     $remainder = substr($line[$i], $pos);
9905                     trace $remainder if DEBUG && $to_trace;
9906                     $line[$i] = substr($segment, 0, $pos + 1);
9907                 }
9908             }
9909
9910             # Here we know if we can fold or not.  If we can, $remainder
9911             # is what remains to be processed in the next iteration.
9912             if (defined $remainder) {
9913                 trace "folded='$line[$i]'" if main::DEBUG && $to_trace;
9914
9915                 # Insert the folded remainder of the line as a new element
9916                 # of the array.  (It may still be too long, but we will
9917                 # deal with that next time through the loop.)  Omit any
9918                 # leading space in the remainder.
9919                 $remainder =~ s/^\s+//;
9920                 trace "remainder='$remainder'" if main::DEBUG && $to_trace;
9921
9922                 # But then indent by whichever is larger of:
9923                 # 1) the leading space on the input line;
9924                 # 2) the hanging indent.
9925                 # This preserves indentation in the original line.
9926                 my $lead = ($leading_space)
9927                             ? length $leading_space
9928                             : $hanging_indent;
9929                 $lead = max($lead, $hanging_indent);
9930                 splice @line, $i+1, 0, (" " x $lead) . $remainder;
9931             }
9932         }
9933
9934         # Ready to output the line. Get rid of any trailing space
9935         # And prefix by the required $prefix passed in.
9936         $line[$i] =~ s/\s+$//;
9937         $line[$i] = "$prefix$line[$i]\n";
9938     } # End of looping through all the lines.
9939
9940     return join "", @line;
9941 }
9942
9943 sub property_ref {  # Returns a reference to a property object.
9944     return Property::property_ref(@_);
9945 }
9946
9947 sub force_unlink ($) {
9948     my $filename = shift;
9949     return unless file_exists($filename);
9950     return if CORE::unlink($filename);
9951
9952     # We might need write permission
9953     chmod 0777, $filename;
9954     CORE::unlink($filename) or Carp::my_carp("Couldn't unlink $filename.  Proceeding anyway: $!");
9955     return;
9956 }
9957
9958 sub write ($$@) {
9959     # Given a filename and references to arrays of lines, write the lines of
9960     # each array to the file
9961     # Filename can be given as an arrayref of directory names
9962
9963     return Carp::carp_too_few_args(\@_, 3) if main::DEBUG && @_ < 3;
9964
9965     my $file  = shift;
9966     my $use_utf8 = shift;
9967
9968     # Get into a single string if an array, and get rid of, in Unix terms, any
9969     # leading '.'
9970     $file= File::Spec->join(@$file) if ref $file eq 'ARRAY';
9971     $file = File::Spec->canonpath($file);
9972
9973     # If has directories, make sure that they all exist
9974     (undef, my $directories, undef) = File::Spec->splitpath($file);
9975     File::Path::mkpath($directories) if $directories && ! -d $directories;
9976
9977     push @files_actually_output, $file;
9978
9979     force_unlink ($file);
9980
9981     my $OUT;
9982     if (not open $OUT, ">", $file) {
9983         Carp::my_carp("can't open $file for output.  Skipping this file: $!");
9984         return;
9985     }
9986
9987     binmode $OUT, ":utf8" if $use_utf8;
9988
9989     while (defined (my $lines_ref = shift)) {
9990         unless (@$lines_ref) {
9991             Carp::my_carp("An array of lines for writing to file '$file' is empty; writing it anyway;");
9992         }
9993
9994         print $OUT @$lines_ref or die Carp::my_carp("write to '$file' failed: $!");
9995     }
9996     close $OUT or die Carp::my_carp("close '$file' failed: $!");
9997
9998     print "$file written.\n" if $verbosity >= $VERBOSE;
9999
10000     return;
10001 }
10002
10003
10004 sub Standardize($) {
10005     # This converts the input name string into a standardized equivalent to
10006     # use internally.
10007
10008     my $name = shift;
10009     unless (defined $name) {
10010       Carp::my_carp_bug("Standardize() called with undef.  Returning undef.");
10011       return;
10012     }
10013
10014     # Remove any leading or trailing white space
10015     $name =~ s/^\s+//g;
10016     $name =~ s/\s+$//g;
10017
10018     # Convert interior white space and hyphens into underscores.
10019     $name =~ s/ (?<= .) [ -]+ (.) /_$1/xg;
10020
10021     # Capitalize the letter following an underscore, and convert a sequence of
10022     # multiple underscores to a single one
10023     $name =~ s/ (?<= .) _+ (.) /_\u$1/xg;
10024
10025     # And capitalize the first letter, but not for the special cjk ones.
10026     $name = ucfirst($name) unless $name =~ /^k[A-Z]/;
10027     return $name;
10028 }
10029
10030 sub standardize ($) {
10031     # Returns a lower-cased standardized name, without underscores.  This form
10032     # is chosen so that it can distinguish between any real versus superficial
10033     # Unicode name differences.  It relies on the fact that Unicode doesn't
10034     # have interior underscores, white space, nor dashes in any
10035     # stricter-matched name.  It should not be used on Unicode code point
10036     # names (the Name property), as they mostly, but not always follow these
10037     # rules.
10038
10039     my $name = Standardize(shift);
10040     return if !defined $name;
10041
10042     $name =~ s/ (?<= .) _ (?= . ) //xg;
10043     return lc $name;
10044 }
10045
10046 sub UCD_name ($$) {
10047     # Returns the name that Unicode::UCD will use to find a table.  XXX
10048     # perhaps this function should be placed somewhere, like UCD.pm so that
10049     # Unicode::UCD can use it directly without duplicating code that can get
10050     # out-of sync.
10051
10052     my $table = shift;
10053     my $alias = shift;
10054     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10055
10056     my $property = $table->property;
10057     $property = ($property == $perl)
10058                 ? ""                # 'perl' is never explicitly stated
10059                 : standardize($property->name) . '=';
10060     if ($alias->loose_match) {
10061         return $property . standardize($alias->name);
10062     }
10063     else {
10064         return lc ($property . $alias->name);
10065     }
10066
10067     return;
10068 }
10069
10070 {   # Closure
10071
10072     my $indent_increment = " " x (($debugging_build) ? 2 : 0);
10073     %main::already_output = ();
10074
10075     $main::simple_dumper_nesting = 0;
10076
10077     sub simple_dumper {
10078         # Like Simple Data::Dumper. Good enough for our needs. We can't use
10079         # the real thing as we have to run under miniperl.
10080
10081         # It is designed so that on input it is at the beginning of a line,
10082         # and the final thing output in any call is a trailing ",\n".
10083
10084         my $item = shift;
10085         my $indent = shift;
10086         $indent = "" if ! $debugging_build || ! defined $indent;
10087
10088         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10089
10090         # nesting level is localized, so that as the call stack pops, it goes
10091         # back to the prior value.
10092         local $main::simple_dumper_nesting = $main::simple_dumper_nesting;
10093         local %main::already_output = %main::already_output;
10094         $main::simple_dumper_nesting++;
10095         #print STDERR __LINE__, ": $main::simple_dumper_nesting: $indent$item\n";
10096
10097         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10098
10099         # Determine the indent for recursive calls.
10100         my $next_indent = $indent . $indent_increment;
10101
10102         my $output;
10103         if (! ref $item) {
10104
10105             # Dump of scalar: just output it in quotes if not a number.  To do
10106             # so we must escape certain characters, and therefore need to
10107             # operate on a copy to avoid changing the original
10108             my $copy = $item;
10109             $copy = $UNDEF unless defined $copy;
10110
10111             # Quote non-integers (integers also have optional leading '-')
10112             if ($copy eq "" || $copy !~ /^ -? \d+ $/x) {
10113
10114                 # Escape apostrophe and backslash
10115                 $copy =~ s/ ( ['\\] ) /\\$1/xg;
10116                 $copy = "'$copy'";
10117             }
10118             $output = "$indent$copy,\n";
10119         }
10120         else {
10121
10122             # Keep track of cycles in the input, and refuse to infinitely loop
10123             my $addr = do { no overloading; pack 'J', $item; };
10124             if (defined $main::already_output{$addr}) {
10125                 return "${indent}ALREADY OUTPUT: $item\n";
10126             }
10127             $main::already_output{$addr} = $item;
10128
10129             if (ref $item eq 'ARRAY') {
10130                 my $using_brackets;
10131                 $output = $indent;
10132                 if ($main::simple_dumper_nesting > 1) {
10133                     $output .= '[';
10134                     $using_brackets = 1;
10135                 }
10136                 else {
10137                     $using_brackets = 0;
10138                 }
10139
10140                 # If the array is empty, put the closing bracket on the same
10141                 # line.  Otherwise, recursively add each array element
10142                 if (@$item == 0) {
10143                     $output .= " ";
10144                 }
10145                 else {
10146                     $output .= "\n";
10147                     for (my $i = 0; $i < @$item; $i++) {
10148
10149                         # Indent array elements one level
10150                         $output .= &simple_dumper($item->[$i], $next_indent);
10151                         next if ! $debugging_build;
10152                         $output =~ s/\n$//;      # Remove any trailing nl so
10153                         $output .= " # [$i]\n";  # as to add a comment giving
10154                                                  # the array index
10155                     }
10156                     $output .= $indent;     # Indent closing ']' to orig level
10157                 }
10158                 $output .= ']' if $using_brackets;
10159                 $output .= ",\n";
10160             }
10161             elsif (ref $item eq 'HASH') {
10162                 my $is_first_line;
10163                 my $using_braces;
10164                 my $body_indent;
10165
10166                 # No surrounding braces at top level
10167                 $output .= $indent;
10168                 if ($main::simple_dumper_nesting > 1) {
10169                     $output .= "{\n";
10170                     $is_first_line = 0;
10171                     $body_indent = $next_indent;
10172                     $next_indent .= $indent_increment;
10173                     $using_braces = 1;
10174                 }
10175                 else {
10176                     $is_first_line = 1;
10177                     $body_indent = $indent;
10178                     $using_braces = 0;
10179                 }
10180
10181                 # Output hashes sorted alphabetically instead of apparently
10182                 # random.  Use caseless alphabetic sort
10183                 foreach my $key (sort { lc $a cmp lc $b } keys %$item)
10184                 {
10185                     if ($is_first_line) {
10186                         $is_first_line = 0;
10187                     }
10188                     else {
10189                         $output .= "$body_indent";
10190                     }
10191
10192                     # The key must be a scalar, but this recursive call quotes
10193                     # it
10194                     $output .= &simple_dumper($key);
10195
10196                     # And change the trailing comma and nl to the hash fat
10197                     # comma for clarity, and so the value can be on the same
10198                     # line
10199                     $output =~ s/,\n$/ => /;
10200
10201                     # Recursively call to get the value's dump.
10202                     my $next = &simple_dumper($item->{$key}, $next_indent);
10203
10204                     # If the value is all on one line, remove its indent, so
10205                     # will follow the => immediately.  If it takes more than
10206                     # one line, start it on a new line.
10207                     if ($next !~ /\n.*\n/) {
10208                         $next =~ s/^ *//;
10209                     }
10210                     else {
10211                         $output .= "\n";
10212                     }
10213                     $output .= $next;
10214                 }
10215
10216                 $output .= "$indent},\n" if $using_braces;
10217             }
10218             elsif (ref $item eq 'CODE' || ref $item eq 'GLOB') {
10219                 $output = $indent . ref($item) . "\n";
10220                 # XXX see if blessed
10221             }
10222             elsif ($item->can('dump')) {
10223
10224                 # By convention in this program, objects furnish a 'dump'
10225                 # method.  Since not doing any output at this level, just pass
10226                 # on the input indent
10227                 $output = $item->dump($indent);
10228             }
10229             else {
10230                 Carp::my_carp("Can't cope with dumping a " . ref($item) . ".  Skipping.");
10231             }
10232         }
10233         return $output;
10234     }
10235 }
10236
10237 sub dump_inside_out {
10238     # Dump inside-out hashes in an object's state by converting them to a
10239     # regular hash and then calling simple_dumper on that.
10240
10241     my $object = shift;
10242     my $fields_ref = shift;
10243
10244     my $addr = do { no overloading; pack 'J', $object; };
10245
10246     my %hash;
10247     foreach my $key (keys %$fields_ref) {
10248         $hash{$key} = $fields_ref->{$key}{$addr};
10249     }
10250
10251     return simple_dumper(\%hash, @_);
10252 }
10253
10254 sub _operator_dot {
10255     # Overloaded '.' method that is common to all packages.  It uses the
10256     # package's stringify method.
10257
10258     my $self = shift;
10259     my $other = shift;
10260     my $reversed = shift;
10261     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10262
10263     $other = "" unless defined $other;
10264
10265     foreach my $which (\$self, \$other) {
10266         next unless ref $$which;
10267         if ($$which->can('_operator_stringify')) {
10268             $$which = $$which->_operator_stringify;
10269         }
10270         else {
10271             my $ref = ref $$which;
10272             my $addr = do { no overloading; pack 'J', $$which; };
10273             $$which = "$ref ($addr)";
10274         }
10275     }
10276     return ($reversed)
10277             ? "$other$self"
10278             : "$self$other";
10279 }
10280
10281 sub _operator_dot_equal {
10282     # Overloaded '.=' method that is common to all packages.
10283
10284     my $self = shift;
10285     my $other = shift;
10286     my $reversed = shift;
10287     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10288
10289     $other = "" unless defined $other;
10290
10291     if ($reversed) {
10292         return $other .= "$self";
10293     }
10294     else {
10295         return "$self" . "$other";
10296     }
10297 }
10298
10299 sub _operator_equal {
10300     # Generic overloaded '==' routine.  To be equal, they must be the exact
10301     # same object
10302
10303     my $self = shift;
10304     my $other = shift;
10305
10306     return 0 unless defined $other;
10307     return 0 unless ref $other;
10308     no overloading;
10309     return $self == $other;
10310 }
10311
10312 sub _operator_not_equal {
10313     my $self = shift;
10314     my $other = shift;
10315
10316     return ! _operator_equal($self, $other);
10317 }
10318
10319 sub substitute_PropertyAliases($) {
10320     # Deal with early releases that don't have the crucial PropertyAliases.txt
10321     # file.
10322
10323     my $file_object = shift;
10324     $file_object->insert_lines(get_old_property_aliases());
10325
10326     process_PropertyAliases($file_object);
10327 }
10328
10329
10330 sub process_PropertyAliases($) {
10331     # This reads in the PropertyAliases.txt file, which contains almost all
10332     # the character properties in Unicode and their equivalent aliases:
10333     # scf       ; Simple_Case_Folding         ; sfc
10334     #
10335     # Field 0 is the preferred short name for the property.
10336     # Field 1 is the full name.
10337     # Any succeeding ones are other accepted names.
10338
10339     my $file= shift;
10340     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10341
10342     # Add any cjk properties that may have been defined.
10343     $file->insert_lines(@cjk_properties);
10344
10345     while ($file->next_line) {
10346
10347         my @data = split /\s*;\s*/;
10348
10349         my $full = $data[1];
10350
10351         # This line is defective in early Perls.  The property in Unihan.txt
10352         # is kRSUnicode.
10353         if ($full eq 'Unicode_Radical_Stroke' && @data < 3) {
10354             push @data, qw(cjkRSUnicode kRSUnicode);
10355         }
10356
10357         my $this = Property->new($data[0], Full_Name => $full);
10358
10359         $this->set_fate($SUPPRESSED, $why_suppressed{$full})
10360                                                     if $why_suppressed{$full};
10361
10362         # Start looking for more aliases after these two.
10363         for my $i (2 .. @data - 1) {
10364             $this->add_alias($data[$i]);
10365         }
10366
10367     }
10368
10369     my $scf = property_ref("Simple_Case_Folding");
10370     $scf->add_alias("scf");
10371     $scf->add_alias("sfc");
10372
10373     return;
10374 }
10375
10376 sub finish_property_setup {
10377     # Finishes setting up after PropertyAliases.
10378
10379     my $file = shift;
10380     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10381
10382     # This entry was missing from this file in earlier Unicode versions
10383     if (-e 'Jamo.txt' && ! defined property_ref('JSN')) {
10384         Property->new('JSN', Full_Name => 'Jamo_Short_Name');
10385     }
10386
10387     # These are used so much, that we set globals for them.
10388     $gc = property_ref('General_Category');
10389     $block = property_ref('Block');
10390     $script = property_ref('Script');
10391     $age = property_ref('Age');
10392
10393     # Perl adds this alias.
10394     $gc->add_alias('Category');
10395
10396     # Unicode::Normalize expects this file with this name and directory.
10397     $ccc = property_ref('Canonical_Combining_Class');
10398     if (defined $ccc) {
10399         $ccc->set_file('CombiningClass');
10400         $ccc->set_directory(File::Spec->curdir());
10401     }
10402
10403     # These two properties aren't actually used in the core, but unfortunately
10404     # the names just above that are in the core interfere with these, so
10405     # choose different names.  These aren't a problem unless the map tables
10406     # for these files get written out.
10407     my $lowercase = property_ref('Lowercase');
10408     $lowercase->set_file('IsLower') if defined $lowercase;
10409     my $uppercase = property_ref('Uppercase');
10410     $uppercase->set_file('IsUpper') if defined $uppercase;
10411
10412     # Set up the hard-coded default mappings, but only on properties defined
10413     # for this release
10414     foreach my $property (keys %default_mapping) {
10415         my $property_object = property_ref($property);
10416         next if ! defined $property_object;
10417         my $default_map = $default_mapping{$property};
10418         $property_object->set_default_map($default_map);
10419
10420         # A map of <code point> implies the property is string.
10421         if ($property_object->type == $UNKNOWN
10422             && $default_map eq $CODE_POINT)
10423         {
10424             $property_object->set_type($STRING);
10425         }
10426     }
10427
10428     # The following use the Multi_Default class to create objects for
10429     # defaults.
10430
10431     # Bidi class has a complicated default, but the derived file takes care of
10432     # the complications, leaving just 'L'.
10433     if (file_exists("${EXTRACTED}DBidiClass.txt")) {
10434         property_ref('Bidi_Class')->set_default_map('L');
10435     }
10436     else {
10437         my $default;
10438
10439         # The derived file was introduced in 3.1.1.  The values below are
10440         # taken from table 3-8, TUS 3.0
10441         my $default_R =
10442             'my $default = Range_List->new;
10443              $default->add_range(0x0590, 0x05FF);
10444              $default->add_range(0xFB1D, 0xFB4F);'
10445         ;
10446
10447         # The defaults apply only to unassigned characters
10448         $default_R .= '$gc->table("Unassigned") & $default;';
10449
10450         if ($v_version lt v3.0.0) {
10451             $default = Multi_Default->new(R => $default_R, 'L');
10452         }
10453         else {
10454
10455             # AL apparently not introduced until 3.0:  TUS 2.x references are
10456             # not on-line to check it out
10457             my $default_AL =
10458                 'my $default = Range_List->new;
10459                  $default->add_range(0x0600, 0x07BF);
10460                  $default->add_range(0xFB50, 0xFDFF);
10461                  $default->add_range(0xFE70, 0xFEFF);'
10462             ;
10463
10464             # Non-character code points introduced in this release; aren't AL
10465             if ($v_version ge 3.1.0) {
10466                 $default_AL .= '$default->delete_range(0xFDD0, 0xFDEF);';
10467             }
10468             $default_AL .= '$gc->table("Unassigned") & $default';
10469             $default = Multi_Default->new(AL => $default_AL,
10470                                           R => $default_R,
10471                                           'L');
10472         }
10473         property_ref('Bidi_Class')->set_default_map($default);
10474     }
10475
10476     # Joining type has a complicated default, but the derived file takes care
10477     # of the complications, leaving just 'U' (or Non_Joining), except the file
10478     # is bad in 3.1.0
10479     if (file_exists("${EXTRACTED}DJoinType.txt") || -e 'ArabicShaping.txt') {
10480         if (file_exists("${EXTRACTED}DJoinType.txt") && $v_version ne 3.1.0) {
10481             property_ref('Joining_Type')->set_default_map('Non_Joining');
10482         }
10483         else {
10484
10485             # Otherwise, there are not one, but two possibilities for the
10486             # missing defaults: T and U.
10487             # The missing defaults that evaluate to T are given by:
10488             # T = Mn + Cf - ZWNJ - ZWJ
10489             # where Mn and Cf are the general category values. In other words,
10490             # any non-spacing mark or any format control character, except
10491             # U+200C ZERO WIDTH NON-JOINER (joining type U) and U+200D ZERO
10492             # WIDTH JOINER (joining type C).
10493             my $default = Multi_Default->new(
10494                'T' => '$gc->table("Mn") + $gc->table("Cf") - 0x200C - 0x200D',
10495                'Non_Joining');
10496             property_ref('Joining_Type')->set_default_map($default);
10497         }
10498     }
10499
10500     # Line break has a complicated default in early releases. It is 'Unknown'
10501     # for non-assigned code points; 'AL' for assigned.
10502     if (file_exists("${EXTRACTED}DLineBreak.txt") || -e 'LineBreak.txt') {
10503         my $lb = property_ref('Line_Break');
10504         if (file_exists("${EXTRACTED}DLineBreak.txt")) {
10505             $lb->set_default_map('Unknown');
10506         }
10507         else {
10508             my $default = Multi_Default->new('AL' => '~ $gc->table("Cn")',
10509                                              'Unknown',
10510                                             );
10511             $lb->set_default_map($default);
10512         }
10513     }
10514
10515     # For backwards compatibility with applications that may read the mapping
10516     # file directly (it was documented in 5.12 and 5.14 as being thusly
10517     # usable), keep it from being adjusted.  (range_size_1 is
10518     # used to force the traditional format.)
10519     if (defined (my $nfkc_cf = property_ref('NFKC_Casefold'))) {
10520         $nfkc_cf->set_to_output_map($EXTERNAL_MAP);
10521         $nfkc_cf->set_range_size_1(1);
10522     }
10523     if (defined (my $bmg = property_ref('Bidi_Mirroring_Glyph'))) {
10524         $bmg->set_to_output_map($EXTERNAL_MAP);
10525         $bmg->set_range_size_1(1);
10526     }
10527
10528     property_ref('Numeric_Value')->set_to_output_map($OUTPUT_ADJUSTED);
10529
10530     return;
10531 }
10532
10533 sub get_old_property_aliases() {
10534     # Returns what would be in PropertyAliases.txt if it existed in very old
10535     # versions of Unicode.  It was derived from the one in 3.2, and pared
10536     # down based on the data that was actually in the older releases.
10537     # An attempt was made to use the existence of files to mean inclusion or
10538     # not of various aliases, but if this was not sufficient, using version
10539     # numbers was resorted to.
10540
10541     my @return;
10542
10543     # These are to be used in all versions (though some are constructed by
10544     # this program if missing)
10545     push @return, split /\n/, <<'END';
10546 bc        ; Bidi_Class
10547 Bidi_M    ; Bidi_Mirrored
10548 cf        ; Case_Folding
10549 ccc       ; Canonical_Combining_Class
10550 dm        ; Decomposition_Mapping
10551 dt        ; Decomposition_Type
10552 gc        ; General_Category
10553 isc       ; ISO_Comment
10554 lc        ; Lowercase_Mapping
10555 na        ; Name
10556 na1       ; Unicode_1_Name
10557 nt        ; Numeric_Type
10558 nv        ; Numeric_Value
10559 scf       ; Simple_Case_Folding
10560 slc       ; Simple_Lowercase_Mapping
10561 stc       ; Simple_Titlecase_Mapping
10562 suc       ; Simple_Uppercase_Mapping
10563 tc        ; Titlecase_Mapping
10564 uc        ; Uppercase_Mapping
10565 END
10566
10567     if (-e 'Blocks.txt') {
10568         push @return, "blk       ; Block\n";
10569     }
10570     if (-e 'ArabicShaping.txt') {
10571         push @return, split /\n/, <<'END';
10572 jg        ; Joining_Group
10573 jt        ; Joining_Type
10574 END
10575     }
10576     if (-e 'PropList.txt') {
10577
10578         # This first set is in the original old-style proplist.
10579         push @return, split /\n/, <<'END';
10580 Bidi_C    ; Bidi_Control
10581 Dash      ; Dash
10582 Dia       ; Diacritic
10583 Ext       ; Extender
10584 Hex       ; Hex_Digit
10585 Hyphen    ; Hyphen
10586 IDC       ; ID_Continue
10587 Ideo      ; Ideographic
10588 Join_C    ; Join_Control
10589 Math      ; Math
10590 QMark     ; Quotation_Mark
10591 Term      ; Terminal_Punctuation
10592 WSpace    ; White_Space
10593 END
10594         # The next sets were added later
10595         if ($v_version ge v3.0.0) {
10596             push @return, split /\n/, <<'END';
10597 Upper     ; Uppercase
10598 Lower     ; Lowercase
10599 END
10600         }
10601         if ($v_version ge v3.0.1) {
10602             push @return, split /\n/, <<'END';
10603 NChar     ; Noncharacter_Code_Point
10604 END
10605         }
10606         # The next sets were added in the new-style
10607         if ($v_version ge v3.1.0) {
10608             push @return, split /\n/, <<'END';
10609 OAlpha    ; Other_Alphabetic
10610 OLower    ; Other_Lowercase
10611 OMath     ; Other_Math
10612 OUpper    ; Other_Uppercase
10613 END
10614         }
10615         if ($v_version ge v3.1.1) {
10616             push @return, "AHex      ; ASCII_Hex_Digit\n";
10617         }
10618     }
10619     if (-e 'EastAsianWidth.txt') {
10620         push @return, "ea        ; East_Asian_Width\n";
10621     }
10622     if (-e 'CompositionExclusions.txt') {
10623         push @return, "CE        ; Composition_Exclusion\n";
10624     }
10625     if (-e 'LineBreak.txt') {
10626         push @return, "lb        ; Line_Break\n";
10627     }
10628     if (-e 'BidiMirroring.txt') {
10629         push @return, "bmg       ; Bidi_Mirroring_Glyph\n";
10630     }
10631     if (-e 'Scripts.txt') {
10632         push @return, "sc        ; Script\n";
10633     }
10634     if (-e 'DNormalizationProps.txt') {
10635         push @return, split /\n/, <<'END';
10636 Comp_Ex   ; Full_Composition_Exclusion
10637 FC_NFKC   ; FC_NFKC_Closure
10638 NFC_QC    ; NFC_Quick_Check
10639 NFD_QC    ; NFD_Quick_Check
10640 NFKC_QC   ; NFKC_Quick_Check
10641 NFKD_QC   ; NFKD_Quick_Check
10642 XO_NFC    ; Expands_On_NFC
10643 XO_NFD    ; Expands_On_NFD
10644 XO_NFKC   ; Expands_On_NFKC
10645 XO_NFKD   ; Expands_On_NFKD
10646 END
10647     }
10648     if (-e 'DCoreProperties.txt') {
10649         push @return, split /\n/, <<'END';
10650 Alpha     ; Alphabetic
10651 IDS       ; ID_Start
10652 XIDC      ; XID_Continue
10653 XIDS      ; XID_Start
10654 END
10655         # These can also appear in some versions of PropList.txt
10656         push @return, "Lower     ; Lowercase\n"
10657                                     unless grep { $_ =~ /^Lower\b/} @return;
10658         push @return, "Upper     ; Uppercase\n"
10659                                     unless grep { $_ =~ /^Upper\b/} @return;
10660     }
10661
10662     # This flag requires the DAge.txt file to be copied into the directory.
10663     if (DEBUG && $compare_versions) {
10664         push @return, 'age       ; Age';
10665     }
10666
10667     return @return;
10668 }
10669
10670 sub substitute_PropValueAliases($) {
10671     # Deal with early releases that don't have the crucial
10672     # PropValueAliases.txt file.
10673
10674     my $file_object = shift;
10675     $file_object->insert_lines(get_old_property_value_aliases());
10676
10677     process_PropValueAliases($file_object);
10678 }
10679
10680 sub process_PropValueAliases {
10681     # This file contains values that properties look like:
10682     # bc ; AL        ; Arabic_Letter
10683     # blk; n/a       ; Greek_And_Coptic                 ; Greek
10684     #
10685     # Field 0 is the property.
10686     # Field 1 is the short name of a property value or 'n/a' if no
10687     #                short name exists;
10688     # Field 2 is the full property value name;
10689     # Any other fields are more synonyms for the property value.
10690     # Purely numeric property values are omitted from the file; as are some
10691     # others, fewer and fewer in later releases
10692
10693     # Entries for the ccc property have an extra field before the
10694     # abbreviation:
10695     # ccc;   0; NR   ; Not_Reordered
10696     # It is the numeric value that the names are synonyms for.
10697
10698     # There are comment entries for values missing from this file:
10699     # # @missing: 0000..10FFFF; ISO_Comment; <none>
10700     # # @missing: 0000..10FFFF; Lowercase_Mapping; <code point>
10701
10702     my $file= shift;
10703     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10704
10705     if ($v_version lt 4.0.0) {
10706         $file->insert_lines(split /\n/, <<'END'
10707 Hangul_Syllable_Type; L                                ; Leading_Jamo
10708 Hangul_Syllable_Type; LV                               ; LV_Syllable
10709 Hangul_Syllable_Type; LVT                              ; LVT_Syllable
10710 Hangul_Syllable_Type; NA                               ; Not_Applicable
10711 Hangul_Syllable_Type; T                                ; Trailing_Jamo
10712 Hangul_Syllable_Type; V                                ; Vowel_Jamo
10713 END
10714         );
10715     }
10716     if ($v_version lt 4.1.0) {
10717         $file->insert_lines(split /\n/, <<'END'
10718 _Perl_GCB; CN                               ; Control
10719 _Perl_GCB; CR                               ; CR
10720 _Perl_GCB; EX                               ; Extend
10721 _Perl_GCB; L                                ; L
10722 _Perl_GCB; LF                               ; LF
10723 _Perl_GCB; LV                               ; LV
10724 _Perl_GCB; LVT                              ; LVT
10725 _Perl_GCB; T                                ; T
10726 _Perl_GCB; V                                ; V
10727 _Perl_GCB; XX                               ; Other
10728 END
10729         );
10730     }
10731
10732     # Add any explicit cjk values
10733     $file->insert_lines(@cjk_property_values);
10734
10735     # This line is used only for testing the code that checks for name
10736     # conflicts.  There is a script Inherited, and when this line is executed
10737     # it causes there to be a name conflict with the 'Inherited' that this
10738     # program generates for this block property value
10739     #$file->insert_lines('blk; n/a; Herited');
10740
10741     # Process each line of the file ...
10742     while ($file->next_line) {
10743
10744         # Fix typo in input file
10745         s/CCC133/CCC132/g if $v_version eq v6.1.0;
10746
10747         my ($property, @data) = split /\s*;\s*/;
10748
10749         # The ccc property has an extra field at the beginning, which is the
10750         # numeric value.  Move it to be after the other two, mnemonic, fields,
10751         # so that those will be used as the property value's names, and the
10752         # number will be an extra alias.  (Rightmost splice removes field 1-2,
10753         # returning them in a slice; left splice inserts that before anything,
10754         # thus shifting the former field 0 to after them.)
10755         splice (@data, 0, 0, splice(@data, 1, 2)) if $property eq 'ccc';
10756
10757         if ($v_version le v5.0.0 && $property eq 'blk' && $data[1] =~ /-/) {
10758             my $new_style = $data[1] =~ s/-/_/gr;
10759             splice @data, 1, 0, $new_style;
10760         }
10761
10762         # Field 0 is a short name unless "n/a"; field 1 is the full name.  If
10763         # there is no short name, use the full one in element 1
10764         if ($data[0] eq "n/a") {
10765             $data[0] = $data[1];
10766         }
10767         elsif ($data[0] ne $data[1]
10768                && standardize($data[0]) eq standardize($data[1])
10769                && $data[1] !~ /[[:upper:]]/)
10770         {
10771             # Also, there is a bug in the file in which "n/a" is omitted, and
10772             # the two fields are identical except for case, and the full name
10773             # is all lower case.  Copy the "short" name unto the full one to
10774             # give it some upper case.
10775
10776             $data[1] = $data[0];
10777         }
10778
10779         # Earlier releases had the pseudo property 'qc' that should expand to
10780         # the ones that replace it below.
10781         if ($property eq 'qc') {
10782             if (lc $data[0] eq 'y') {
10783                 $file->insert_lines('NFC_QC; Y      ; Yes',
10784                                     'NFD_QC; Y      ; Yes',
10785                                     'NFKC_QC; Y     ; Yes',
10786                                     'NFKD_QC; Y     ; Yes',
10787                                     );
10788             }
10789             elsif (lc $data[0] eq 'n') {
10790                 $file->insert_lines('NFC_QC; N      ; No',
10791                                     'NFD_QC; N      ; No',
10792                                     'NFKC_QC; N     ; No',
10793                                     'NFKD_QC; N     ; No',
10794                                     );
10795             }
10796             elsif (lc $data[0] eq 'm') {
10797                 $file->insert_lines('NFC_QC; M      ; Maybe',
10798                                     'NFKC_QC; M     ; Maybe',
10799                                     );
10800             }
10801             else {
10802                 $file->carp_bad_line("qc followed by unexpected '$data[0]");
10803             }
10804             next;
10805         }
10806
10807         # The first field is the short name, 2nd is the full one.
10808         my $property_object = property_ref($property);
10809         my $table = $property_object->add_match_table($data[0],
10810                                                 Full_Name => $data[1]);
10811
10812         # Start looking for more aliases after these two.
10813         for my $i (2 .. @data - 1) {
10814             $table->add_alias($data[$i]);
10815         }
10816     } # End of looping through the file
10817
10818     # As noted in the comments early in the program, it generates tables for
10819     # the default values for all releases, even those for which the concept
10820     # didn't exist at the time.  Here we add those if missing.
10821     if (defined $age && ! defined $age->table('Unassigned')) {
10822         $age->add_match_table('Unassigned');
10823     }
10824     $block->add_match_table('No_Block') if -e 'Blocks.txt'
10825                                     && ! defined $block->table('No_Block');
10826
10827
10828     # Now set the default mappings of the properties from the file.  This is
10829     # done after the loop because a number of properties have only @missings
10830     # entries in the file, and may not show up until the end.
10831     my @defaults = $file->get_missings;
10832     foreach my $default_ref (@defaults) {
10833         my $default = $default_ref->[0];
10834         my $property = property_ref($default_ref->[1]);
10835         $property->set_default_map($default);
10836     }
10837     return;
10838 }
10839
10840 sub get_old_property_value_aliases () {
10841     # Returns what would be in PropValueAliases.txt if it existed in very old
10842     # versions of Unicode.  It was derived from the one in 3.2, and pared
10843     # down.  An attempt was made to use the existence of files to mean
10844     # inclusion or not of various aliases, but if this was not sufficient,
10845     # using version numbers was resorted to.
10846
10847     my @return = split /\n/, <<'END';
10848 bc ; AN        ; Arabic_Number
10849 bc ; B         ; Paragraph_Separator
10850 bc ; CS        ; Common_Separator
10851 bc ; EN        ; European_Number
10852 bc ; ES        ; European_Separator
10853 bc ; ET        ; European_Terminator
10854 bc ; L         ; Left_To_Right
10855 bc ; ON        ; Other_Neutral
10856 bc ; R         ; Right_To_Left
10857 bc ; WS        ; White_Space
10858
10859 Bidi_M; N; No; F; False
10860 Bidi_M; Y; Yes; T; True
10861
10862 # The standard combining classes are very much different in v1, so only use
10863 # ones that look right (not checked thoroughly)
10864 ccc;   0; NR   ; Not_Reordered
10865 ccc;   1; OV   ; Overlay
10866 ccc;   7; NK   ; Nukta
10867 ccc;   8; KV   ; Kana_Voicing
10868 ccc;   9; VR   ; Virama
10869 ccc; 202; ATBL ; Attached_Below_Left
10870 ccc; 216; ATAR ; Attached_Above_Right
10871 ccc; 218; BL   ; Below_Left
10872 ccc; 220; B    ; Below
10873 ccc; 222; BR   ; Below_Right
10874 ccc; 224; L    ; Left
10875 ccc; 228; AL   ; Above_Left
10876 ccc; 230; A    ; Above
10877 ccc; 232; AR   ; Above_Right
10878 ccc; 234; DA   ; Double_Above
10879
10880 dt ; can       ; canonical
10881 dt ; enc       ; circle
10882 dt ; fin       ; final
10883 dt ; font      ; font
10884 dt ; fra       ; fraction
10885 dt ; init      ; initial
10886 dt ; iso       ; isolated
10887 dt ; med       ; medial
10888 dt ; n/a       ; none
10889 dt ; nb        ; noBreak
10890 dt ; sqr       ; square
10891 dt ; sub       ; sub
10892 dt ; sup       ; super
10893
10894 gc ; C         ; Other                            # Cc | Cf | Cn | Co | Cs
10895 gc ; Cc        ; Control
10896 gc ; Cn        ; Unassigned
10897 gc ; Co        ; Private_Use
10898 gc ; L         ; Letter                           # Ll | Lm | Lo | Lt | Lu
10899 gc ; LC        ; Cased_Letter                     # Ll | Lt | Lu
10900 gc ; Ll        ; Lowercase_Letter
10901 gc ; Lm        ; Modifier_Letter
10902 gc ; Lo        ; Other_Letter
10903 gc ; Lu        ; Uppercase_Letter
10904 gc ; M         ; Mark                             # Mc | Me | Mn
10905 gc ; Mc        ; Spacing_Mark
10906 gc ; Mn        ; Nonspacing_Mark
10907 gc ; N         ; Number                           # Nd | Nl | No
10908 gc ; Nd        ; Decimal_Number
10909 gc ; No        ; Other_Number
10910 gc ; P         ; Punctuation                      # Pc | Pd | Pe | Pf | Pi | Po | Ps
10911 gc ; Pd        ; Dash_Punctuation
10912 gc ; Pe        ; Close_Punctuation
10913 gc ; Po        ; Other_Punctuation
10914 gc ; Ps        ; Open_Punctuation
10915 gc ; S         ; Symbol                           # Sc | Sk | Sm | So
10916 gc ; Sc        ; Currency_Symbol
10917 gc ; Sm        ; Math_Symbol
10918 gc ; So        ; Other_Symbol
10919 gc ; Z         ; Separator                        # Zl | Zp | Zs
10920 gc ; Zl        ; Line_Separator
10921 gc ; Zp        ; Paragraph_Separator
10922 gc ; Zs        ; Space_Separator
10923
10924 nt ; de        ; Decimal
10925 nt ; di        ; Digit
10926 nt ; n/a       ; None
10927 nt ; nu        ; Numeric
10928 END
10929
10930     if (-e 'ArabicShaping.txt') {
10931         push @return, split /\n/, <<'END';
10932 jg ; n/a       ; AIN
10933 jg ; n/a       ; ALEF
10934 jg ; n/a       ; DAL
10935 jg ; n/a       ; GAF
10936 jg ; n/a       ; LAM
10937 jg ; n/a       ; MEEM
10938 jg ; n/a       ; NO_JOINING_GROUP
10939 jg ; n/a       ; NOON
10940 jg ; n/a       ; QAF
10941 jg ; n/a       ; SAD
10942 jg ; n/a       ; SEEN
10943 jg ; n/a       ; TAH
10944 jg ; n/a       ; WAW
10945
10946 jt ; C         ; Join_Causing
10947 jt ; D         ; Dual_Joining
10948 jt ; L         ; Left_Joining
10949 jt ; R         ; Right_Joining
10950 jt ; U         ; Non_Joining
10951 jt ; T         ; Transparent
10952 END
10953         if ($v_version ge v3.0.0) {
10954             push @return, split /\n/, <<'END';
10955 jg ; n/a       ; ALAPH
10956 jg ; n/a       ; BEH
10957 jg ; n/a       ; BETH
10958 jg ; n/a       ; DALATH_RISH
10959 jg ; n/a       ; E
10960 jg ; n/a       ; FEH
10961 jg ; n/a       ; FINAL_SEMKATH
10962 jg ; n/a       ; GAMAL
10963 jg ; n/a       ; HAH
10964 jg ; n/a       ; HAMZA_ON_HEH_GOAL
10965 jg ; n/a       ; HE
10966 jg ; n/a       ; HEH
10967 jg ; n/a       ; HEH_GOAL
10968 jg ; n/a       ; HETH
10969 jg ; n/a       ; KAF
10970 jg ; n/a       ; KAPH
10971 jg ; n/a       ; KNOTTED_HEH
10972 jg ; n/a       ; LAMADH
10973 jg ; n/a       ; MIM
10974 jg ; n/a       ; NUN
10975 jg ; n/a       ; PE
10976 jg ; n/a       ; QAPH
10977 jg ; n/a       ; REH
10978 jg ; n/a       ; REVERSED_PE
10979 jg ; n/a       ; SADHE
10980 jg ; n/a       ; SEMKATH
10981 jg ; n/a       ; SHIN
10982 jg ; n/a       ; SWASH_KAF
10983 jg ; n/a       ; TAW
10984 jg ; n/a       ; TEH_MARBUTA
10985 jg ; n/a       ; TETH
10986 jg ; n/a       ; YEH
10987 jg ; n/a       ; YEH_BARREE
10988 jg ; n/a       ; YEH_WITH_TAIL
10989 jg ; n/a       ; YUDH
10990 jg ; n/a       ; YUDH_HE
10991 jg ; n/a       ; ZAIN
10992 END
10993         }
10994     }
10995
10996
10997     if (-e 'EastAsianWidth.txt') {
10998         push @return, split /\n/, <<'END';
10999 ea ; A         ; Ambiguous
11000 ea ; F         ; Fullwidth
11001 ea ; H         ; Halfwidth
11002 ea ; N         ; Neutral
11003 ea ; Na        ; Narrow
11004 ea ; W         ; Wide
11005 END
11006     }
11007
11008     if (-e 'LineBreak.txt' || -e 'LBsubst.txt') {
11009         my @lb = split /\n/, <<'END';
11010 lb ; AI        ; Ambiguous
11011 lb ; AL        ; Alphabetic
11012 lb ; B2        ; Break_Both
11013 lb ; BA        ; Break_After
11014 lb ; BB        ; Break_Before
11015 lb ; BK        ; Mandatory_Break
11016 lb ; CB        ; Contingent_Break
11017 lb ; CL        ; Close_Punctuation
11018 lb ; CM        ; Combining_Mark
11019 lb ; CR        ; Carriage_Return
11020 lb ; EX        ; Exclamation
11021 lb ; GL        ; Glue
11022 lb ; HY        ; Hyphen
11023 lb ; ID        ; Ideographic
11024 lb ; IN        ; Inseperable
11025 lb ; IS        ; Infix_Numeric
11026 lb ; LF        ; Line_Feed
11027 lb ; NS        ; Nonstarter
11028 lb ; NU        ; Numeric
11029 lb ; OP        ; Open_Punctuation
11030 lb ; PO        ; Postfix_Numeric
11031 lb ; PR        ; Prefix_Numeric
11032 lb ; QU        ; Quotation
11033 lb ; SA        ; Complex_Context
11034 lb ; SG        ; Surrogate
11035 lb ; SP        ; Space
11036 lb ; SY        ; Break_Symbols
11037 lb ; XX        ; Unknown
11038 lb ; ZW        ; ZWSpace
11039 END
11040         # If this Unicode version predates the lb property, we use our
11041         # substitute one
11042         if (-e 'LBsubst.txt') {
11043             $_ = s/^lb/_Perl_LB/r for @lb;
11044         }
11045         push @return, @lb;
11046     }
11047
11048     if (-e 'DNormalizationProps.txt') {
11049         push @return, split /\n/, <<'END';
11050 qc ; M         ; Maybe
11051 qc ; N         ; No
11052 qc ; Y         ; Yes
11053 END
11054     }
11055
11056     if (-e 'Scripts.txt') {
11057         push @return, split /\n/, <<'END';
11058 sc ; Arab      ; Arabic
11059 sc ; Armn      ; Armenian
11060 sc ; Beng      ; Bengali
11061 sc ; Bopo      ; Bopomofo
11062 sc ; Cans      ; Canadian_Aboriginal
11063 sc ; Cher      ; Cherokee
11064 sc ; Cyrl      ; Cyrillic
11065 sc ; Deva      ; Devanagari
11066 sc ; Dsrt      ; Deseret
11067 sc ; Ethi      ; Ethiopic
11068 sc ; Geor      ; Georgian
11069 sc ; Goth      ; Gothic
11070 sc ; Grek      ; Greek
11071 sc ; Gujr      ; Gujarati
11072 sc ; Guru      ; Gurmukhi
11073 sc ; Hang      ; Hangul
11074 sc ; Hani      ; Han
11075 sc ; Hebr      ; Hebrew
11076 sc ; Hira      ; Hiragana
11077 sc ; Ital      ; Old_Italic
11078 sc ; Kana      ; Katakana
11079 sc ; Khmr      ; Khmer
11080 sc ; Knda      ; Kannada
11081 sc ; Laoo      ; Lao
11082 sc ; Latn      ; Latin
11083 sc ; Mlym      ; Malayalam
11084 sc ; Mong      ; Mongolian
11085 sc ; Mymr      ; Myanmar
11086 sc ; Ogam      ; Ogham
11087 sc ; Orya      ; Oriya
11088 sc ; Qaai      ; Inherited
11089 sc ; Runr      ; Runic
11090 sc ; Sinh      ; Sinhala
11091 sc ; Syrc      ; Syriac
11092 sc ; Taml      ; Tamil
11093 sc ; Telu      ; Telugu
11094 sc ; Thaa      ; Thaana
11095 sc ; Thai      ; Thai
11096 sc ; Tibt      ; Tibetan
11097 sc ; Yiii      ; Yi
11098 sc ; Zyyy      ; Common
11099 END
11100     }
11101
11102     if ($v_version ge v2.0.0) {
11103         push @return, split /\n/, <<'END';
11104 dt ; com       ; compat
11105 dt ; nar       ; narrow
11106 dt ; sml       ; small
11107 dt ; vert      ; vertical
11108 dt ; wide      ; wide
11109
11110 gc ; Cf        ; Format
11111 gc ; Cs        ; Surrogate
11112 gc ; Lt        ; Titlecase_Letter
11113 gc ; Me        ; Enclosing_Mark
11114 gc ; Nl        ; Letter_Number
11115 gc ; Pc        ; Connector_Punctuation
11116 gc ; Sk        ; Modifier_Symbol
11117 END
11118     }
11119     if ($v_version ge v2.1.2) {
11120         push @return, "bc ; S         ; Segment_Separator\n";
11121     }
11122     if ($v_version ge v2.1.5) {
11123         push @return, split /\n/, <<'END';
11124 gc ; Pf        ; Final_Punctuation
11125 gc ; Pi        ; Initial_Punctuation
11126 END
11127     }
11128     if ($v_version ge v2.1.8) {
11129         push @return, "ccc; 240; IS   ; Iota_Subscript\n";
11130     }
11131
11132     if ($v_version ge v3.0.0) {
11133         push @return, split /\n/, <<'END';
11134 bc ; AL        ; Arabic_Letter
11135 bc ; BN        ; Boundary_Neutral
11136 bc ; LRE       ; Left_To_Right_Embedding
11137 bc ; LRO       ; Left_To_Right_Override
11138 bc ; NSM       ; Nonspacing_Mark
11139 bc ; PDF       ; Pop_Directional_Format
11140 bc ; RLE       ; Right_To_Left_Embedding
11141 bc ; RLO       ; Right_To_Left_Override
11142
11143 ccc; 233; DB   ; Double_Below
11144 END
11145     }
11146
11147     if ($v_version ge v3.1.0) {
11148         push @return, "ccc; 226; R    ; Right\n";
11149     }
11150
11151     return @return;
11152 }
11153
11154 sub process_NormalizationsTest {
11155
11156     # Each line looks like:
11157     #      source code point; NFC; NFD; NFKC; NFKD
11158     # e.g.
11159     #       1E0A;1E0A;0044 0307;1E0A;0044 0307;
11160
11161     my $file= shift;
11162     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
11163
11164     # Process each line of the file ...
11165     while ($file->next_line) {
11166
11167         next if /^@/;
11168
11169         my ($c1, $c2, $c3, $c4, $c5) = split /\s*;\s*/;
11170
11171         foreach my $var (\$c1, \$c2, \$c3, \$c4, \$c5) {
11172             $$var = pack "U0U*", map { hex } split " ", $$var;
11173             $$var =~ s/(\\)/$1$1/g;
11174         }
11175
11176         push @normalization_tests,
11177                 "Test_N(q\a$c1\a, q\a$c2\a, q\a$c3\a, q\a$c4\a, q\a$c5\a);\n";
11178     } # End of looping through the file
11179 }
11180
11181 sub output_perl_charnames_line ($$) {
11182
11183     # Output the entries in Perl_charnames specially, using 5 digits instead
11184     # of four.  This makes the entries a constant length, and simplifies
11185     # charnames.pm which this table is for.  Unicode can have 6 digit
11186     # ordinals, but they are all private use or noncharacters which do not
11187     # have names, so won't be in this table.
11188
11189     return sprintf "%05X\t%s\n", $_[0], $_[1];
11190 }
11191
11192 { # Closure
11193
11194     # These are constants to the $property_info hash in this subroutine, to
11195     # avoid using a quoted-string which might have a typo.
11196     my $TYPE  = 'type';
11197     my $DEFAULT_MAP = 'default_map';
11198     my $DEFAULT_TABLE = 'default_table';
11199     my $PSEUDO_MAP_TYPE = 'pseudo_map_type';
11200     my $MISSINGS = 'missings';
11201
11202     sub process_generic_property_file {
11203         # This processes a file containing property mappings and puts them
11204         # into internal map tables.  It should be used to handle any property
11205         # files that have mappings from a code point or range thereof to
11206         # something else.  This means almost all the UCD .txt files.
11207         # each_line_handlers() should be set to adjust the lines of these
11208         # files, if necessary, to what this routine understands:
11209         #
11210         # 0374          ; NFD_QC; N
11211         # 003C..003E    ; Math
11212         #
11213         # the fields are: "codepoint-range ; property; map"
11214         #
11215         # meaning the codepoints in the range all have the value 'map' under
11216         # 'property'.
11217         # Beginning and trailing white space in each field are not significant.
11218         # Note there is not a trailing semi-colon in the above.  A trailing
11219         # semi-colon means the map is a null-string.  An omitted map, as
11220         # opposed to a null-string, is assumed to be 'Y', based on Unicode
11221         # table syntax.  (This could have been hidden from this routine by
11222         # doing it in the $file object, but that would require parsing of the
11223         # line there, so would have to parse it twice, or change the interface
11224         # to pass this an array.  So not done.)
11225         #
11226         # The map field may begin with a sequence of commands that apply to
11227         # this range.  Each such command begins and ends with $CMD_DELIM.
11228         # These are used to indicate, for example, that the mapping for a
11229         # range has a non-default type.
11230         #
11231         # This loops through the file, calling its next_line() method, and
11232         # then taking the map and adding it to the property's table.
11233         # Complications arise because any number of properties can be in the
11234         # file, in any order, interspersed in any way.  The first time a
11235         # property is seen, it gets information about that property and
11236         # caches it for quick retrieval later.  It also normalizes the maps
11237         # so that only one of many synonyms is stored.  The Unicode input
11238         # files do use some multiple synonyms.
11239
11240         my $file = shift;
11241         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
11242
11243         my %property_info;               # To keep track of what properties
11244                                          # have already had entries in the
11245                                          # current file, and info about each,
11246                                          # so don't have to recompute.
11247         my $property_name;               # property currently being worked on
11248         my $property_type;               # and its type
11249         my $previous_property_name = ""; # name from last time through loop
11250         my $property_object;             # pointer to the current property's
11251                                          # object
11252         my $property_addr;               # the address of that object
11253         my $default_map;                 # the string that code points missing
11254                                          # from the file map to
11255         my $default_table;               # For non-string properties, a
11256                                          # reference to the match table that
11257                                          # will contain the list of code
11258                                          # points that map to $default_map.
11259
11260         # Get the next real non-comment line
11261         LINE:
11262         while ($file->next_line) {
11263
11264             # Default replacement type; means that if parts of the range have
11265             # already been stored in our tables, the new map overrides them if
11266             # they differ more than cosmetically
11267             my $replace = $IF_NOT_EQUIVALENT;
11268             my $map_type;            # Default type for the map of this range
11269
11270             #local $to_trace = 1 if main::DEBUG;
11271             trace $_ if main::DEBUG && $to_trace;
11272
11273             # Split the line into components
11274             my ($range, $property_name, $map, @remainder)
11275                 = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
11276
11277             # If more or less on the line than we are expecting, warn and skip
11278             # the line
11279             if (@remainder) {
11280                 $file->carp_bad_line('Extra fields');
11281                 next LINE;
11282             }
11283             elsif ( ! defined $property_name) {
11284                 $file->carp_bad_line('Missing property');
11285                 next LINE;
11286             }
11287
11288             # Examine the range.
11289             if ($range !~ /^ ($code_point_re) (?:\.\. ($code_point_re) )? $/x)
11290             {
11291                 $file->carp_bad_line("Range '$range' not of the form 'CP1' or 'CP1..CP2' (where CP1,2 are code points in hex)");
11292                 next LINE;
11293             }
11294             my $low = hex $1;
11295             my $high = (defined $2) ? hex $2 : $low;
11296
11297             # If changing to a new property, get the things constant per
11298             # property
11299             if ($previous_property_name ne $property_name) {
11300
11301                 $property_object = property_ref($property_name);
11302                 if (! defined $property_object) {
11303                     $file->carp_bad_line("Unexpected property '$property_name'.  Skipped");
11304                     next LINE;
11305                 }
11306                 { no overloading; $property_addr = pack 'J', $property_object; }
11307
11308                 # Defer changing names until have a line that is acceptable
11309                 # (the 'next' statement above means is unacceptable)
11310                 $previous_property_name = $property_name;
11311
11312                 # If not the first time for this property, retrieve info about
11313                 # it from the cache
11314                 if (defined ($property_info{$property_addr}{$TYPE})) {
11315                     $property_type = $property_info{$property_addr}{$TYPE};
11316                     $default_map = $property_info{$property_addr}{$DEFAULT_MAP};
11317                     $map_type
11318                         = $property_info{$property_addr}{$PSEUDO_MAP_TYPE};
11319                     $default_table
11320                             = $property_info{$property_addr}{$DEFAULT_TABLE};
11321                 }
11322                 else {
11323
11324                     # Here, is the first time for this property.  Set up the
11325                     # cache.
11326                     $property_type = $property_info{$property_addr}{$TYPE}
11327                                    = $property_object->type;
11328                     $map_type
11329                         = $property_info{$property_addr}{$PSEUDO_MAP_TYPE}
11330                         = $property_object->pseudo_map_type;
11331
11332                     # The Unicode files are set up so that if the map is not
11333                     # defined, it is a binary property
11334                     if (! defined $map && $property_type != $BINARY) {
11335                         if ($property_type != $UNKNOWN
11336                             && $property_type != $NON_STRING)
11337                         {
11338                             $file->carp_bad_line("No mapping defined on a non-binary property.  Using 'Y' for the map");
11339                         }
11340                         else {
11341                             $property_object->set_type($BINARY);
11342                             $property_type
11343                                 = $property_info{$property_addr}{$TYPE}
11344                                 = $BINARY;
11345                         }
11346                     }
11347
11348                     # Get any @missings default for this property.  This
11349                     # should precede the first entry for the property in the
11350                     # input file, and is located in a comment that has been
11351                     # stored by the Input_file class until we access it here.
11352                     # It's possible that there is more than one such line
11353                     # waiting for us; collect them all, and parse
11354                     my @missings_list = $file->get_missings
11355                                             if $file->has_missings_defaults;
11356                     foreach my $default_ref (@missings_list) {
11357                         my $default = $default_ref->[0];
11358                         my $addr = do { no overloading; pack 'J', property_ref($default_ref->[1]); };
11359
11360                         # For string properties, the default is just what the
11361                         # file says, but non-string properties should already
11362                         # have set up a table for the default property value;
11363                         # use the table for these, so can resolve synonyms
11364                         # later to a single standard one.
11365                         if ($property_type == $STRING
11366                             || $property_type == $UNKNOWN)
11367                         {
11368                             $property_info{$addr}{$MISSINGS} = $default;
11369                         }
11370                         else {
11371                             $property_info{$addr}{$MISSINGS}
11372                                         = $property_object->table($default);
11373                         }
11374                     }
11375
11376                     # Finished storing all the @missings defaults in the input
11377                     # file so far.  Get the one for the current property.
11378                     my $missings = $property_info{$property_addr}{$MISSINGS};
11379
11380                     # But we likely have separately stored what the default
11381                     # should be.  (This is to accommodate versions of the
11382                     # standard where the @missings lines are absent or
11383                     # incomplete.)  Hopefully the two will match.  But check
11384                     # it out.
11385                     $default_map = $property_object->default_map;
11386
11387                     # If the map is a ref, it means that the default won't be
11388                     # processed until later, so undef it, so next few lines
11389                     # will redefine it to something that nothing will match
11390                     undef $default_map if ref $default_map;
11391
11392                     # Create a $default_map if don't have one; maybe a dummy
11393                     # that won't match anything.
11394                     if (! defined $default_map) {
11395
11396                         # Use any @missings line in the file.
11397                         if (defined $missings) {
11398                             if (ref $missings) {
11399                                 $default_map = $missings->full_name;
11400                                 $default_table = $missings;
11401                             }
11402                             else {
11403                                 $default_map = $missings;
11404                             }
11405
11406                             # And store it with the property for outside use.
11407                             $property_object->set_default_map($default_map);
11408                         }
11409                         else {
11410
11411                             # Neither an @missings nor a default map.  Create
11412                             # a dummy one, so won't have to test definedness
11413                             # in the main loop.
11414                             $default_map = '_Perl This will never be in a file
11415                                             from Unicode';
11416                         }
11417                     }
11418
11419                     # Here, we have $default_map defined, possibly in terms of
11420                     # $missings, but maybe not, and possibly is a dummy one.
11421                     if (defined $missings) {
11422
11423                         # Make sure there is no conflict between the two.
11424                         # $missings has priority.
11425                         if (ref $missings) {
11426                             $default_table
11427                                         = $property_object->table($default_map);
11428                             if (! defined $default_table
11429                                 || $default_table != $missings)
11430                             {
11431                                 if (! defined $default_table) {
11432                                     $default_table = $UNDEF;
11433                                 }
11434                                 $file->carp_bad_line(<<END
11435 The \@missings line for $property_name in $file says that missings default to
11436 $missings, but we expect it to be $default_table.  $missings used.
11437 END
11438                                 );
11439                                 $default_table = $missings;
11440                                 $default_map = $missings->full_name;
11441                             }
11442                             $property_info{$property_addr}{$DEFAULT_TABLE}
11443                                                         = $default_table;
11444                         }
11445                         elsif ($default_map ne $missings) {
11446                             $file->carp_bad_line(<<END
11447 The \@missings line for $property_name in $file says that missings default to
11448 $missings, but we expect it to be $default_map.  $missings used.
11449 END
11450                             );
11451                             $default_map = $missings;
11452                         }
11453                     }
11454
11455                     $property_info{$property_addr}{$DEFAULT_MAP}
11456                                                     = $default_map;
11457
11458                     # If haven't done so already, find the table corresponding
11459                     # to this map for non-string properties.
11460                     if (! defined $default_table
11461                         && $property_type != $STRING
11462                         && $property_type != $UNKNOWN)
11463                     {
11464                         $default_table = $property_info{$property_addr}
11465                                                         {$DEFAULT_TABLE}
11466                                     = $property_object->table($default_map);
11467                     }
11468                 } # End of is first time for this property
11469             } # End of switching properties.
11470
11471             # Ready to process the line.
11472             # The Unicode files are set up so that if the map is not defined,
11473             # it is a binary property with value 'Y'
11474             if (! defined $map) {
11475                 $map = 'Y';
11476             }
11477             else {
11478
11479                 # If the map begins with a special command to us (enclosed in
11480                 # delimiters), extract the command(s).
11481                 while ($map =~ s/ ^ $CMD_DELIM (.*?) $CMD_DELIM //x) {
11482                     my $command = $1;
11483                     if ($command =~  / ^ $REPLACE_CMD= (.*) /x) {
11484                         $replace = $1;
11485                     }
11486                     elsif ($command =~  / ^ $MAP_TYPE_CMD= (.*) /x) {
11487                         $map_type = $1;
11488                     }
11489                     else {
11490                         $file->carp_bad_line("Unknown command line: '$1'");
11491                         next LINE;
11492                     }
11493                 }
11494             }
11495
11496             if ($default_map eq $CODE_POINT && $map =~ / ^ $code_point_re $/x)
11497             {
11498
11499                 # Here, we have a map to a particular code point, and the
11500                 # default map is to a code point itself.  If the range
11501                 # includes the particular code point, change that portion of
11502                 # the range to the default.  This makes sure that in the final
11503                 # table only the non-defaults are listed.
11504                 my $decimal_map = hex $map;
11505                 if ($low <= $decimal_map && $decimal_map <= $high) {
11506
11507                     # If the range includes stuff before or after the map
11508                     # we're changing, split it and process the split-off parts
11509                     # later.
11510                     if ($low < $decimal_map) {
11511                         $file->insert_adjusted_lines(
11512                                             sprintf("%04X..%04X; %s; %s",
11513                                                     $low,
11514                                                     $decimal_map - 1,
11515                                                     $property_name,
11516                                                     $map));
11517                     }
11518                     if ($high > $decimal_map) {
11519                         $file->insert_adjusted_lines(
11520                                             sprintf("%04X..%04X; %s; %s",
11521                                                     $decimal_map + 1,
11522                                                     $high,
11523                                                     $property_name,
11524                                                     $map));
11525                     }
11526                     $low = $high = $decimal_map;
11527                     $map = $CODE_POINT;
11528                 }
11529             }
11530
11531             # If we can tell that this is a synonym for the default map, use
11532             # the default one instead.
11533             if ($property_type != $STRING
11534                 && $property_type != $UNKNOWN)
11535             {
11536                 my $table = $property_object->table($map);
11537                 if (defined $table && $table == $default_table) {
11538                     $map = $default_map;
11539                 }
11540             }
11541
11542             # And figure out the map type if not known.
11543             if (! defined $map_type || $map_type == $COMPUTE_NO_MULTI_CP) {
11544                 if ($map eq "") {   # Nulls are always $NULL map type
11545                     $map_type = $NULL;
11546                 } # Otherwise, non-strings, and those that don't allow
11547                   # $MULTI_CP, and those that aren't multiple code points are
11548                   # 0
11549                 elsif
11550                    (($property_type != $STRING && $property_type != $UNKNOWN)
11551                    || (defined $map_type && $map_type == $COMPUTE_NO_MULTI_CP)
11552                    || $map !~ /^ $code_point_re ( \  $code_point_re )+ $ /x)
11553                 {
11554                     $map_type = 0;
11555                 }
11556                 else {
11557                     $map_type = $MULTI_CP;
11558                 }
11559             }
11560
11561             $property_object->add_map($low, $high,
11562                                         $map,
11563                                         Type => $map_type,
11564                                         Replace => $replace);
11565         } # End of loop through file's lines
11566
11567         return;
11568     }
11569 }
11570
11571 { # Closure for UnicodeData.txt handling
11572
11573     # This file was the first one in the UCD; its design leads to some
11574     # awkwardness in processing.  Here is a sample line:
11575     # 0041;LATIN CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0061;
11576     # The fields in order are:
11577     my $i = 0;            # The code point is in field 0, and is shifted off.
11578     my $CHARNAME = $i++;  # character name (e.g. "LATIN CAPITAL LETTER A")
11579     my $CATEGORY = $i++;  # category (e.g. "Lu")
11580     my $CCC = $i++;       # Canonical combining class (e.g. "230")
11581     my $BIDI = $i++;      # directional class (e.g. "L")
11582     my $PERL_DECOMPOSITION = $i++;  # decomposition mapping
11583     my $PERL_DECIMAL_DIGIT = $i++;   # decimal digit value
11584     my $NUMERIC_TYPE_OTHER_DIGIT = $i++; # digit value, like a superscript
11585                                          # Dual-use in this program; see below
11586     my $NUMERIC = $i++;   # numeric value
11587     my $MIRRORED = $i++;  # ? mirrored
11588     my $UNICODE_1_NAME = $i++; # name in Unicode 1.0
11589     my $COMMENT = $i++;   # iso comment
11590     my $UPPER = $i++;     # simple uppercase mapping
11591     my $LOWER = $i++;     # simple lowercase mapping
11592     my $TITLE = $i++;     # simple titlecase mapping
11593     my $input_field_count = $i;
11594
11595     # This routine in addition outputs these extra fields:
11596
11597     my $DECOMP_TYPE = $i++; # Decomposition type
11598
11599     # These fields are modifications of ones above, and are usually
11600     # suppressed; they must come last, as for speed, the loop upper bound is
11601     # normally set to ignore them
11602     my $NAME = $i++;        # This is the strict name field, not the one that
11603                             # charnames uses.
11604     my $DECOMP_MAP = $i++;  # Strict decomposition mapping; not the one used
11605                             # by Unicode::Normalize
11606     my $last_field = $i - 1;
11607
11608     # All these are read into an array for each line, with the indices defined
11609     # above.  The empty fields in the example line above indicate that the
11610     # value is defaulted.  The handler called for each line of the input
11611     # changes these to their defaults.
11612
11613     # Here are the official names of the properties, in a parallel array:
11614     my @field_names;
11615     $field_names[$BIDI] = 'Bidi_Class';
11616     $field_names[$CATEGORY] = 'General_Category';
11617     $field_names[$CCC] = 'Canonical_Combining_Class';
11618     $field_names[$CHARNAME] = 'Perl_Charnames';
11619     $field_names[$COMMENT] = 'ISO_Comment';
11620     $field_names[$DECOMP_MAP] = 'Decomposition_Mapping';
11621     $field_names[$DECOMP_TYPE] = 'Decomposition_Type';
11622     $field_names[$LOWER] = 'Lowercase_Mapping';
11623     $field_names[$MIRRORED] = 'Bidi_Mirrored';
11624     $field_names[$NAME] = 'Name';
11625     $field_names[$NUMERIC] = 'Numeric_Value';
11626     $field_names[$NUMERIC_TYPE_OTHER_DIGIT] = 'Numeric_Type';
11627     $field_names[$PERL_DECIMAL_DIGIT] = 'Perl_Decimal_Digit';
11628     $field_names[$PERL_DECOMPOSITION] = 'Perl_Decomposition_Mapping';
11629     $field_names[$TITLE] = 'Titlecase_Mapping';
11630     $field_names[$UNICODE_1_NAME] = 'Unicode_1_Name';
11631     $field_names[$UPPER] = 'Uppercase_Mapping';
11632
11633     # Some of these need a little more explanation:
11634     # The $PERL_DECIMAL_DIGIT field does not lead to an official Unicode
11635     #   property, but is used in calculating the Numeric_Type.  Perl however,
11636     #   creates a file from this field, so a Perl property is created from it.
11637     # Similarly, the Other_Digit field is used only for calculating the
11638     #   Numeric_Type, and so it can be safely re-used as the place to store
11639     #   the value for Numeric_Type; hence it is referred to as
11640     #   $NUMERIC_TYPE_OTHER_DIGIT.
11641     # The input field named $PERL_DECOMPOSITION is a combination of both the
11642     #   decomposition mapping and its type.  Perl creates a file containing
11643     #   exactly this field, so it is used for that.  The two properties are
11644     #   separated into two extra output fields, $DECOMP_MAP and $DECOMP_TYPE.
11645     #   $DECOMP_MAP is usually suppressed (unless the lists are changed to
11646     #   output it), as Perl doesn't use it directly.
11647     # The input field named here $CHARNAME is used to construct the
11648     #   Perl_Charnames property, which is a combination of the Name property
11649     #   (which the input field contains), and the Unicode_1_Name property, and
11650     #   others from other files.  Since, the strict Name property is not used
11651     #   by Perl, this field is used for the table that Perl does use.  The
11652     #   strict Name property table is usually suppressed (unless the lists are
11653     #   changed to output it), so it is accumulated in a separate field,
11654     #   $NAME, which to save time is discarded unless the table is actually to
11655     #   be output
11656
11657     # This file is processed like most in this program.  Control is passed to
11658     # process_generic_property_file() which calls filter_UnicodeData_line()
11659     # for each input line.  This filter converts the input into line(s) that
11660     # process_generic_property_file() understands.  There is also a setup
11661     # routine called before any of the file is processed, and a handler for
11662     # EOF processing, all in this closure.
11663
11664     # A huge speed-up occurred at the cost of some added complexity when these
11665     # routines were altered to buffer the outputs into ranges.  Almost all the
11666     # lines of the input file apply to just one code point, and for most
11667     # properties, the map for the next code point up is the same as the
11668     # current one.  So instead of creating a line for each property for each
11669     # input line, filter_UnicodeData_line() remembers what the previous map
11670     # of a property was, and doesn't generate a line to pass on until it has
11671     # to, as when the map changes; and that passed-on line encompasses the
11672     # whole contiguous range of code points that have the same map for that
11673     # property.  This means a slight amount of extra setup, and having to
11674     # flush these buffers on EOF, testing if the maps have changed, plus
11675     # remembering state information in the closure.  But it means a lot less
11676     # real time in not having to change the data base for each property on
11677     # each line.
11678
11679     # Another complication is that there are already a few ranges designated
11680     # in the input.  There are two lines for each, with the same maps except
11681     # the code point and name on each line.  This was actually the hardest
11682     # thing to design around.  The code points in those ranges may actually
11683     # have real maps not given by these two lines.  These maps will either
11684     # be algorithmically determinable, or be in the extracted files furnished
11685     # with the UCD.  In the event of conflicts between these extracted files,
11686     # and this one, Unicode says that this one prevails.  But it shouldn't
11687     # prevail for conflicts that occur in these ranges.  The data from the
11688     # extracted files prevails in those cases.  So, this program is structured
11689     # so that those files are processed first, storing maps.  Then the other
11690     # files are processed, generally overwriting what the extracted files
11691     # stored.  But just the range lines in this input file are processed
11692     # without overwriting.  This is accomplished by adding a special string to
11693     # the lines output to tell process_generic_property_file() to turn off the
11694     # overwriting for just this one line.
11695     # A similar mechanism is used to tell it that the map is of a non-default
11696     # type.
11697
11698     sub setup_UnicodeData { # Called before any lines of the input are read
11699         my $file = shift;
11700         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
11701
11702         # Create a new property specially located that is a combination of
11703         # various Name properties: Name, Unicode_1_Name, Named Sequences, and
11704         # _Perl_Name_Alias properties.  (The final one duplicates elements of the
11705         # first, and starting in v6.1, is the same as the 'Name_Alias
11706         # property.)  A comment for the new property will later be constructed
11707         # based on the actual properties present and used
11708         $perl_charname = Property->new('Perl_Charnames',
11709                        Default_Map => "",
11710                        Directory => File::Spec->curdir(),
11711                        File => 'Name',
11712                        Fate => $INTERNAL_ONLY,
11713                        Perl_Extension => 1,
11714                        Range_Size_1 => \&output_perl_charnames_line,
11715                        Type => $STRING,
11716                        );
11717         $perl_charname->set_proxy_for('Name');
11718
11719         my $Perl_decomp = Property->new('Perl_Decomposition_Mapping',
11720                                         Directory => File::Spec->curdir(),
11721                                         File => 'Decomposition',
11722                                         Format => $DECOMP_STRING_FORMAT,
11723                                         Fate => $INTERNAL_ONLY,
11724                                         Perl_Extension => 1,
11725                                         Default_Map => $CODE_POINT,
11726
11727                                         # normalize.pm can't cope with these
11728                                         Output_Range_Counts => 0,
11729
11730                                         # This is a specially formatted table
11731                                         # explicitly for normalize.pm, which
11732                                         # is expecting a particular format,
11733                                         # which means that mappings containing
11734                                         # multiple code points are in the main
11735                                         # body of the table
11736                                         Map_Type => $COMPUTE_NO_MULTI_CP,
11737                                         Type => $STRING,
11738                                         To_Output_Map => $INTERNAL_MAP,
11739                                         );
11740         $Perl_decomp->set_proxy_for('Decomposition_Mapping', 'Decomposition_Type');
11741         $Perl_decomp->add_comment(join_lines(<<END
11742 This mapping is a combination of the Unicode 'Decomposition_Type' and
11743 'Decomposition_Mapping' properties, formatted for use by normalize.pm.  It is
11744 identical to the official Unicode 'Decomposition_Mapping' property except for
11745 two things:
11746  1) It omits the algorithmically determinable Hangul syllable decompositions,
11747 which normalize.pm handles algorithmically.
11748  2) It contains the decomposition type as well.  Non-canonical decompositions
11749 begin with a word in angle brackets, like <super>, which denotes the
11750 compatible decomposition type.  If the map does not begin with the <angle
11751 brackets>, the decomposition is canonical.
11752 END
11753         ));
11754
11755         my $Decimal_Digit = Property->new("Perl_Decimal_Digit",
11756                                         Default_Map => "",
11757                                         Perl_Extension => 1,
11758                                         Directory => $map_directory,
11759                                         Type => $STRING,
11760                                         To_Output_Map => $OUTPUT_ADJUSTED,
11761                                         );
11762         $Decimal_Digit->add_comment(join_lines(<<END
11763 This file gives the mapping of all code points which represent a single
11764 decimal digit [0-9] to their respective digits, but it has ranges of 10 code
11765 points, and the mapping of each non-initial element of each range is actually
11766 not to "0", but to the offset that element has from its corresponding DIGIT 0.
11767 These code points are those that have Numeric_Type=Decimal; not special
11768 things, like subscripts nor Roman numerals.
11769 END
11770         ));
11771
11772         # These properties are not used for generating anything else, and are
11773         # usually not output.  By making them last in the list, we can just
11774         # change the high end of the loop downwards to avoid the work of
11775         # generating a table(s) that is/are just going to get thrown away.
11776         if (! property_ref('Decomposition_Mapping')->to_output_map
11777             && ! property_ref('Name')->to_output_map)
11778         {
11779             $last_field = min($NAME, $DECOMP_MAP) - 1;
11780         } elsif (property_ref('Decomposition_Mapping')->to_output_map) {
11781             $last_field = $DECOMP_MAP;
11782         } elsif (property_ref('Name')->to_output_map) {
11783             $last_field = $NAME;
11784         }
11785         return;
11786     }
11787
11788     my $first_time = 1;                 # ? Is this the first line of the file
11789     my $in_range = 0;                   # ? Are we in one of the file's ranges
11790     my $previous_cp;                    # hex code point of previous line
11791     my $decimal_previous_cp = -1;       # And its decimal equivalent
11792     my @start;                          # For each field, the current starting
11793                                         # code point in hex for the range
11794                                         # being accumulated.
11795     my @fields;                         # The input fields;
11796     my @previous_fields;                # And those from the previous call
11797
11798     sub filter_UnicodeData_line {
11799         # Handle a single input line from UnicodeData.txt; see comments above
11800         # Conceptually this takes a single line from the file containing N
11801         # properties, and converts it into N lines with one property per line,
11802         # which is what the final handler expects.  But there are
11803         # complications due to the quirkiness of the input file, and to save
11804         # time, it accumulates ranges where the property values don't change
11805         # and only emits lines when necessary.  This is about an order of
11806         # magnitude fewer lines emitted.
11807
11808         my $file = shift;
11809         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
11810
11811         # $_ contains the input line.
11812         # -1 in split means retain trailing null fields
11813         (my $cp, @fields) = split /\s*;\s*/, $_, -1;
11814
11815         #local $to_trace = 1 if main::DEBUG;
11816         trace $cp, @fields , $input_field_count if main::DEBUG && $to_trace;
11817         if (@fields > $input_field_count) {
11818             $file->carp_bad_line('Extra fields');
11819             $_ = "";
11820             return;
11821         }
11822
11823         my $decimal_cp = hex $cp;
11824
11825         # We have to output all the buffered ranges when the next code point
11826         # is not exactly one after the previous one, which means there is a
11827         # gap in the ranges.
11828         my $force_output = ($decimal_cp != $decimal_previous_cp + 1);
11829
11830         # The decomposition mapping field requires special handling.  It looks
11831         # like either:
11832         #
11833         # <compat> 0032 0020
11834         # 0041 0300
11835         #
11836         # The decomposition type is enclosed in <brackets>; if missing, it
11837         # means the type is canonical.  There are two decomposition mapping
11838         # tables: the one for use by Perl's normalize.pm has a special format
11839         # which is this field intact; the other, for general use is of
11840         # standard format.  In either case we have to find the decomposition
11841         # type.  Empty fields have None as their type, and map to the code
11842         # point itself
11843         if ($fields[$PERL_DECOMPOSITION] eq "") {
11844             $fields[$DECOMP_TYPE] = 'None';
11845             $fields[$DECOMP_MAP] = $fields[$PERL_DECOMPOSITION] = $CODE_POINT;
11846         }
11847         else {
11848             ($fields[$DECOMP_TYPE], my $map) = $fields[$PERL_DECOMPOSITION]
11849                                             =~ / < ( .+? ) > \s* ( .+ ) /x;
11850             if (! defined $fields[$DECOMP_TYPE]) {
11851                 $fields[$DECOMP_TYPE] = 'Canonical';
11852                 $fields[$DECOMP_MAP] = $fields[$PERL_DECOMPOSITION];
11853             }
11854             else {
11855                 $fields[$DECOMP_MAP] = $map;
11856             }
11857         }
11858
11859         # The 3 numeric fields also require special handling.  The 2 digit
11860         # fields must be either empty or match the number field.  This means
11861         # that if it is empty, they must be as well, and the numeric type is
11862         # None, and the numeric value is 'Nan'.
11863         # The decimal digit field must be empty or match the other digit
11864         # field.  If the decimal digit field is non-empty, the code point is
11865         # a decimal digit, and the other two fields will have the same value.
11866         # If it is empty, but the other digit field is non-empty, the code
11867         # point is an 'other digit', and the number field will have the same
11868         # value as the other digit field.  If the other digit field is empty,
11869         # but the number field is non-empty, the code point is a generic
11870         # numeric type.
11871         if ($fields[$NUMERIC] eq "") {
11872             if ($fields[$PERL_DECIMAL_DIGIT] ne ""
11873                 || $fields[$NUMERIC_TYPE_OTHER_DIGIT] ne ""
11874             ) {
11875                 $file->carp_bad_line("Numeric values inconsistent.  Trying to process anyway");
11876             }
11877             $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'None';
11878             $fields[$NUMERIC] = 'NaN';
11879         }
11880         else {
11881             $file->carp_bad_line("'$fields[$NUMERIC]' should be a whole or rational number.  Processing as if it were") if $fields[$NUMERIC] !~ qr{ ^ -? \d+ ( / \d+ )? $ }x;
11882             if ($fields[$PERL_DECIMAL_DIGIT] ne "") {
11883                 $file->carp_bad_line("$fields[$PERL_DECIMAL_DIGIT] should equal $fields[$NUMERIC].  Processing anyway") if $fields[$PERL_DECIMAL_DIGIT] != $fields[$NUMERIC];
11884                 $file->carp_bad_line("$fields[$PERL_DECIMAL_DIGIT] should be empty since the general category ($fields[$CATEGORY]) isn't 'Nd'.  Processing as Decimal") if $fields[$CATEGORY] ne "Nd";
11885                 $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'Decimal';
11886             }
11887             elsif ($fields[$NUMERIC_TYPE_OTHER_DIGIT] ne "") {
11888                 $file->carp_bad_line("$fields[$NUMERIC_TYPE_OTHER_DIGIT] should equal $fields[$NUMERIC].  Processing anyway") if $fields[$NUMERIC_TYPE_OTHER_DIGIT] != $fields[$NUMERIC];
11889                 $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'Digit';
11890             }
11891             else {
11892                 $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'Numeric';
11893
11894                 # Rationals require extra effort.
11895                 if ($fields[$NUMERIC] =~ qr{/}) {
11896                     reduce_fraction(\$fields[$NUMERIC]);
11897                     register_fraction($fields[$NUMERIC])
11898                 }
11899             }
11900         }
11901
11902         # For the properties that have empty fields in the file, and which
11903         # mean something different from empty, change them to that default.
11904         # Certain fields just haven't been empty so far in any Unicode
11905         # version, so don't look at those, namely $MIRRORED, $BIDI, $CCC,
11906         # $CATEGORY.  This leaves just the two fields, and so we hard-code in
11907         # the defaults; which are very unlikely to ever change.
11908         $fields[$UPPER] = $CODE_POINT if $fields[$UPPER] eq "";
11909         $fields[$LOWER] = $CODE_POINT if $fields[$LOWER] eq "";
11910
11911         # UAX44 says that if title is empty, it is the same as whatever upper
11912         # is,
11913         $fields[$TITLE] = $fields[$UPPER] if $fields[$TITLE] eq "";
11914
11915         # There are a few pairs of lines like:
11916         #   AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
11917         #   D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
11918         # that define ranges.  These should be processed after the fields are
11919         # adjusted above, as they may override some of them; but mostly what
11920         # is left is to possibly adjust the $CHARNAME field.  The names of all the
11921         # paired lines start with a '<', but this is also true of '<control>,
11922         # which isn't one of these special ones.
11923         if ($fields[$CHARNAME] eq '<control>') {
11924
11925             # Some code points in this file have the pseudo-name
11926             # '<control>', but the official name for such ones is the null
11927             # string.
11928             $fields[$NAME] = $fields[$CHARNAME] = "";
11929
11930             # We had better not be in between range lines.
11931             if ($in_range) {
11932                 $file->carp_bad_line("Expecting a closing range line, not a $fields[$CHARNAME]'.  Trying anyway");
11933                 $in_range = 0;
11934             }
11935         }
11936         elsif (substr($fields[$CHARNAME], 0, 1) ne '<') {
11937
11938             # Here is a non-range line.  We had better not be in between range
11939             # lines.
11940             if ($in_range) {
11941                 $file->carp_bad_line("Expecting a closing range line, not a $fields[$CHARNAME]'.  Trying anyway");
11942                 $in_range = 0;
11943             }
11944             if ($fields[$CHARNAME] =~ s/- $cp $//x) {
11945
11946                 # These are code points whose names end in their code points,
11947                 # which means the names are algorithmically derivable from the
11948                 # code points.  To shorten the output Name file, the algorithm
11949                 # for deriving these is placed in the file instead of each
11950                 # code point, so they have map type $CP_IN_NAME
11951                 $fields[$CHARNAME] = $CMD_DELIM
11952                                  . $MAP_TYPE_CMD
11953                                  . '='
11954                                  . $CP_IN_NAME
11955                                  . $CMD_DELIM
11956                                  . $fields[$CHARNAME];
11957             }
11958             $fields[$NAME] = $fields[$CHARNAME];
11959         }
11960         elsif ($fields[$CHARNAME] =~ /^<(.+), First>$/) {
11961             $fields[$CHARNAME] = $fields[$NAME] = $1;
11962
11963             # Here we are at the beginning of a range pair.
11964             if ($in_range) {
11965                 $file->carp_bad_line("Expecting a closing range line, not a beginning one, $fields[$CHARNAME]'.  Trying anyway");
11966             }
11967             $in_range = 1;
11968
11969             # Because the properties in the range do not overwrite any already
11970             # in the db, we must flush the buffers of what's already there, so
11971             # they get handled in the normal scheme.
11972             $force_output = 1;
11973
11974         }
11975         elsif ($fields[$CHARNAME] !~ s/^<(.+), Last>$/$1/) {
11976             $file->carp_bad_line("Unexpected name starting with '<' $fields[$CHARNAME].  Ignoring this line.");
11977             $_ = "";
11978             return;
11979         }
11980         else { # Here, we are at the last line of a range pair.
11981
11982             if (! $in_range) {
11983                 $file->carp_bad_line("Unexpected end of range $fields[$CHARNAME] when not in one.  Ignoring this line.");
11984                 $_ = "";
11985                 return;
11986             }
11987             $in_range = 0;
11988
11989             $fields[$NAME] = $fields[$CHARNAME];
11990
11991             # Check that the input is valid: that the closing of the range is
11992             # the same as the beginning.
11993             foreach my $i (0 .. $last_field) {
11994                 next if $fields[$i] eq $previous_fields[$i];
11995                 $file->carp_bad_line("Expecting '$fields[$i]' to be the same as '$previous_fields[$i]'.  Bad News.  Trying anyway");
11996             }
11997
11998             # The processing differs depending on the type of range,
11999             # determined by its $CHARNAME
12000             if ($fields[$CHARNAME] =~ /^Hangul Syllable/) {
12001
12002                 # Check that the data looks right.
12003                 if ($decimal_previous_cp != $SBase) {
12004                     $file->carp_bad_line("Unexpected Hangul syllable start = $previous_cp.  Bad News.  Results will be wrong");
12005                 }
12006                 if ($decimal_cp != $SBase + $SCount - 1) {
12007                     $file->carp_bad_line("Unexpected Hangul syllable end = $cp.  Bad News.  Results will be wrong");
12008                 }
12009
12010                 # The Hangul syllable range has a somewhat complicated name
12011                 # generation algorithm.  Each code point in it has a canonical
12012                 # decomposition also computable by an algorithm.  The
12013                 # perl decomposition map table built from these is used only
12014                 # by normalize.pm, which has the algorithm built in it, so the
12015                 # decomposition maps are not needed, and are large, so are
12016                 # omitted from it.  If the full decomposition map table is to
12017                 # be output, the decompositions are generated for it, in the
12018                 # EOF handling code for this input file.
12019
12020                 $previous_fields[$DECOMP_TYPE] = 'Canonical';
12021
12022                 # This range is stored in our internal structure with its
12023                 # own map type, different from all others.
12024                 $previous_fields[$CHARNAME] = $previous_fields[$NAME]
12025                                         = $CMD_DELIM
12026                                           . $MAP_TYPE_CMD
12027                                           . '='
12028                                           . $HANGUL_SYLLABLE
12029                                           . $CMD_DELIM
12030                                           . $fields[$CHARNAME];
12031             }
12032             elsif ($fields[$CATEGORY] eq 'Lo') {    # Is a letter
12033
12034                 # All the CJK ranges like this have the name given as a
12035                 # special case in the next code line.  And for the others, we
12036                 # hope that Unicode continues to use the correct name in
12037                 # future releases, so we don't have to make further special
12038                 # cases.
12039                 my $name = ($fields[$CHARNAME] =~ /^CJK/)
12040                            ? 'CJK UNIFIED IDEOGRAPH'
12041                            : uc $fields[$CHARNAME];
12042
12043                 # The name for these contains the code point itself, and all
12044                 # are defined to have the same base name, regardless of what
12045                 # is in the file.  They are stored in our internal structure
12046                 # with a map type of $CP_IN_NAME
12047                 $previous_fields[$CHARNAME] = $previous_fields[$NAME]
12048                                         = $CMD_DELIM
12049                                            . $MAP_TYPE_CMD
12050                                            . '='
12051                                            . $CP_IN_NAME
12052                                            . $CMD_DELIM
12053                                            . $name;
12054
12055             }
12056             elsif ($fields[$CATEGORY] eq 'Co'
12057                      || $fields[$CATEGORY] eq 'Cs')
12058             {
12059                 # The names of all the code points in these ranges are set to
12060                 # null, as there are no names for the private use and
12061                 # surrogate code points.
12062
12063                 $previous_fields[$CHARNAME] = $previous_fields[$NAME] = "";
12064             }
12065             else {
12066                 $file->carp_bad_line("Unexpected code point range $fields[$CHARNAME] because category is $fields[$CATEGORY].  Attempting to process it.");
12067             }
12068
12069             # The first line of the range caused everything else to be output,
12070             # and then its values were stored as the beginning values for the
12071             # next set of ranges, which this one ends.  Now, for each value,
12072             # add a command to tell the handler that these values should not
12073             # replace any existing ones in our database.
12074             foreach my $i (0 .. $last_field) {
12075                 $previous_fields[$i] = $CMD_DELIM
12076                                         . $REPLACE_CMD
12077                                         . '='
12078                                         . $NO
12079                                         . $CMD_DELIM
12080                                         . $previous_fields[$i];
12081             }
12082
12083             # And change things so it looks like the entire range has been
12084             # gone through with this being the final part of it.  Adding the
12085             # command above to each field will cause this range to be flushed
12086             # during the next iteration, as it guaranteed that the stored
12087             # field won't match whatever value the next one has.
12088             $previous_cp = $cp;
12089             $decimal_previous_cp = $decimal_cp;
12090
12091             # We are now set up for the next iteration; so skip the remaining
12092             # code in this subroutine that does the same thing, but doesn't
12093             # know about these ranges.
12094             $_ = "";
12095
12096             return;
12097         }
12098
12099         # On the very first line, we fake it so the code below thinks there is
12100         # nothing to output, and initialize so that when it does get output it
12101         # uses the first line's values for the lowest part of the range.
12102         # (One could avoid this by using peek(), but then one would need to
12103         # know the adjustments done above and do the same ones in the setup
12104         # routine; not worth it)
12105         if ($first_time) {
12106             $first_time = 0;
12107             @previous_fields = @fields;
12108             @start = ($cp) x scalar @fields;
12109             $decimal_previous_cp = $decimal_cp - 1;
12110         }
12111
12112         # For each field, output the stored up ranges that this code point
12113         # doesn't fit in.  Earlier we figured out if all ranges should be
12114         # terminated because of changing the replace or map type styles, or if
12115         # there is a gap between this new code point and the previous one, and
12116         # that is stored in $force_output.  But even if those aren't true, we
12117         # need to output the range if this new code point's value for the
12118         # given property doesn't match the stored range's.
12119         #local $to_trace = 1 if main::DEBUG;
12120         foreach my $i (0 .. $last_field) {
12121             my $field = $fields[$i];
12122             if ($force_output || $field ne $previous_fields[$i]) {
12123
12124                 # Flush the buffer of stored values.
12125                 $file->insert_adjusted_lines("$start[$i]..$previous_cp; $field_names[$i]; $previous_fields[$i]");
12126
12127                 # Start a new range with this code point and its value
12128                 $start[$i] = $cp;
12129                 $previous_fields[$i] = $field;
12130             }
12131         }
12132
12133         # Set the values for the next time.
12134         $previous_cp = $cp;
12135         $decimal_previous_cp = $decimal_cp;
12136
12137         # The input line has generated whatever adjusted lines are needed, and
12138         # should not be looked at further.
12139         $_ = "";
12140         return;
12141     }
12142
12143     sub EOF_UnicodeData {
12144         # Called upon EOF to flush the buffers, and create the Hangul
12145         # decomposition mappings if needed.
12146
12147         my $file = shift;
12148         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
12149
12150         # Flush the buffers.
12151         foreach my $i (0 .. $last_field) {
12152             $file->insert_adjusted_lines("$start[$i]..$previous_cp; $field_names[$i]; $previous_fields[$i]");
12153         }
12154
12155         if (-e 'Jamo.txt') {
12156
12157             # The algorithm is published by Unicode, based on values in
12158             # Jamo.txt, (which should have been processed before this
12159             # subroutine), and the results left in %Jamo
12160             unless (%Jamo) {
12161                 Carp::my_carp_bug("Jamo.txt should be processed before Unicode.txt.  Hangul syllables not generated.");
12162                 return;
12163             }
12164
12165             # If the full decomposition map table is being output, insert
12166             # into it the Hangul syllable mappings.  This is to avoid having
12167             # to publish a subroutine in it to compute them.  (which would
12168             # essentially be this code.)  This uses the algorithm published by
12169             # Unicode.  (No hangul syllables in version 1)
12170             if ($v_version ge v2.0.0
12171                 && property_ref('Decomposition_Mapping')->to_output_map) {
12172                 for (my $S = $SBase; $S < $SBase + $SCount; $S++) {
12173                     use integer;
12174                     my $SIndex = $S - $SBase;
12175                     my $L = $LBase + $SIndex / $NCount;
12176                     my $V = $VBase + ($SIndex % $NCount) / $TCount;
12177                     my $T = $TBase + $SIndex % $TCount;
12178
12179                     trace "L=$L, V=$V, T=$T" if main::DEBUG && $to_trace;
12180                     my $decomposition = sprintf("%04X %04X", $L, $V);
12181                     $decomposition .= sprintf(" %04X", $T) if $T != $TBase;
12182                     $file->insert_adjusted_lines(
12183                                 sprintf("%04X; Decomposition_Mapping; %s",
12184                                         $S,
12185                                         $decomposition));
12186                 }
12187             }
12188         }
12189
12190         return;
12191     }
12192
12193     sub filter_v1_ucd {
12194         # Fix UCD lines in version 1.  This is probably overkill, but this
12195         # fixes some glaring errors in Version 1 UnicodeData.txt.  That file:
12196         # 1)    had many Hangul (U+3400 - U+4DFF) code points that were later
12197         #       removed.  This program retains them
12198         # 2)    didn't include ranges, which it should have, and which are now
12199         #       added in @corrected_lines below.  It was hand populated by
12200         #       taking the data from Version 2, verified by analyzing
12201         #       DAge.txt.
12202         # 3)    There is a syntax error in the entry for U+09F8 which could
12203         #       cause problems for Unicode::UCD, and so is changed.  It's
12204         #       numeric value was simply a minus sign, without any number.
12205         #       (Eventually Unicode changed the code point to non-numeric.)
12206         # 4)    The decomposition types often don't match later versions
12207         #       exactly, and the whole syntax of that field is different; so
12208         #       the syntax is changed as well as the types to their later
12209         #       terminology.  Otherwise normalize.pm would be very unhappy
12210         # 5)    Many ccc classes are different.  These are left intact.
12211         # 6)    U+FF10..U+FF19 are missing their numeric values in all three
12212         #       fields.  These are unchanged because it doesn't really cause
12213         #       problems for Perl.
12214         # 7)    A number of code points, such as controls, don't have their
12215         #       Unicode Version 1 Names in this file.  These are added.
12216         # 8)    A number of Symbols were marked as Lm.  This changes those in
12217         #       the Latin1 range, so that regexes work.
12218         # 9)    The odd characters U+03DB .. U+03E1 weren't encoded but are
12219         #       referred to by their lc equivalents.  Not fixed.
12220
12221         my @corrected_lines = split /\n/, <<'END';
12222 4E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;;
12223 9FA5;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;;
12224 E000;<Private Use, First>;Co;0;L;;;;;N;;;;;
12225 F8FF;<Private Use, Last>;Co;0;L;;;;;N;;;;;
12226 F900;<CJK Compatibility Ideograph, First>;Lo;0;L;;;;;N;;;;;
12227 FA2D;<CJK Compatibility Ideograph, Last>;Lo;0;L;;;;;N;;;;;
12228 END
12229
12230         my $file = shift;
12231         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
12232
12233         #local $to_trace = 1 if main::DEBUG;
12234         trace $_ if main::DEBUG && $to_trace;
12235
12236         # -1 => retain trailing null fields
12237         my ($code_point, @fields) = split /\s*;\s*/, $_, -1;
12238
12239         # At the first place that is wrong in the input, insert all the
12240         # corrections, replacing the wrong line.
12241         if ($code_point eq '4E00') {
12242             my @copy = @corrected_lines;
12243             $_ = shift @copy;
12244             ($code_point, @fields) = split /\s*;\s*/, $_, -1;
12245
12246             $file->insert_lines(@copy);
12247         }
12248         elsif ($code_point =~ /^00/ && $fields[$CATEGORY] eq 'Lm') {
12249
12250             # There are no Lm characters in Latin1; these should be 'Sk', but
12251             # there isn't that in V1.
12252             $fields[$CATEGORY] = 'So';
12253         }
12254
12255         if ($fields[$NUMERIC] eq '-') {
12256             $fields[$NUMERIC] = '-1';  # This is what 2.0 made it.
12257         }
12258
12259         if  ($fields[$PERL_DECOMPOSITION] ne "") {
12260
12261             # Several entries have this change to superscript 2 or 3 in the
12262             # middle.  Convert these to the modern version, which is to use
12263             # the actual U+00B2 and U+00B3 (the superscript forms) instead.
12264             # So 'HHHH HHHH <+sup> 0033 <-sup> HHHH' becomes
12265             # 'HHHH HHHH 00B3 HHHH'.
12266             # It turns out that all of these that don't have another
12267             # decomposition defined at the beginning of the line have the
12268             # <square> decomposition in later releases.
12269             if ($code_point ne '00B2' && $code_point ne '00B3') {
12270                 if  ($fields[$PERL_DECOMPOSITION]
12271                                     =~ s/<\+sup> 003([23]) <-sup>/00B$1/)
12272                 {
12273                     if (substr($fields[$PERL_DECOMPOSITION], 0, 1) ne '<') {
12274                         $fields[$PERL_DECOMPOSITION] = '<square> '
12275                         . $fields[$PERL_DECOMPOSITION];
12276                     }
12277                 }
12278             }
12279
12280             # If is like '<+circled> 0052 <-circled>', convert to
12281             # '<circled> 0052'
12282             $fields[$PERL_DECOMPOSITION] =~
12283                             s/ < \+ ( .*? ) > \s* (.*?) \s* <-\1> /<$1> $2/xg;
12284
12285             # Convert '<join> HHHH HHHH <join>' to '<medial> HHHH HHHH', etc.
12286             $fields[$PERL_DECOMPOSITION] =~
12287                             s/ <join> \s* (.*?) \s* <no-join> /<final> $1/x
12288             or $fields[$PERL_DECOMPOSITION] =~
12289                             s/ <join> \s* (.*?) \s* <join> /<medial> $1/x
12290             or $fields[$PERL_DECOMPOSITION] =~
12291                             s/ <no-join> \s* (.*?) \s* <join> /<initial> $1/x
12292             or $fields[$PERL_DECOMPOSITION] =~
12293                         s/ <no-join> \s* (.*?) \s* <no-join> /<isolated> $1/x;
12294
12295             # Convert '<break> HHHH HHHH <break>' to '<break> HHHH', etc.
12296             $fields[$PERL_DECOMPOSITION] =~
12297                     s/ <(break|no-break)> \s* (.*?) \s* <\1> /<$1> $2/x;
12298
12299             # Change names to modern form.
12300             $fields[$PERL_DECOMPOSITION] =~ s/<font variant>/<font>/g;
12301             $fields[$PERL_DECOMPOSITION] =~ s/<no-break>/<noBreak>/g;
12302             $fields[$PERL_DECOMPOSITION] =~ s/<circled>/<circle>/g;
12303             $fields[$PERL_DECOMPOSITION] =~ s/<break>/<fraction>/g;
12304
12305             # One entry has weird braces
12306             $fields[$PERL_DECOMPOSITION] =~ s/[{}]//g;
12307
12308             # One entry at U+2116 has an extra <sup>
12309             $fields[$PERL_DECOMPOSITION] =~ s/( < .*? > .* ) < .*? > \ * /$1/x;
12310         }
12311
12312         $_ = join ';', $code_point, @fields;
12313         trace $_ if main::DEBUG && $to_trace;
12314         return;
12315     }
12316
12317     sub filter_bad_Nd_ucd {
12318         # Early versions specified a value in the decimal digit field even
12319         # though the code point wasn't a decimal digit.  Clear the field in
12320         # that situation, so that the main code doesn't think it is a decimal
12321         # digit.
12322
12323         my ($code_point, @fields) = split /\s*;\s*/, $_, -1;
12324         if ($fields[$PERL_DECIMAL_DIGIT] ne "" && $fields[$CATEGORY] ne 'Nd') {
12325             $fields[$PERL_DECIMAL_DIGIT] = "";
12326             $_ = join ';', $code_point, @fields;
12327         }
12328         return;
12329     }
12330
12331     my @U1_control_names = split /\n/, <<'END';
12332 NULL
12333 START OF HEADING
12334 START OF TEXT
12335 END OF TEXT
12336 END OF TRANSMISSION
12337 ENQUIRY
12338 ACKNOWLEDGE
12339 BELL
12340 BACKSPACE
12341 HORIZONTAL TABULATION
12342 LINE FEED
12343 VERTICAL TABULATION
12344 FORM FEED
12345 CARRIAGE RETURN
12346 SHIFT OUT
12347 SHIFT IN
12348 DATA LINK ESCAPE
12349 DEVICE CONTROL ONE
12350 DEVICE CONTROL TWO
12351 DEVICE CONTROL THREE
12352 DEVICE CONTROL FOUR
12353 NEGATIVE ACKNOWLEDGE
12354 SYNCHRONOUS IDLE
12355 END OF TRANSMISSION BLOCK
12356 CANCEL
12357 END OF MEDIUM
12358 SUBSTITUTE
12359 ESCAPE
12360 FILE SEPARATOR
12361 GROUP SEPARATOR
12362 RECORD SEPARATOR
12363 UNIT SEPARATOR
12364 DELETE
12365 BREAK PERMITTED HERE
12366 NO BREAK HERE
12367 INDEX
12368 NEXT LINE
12369 START OF SELECTED AREA
12370 END OF SELECTED AREA
12371 CHARACTER TABULATION SET
12372 CHARACTER TABULATION WITH JUSTIFICATION
12373 LINE TABULATION SET
12374 PARTIAL LINE DOWN
12375 PARTIAL LINE UP
12376 REVERSE LINE FEED
12377 SINGLE SHIFT TWO
12378 SINGLE SHIFT THREE
12379 DEVICE CONTROL STRING
12380 PRIVATE USE ONE
12381 PRIVATE USE TWO
12382 SET TRANSMIT STATE
12383 CANCEL CHARACTER
12384 MESSAGE WAITING
12385 START OF GUARDED AREA
12386 END OF GUARDED AREA
12387 START OF STRING
12388 SINGLE CHARACTER INTRODUCER
12389 CONTROL SEQUENCE INTRODUCER
12390 STRING TERMINATOR
12391 OPERATING SYSTEM COMMAND
12392 PRIVACY MESSAGE
12393 APPLICATION PROGRAM COMMAND
12394 END
12395
12396     sub filter_early_U1_names {
12397         # Very early versions did not have the Unicode_1_name field specified.
12398         # They differed in which ones were present; make sure a U1 name
12399         # exists, so that Unicode::UCD::charinfo will work
12400
12401         my ($code_point, @fields) = split /\s*;\s*/, $_, -1;
12402
12403
12404         # @U1_control names above are entirely positional, so we pull them out
12405         # in the exact order required, with gaps for the ones that don't have
12406         # names.
12407         if ($code_point =~ /^00[01]/
12408             || $code_point eq '007F'
12409             || $code_point =~ /^008[2-9A-F]/
12410             || $code_point =~ /^009[0-8A-F]/)
12411         {
12412             my $u1_name = shift @U1_control_names;
12413             $fields[$UNICODE_1_NAME] = $u1_name unless $fields[$UNICODE_1_NAME];
12414             $_ = join ';', $code_point, @fields;
12415         }
12416         return;
12417     }
12418
12419     sub filter_v2_1_5_ucd {
12420         # A dozen entries in this 2.1.5 file had the mirrored and numeric
12421         # columns swapped;  These all had mirrored be 'N'.  So if the numeric
12422         # column appears to be N, swap it back.
12423
12424         my ($code_point, @fields) = split /\s*;\s*/, $_, -1;
12425         if ($fields[$NUMERIC] eq 'N') {
12426             $fields[$NUMERIC] = $fields[$MIRRORED];
12427             $fields[$MIRRORED] = 'N';
12428             $_ = join ';', $code_point, @fields;
12429         }
12430         return;
12431     }
12432
12433     sub filter_v6_ucd {
12434
12435         # Unicode 6.0 co-opted the name BELL for U+1F514, but until 5.17,
12436         # it wasn't accepted, to allow for some deprecation cycles.  This
12437         # function is not called after 5.16
12438
12439         return if $_ !~ /^(?:0007|1F514|070F);/;
12440
12441         my ($code_point, @fields) = split /\s*;\s*/, $_, -1;
12442         if ($code_point eq '0007') {
12443             $fields[$CHARNAME] = "";
12444         }
12445         elsif ($code_point eq '070F') { # Unicode Corrigendum #8; see
12446                             # http://www.unicode.org/versions/corrigendum8.html
12447             $fields[$BIDI] = "AL";
12448         }
12449         elsif ($^V lt v5.18.0) { # For 5.18 will convert to use Unicode's name
12450             $fields[$CHARNAME] = "";
12451         }
12452
12453         $_ = join ';', $code_point, @fields;
12454
12455         return;
12456     }
12457 } # End closure for UnicodeData
12458
12459 sub process_GCB_test {
12460
12461     my $file = shift;
12462     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
12463
12464     while ($file->next_line) {
12465         push @backslash_X_tests, $_;
12466     }
12467
12468     return;
12469 }
12470
12471 sub process_LB_test {
12472
12473     my $file = shift;
12474     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
12475
12476     while ($file->next_line) {
12477         push @LB_tests, $_;
12478     }
12479
12480     return;
12481 }
12482
12483 sub process_SB_test {
12484
12485     my $file = shift;
12486     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
12487
12488     while ($file->next_line) {
12489         push @SB_tests, $_;
12490     }
12491
12492     return;
12493 }
12494
12495 sub process_WB_test {
12496
12497     my $file = shift;
12498     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
12499
12500     while ($file->next_line) {
12501         push @WB_tests, $_;
12502     }
12503
12504     return;
12505 }
12506
12507 sub process_NamedSequences {
12508     # NamedSequences.txt entries are just added to an array.  Because these
12509     # don't look like the other tables, they have their own handler.
12510     # An example:
12511     # LATIN CAPITAL LETTER A WITH MACRON AND GRAVE;0100 0300
12512     #
12513     # This just adds the sequence to an array for later handling
12514
12515     my $file = shift;
12516     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
12517
12518     while ($file->next_line) {
12519         my ($name, $sequence, @remainder) = split /\s*;\s*/, $_, -1;
12520         if (@remainder) {
12521             $file->carp_bad_line(
12522                 "Doesn't look like 'KHMER VOWEL SIGN OM;17BB 17C6'");
12523             next;
12524         }
12525
12526         # Code points below 0x0100 need to be converted to native
12527         $sequence =~ s{ \b 00 ( [0-9A-F]{2} ) \b }
12528                       { sprintf("%04X", utf8::unicode_to_native(hex $1)) }gxe
12529                                                         if NON_ASCII_PLATFORM;
12530
12531         # Note single \t in keeping with special output format of
12532         # Perl_charnames.  But it turns out that the code points don't have to
12533         # be 5 digits long, like the rest, based on the internal workings of
12534         # charnames.pm.  This could be easily changed for consistency.
12535         push @named_sequences, "$sequence\t$name";
12536     }
12537     return;
12538 }
12539
12540 { # Closure
12541
12542     my $first_range;
12543
12544     sub  filter_early_ea_lb {
12545         # Fixes early EastAsianWidth.txt and LineBreak.txt files.  These had a
12546         # third field be the name of the code point, which can be ignored in
12547         # most cases.  But it can be meaningful if it marks a range:
12548         # 33FE;W;IDEOGRAPHIC TELEGRAPH SYMBOL FOR DAY THIRTY-ONE
12549         # 3400;W;<CJK Ideograph Extension A, First>
12550         #
12551         # We need to see the First in the example above to know it's a range.
12552         # They did not use the later range syntaxes.  This routine changes it
12553         # to use the modern syntax.
12554         # $1 is the Input_file object.
12555
12556         my @fields = split /\s*;\s*/;
12557         if ($fields[2] =~ /^<.*, First>/) {
12558             $first_range = $fields[0];
12559             $_ = "";
12560         }
12561         elsif ($fields[2] =~ /^<.*, Last>/) {
12562             $_ = $_ = "$first_range..$fields[0]; $fields[1]";
12563         }
12564         else {
12565             undef $first_range;
12566             $_ = "$fields[0]; $fields[1]";
12567         }
12568
12569         return;
12570     }
12571 }
12572
12573 sub filter_substitute_lb {
12574     # Used on Unicodes that predate the LB property, where there is a
12575     # substitute file.  This just does the regular ea_lb handling for such
12576     # files, and then substitutes the long property value name for the short
12577     # one that comes with the file.  (The other break files have the long
12578     # names in them, so this is the odd one out.)  The reason for doing this
12579     # kludge is that regen/mk_invlists.pl is expecting the long name.  This
12580     # also fixes the typo 'Inseperable' that leads to problems.
12581
12582     filter_early_ea_lb;
12583     return unless $_;
12584
12585     my @fields = split /\s*;\s*/;
12586     $fields[1] = property_ref('_Perl_LB')->table($fields[1])->full_name;
12587     $fields[1] = 'Inseparable' if lc $fields[1] eq 'inseperable';
12588     $_ = join '; ', @fields;
12589 }
12590
12591 sub filter_old_style_arabic_shaping {
12592     # Early versions used a different term for the later one.
12593
12594     my @fields = split /\s*;\s*/;
12595     $fields[3] =~ s/<no shaping>/No_Joining_Group/;
12596     $fields[3] =~ s/\s+/_/g;                # Change spaces to underscores
12597     $_ = join ';', @fields;
12598     return;
12599 }
12600
12601 { # Closure
12602     my $lc; # Table for lowercase mapping
12603     my $tc;
12604     my $uc;
12605     my %special_casing_code_points;
12606
12607     sub setup_special_casing {
12608         # SpecialCasing.txt contains the non-simple case change mappings.  The
12609         # simple ones are in UnicodeData.txt, which should already have been
12610         # read in to the full property data structures, so as to initialize
12611         # these with the simple ones.  Then the SpecialCasing.txt entries
12612         # add or overwrite the ones which have different full mappings.
12613
12614         # This routine sees if the simple mappings are to be output, and if
12615         # so, copies what has already been put into the full mapping tables,
12616         # while they still contain only the simple mappings.
12617
12618         # The reason it is done this way is that the simple mappings are
12619         # probably not going to be output, so it saves work to initialize the
12620         # full tables with the simple mappings, and then overwrite those
12621         # relatively few entries in them that have different full mappings,
12622         # and thus skip the simple mapping tables altogether.
12623
12624         my $file= shift;
12625         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
12626
12627         $lc = property_ref('lc');
12628         $tc = property_ref('tc');
12629         $uc = property_ref('uc');
12630
12631         # For each of the case change mappings...
12632         foreach my $full_casing_table ($lc, $tc, $uc) {
12633             my $full_casing_name = $full_casing_table->name;
12634             my $full_casing_full_name = $full_casing_table->full_name;
12635             unless (defined $full_casing_table
12636                     && ! $full_casing_table->is_empty)
12637             {
12638                 Carp::my_carp_bug("Need to process UnicodeData before SpecialCasing.  Only special casing will be generated.");
12639             }
12640
12641             # Create a table in the old-style format and with the original
12642             # file name for backwards compatibility with applications that
12643             # read it directly.  The new tables contain both the simple and
12644             # full maps, and the old are missing simple maps when there is a
12645             # conflicting full one.  Probably it would have been ok to add
12646             # those to the legacy version, as was already done in 5.14 to the
12647             # case folding one, but this was not done, out of an abundance of
12648             # caution.  The tables are set up here before we deal with the
12649             # full maps so that as we handle those, we can override the simple
12650             # maps for them in the legacy table, and merely add them in the
12651             # new-style one.
12652             my $legacy = Property->new("Legacy_" . $full_casing_full_name,
12653                                 File => $full_casing_full_name
12654                                                           =~ s/case_Mapping//r,
12655                                 Format => $HEX_FORMAT,
12656                                 Default_Map => $CODE_POINT,
12657                                 Initialize => $full_casing_table,
12658                                 Replacement_Property => $full_casing_full_name,
12659             );
12660
12661             $full_casing_table->add_comment(join_lines( <<END
12662 This file includes both the simple and full case changing maps.  The simple
12663 ones are in the main body of the table below, and the full ones adding to or
12664 overriding them are in the hash.
12665 END
12666             ));
12667
12668             # The simple version's name in each mapping merely has an 's' in
12669             # front of the full one's
12670             my $simple_name = 's' . $full_casing_name;
12671             my $simple = property_ref($simple_name);
12672             $simple->initialize($full_casing_table) if $simple->to_output_map();
12673         }
12674
12675         return;
12676     }
12677
12678     sub filter_2_1_8_special_casing_line {
12679
12680         # This version had duplicate entries in this file.  Delete all but the
12681         # first one
12682         my @fields = split /\s*;\s*/, $_, -1; # -1 => retain trailing null
12683                                               # fields
12684         if (exists $special_casing_code_points{$fields[0]}) {
12685             $_ = "";
12686             return;
12687         }
12688
12689         $special_casing_code_points{$fields[0]} = 1;
12690         filter_special_casing_line(@_);
12691     }
12692
12693     sub filter_special_casing_line {
12694         # Change the format of $_ from SpecialCasing.txt into something that
12695         # the generic handler understands.  Each input line contains three
12696         # case mappings.  This will generate three lines to pass to the
12697         # generic handler for each of those.
12698
12699         # The input syntax (after stripping comments and trailing white space
12700         # is like one of the following (with the final two being entries that
12701         # we ignore):
12702         # 00DF; 00DF; 0053 0073; 0053 0053; # LATIN SMALL LETTER SHARP S
12703         # 03A3; 03C2; 03A3; 03A3; Final_Sigma;
12704         # 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
12705         # Note the trailing semi-colon, unlike many of the input files.  That
12706         # means that there will be an extra null field generated by the split
12707
12708         my $file = shift;
12709         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
12710
12711         my @fields = split /\s*;\s*/, $_, -1; # -1 => retain trailing null
12712                                               # fields
12713
12714         # field #4 is when this mapping is conditional.  If any of these get
12715         # implemented, it would be by hard-coding in the casing functions in
12716         # the Perl core, not through tables.  But if there is a new condition
12717         # we don't know about, output a warning.  We know about all the
12718         # conditions through 6.0
12719         if ($fields[4] ne "") {
12720             my @conditions = split ' ', $fields[4];
12721             if ($conditions[0] ne 'tr'  # We know that these languages have
12722                                         # conditions, and some are multiple
12723                 && $conditions[0] ne 'az'
12724                 && $conditions[0] ne 'lt'
12725
12726                 # And, we know about a single condition Final_Sigma, but
12727                 # nothing else.
12728                 && ($v_version gt v5.2.0
12729                     && (@conditions > 1 || $conditions[0] ne 'Final_Sigma')))
12730             {
12731                 $file->carp_bad_line("Unknown condition '$fields[4]'.  You should inspect it and either add code to handle it, or add to list of those that are to ignore");
12732             }
12733             elsif ($conditions[0] ne 'Final_Sigma') {
12734
12735                     # Don't print out a message for Final_Sigma, because we
12736                     # have hard-coded handling for it.  (But the standard
12737                     # could change what the rule should be, but it wouldn't
12738                     # show up here anyway.
12739
12740                     print "# SKIPPING Special Casing: $_\n"
12741                                                     if $verbosity >= $VERBOSE;
12742             }
12743             $_ = "";
12744             return;
12745         }
12746         elsif (@fields > 6 || (@fields == 6 && $fields[5] ne "" )) {
12747             $file->carp_bad_line('Extra fields');
12748             $_ = "";
12749             return;
12750         }
12751
12752         my $decimal_code_point = hex $fields[0];
12753
12754         # Loop to handle each of the three mappings in the input line, in
12755         # order, with $i indicating the current field number.
12756         my $i = 0;
12757         for my $object ($lc, $tc, $uc) {
12758             $i++;   # First time through, $i = 0 ... 3rd time = 3
12759
12760             my $value = $object->value_of($decimal_code_point);
12761             $value = ($value eq $CODE_POINT)
12762                       ? $decimal_code_point
12763                       : hex $value;
12764
12765             # If this isn't a multi-character mapping, it should already have
12766             # been read in.
12767             if ($fields[$i] !~ / /) {
12768                 if ($value != hex $fields[$i]) {
12769                     Carp::my_carp("Bad news. UnicodeData.txt thinks "
12770                                   . $object->name
12771                                   . "(0x$fields[0]) is $value"
12772                                   . " and SpecialCasing.txt thinks it is "
12773                                   . hex($fields[$i])
12774                                   . ".  Good luck.  Retaining UnicodeData value, and proceeding anyway.");
12775                 }
12776             }
12777             else {
12778
12779                 # The mapping goes into both the legacy table, in which it
12780                 # replaces the simple one...
12781                 $file->insert_adjusted_lines("$fields[0]; Legacy_"
12782                                              . $object->full_name
12783                                              . "; $fields[$i]");
12784
12785                 # ... and the regular table, in which it is additional,
12786                 # beyond the simple mapping.
12787                 $file->insert_adjusted_lines("$fields[0]; "
12788                                              . $object->name
12789                                             . "; "
12790                                             . $CMD_DELIM
12791                                             . "$REPLACE_CMD=$MULTIPLE_BEFORE"
12792                                             . $CMD_DELIM
12793                                             . $fields[$i]);
12794             }
12795         }
12796
12797         # Everything has been handled by the insert_adjusted_lines()
12798         $_ = "";
12799
12800         return;
12801     }
12802 }
12803
12804 sub filter_old_style_case_folding {
12805     # This transforms $_ containing the case folding style of 3.0.1, to 3.1
12806     # and later style.  Different letters were used in the earlier.
12807
12808     my $file = shift;
12809     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
12810
12811     my @fields = split /\s*;\s*/;
12812
12813     if ($fields[1] eq 'L') {
12814         $fields[1] = 'C';             # L => C always
12815     }
12816     elsif ($fields[1] eq 'E') {
12817         if ($fields[2] =~ / /) {      # E => C if one code point; F otherwise
12818             $fields[1] = 'F'
12819         }
12820         else {
12821             $fields[1] = 'C'
12822         }
12823     }
12824     else {
12825         $file->carp_bad_line("Expecting L or E in second field");
12826         $_ = "";
12827         return;
12828     }
12829     $_ = join("; ", @fields) . ';';
12830     return;
12831 }
12832
12833 { # Closure for case folding
12834
12835     # Create the map for simple only if are going to output it, for otherwise
12836     # it takes no part in anything we do.
12837     my $to_output_simple;
12838
12839     sub setup_case_folding($) {
12840         # Read in the case foldings in CaseFolding.txt.  This handles both
12841         # simple and full case folding.
12842
12843         $to_output_simple
12844                         = property_ref('Simple_Case_Folding')->to_output_map;
12845
12846         if (! $to_output_simple) {
12847             property_ref('Case_Folding')->set_proxy_for('Simple_Case_Folding');
12848         }
12849
12850         # If we ever wanted to show that these tables were combined, a new
12851         # property method could be created, like set_combined_props()
12852         property_ref('Case_Folding')->add_comment(join_lines( <<END
12853 This file includes both the simple and full case folding maps.  The simple
12854 ones are in the main body of the table below, and the full ones adding to or
12855 overriding them are in the hash.
12856 END
12857         ));
12858         return;
12859     }
12860
12861     sub filter_case_folding_line {
12862         # Called for each line in CaseFolding.txt
12863         # Input lines look like:
12864         # 0041; C; 0061; # LATIN CAPITAL LETTER A
12865         # 00DF; F; 0073 0073; # LATIN SMALL LETTER SHARP S
12866         # 1E9E; S; 00DF; # LATIN CAPITAL LETTER SHARP S
12867         #
12868         # 'C' means that folding is the same for both simple and full
12869         # 'F' that it is only for full folding
12870         # 'S' that it is only for simple folding
12871         # 'T' is locale-dependent, and ignored
12872         # 'I' is a type of 'F' used in some early releases.
12873         # Note the trailing semi-colon, unlike many of the input files.  That
12874         # means that there will be an extra null field generated by the split
12875         # below, which we ignore and hence is not an error.
12876
12877         my $file = shift;
12878         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
12879
12880         my ($range, $type, $map, @remainder) = split /\s*;\s*/, $_, -1;
12881         if (@remainder > 1 || (@remainder == 1 && $remainder[0] ne "" )) {
12882             $file->carp_bad_line('Extra fields');
12883             $_ = "";
12884             return;
12885         }
12886
12887         if ($type =~ / ^ [IT] $/x) {   # Skip Turkic case folding, is locale dependent
12888             $_ = "";
12889             return;
12890         }
12891
12892         # C: complete, F: full, or I: dotted uppercase I -> dotless lowercase
12893         # I are all full foldings; S is single-char.  For S, there is always
12894         # an F entry, so we must allow multiple values for the same code
12895         # point.  Fortunately this table doesn't need further manipulation
12896         # which would preclude using multiple-values.  The S is now included
12897         # so that _swash_inversion_hash() is able to construct closures
12898         # without having to worry about F mappings.
12899         if ($type eq 'C' || $type eq 'F' || $type eq 'I' || $type eq 'S') {
12900             $_ = "$range; Case_Folding; "
12901                  . "$CMD_DELIM$REPLACE_CMD=$MULTIPLE_BEFORE$CMD_DELIM$map";
12902         }
12903         else {
12904             $_ = "";
12905             $file->carp_bad_line('Expecting C F I S or T in second field');
12906         }
12907
12908         # C and S are simple foldings, but simple case folding is not needed
12909         # unless we explicitly want its map table output.
12910         if ($to_output_simple && $type eq 'C' || $type eq 'S') {
12911             $file->insert_adjusted_lines("$range; Simple_Case_Folding; $map");
12912         }
12913
12914         return;
12915     }
12916
12917 } # End case fold closure
12918
12919 sub filter_jamo_line {
12920     # Filter Jamo.txt lines.  This routine mainly is used to populate hashes
12921     # from this file that is used in generating the Name property for Jamo
12922     # code points.  But, it also is used to convert early versions' syntax
12923     # into the modern form.  Here are two examples:
12924     # 1100; G   # HANGUL CHOSEONG KIYEOK            # Modern syntax
12925     # U+1100; G; HANGUL CHOSEONG KIYEOK             # 2.0 syntax
12926     #
12927     # The input is $_, the output is $_ filtered.
12928
12929     my @fields = split /\s*;\s*/, $_, -1;  # -1 => retain trailing null fields
12930
12931     # Let the caller handle unexpected input.  In earlier versions, there was
12932     # a third field which is supposed to be a comment, but did not have a '#'
12933     # before it.
12934     return if @fields > (($v_version gt v3.0.0) ? 2 : 3);
12935
12936     $fields[0] =~ s/^U\+//;     # Also, early versions had this extraneous
12937                                 # beginning.
12938
12939     # Some 2.1 versions had this wrong.  Causes havoc with the algorithm.
12940     $fields[1] = 'R' if $fields[0] eq '1105';
12941
12942     # Add to structure so can generate Names from it.
12943     my $cp = hex $fields[0];
12944     my $short_name = $fields[1];
12945     $Jamo{$cp} = $short_name;
12946     if ($cp <= $LBase + $LCount) {
12947         $Jamo_L{$short_name} = $cp - $LBase;
12948     }
12949     elsif ($cp <= $VBase + $VCount) {
12950         $Jamo_V{$short_name} = $cp - $VBase;
12951     }
12952     elsif ($cp <= $TBase + $TCount) {
12953         $Jamo_T{$short_name} = $cp - $TBase;
12954     }
12955     else {
12956         Carp::my_carp_bug("Unexpected Jamo code point in $_");
12957     }
12958
12959
12960     # Reassemble using just the first two fields to look like a typical
12961     # property file line
12962     $_ = "$fields[0]; $fields[1]";
12963
12964     return;
12965 }
12966
12967 sub register_fraction($) {
12968     # This registers the input rational number so that it can be passed on to
12969     # Unicode::UCD, both in rational and floating forms.
12970
12971     my $rational = shift;
12972
12973     my $floating = eval $rational;
12974
12975     my @floats = sprintf "%.*e", $E_FLOAT_PRECISION, $floating;
12976
12977     # See if the denominator is a power of 2.
12978     $rational =~ m!.*/(.*)!;
12979     my $denominator = $1;
12980     if (defined $denominator && (($denominator & ($denominator - 1)) == 0)) {
12981
12982         # Here the denominator is a power of 2.  This means it has an exact
12983         # representation in binary, so rounding could go either way.  It turns
12984         # out that Windows doesn't necessarily round towards even, so output
12985         # an extra entry.  This happens when the final digit we output is even
12986         # and the next digits would be 50* to the precision of the machine.
12987         my $extra_digit_float = sprintf "%e", $floating;
12988         my $q = $E_FLOAT_PRECISION - 1;
12989         if ($extra_digit_float =~ / ( .* \. \d{$q} )
12990                                     ( [02468] ) 5 0* ( e .*)
12991                                   /ix)
12992         {
12993             push @floats, $1 . ($2 + 1) . $3;
12994         }
12995     }
12996
12997     foreach my $float (@floats) {
12998         # Strip off any leading zeros beyond 2 digits to make it C99
12999         # compliant.  (Windows has 3 digit exponents, contrary to C99)
13000         $float =~ s/ ( .* e [-+] ) 0* ( \d{2,}? ) /$1$2/x;
13001
13002         if (   defined $nv_floating_to_rational{$float}
13003             && $nv_floating_to_rational{$float} ne $rational)
13004         {
13005             die Carp::my_carp_bug("Both '$rational' and"
13006                             . " '$nv_floating_to_rational{$float}' evaluate to"
13007                             . " the same floating point number."
13008                             . "  \$E_FLOAT_PRECISION must be increased");
13009         }
13010         $nv_floating_to_rational{$float} = $rational;
13011     }
13012     return;
13013 }
13014
13015 sub gcd($$) {   # Greatest-common-divisor; from
13016                 # http://en.wikipedia.org/wiki/Euclidean_algorithm
13017     my ($a, $b) = @_;
13018
13019     use integer;
13020
13021     while ($b != 0) {
13022        my $temp = $b;
13023        $b = $a % $b;
13024        $a = $temp;
13025     }
13026     return $a;
13027 }
13028
13029 sub reduce_fraction($) {
13030     my $fraction_ref = shift;
13031
13032     # Reduce a fraction to lowest terms.  The Unicode data may be reducible,
13033     # hence this is needed.  The argument is a reference to the
13034     # string denoting the fraction, which must be of the form:
13035     if ($$fraction_ref !~ / ^ (-?) (\d+) \/ (\d+) $ /ax) {
13036         Carp::my_carp_bug("Non-fraction input '$$fraction_ref'.  Unchanged");
13037         return;
13038     }
13039
13040     my $sign = $1;
13041     my $numerator = $2;
13042     my $denominator = $3;
13043
13044     use integer;
13045
13046     # Find greatest common divisor
13047     my $gcd = gcd($numerator, $denominator);
13048
13049     # And reduce using the gcd.
13050     if ($gcd != 1) {
13051         $numerator    /= $gcd;
13052         $denominator  /= $gcd;
13053         $$fraction_ref = "$sign$numerator/$denominator";
13054     }
13055
13056     return;
13057 }
13058
13059 sub filter_numeric_value_line {
13060     # DNumValues contains lines of a different syntax than the typical
13061     # property file:
13062     # 0F33          ; -0.5 ; ; -1/2 # No       TIBETAN DIGIT HALF ZERO
13063     #
13064     # This routine transforms $_ containing the anomalous syntax to the
13065     # typical, by filtering out the extra columns, and convert early version
13066     # decimal numbers to strings that look like rational numbers.
13067
13068     my $file = shift;
13069     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
13070
13071     # Starting in 5.1, there is a rational field.  Just use that, omitting the
13072     # extra columns.  Otherwise convert the decimal number in the second field
13073     # to a rational, and omit extraneous columns.
13074     my @fields = split /\s*;\s*/, $_, -1;
13075     my $rational;
13076
13077     if ($v_version ge v5.1.0) {
13078         if (@fields != 4) {
13079             $file->carp_bad_line('Not 4 semi-colon separated fields');
13080             $_ = "";
13081             return;
13082         }
13083         reduce_fraction(\$fields[3]) if $fields[3] =~ qr{/};
13084         $rational = $fields[3];
13085
13086         $_ = join '; ', @fields[ 0, 3 ];
13087     }
13088     else {
13089
13090         # Here, is an older Unicode file, which has decimal numbers instead of
13091         # rationals in it.  Use the fraction to calculate the denominator and
13092         # convert to rational.
13093
13094         if (@fields != 2 && @fields != 3) {
13095             $file->carp_bad_line('Not 2 or 3 semi-colon separated fields');
13096             $_ = "";
13097             return;
13098         }
13099
13100         my $codepoints = $fields[0];
13101         my $decimal = $fields[1];
13102         if ($decimal =~ s/\.0+$//) {
13103
13104             # Anything ending with a decimal followed by nothing but 0's is an
13105             # integer
13106             $_ = "$codepoints; $decimal";
13107             $rational = $decimal;
13108         }
13109         else {
13110
13111             my $denominator;
13112             if ($decimal =~ /\.50*$/) {
13113                 $denominator = 2;
13114             }
13115
13116             # Here have the hardcoded repeating decimals in the fraction, and
13117             # the denominator they imply.  There were only a few denominators
13118             # in the older Unicode versions of this file which this code
13119             # handles, so it is easy to convert them.
13120
13121             # The 4 is because of a round-off error in the Unicode 3.2 files
13122             elsif ($decimal =~ /\.33*[34]$/ || $decimal =~ /\.6+7$/) {
13123                 $denominator = 3;
13124             }
13125             elsif ($decimal =~ /\.[27]50*$/) {
13126                 $denominator = 4;
13127             }
13128             elsif ($decimal =~ /\.[2468]0*$/) {
13129                 $denominator = 5;
13130             }
13131             elsif ($decimal =~ /\.16+7$/ || $decimal =~ /\.83+$/) {
13132                 $denominator = 6;
13133             }
13134             elsif ($decimal =~ /\.(12|37|62|87)50*$/) {
13135                 $denominator = 8;
13136             }
13137             if ($denominator) {
13138                 my $sign = ($decimal < 0) ? "-" : "";
13139                 my $numerator = int((abs($decimal) * $denominator) + .5);
13140                 $rational = "$sign$numerator/$denominator";
13141                 $_ = "$codepoints; $rational";
13142             }
13143             else {
13144                 $file->carp_bad_line("Can't cope with number '$decimal'.");
13145                 $_ = "";
13146                 return;
13147             }
13148         }
13149     }
13150
13151     register_fraction($rational) if $rational =~ qr{/};
13152     return;
13153 }
13154
13155 { # Closure
13156     my %unihan_properties;
13157
13158     sub construct_unihan {
13159
13160         my $file_object = shift;
13161
13162         return unless file_exists($file_object->file);
13163
13164         if ($v_version lt v4.0.0) {
13165             push @cjk_properties, 'URS ; Unicode_Radical_Stroke';
13166             push @cjk_property_values, split "\n", <<'END';
13167 # @missing: 0000..10FFFF; Unicode_Radical_Stroke; <none>
13168 END
13169         }
13170
13171         if ($v_version ge v3.0.0) {
13172             push @cjk_properties, split "\n", <<'END';
13173 cjkIRG_GSource; kIRG_GSource
13174 cjkIRG_JSource; kIRG_JSource
13175 cjkIRG_KSource; kIRG_KSource
13176 cjkIRG_TSource; kIRG_TSource
13177 cjkIRG_VSource; kIRG_VSource
13178 END
13179         push @cjk_property_values, split "\n", <<'END';
13180 # @missing: 0000..10FFFF; cjkIRG_GSource; <none>
13181 # @missing: 0000..10FFFF; cjkIRG_JSource; <none>
13182 # @missing: 0000..10FFFF; cjkIRG_KSource; <none>
13183 # @missing: 0000..10FFFF; cjkIRG_TSource; <none>
13184 # @missing: 0000..10FFFF; cjkIRG_VSource; <none>
13185 END
13186         }
13187         if ($v_version ge v3.1.0) {
13188             push @cjk_properties, 'cjkIRG_HSource; kIRG_HSource';
13189             push @cjk_property_values, '# @missing: 0000..10FFFF; cjkIRG_HSource; <none>';
13190         }
13191         if ($v_version ge v3.1.1) {
13192             push @cjk_properties, 'cjkIRG_KPSource; kIRG_KPSource';
13193             push @cjk_property_values, '# @missing: 0000..10FFFF; cjkIRG_KPSource; <none>';
13194         }
13195         if ($v_version ge v3.2.0) {
13196             push @cjk_properties, split "\n", <<'END';
13197 cjkAccountingNumeric; kAccountingNumeric
13198 cjkCompatibilityVariant; kCompatibilityVariant
13199 cjkOtherNumeric; kOtherNumeric
13200 cjkPrimaryNumeric; kPrimaryNumeric
13201 END
13202             push @cjk_property_values, split "\n", <<'END';
13203 # @missing: 0000..10FFFF; cjkAccountingNumeric; NaN
13204 # @missing: 0000..10FFFF; cjkCompatibilityVariant; <code point>
13205 # @missing: 0000..10FFFF; cjkOtherNumeric; NaN
13206 # @missing: 0000..10FFFF; cjkPrimaryNumeric; NaN
13207 END
13208         }
13209         if ($v_version gt v4.0.0) {
13210             push @cjk_properties, 'cjkIRG_USource; kIRG_USource';
13211             push @cjk_property_values, '# @missing: 0000..10FFFF; cjkIRG_USource; <none>';
13212         }
13213
13214         if ($v_version ge v4.1.0) {
13215             push @cjk_properties, 'cjkIICore ; kIICore';
13216             push @cjk_property_values, '# @missing: 0000..10FFFF; cjkIICore; <none>';
13217         }
13218     }
13219
13220     sub setup_unihan {
13221         # Do any special setup for Unihan properties.
13222
13223         # This property gives the wrong computed type, so override.
13224         my $usource = property_ref('kIRG_USource');
13225         $usource->set_type($STRING) if defined $usource;
13226
13227         # This property is to be considered binary (it says so in
13228         # http://www.unicode.org/reports/tr38/)
13229         my $iicore = property_ref('kIICore');
13230         if (defined $iicore) {
13231             $iicore->set_type($FORCED_BINARY);
13232             $iicore->table("Y")->add_note("Matches any code point which has a non-null value for this property; see unicode.org UAX #38.");
13233
13234             # Unicode doesn't include the maps for this property, so don't
13235             # warn that they are missing.
13236             $iicore->set_pre_declared_maps(0);
13237             $iicore->add_comment(join_lines( <<END
13238 This property contains string values, but any non-empty ones are considered to
13239 be 'core', so Perl creates tables for both: 1) its string values, plus 2)
13240 tables so that \\p{kIICore} matches any code point which has a non-empty
13241 value for this property.
13242 END
13243             ));
13244         }
13245
13246         return;
13247     }
13248
13249     sub filter_unihan_line {
13250         # Change unihan db lines to look like the others in the db.  Here is
13251         # an input sample:
13252         #   U+341C        kCangjie        IEKN
13253
13254         # Tabs are used instead of semi-colons to separate fields; therefore
13255         # they may have semi-colons embedded in them.  Change these to periods
13256         # so won't screw up the rest of the code.
13257         s/;/./g;
13258
13259         # Remove lines that don't look like ones we accept.
13260         if ($_ !~ /^ [^\t]* \t ( [^\t]* ) /x) {
13261             $_ = "";
13262             return;
13263         }
13264
13265         # Extract the property, and save a reference to its object.
13266         my $property = $1;
13267         if (! exists $unihan_properties{$property}) {
13268             $unihan_properties{$property} = property_ref($property);
13269         }
13270
13271         # Don't do anything unless the property is one we're handling, which
13272         # we determine by seeing if there is an object defined for it or not
13273         if (! defined $unihan_properties{$property}) {
13274             $_ = "";
13275             return;
13276         }
13277
13278         # Convert the tab separators to our standard semi-colons, and convert
13279         # the U+HHHH notation to the rest of the standard's HHHH
13280         s/\t/;/g;
13281         s/\b U \+ (?= $code_point_re )//xg;
13282
13283         #local $to_trace = 1 if main::DEBUG;
13284         trace $_ if main::DEBUG && $to_trace;
13285
13286         return;
13287     }
13288 }
13289
13290 sub filter_blocks_lines {
13291     # In the Blocks.txt file, the names of the blocks don't quite match the
13292     # names given in PropertyValueAliases.txt, so this changes them so they
13293     # do match:  Blanks and hyphens are changed into underscores.  Also makes
13294     # early release versions look like later ones
13295     #
13296     # $_ is transformed to the correct value.
13297
13298     my $file = shift;
13299         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
13300
13301     if ($v_version lt v3.2.0) {
13302         if (/FEFF.*Specials/) { # Bug in old versions: line wrongly inserted
13303             $_ = "";
13304             return;
13305         }
13306
13307         # Old versions used a different syntax to mark the range.
13308         $_ =~ s/;\s+/../ if $v_version lt v3.1.0;
13309     }
13310
13311     my @fields = split /\s*;\s*/, $_, -1;
13312     if (@fields != 2) {
13313         $file->carp_bad_line("Expecting exactly two fields");
13314         $_ = "";
13315         return;
13316     }
13317
13318     # Change hyphens and blanks in the block name field only
13319     $fields[1] =~ s/[ -]/_/g;
13320     $fields[1] =~ s/_ ( [a-z] ) /_\u$1/xg;   # Capitalize first letter of word
13321
13322     $_ = join("; ", @fields);
13323     return;
13324 }
13325
13326 { # Closure
13327     my $current_property;
13328
13329     sub filter_old_style_proplist {
13330         # PropList.txt has been in Unicode since version 2.0.  Until 3.1, it
13331         # was in a completely different syntax.  Ken Whistler of Unicode says
13332         # that it was something he used as an aid for his own purposes, but
13333         # was never an official part of the standard.  Many of the properties
13334         # in it were incorporated into the later PropList.txt, but some were
13335         # not.  This program uses this early file to generate property tables
13336         # that are otherwise not accessible in the early UCD's.  It does this
13337         # for the ones that eventually became official, and don't appear to be
13338         # too different in their contents from the later official version, and
13339         # throws away the rest.  It could be argued that the ones it generates
13340         # were probably not really official at that time, so should be
13341         # ignored.  You can easily modify things to skip all of them by
13342         # changing this function to just set $_ to "", and return; and to skip
13343         # certain of them by by simply removing their declarations from
13344         # get_old_property_aliases().
13345         #
13346         # Here is a list of all the ones that are thrown away:
13347         #   Alphabetic                   The definitions for this are very
13348         #                                defective, so better to not mislead
13349         #                                people into thinking it works.
13350         #                                Instead the Perl extension of the
13351         #                                same name is constructed from first
13352         #                                principles.
13353         #   Bidi=*                       duplicates UnicodeData.txt
13354         #   Combining                    never made into official property;
13355         #                                is \P{ccc=0}
13356         #   Composite                    never made into official property.
13357         #   Currency Symbol              duplicates UnicodeData.txt: gc=sc
13358         #   Decimal Digit                duplicates UnicodeData.txt: gc=nd
13359         #   Delimiter                    never made into official property;
13360         #                                removed in 3.0.1
13361         #   Format Control               never made into official property;
13362         #                                similar to gc=cf
13363         #   High Surrogate               duplicates Blocks.txt
13364         #   Ignorable Control            never made into official property;
13365         #                                similar to di=y
13366         #   ISO Control                  duplicates UnicodeData.txt: gc=cc
13367         #   Left of Pair                 never made into official property;
13368         #   Line Separator               duplicates UnicodeData.txt: gc=zl
13369         #   Low Surrogate                duplicates Blocks.txt
13370         #   Non-break                    was actually listed as a property
13371         #                                in 3.2, but without any code
13372         #                                points.  Unicode denies that this
13373         #                                was ever an official property
13374         #   Non-spacing                  duplicate UnicodeData.txt: gc=mn
13375         #   Numeric                      duplicates UnicodeData.txt: gc=cc
13376         #   Paired Punctuation           never made into official property;
13377         #                                appears to be gc=ps + gc=pe
13378         #   Paragraph Separator          duplicates UnicodeData.txt: gc=cc
13379         #   Private Use                  duplicates UnicodeData.txt: gc=co
13380         #   Private Use High Surrogate   duplicates Blocks.txt
13381         #   Punctuation                  duplicates UnicodeData.txt: gc=p
13382         #   Space                        different definition than eventual
13383         #                                one.
13384         #   Titlecase                    duplicates UnicodeData.txt: gc=lt
13385         #   Unassigned Code Value        duplicates UnicodeData.txt: gc=cn
13386         #   Zero-width                   never made into official property;
13387         #                                subset of gc=cf
13388         # Most of the properties have the same names in this file as in later
13389         # versions, but a couple do not.
13390         #
13391         # This subroutine filters $_, converting it from the old style into
13392         # the new style.  Here's a sample of the old-style
13393         #
13394         #   *******************************************
13395         #
13396         #   Property dump for: 0x100000A0 (Join Control)
13397         #
13398         #   200C..200D  (2 chars)
13399         #
13400         # In the example, the property is "Join Control".  It is kept in this
13401         # closure between calls to the subroutine.  The numbers beginning with
13402         # 0x were internal to Ken's program that generated this file.
13403
13404         # If this line contains the property name, extract it.
13405         if (/^Property dump for: [^(]*\((.*)\)/) {
13406             $_ = $1;
13407
13408             # Convert white space to underscores.
13409             s/ /_/g;
13410
13411             # Convert the few properties that don't have the same name as
13412             # their modern counterparts
13413             s/Identifier_Part/ID_Continue/
13414             or s/Not_a_Character/NChar/;
13415
13416             # If the name matches an existing property, use it.
13417             if (defined property_ref($_)) {
13418                 trace "new property=", $_ if main::DEBUG && $to_trace;
13419                 $current_property = $_;
13420             }
13421             else {        # Otherwise discard it
13422                 trace "rejected property=", $_ if main::DEBUG && $to_trace;
13423                 undef $current_property;
13424             }
13425             $_ = "";    # The property is saved for the next lines of the
13426                         # file, but this defining line is of no further use,
13427                         # so clear it so that the caller won't process it
13428                         # further.
13429         }
13430         elsif (! defined $current_property || $_ !~ /^$code_point_re/) {
13431
13432             # Here, the input line isn't a header defining a property for the
13433             # following section, and either we aren't in such a section, or
13434             # the line doesn't look like one that defines the code points in
13435             # such a section.  Ignore this line.
13436             $_ = "";
13437         }
13438         else {
13439
13440             # Here, we have a line defining the code points for the current
13441             # stashed property.  Anything starting with the first blank is
13442             # extraneous.  Otherwise, it should look like a normal range to
13443             # the caller.  Append the property name so that it looks just like
13444             # a modern PropList entry.
13445
13446             $_ =~ s/\s.*//;
13447             $_ .= "; $current_property";
13448         }
13449         trace $_ if main::DEBUG && $to_trace;
13450         return;
13451     }
13452 } # End closure for old style proplist
13453
13454 sub filter_old_style_normalization_lines {
13455     # For early releases of Unicode, the lines were like:
13456     #        74..2A76    ; NFKD_NO
13457     # For later releases this became:
13458     #        74..2A76    ; NFKD_QC; N
13459     # Filter $_ to look like those in later releases.
13460     # Similarly for MAYBEs
13461
13462     s/ _NO \b /_QC; N/x || s/ _MAYBE \b /_QC; M/x;
13463
13464     # Also, the property FC_NFKC was abbreviated to FNC
13465     s/FNC/FC_NFKC/;
13466     return;
13467 }
13468
13469 sub setup_script_extensions {
13470     # The Script_Extensions property starts out with a clone of the Script
13471     # property.
13472
13473     $scx = property_ref("Script_Extensions");
13474     return unless defined $scx;
13475
13476     $scx->_set_format($STRING_WHITE_SPACE_LIST);
13477     $scx->initialize($script);
13478     $scx->set_default_map($script->default_map);
13479     $scx->set_pre_declared_maps(0);     # PropValueAliases doesn't list these
13480     $scx->add_comment(join_lines( <<END
13481 The values for code points that appear in one script are just the same as for
13482 the 'Script' property.  Likewise the values for those that appear in many
13483 scripts are either 'Common' or 'Inherited', same as with 'Script'.  But the
13484 values of code points that appear in a few scripts are a space separated list
13485 of those scripts.
13486 END
13487     ));
13488
13489     # Initialize scx's tables and the aliases for them to be the same as sc's
13490     foreach my $table ($script->tables) {
13491         my $scx_table = $scx->add_match_table($table->name,
13492                                 Full_Name => $table->full_name);
13493         foreach my $alias ($table->aliases) {
13494             $scx_table->add_alias($alias->name);
13495         }
13496     }
13497 }
13498
13499 sub  filter_script_extensions_line {
13500     # The Scripts file comes with the full name for the scripts; the
13501     # ScriptExtensions, with the short name.  The final mapping file is a
13502     # combination of these, and without adjustment, would have inconsistent
13503     # entries.  This filters the latter file to convert to full names.
13504     # Entries look like this:
13505     # 064B..0655    ; Arab Syrc # Mn  [11] ARABIC FATHATAN..ARABIC HAMZA BELOW
13506
13507     my @fields = split /\s*;\s*/;
13508
13509     # This script was erroneously omitted in this Unicode version.
13510     $fields[1] .= ' Takr' if $v_version eq v6.1.0 && $fields[0] =~ /^0964/;
13511
13512     my @full_names;
13513     foreach my $short_name (split " ", $fields[1]) {
13514         push @full_names, $script->table($short_name)->full_name;
13515     }
13516     $fields[1] = join " ", @full_names;
13517     $_ = join "; ", @fields;
13518
13519     return;
13520 }
13521
13522 sub setup_emojidata {
13523     my $prop_ref = Property->new('XPG',
13524                                  Full_Name => 'Extended_Pictographic',
13525     );
13526     $prop_ref->set_fate($PLACEHOLDER,
13527                         "Not part of the Unicode Character Database");
13528 }
13529
13530 sub filter_emojidata_line {
13531     # We only are interested in this single property from this non-UCD data
13532     # file, and we turn it into a Perl property, so that it isn't accessible
13533     # to the users
13534
13535     $_ = "" unless /\bExtended_Pictographic\b/;
13536
13537     return;
13538 }
13539
13540 sub generate_hst {
13541
13542     # Populates the Hangul Syllable Type property from first principles
13543
13544     my $file= shift;
13545     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
13546
13547     # These few ranges are hard-coded in.
13548     $file->insert_lines(split /\n/, <<'END'
13549 1100..1159    ; L
13550 115F          ; L
13551 1160..11A2    ; V
13552 11A8..11F9    ; T
13553 END
13554 );
13555
13556     # The Hangul syllables in version 1 are at different code points than
13557     # those that came along starting in version 2, and have different names;
13558     # they comprise about 60% of the code points of the later version.
13559     # From my (khw) research on them (see <558493EB.4000807@att.net>), the
13560     # initial set is a subset of the later version, with different English
13561     # transliterations.  I did not see an easy mapping between them.  The
13562     # later set includes essentially all possibilities, even ones that aren't
13563     # in modern use (if they ever were), and over 96% of the new ones are type
13564     # LVT.  Mathematically, the early set must also contain a preponderance of
13565     # LVT values.  In lieu of doing nothing, we just set them all to LVT, and
13566     # expect that this will be right most of the time, which is better than
13567     # not being right at all.
13568     if ($v_version lt v2.0.0) {
13569         my $property = property_ref($file->property);
13570         $file->insert_lines(sprintf("%04X..%04X; LVT\n",
13571                                     $FIRST_REMOVED_HANGUL_SYLLABLE,
13572                                     $FINAL_REMOVED_HANGUL_SYLLABLE));
13573         push @tables_that_may_be_empty, $property->table('LV')->complete_name;
13574         return;
13575     }
13576
13577     # The algorithmically derived syllables are almost all LVT ones, so
13578     # initialize the whole range with that.
13579     $file->insert_lines(sprintf "%04X..%04X; LVT\n",
13580                         $SBase, $SBase + $SCount -1);
13581
13582     # Those ones that aren't LVT are LV, and they occur at intervals of
13583     # $TCount code points, starting with the first code point, at $SBase.
13584     for (my $i = $SBase; $i < $SBase + $SCount; $i += $TCount) {
13585         $file->insert_lines(sprintf "%04X..%04X; LV\n", $i, $i);
13586     }
13587
13588     return;
13589 }
13590
13591 sub generate_GCB {
13592
13593     # Populates the Grapheme Cluster Break property from first principles
13594
13595     my $file= shift;
13596     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
13597
13598     # All these definitions are from
13599     # http://www.unicode.org/reports/tr29/tr29-3.html with confirmation
13600     # from http://www.unicode.org/reports/tr29/tr29-4.html
13601
13602     foreach my $range ($gc->ranges) {
13603
13604         # Extend includes gc=Me and gc=Mn, while Control includes gc=Cc
13605         # and gc=Cf
13606         if ($range->value =~ / ^ M [en] $ /x) {
13607             $file->insert_lines(sprintf "%04X..%04X; Extend",
13608                                 $range->start,  $range->end);
13609         }
13610         elsif ($range->value =~ / ^ C [cf] $ /x) {
13611             $file->insert_lines(sprintf "%04X..%04X; Control",
13612                                 $range->start,  $range->end);
13613         }
13614     }
13615     $file->insert_lines("2028; Control"); # Line Separator
13616     $file->insert_lines("2029; Control"); # Paragraph Separator
13617
13618     $file->insert_lines("000D; CR");
13619     $file->insert_lines("000A; LF");
13620
13621     # Also from http://www.unicode.org/reports/tr29/tr29-3.html.
13622     foreach my $code_point ( qw{
13623                                 09BE 09D7 0B3E 0B57 0BBE 0BD7 0CC2 0CD5 0CD6
13624                                 0D3E 0D57 0DCF 0DDF FF9E FF9F 1D165 1D16E 1D16F
13625                                 }
13626     ) {
13627         my $category = $gc->value_of(hex $code_point);
13628         next if ! defined $category || $category eq 'Cn'; # But not if
13629                                                           # unassigned in this
13630                                                           # release
13631         $file->insert_lines("$code_point; Extend");
13632     }
13633
13634     my $hst = property_ref('Hangul_Syllable_Type');
13635     if ($hst->count > 0) {
13636         foreach my $range ($hst->ranges) {
13637             $file->insert_lines(sprintf "%04X..%04X; %s",
13638                                     $range->start, $range->end, $range->value);
13639         }
13640     }
13641     else {
13642         generate_hst($file);
13643     }
13644
13645     main::process_generic_property_file($file);
13646 }
13647
13648
13649 sub fixup_early_perl_name_alias {
13650
13651     # Different versions of Unicode have varying support for the name synonyms
13652     # below.  Just include everything.  As of 6.1, all these are correct in
13653     # the Unicode-supplied file.
13654
13655     my $file= shift;
13656     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
13657
13658
13659     # ALERT did not come along until 6.0, at which point it became preferred
13660     # over BELL.  By inserting it last in early releases, BELL is preferred
13661     # over it; and vice-vers in 6.0
13662     my $type_for_bell = ($v_version lt v6.0.0)
13663                ? 'correction'
13664                : 'alternate';
13665     $file->insert_lines(split /\n/, <<END
13666 0007;BELL; $type_for_bell
13667 000A;LINE FEED (LF);alternate
13668 000C;FORM FEED (FF);alternate
13669 000D;CARRIAGE RETURN (CR);alternate
13670 0085;NEXT LINE (NEL);alternate
13671 END
13672
13673     );
13674
13675     # One might think that the the 'Unicode_1_Name' field, could work for most
13676     # of the above names, but sadly that field varies depending on the
13677     # release.  Version 1.1.5 had no names for any of the controls; Version
13678     # 2.0 introduced names for the C0 controls, and 3.0 introduced C1 names.
13679     # 3.0.1 removed the name INDEX; and 3.2 changed some names:
13680     #   changed to parenthesized versions like "NEXT LINE" to
13681     #       "NEXT LINE (NEL)";
13682     #   changed PARTIAL LINE DOWN to PARTIAL LINE FORWARD
13683     #   changed PARTIAL LINE UP to PARTIAL LINE BACKWARD;;
13684     #   changed e.g. FILE SEPARATOR to INFORMATION SEPARATOR FOUR
13685     #
13686     # All these are present in the 6.1 NameAliases.txt
13687
13688     return;
13689 }
13690
13691 sub filter_later_version_name_alias_line {
13692
13693     # This file has an extra entry per line for the alias type.  This is
13694     # handled by creating a compound entry: "$alias: $type";  First, split
13695     # the line into components.
13696     my ($range, $alias, $type, @remainder)
13697         = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
13698
13699     # This file contains multiple entries for some components, so tell the
13700     # downstream code to allow this in our internal tables; the
13701     # $MULTIPLE_AFTER preserves the input ordering.
13702     $_ = join ";", $range, $CMD_DELIM
13703                            . $REPLACE_CMD
13704                            . '='
13705                            . $MULTIPLE_AFTER
13706                            . $CMD_DELIM
13707                            . "$alias: $type",
13708                    @remainder;
13709     return;
13710 }
13711
13712 sub filter_early_version_name_alias_line {
13713
13714     # Early versions did not have the trailing alias type field; implicitly it
13715     # was 'correction'.
13716     $_ .= "; correction";
13717
13718     filter_later_version_name_alias_line;
13719     return;
13720 }
13721
13722 sub filter_all_caps_script_names {
13723
13724     # Some early Unicode releases had the script names in all CAPS.  This
13725     # converts them to just the first letter of each word being capital.
13726
13727     my ($range, $script, @remainder)
13728         = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
13729     my @words = split /[_-]/, $script;
13730     for my $word (@words) {
13731         $word =
13732             ucfirst(lc($word)) if $word ne 'CJK';
13733     }
13734     $script = join "_", @words;
13735     $_ = join ";", $range, $script, @remainder;
13736 }
13737
13738 sub finish_Unicode() {
13739     # This routine should be called after all the Unicode files have been read
13740     # in.  It:
13741     # 1) Creates properties that are missing from the version of Unicode being
13742     #    compiled, and which, for whatever reason, are needed for the Perl
13743     #    core to function properly.  These are minimally populated as
13744     #    necessary.
13745     # 2) Adds the mappings for code points missing from the files which have
13746     #    defaults specified for them.
13747     # 3) At this this point all mappings are known, so it computes the type of
13748     #    each property whose type hasn't been determined yet.
13749     # 4) Calculates all the regular expression match tables based on the
13750     #    mappings.
13751     # 5) Calculates and adds the tables which are defined by Unicode, but
13752     #    which aren't derived by them, and certain derived tables that Perl
13753     #    uses.
13754
13755     # Folding information was introduced later into Unicode data.  To get
13756     # Perl's case ignore (/i) to work at all in releases that don't have
13757     # folding, use the best available alternative, which is lower casing.
13758     my $fold = property_ref('Case_Folding');
13759     if ($fold->is_empty) {
13760         $fold->initialize(property_ref('Lowercase_Mapping'));
13761         $fold->add_note(join_lines(<<END
13762 WARNING: This table uses lower case as a substitute for missing fold
13763 information
13764 END
13765         ));
13766     }
13767
13768     # Multiple-character mapping was introduced later into Unicode data, so it
13769     # is by default the simple version.  If to output the simple versions and
13770     # not present, just use the regular (which in these Unicode versions is
13771     # the simple as well).
13772     foreach my $map (qw {   Uppercase_Mapping
13773                             Lowercase_Mapping
13774                             Titlecase_Mapping
13775                             Case_Folding
13776                         } )
13777     {
13778         my $comment = <<END;
13779
13780 Note that although the Perl core uses this file, it has the standard values
13781 for code points from U+0000 to U+00FF compiled in, so changing this table will
13782 not change the core's behavior with respect to these code points.  Use
13783 Unicode::Casing to override this table.
13784 END
13785         if ($map eq 'Case_Folding') {
13786             $comment .= <<END;
13787 (/i regex matching is not overridable except by using a custom regex engine)
13788 END
13789         }
13790         property_ref($map)->add_comment(join_lines($comment));
13791         my $simple = property_ref("Simple_$map");
13792         next if ! $simple->is_empty;
13793         if ($simple->to_output_map) {
13794             $simple->initialize(property_ref($map));
13795         }
13796         else {
13797             property_ref($map)->set_proxy_for($simple->name);
13798         }
13799     }
13800
13801     # For each property, fill in any missing mappings, and calculate the re
13802     # match tables.  If a property has more than one missing mapping, the
13803     # default is a reference to a data structure, and may require data from
13804     # other properties to resolve.  The sort is used to cause these to be
13805     # processed last, after all the other properties have been calculated.
13806     # (Fortunately, the missing properties so far don't depend on each other.)
13807     foreach my $property
13808         (sort { (defined $a->default_map && ref $a->default_map) ? 1 : -1 }
13809         property_ref('*'))
13810     {
13811         # $perl has been defined, but isn't one of the Unicode properties that
13812         # need to be finished up.
13813         next if $property == $perl;
13814
13815         # Nor do we need to do anything with properties that aren't going to
13816         # be output.
13817         next if $property->fate == $SUPPRESSED;
13818
13819         # Handle the properties that have more than one possible default
13820         if (ref $property->default_map) {
13821             my $default_map = $property->default_map;
13822
13823             # These properties have stored in the default_map:
13824             # One or more of:
13825             #   1)  A default map which applies to all code points in a
13826             #       certain class
13827             #   2)  an expression which will evaluate to the list of code
13828             #       points in that class
13829             # And
13830             #   3) the default map which applies to every other missing code
13831             #      point.
13832             #
13833             # Go through each list.
13834             while (my ($default, $eval) = $default_map->get_next_defaults) {
13835
13836                 # Get the class list, and intersect it with all the so-far
13837                 # unspecified code points yielding all the code points
13838                 # in the class that haven't been specified.
13839                 my $list = eval $eval;
13840                 if ($@) {
13841                     Carp::my_carp("Can't set some defaults for missing code points for $property because eval '$eval' failed with '$@'");
13842                     last;
13843                 }
13844
13845                 # Narrow down the list to just those code points we don't have
13846                 # maps for yet.
13847                 $list = $list & $property->inverse_list;
13848
13849                 # Add mappings to the property for each code point in the list
13850                 foreach my $range ($list->ranges) {
13851                     $property->add_map($range->start, $range->end, $default,
13852                     Replace => $CROAK);
13853                 }
13854             }
13855
13856             # All remaining code points have the other mapping.  Set that up
13857             # so the normal single-default mapping code will work on them
13858             $property->set_default_map($default_map->other_default);
13859
13860             # And fall through to do that
13861         }
13862
13863         # We should have enough data now to compute the type of the property.
13864         my $property_name = $property->name;
13865         $property->compute_type;
13866         my $property_type = $property->type;
13867
13868         next if ! $property->to_create_match_tables;
13869
13870         # Here want to create match tables for this property
13871
13872         # The Unicode db always (so far, and they claim into the future) have
13873         # the default for missing entries in binary properties be 'N' (unless
13874         # there is a '@missing' line that specifies otherwise)
13875         if (! defined $property->default_map) {
13876             if ($property_type == $BINARY) {
13877                 $property->set_default_map('N');
13878             }
13879             elsif ($property_type == $ENUM) {
13880                 Carp::my_carp("Property '$property_name doesn't have a default mapping.  Using a fake one");
13881                 $property->set_default_map('XXX This makes sure there is a default map');
13882             }
13883         }
13884
13885         # Add any remaining code points to the mapping, using the default for
13886         # missing code points.
13887         my $default_table;
13888         my $default_map = $property->default_map;
13889         if ($property_type == $FORCED_BINARY) {
13890
13891             # A forced binary property creates a 'Y' table that matches all
13892             # non-default values.  The actual string values are also written out
13893             # as a map table.  (The default value will almost certainly be the
13894             # empty string, so the pod glosses over the distinction, and just
13895             # talks about empty vs non-empty.)
13896             my $yes = $property->table("Y");
13897             foreach my $range ($property->ranges) {
13898                 next if $range->value eq $default_map;
13899                 $yes->add_range($range->start, $range->end);
13900             }
13901             $property->table("N")->set_complement($yes);
13902         }
13903         else {
13904             if (defined $default_map) {
13905
13906                 # Make sure there is a match table for the default
13907                 if (! defined ($default_table = $property->table($default_map)))
13908                 {
13909                     $default_table = $property->add_match_table($default_map);
13910                 }
13911
13912                 # And, if the property is binary, the default table will just
13913                 # be the complement of the other table.
13914                 if ($property_type == $BINARY) {
13915                     my $non_default_table;
13916
13917                     # Find the non-default table.
13918                     for my $table ($property->tables) {
13919                         if ($table == $default_table) {
13920                             if ($v_version le v5.0.0) {
13921                                 $table->add_alias($_) for qw(N No F False);
13922                             }
13923                             next;
13924                         } elsif ($v_version le v5.0.0) {
13925                             $table->add_alias($_) for qw(Y Yes T True);
13926                         }
13927                         $non_default_table = $table;
13928                     }
13929                     $default_table->set_complement($non_default_table);
13930                 }
13931                 else {
13932
13933                     # This fills in any missing values with the default.  It's
13934                     # not necessary to do this with binary properties, as the
13935                     # default is defined completely in terms of the Y table.
13936                     $property->add_map(0, $MAX_WORKING_CODEPOINT,
13937                                     $default_map, Replace => $NO);
13938                 }
13939             }
13940
13941             # Have all we need to populate the match tables.
13942             my $maps_should_be_defined = $property->pre_declared_maps;
13943             foreach my $range ($property->ranges) {
13944                 my $map = $range->value;
13945                 my $table = $property->table($map);
13946                 if (! defined $table) {
13947
13948                     # Integral and rational property values are not
13949                     # necessarily defined in PropValueAliases, but whether all
13950                     # the other ones should be depends on the property.
13951                     if ($maps_should_be_defined
13952                         && $map !~ /^ -? \d+ ( \/ \d+ )? $/x)
13953                     {
13954                         Carp::my_carp("Table '$property_name=$map' should "
13955                                     . "have been defined.  Defining it now.")
13956                     }
13957                     $table = $property->add_match_table($map);
13958                 }
13959
13960                 next if $table->complement != 0; # Don't need to populate these
13961                 $table->add_range($range->start, $range->end);
13962             }
13963         }
13964
13965         # For Perl 5.6 compatibility, all properties matchable in regexes can
13966         # have an optional 'Is_' prefix.  This is now done in Unicode::UCD.
13967         # But warn if this creates a conflict with a (new) Unicode property
13968         # name, although it appears that Unicode has made a decision never to
13969         # begin a property name with 'Is_', so this shouldn't happen.
13970         foreach my $alias ($property->aliases) {
13971             my $Is_name = 'Is_' . $alias->name;
13972             if (defined (my $pre_existing = property_ref($Is_name))) {
13973                 Carp::my_carp(<<END
13974 There is already an alias named $Is_name (from " . $pre_existing . "), so
13975 creating one for $property won't work.  This is bad news.  If it is not too
13976 late, get Unicode to back off.  Otherwise go back to the old scheme (findable
13977 from the git blame log for this area of the code that suppressed individual
13978 aliases that conflict with the new Unicode names.  Proceeding anyway.
13979 END
13980                 );
13981             }
13982         } # End of loop through aliases for this property
13983     } # End of loop through all Unicode properties.
13984
13985     # Fill in the mappings that Unicode doesn't completely furnish.  First the
13986     # single letter major general categories.  If Unicode were to start
13987     # delivering the values, this would be redundant, but better that than to
13988     # try to figure out if should skip and not get it right.  Ths could happen
13989     # if a new major category were to be introduced, and the hard-coded test
13990     # wouldn't know about it.
13991     # This routine depends on the standard names for the general categories
13992     # being what it thinks they are, like 'Cn'.  The major categories are the
13993     # union of all the general category tables which have the same first
13994     # letters. eg. L = Lu + Lt + Ll + Lo + Lm
13995     foreach my $minor_table ($gc->tables) {
13996         my $minor_name = $minor_table->name;
13997         next if length $minor_name == 1;
13998         if (length $minor_name != 2) {
13999             Carp::my_carp_bug("Unexpected general category '$minor_name'.  Skipped.");
14000             next;
14001         }
14002
14003         my $major_name = uc(substr($minor_name, 0, 1));
14004         my $major_table = $gc->table($major_name);
14005         $major_table += $minor_table;
14006     }
14007
14008     # LC is Ll, Lu, and Lt.  (used to be L& or L_, but PropValueAliases.txt
14009     # defines it as LC)
14010     my $LC = $gc->table('LC');
14011     $LC->add_alias('L_', Status => $DISCOURAGED);   # For backwards...
14012     $LC->add_alias('L&', Status => $DISCOURAGED);   # compatibility.
14013
14014
14015     if ($LC->is_empty) { # Assume if not empty that Unicode has started to
14016                          # deliver the correct values in it
14017         $LC->initialize($gc->table('Ll') + $gc->table('Lu'));
14018
14019         # Lt not in release 1.
14020         if (defined $gc->table('Lt')) {
14021             $LC += $gc->table('Lt');
14022             $gc->table('Lt')->set_caseless_equivalent($LC);
14023         }
14024     }
14025     $LC->add_description('[\p{Ll}\p{Lu}\p{Lt}]');
14026
14027     $gc->table('Ll')->set_caseless_equivalent($LC);
14028     $gc->table('Lu')->set_caseless_equivalent($LC);
14029
14030     # Create digit and case fold tables with the original file names for
14031     # backwards compatibility with applications that read them directly.
14032     my $Digit = Property->new("Legacy_Perl_Decimal_Digit",
14033                               Default_Map => "",
14034                               File => 'Digit',    # Trad. location
14035                               Directory => $map_directory,
14036                               Type => $STRING,
14037                               Replacement_Property => "Perl_Decimal_Digit",
14038                               Initialize => property_ref('Perl_Decimal_Digit'),
14039                             );
14040     $Digit->add_comment(join_lines(<<END
14041 This file gives the mapping of all code points which represent a single
14042 decimal digit [0-9] to their respective digits.  For example, the code point
14043 U+0031 (an ASCII '1') is mapped to a numeric 1.  These code points are those
14044 that have Numeric_Type=Decimal; not special things, like subscripts nor Roman
14045 numerals.
14046 END
14047     ));
14048
14049     # Make sure this assumption in perl core code is valid in this Unicode
14050     # release, with known exceptions
14051     foreach my $range (property_ref('Numeric-Type')->table('Decimal')->ranges) {
14052         next if $range->end - $range->start == 9;
14053         next if $range->start == 0x1D7CE;   # This whole range was added in 3.1
14054         next if $range->end == 0x19DA && $v_version eq v5.2.0;
14055         next if $range->end - $range->start < 9 && $v_version le 4.0.0;
14056         Carp::my_carp("Range $range unexpectedly doesn't contain 10"
14057                     . " decimal digits.  Code in regcomp.c assumes it does,"
14058                     . " and will have to be fixed.  Proceeding anyway.");
14059     }
14060
14061     Property->new('Legacy_Case_Folding',
14062                     File => "Fold",
14063                     Directory => $map_directory,
14064                     Default_Map => $CODE_POINT,
14065                     Type => $STRING,
14066                     Replacement_Property => "Case_Folding",
14067                     Format => $HEX_FORMAT,
14068                     Initialize => property_ref('cf'),
14069     );
14070
14071     # The Script_Extensions property started out as a clone of the Script
14072     # property.  But processing its data file caused some elements to be
14073     # replaced with different data.  (These elements were for the Common and
14074     # Inherited properties.)  This data is a qw() list of all the scripts that
14075     # the code points in the given range are in.  An example line is:
14076     # 060C          ; Arab Syrc Thaa # Po       ARABIC COMMA
14077     #
14078     # The code above has created a new match table named "Arab Syrc Thaa"
14079     # which contains 060C.  (The cloned table started out with this code point
14080     # mapping to "Common".)  Now we add 060C to each of the Arab, Syrc, and
14081     # Thaa match tables.  Then we delete the now spurious "Arab Syrc Thaa"
14082     # match table.  This is repeated for all these tables and ranges.  The map
14083     # data is retained in the map table for reference, but the spurious match
14084     # tables are deleted.
14085
14086     if (defined $scx) {
14087         foreach my $table ($scx->tables) {
14088             next unless $table->name =~ /\s/;   # All the new and only the new
14089                                                 # tables have a space in their
14090                                                 # names
14091             my @scripts = split /\s+/, $table->name;
14092             foreach my $script (@scripts) {
14093                 my $script_table = $scx->table($script);
14094                 $script_table += $table;
14095             }
14096             $scx->delete_match_table($table);
14097         }
14098
14099         # Mark the scx table as the parent of the corresponding sc table for
14100         # those which are identical.  This causes the pod for the script table
14101         # to refer to the corresponding scx one.
14102         #
14103         # This has to be in a separate loop from above, so as to wait until
14104         # the tables are stabilized before checking for equivalency.
14105         if (defined $pod_directory) {
14106             foreach my $table ($scx->tables) {
14107                 my $plain_sc_equiv = $script->table($table->name);
14108                 if ($table->matches_identically_to($plain_sc_equiv)) {
14109                     $plain_sc_equiv->set_equivalent_to($table, Related => 1);
14110                 }
14111             }
14112         }
14113     }
14114
14115     return;
14116 }
14117
14118 sub pre_3_dot_1_Nl () {
14119
14120     # Return a range list for gc=nl for Unicode versions prior to 3.1, which
14121     # is when Unicode's became fully usable.  These code points were
14122     # determined by inspection and experimentation.  gc=nl is important for
14123     # certain Perl-extension properties that should be available in all
14124     # releases.
14125
14126     my $Nl = Range_List->new();
14127     if (defined (my $official = $gc->table('Nl'))) {
14128         $Nl += $official;
14129     }
14130     else {
14131         $Nl->add_range(0x2160, 0x2182);
14132         $Nl->add_range(0x3007, 0x3007);
14133         $Nl->add_range(0x3021, 0x3029);
14134     }
14135     $Nl->add_range(0xFE20, 0xFE23);
14136     $Nl->add_range(0x16EE, 0x16F0) if $v_version ge v3.0.0; # 3.0 was when
14137                                                             # these were added
14138     return $Nl;
14139 }
14140
14141 sub calculate_Assigned() {  # Set $Assigned to the gc != Cn code points; may be
14142                             # called before the Cn's are completely filled.
14143                             # Works on Unicodes earlier than ones that
14144                             # explicitly specify Cn.
14145     return if defined $Assigned;
14146
14147     if (! defined $gc || $gc->is_empty()) {
14148         Carp::my_carp_bug("calculate_Assigned() called before $gc is populated");
14149     }
14150
14151     $Assigned = $perl->add_match_table('Assigned',
14152                                 Description  => "All assigned code points",
14153                                 );
14154     while (defined (my $range = $gc->each_range())) {
14155         my $standard_value = standardize($range->value);
14156         next if $standard_value eq 'cn' || $standard_value eq 'unassigned';
14157         $Assigned->add_range($range->start, $range->end);
14158     }
14159 }
14160
14161 sub calculate_DI() {    # Set $DI to a Range_List equivalent to the
14162                         # Default_Ignorable_Code_Point property.  Works on
14163                         # Unicodes earlier than ones that explicitly specify
14164                         # DI.
14165     return if defined $DI;
14166
14167     if (defined (my $di = property_ref('Default_Ignorable_Code_Point'))) {
14168         $DI = $di->table('Y');
14169     }
14170     else {
14171         $DI = Range_List->new(Initialize => [ 0x180B .. 0x180D,
14172                                               0x2060 .. 0x206F,
14173                                               0xFE00 .. 0xFE0F,
14174                                               0xFFF0 .. 0xFFFB,
14175                                             ]);
14176         if ($v_version ge v2.0) {
14177             $DI += $gc->table('Cf')
14178                 +  $gc->table('Cs');
14179
14180             # These are above the Unicode version 1 max
14181             $DI->add_range(0xE0000, 0xE0FFF);
14182         }
14183         $DI += $gc->table('Cc')
14184              - ord("\t")
14185              - utf8::unicode_to_native(0x0A)  # LINE FEED
14186              - utf8::unicode_to_native(0x0B)  # VERTICAL TAB
14187              - ord("\f")
14188              - utf8::unicode_to_native(0x0D)  # CARRIAGE RETURN
14189              - utf8::unicode_to_native(0x85); # NEL
14190     }
14191 }
14192
14193 sub calculate_NChar() {  # Create a Perl extension match table which is the
14194                          # same as the Noncharacter_Code_Point property, and
14195                          # set $NChar to point to it.  Works on Unicodes
14196                          # earlier than ones that explicitly specify NChar
14197     return if defined $NChar;
14198
14199     $NChar = $perl->add_match_table('_Perl_Nchar',
14200                                     Perl_Extension => 1,
14201                                     Fate => $INTERNAL_ONLY);
14202     if (defined (my $off_nchar = property_ref('NChar'))) {
14203         $NChar->initialize($off_nchar->table('Y'));
14204     }
14205     else {
14206         $NChar->initialize([ 0xFFFE .. 0xFFFF ]);
14207         if ($v_version ge v2.0) {   # First release with these nchars
14208             for (my $i = 0x1FFFE; $i <= 0x10FFFE; $i += 0x10000) {
14209                 $NChar += [ $i .. $i+1 ];
14210             }
14211         }
14212     }
14213 }
14214
14215 sub handle_compare_versions () {
14216     # This fixes things up for the $compare_versions capability, where we
14217     # compare Unicode version X with version Y (with Y > X), and we are
14218     # running it on the Unicode Data for version Y.
14219     #
14220     # It works by calculating the code points whose meaning has been specified
14221     # after release X, by using the Age property.  The complement of this set
14222     # is the set of code points whose meaning is unchanged between the
14223     # releases.  This is the set the program restricts itself to.  It includes
14224     # everything whose meaning has been specified by the time version X came
14225     # along, plus those still unassigned by the time of version Y.  (We will
14226     # continue to use the word 'assigned' to mean 'meaning has been
14227     # specified', as it's shorter and is accurate in all cases except the
14228     # Noncharacter code points.)
14229     #
14230     # This function is run after all the properties specified by Unicode have
14231     # been calculated for release Y.  This makes sure we get all the nuances
14232     # of Y's rules.  (It is done before the Perl extensions are calculated, as
14233     # those are based entirely on the Unicode ones.)  But doing it after the
14234     # Unicode table calculations means we have to fix up the Unicode tables.
14235     # We do this by subtracting the code points that have been assigned since
14236     # X (which is actually done by ANDing each table of assigned code points
14237     # with the set of unchanged code points).  Most Unicode properties are of
14238     # the form such that all unassigned code points have a default, grab-bag,
14239     # property value which is changed when the code point gets assigned.  For
14240     # these, we just remove the changed code points from the table for the
14241     # latter property value, and add them back in to the grab-bag one.  A few
14242     # other properties are not entirely of this form and have values for some
14243     # or all unassigned code points that are not the grab-bag one.  These have
14244     # to be handled specially, and are hard-coded in to this routine based on
14245     # manual inspection of the Unicode character database.  A list of the
14246     # outlier code points is made for each of these properties, and those
14247     # outliers are excluded from adding and removing from tables.
14248     #
14249     # Note that there are glitches when comparing against Unicode 1.1, as some
14250     # Hangul syllables in it were later ripped out and eventually replaced
14251     # with other things.
14252
14253     print "Fixing up for version comparison\n" if $verbosity >= $PROGRESS;
14254
14255     my $after_first_version = "All matching code points were added after "
14256                             . "Unicode $string_compare_versions";
14257
14258     # Calculate the delta as those code points that have been newly assigned
14259     # since the first compare version.
14260     my $delta = Range_List->new();
14261     foreach my $table ($age->tables) {
14262         use version;
14263         next if $table == $age->table('Unassigned');
14264         next if version->parse($table->name)
14265              le version->parse($string_compare_versions);
14266         $delta += $table;
14267     }
14268     if ($delta->is_empty) {
14269         die ("No changes; perhaps you need a 'DAge.txt' file?");
14270     }
14271
14272     my $unchanged = ~ $delta;
14273
14274     calculate_Assigned() if ! defined $Assigned;
14275     $Assigned &= $unchanged;
14276
14277     # $Assigned now contains the code points that were assigned as of Unicode
14278     # version X.
14279
14280     # A block is all or nothing.  If nothing is assigned in it, it all goes
14281     # back to the No_Block pool; but if even one code point is assigned, the
14282     # block is retained.
14283     my $no_block = $block->table('No_Block');
14284     foreach my $this_block ($block->tables) {
14285         next if     $this_block == $no_block
14286                 ||  ! ($this_block & $Assigned)->is_empty;
14287         $this_block->set_fate($SUPPRESSED, $after_first_version);
14288         foreach my $range ($this_block->ranges) {
14289             $block->replace_map($range->start, $range->end, 'No_Block')
14290         }
14291         $no_block += $this_block;
14292     }
14293
14294     my @special_delta_properties;   # List of properties that have to be
14295                                     # handled specially.
14296     my %restricted_delta;           # Keys are the entries in
14297                                     # @special_delta_properties;  values
14298                                     # are the range list of the code points
14299                                     # that behave normally when they get
14300                                     # assigned.
14301
14302     # In the next three properties, the Default Ignorable code points are
14303     # outliers.
14304     calculate_DI();
14305     $DI &= $unchanged;
14306
14307     push @special_delta_properties, property_ref('_Perl_GCB');
14308     $restricted_delta{$special_delta_properties[-1]} = ~ $DI;
14309
14310     if (defined (my $cwnfkcc = property_ref('Changes_When_NFKC_Casefolded')))
14311     {
14312         push @special_delta_properties, $cwnfkcc;
14313         $restricted_delta{$special_delta_properties[-1]} = ~ $DI;
14314     }
14315
14316     calculate_NChar();      # Non-character code points
14317     $NChar &= $unchanged;
14318
14319     # This may have to be updated from time-to-time to get the most accurate
14320     # results.
14321     my $default_BC_non_LtoR = Range_List->new(Initialize =>
14322                         # These came from the comments in v8.0 DBidiClass.txt
14323                                                         [ # AL
14324                                                             0x0600 .. 0x07BF,
14325                                                             0x08A0 .. 0x08FF,
14326                                                             0xFB50 .. 0xFDCF,
14327                                                             0xFDF0 .. 0xFDFF,
14328                                                             0xFE70 .. 0xFEFF,
14329                                                             0x1EE00 .. 0x1EEFF,
14330                                                            # R
14331                                                             0x0590 .. 0x05FF,
14332                                                             0x07C0 .. 0x089F,
14333                                                             0xFB1D .. 0xFB4F,
14334                                                             0x10800 .. 0x10FFF,
14335                                                             0x1E800 .. 0x1EDFF,
14336                                                             0x1EF00 .. 0x1EFFF,
14337                                                            # ET
14338                                                             0x20A0 .. 0x20CF,
14339                                                          ]
14340                                           );
14341     $default_BC_non_LtoR += $DI + $NChar;
14342     push @special_delta_properties, property_ref('BidiClass');
14343     $restricted_delta{$special_delta_properties[-1]} = ~ $default_BC_non_LtoR;
14344
14345     if (defined (my $eaw = property_ref('East_Asian_Width'))) {
14346
14347         my $default_EA_width_W = Range_List->new(Initialize =>
14348                                     # From comments in v8.0 EastAsianWidth.txt
14349                                                 [
14350                                                     0x3400 .. 0x4DBF,
14351                                                     0x4E00 .. 0x9FFF,
14352                                                     0xF900 .. 0xFAFF,
14353                                                     0x20000 .. 0x2A6DF,
14354                                                     0x2A700 .. 0x2B73F,
14355                                                     0x2B740 .. 0x2B81F,
14356                                                     0x2B820 .. 0x2CEAF,
14357                                                     0x2F800 .. 0x2FA1F,
14358                                                     0x20000 .. 0x2FFFD,
14359                                                     0x30000 .. 0x3FFFD,
14360                                                 ]
14361                                              );
14362         push @special_delta_properties, $eaw;
14363         $restricted_delta{$special_delta_properties[-1]}
14364                                                        = ~ $default_EA_width_W;
14365
14366         # Line break came along in the same release as East_Asian_Width, and
14367         # the non-grab-bag default set is a superset of the EAW one.
14368         if (defined (my $lb = property_ref('Line_Break'))) {
14369             my $default_LB_non_XX = Range_List->new(Initialize =>
14370                                         # From comments in v8.0 LineBreak.txt
14371                                                         [ 0x20A0 .. 0x20CF ]);
14372             $default_LB_non_XX += $default_EA_width_W;
14373             push @special_delta_properties, $lb;
14374             $restricted_delta{$special_delta_properties[-1]}
14375                                                         = ~ $default_LB_non_XX;
14376         }
14377     }
14378
14379     # Go through every property, skipping those we've already worked on, those
14380     # that are immutable, and the perl ones that will be calculated after this
14381     # routine has done its fixup.
14382     foreach my $property (property_ref('*')) {
14383         next if    $property == $perl     # Done later in the program
14384                 || $property == $block    # Done just above
14385                 || $property == $DI       # Done just above
14386                 || $property == $NChar    # Done just above
14387
14388                    # The next two are invariant across Unicode versions
14389                 || $property == property_ref('Pattern_Syntax')
14390                 || $property == property_ref('Pattern_White_Space');
14391
14392         #  Find the grab-bag value.
14393         my $default_map = $property->default_map;
14394
14395         if (! $property->to_create_match_tables) {
14396
14397             # Here there aren't any match tables.  So far, all such properties
14398             # have a default map, and don't require special handling.  Just
14399             # change each newly assigned code point back to the default map,
14400             # as if they were unassigned.
14401             foreach my $range ($delta->ranges) {
14402                 $property->add_map($range->start,
14403                                 $range->end,
14404                                 $default_map,
14405                                 Replace => $UNCONDITIONALLY);
14406             }
14407         }
14408         else {  # Here there are match tables.  Find the one (if any) for the
14409                 # grab-bag value that unassigned code points go to.
14410             my $default_table;
14411             if (defined $default_map) {
14412                 $default_table = $property->table($default_map);
14413             }
14414
14415             # If some code points don't go back to the the grab-bag when they
14416             # are considered unassigned, exclude them from the list that does
14417             # that.
14418             my $this_delta = $delta;
14419             my $this_unchanged = $unchanged;
14420             if (grep { $_ == $property } @special_delta_properties) {
14421                 $this_delta = $delta & $restricted_delta{$property};
14422                 $this_unchanged = ~ $this_delta;
14423             }
14424
14425             # Fix up each match table for this property.
14426             foreach my $table ($property->tables) {
14427                 if (defined $default_table && $table == $default_table) {
14428
14429                     # The code points assigned after release X (the ones we
14430                     # are excluding in this routine) go back on to the default
14431                     # (grab-bag) table.  However, some of these tables don't
14432                     # actually exist, but are specified solely by the other
14433                     # tables.  (In a binary property, we don't need to
14434                     # actually have an 'N' table, as it's just the complement
14435                     # of the 'Y' table.)  Such tables will be locked, so just
14436                     # skip those.
14437                     $table += $this_delta unless $table->locked;
14438                 }
14439                 else {
14440
14441                     # Here the table is not for the default value.  We need to
14442                     # subtract the code points we are ignoring for this
14443                     # comparison (the deltas) from it.  But if the table
14444                     # started out with nothing, no need to exclude anything,
14445                     # and want to skip it here anyway, so it gets listed
14446                     # properly in the pod.
14447                     next if $table->is_empty;
14448
14449                     # Save the deltas for later, before we do the subtraction
14450                     my $deltas = $table & $this_delta;
14451
14452                     $table &= $this_unchanged;
14453
14454                     # Suppress the table if the subtraction left it with
14455                     # nothing in it
14456                     if ($table->is_empty) {
14457                         if ($property->type == $BINARY) {
14458                             push @tables_that_may_be_empty, $table->complete_name;
14459                         }
14460                         else {
14461                             $table->set_fate($SUPPRESSED, $after_first_version);
14462                         }
14463                     }
14464
14465                     # Now we add the removed code points to the property's
14466                     # map, as they should now map to the grab-bag default
14467                     # property (which they did in the first comparison
14468                     # version).  But we don't have to do this if the map is
14469                     # only for internal use.
14470                     if (defined $default_map && $property->to_output_map) {
14471
14472                         # The gc property has pseudo property values whose names
14473                         # have length 1.  These are the union of all the
14474                         # property values whose name is longer than 1 and
14475                         # whose first letter is all the same.  The replacement
14476                         # is done once for the longer-named tables.
14477                         next if $property == $gc && length $table->name == 1;
14478
14479                         foreach my $range ($deltas->ranges) {
14480                             $property->add_map($range->start,
14481                                             $range->end,
14482                                             $default_map,
14483                                             Replace => $UNCONDITIONALLY);
14484                         }
14485                     }
14486                 }
14487             }
14488         }
14489     }
14490
14491     # The above code doesn't work on 'gc=C', as it is a superset of the default
14492     # ('Cn') table.  It's easiest to just special case it here.
14493     my $C = $gc->table('C');
14494     $C += $gc->table('Cn');
14495
14496     return;
14497 }
14498
14499 sub compile_perl() {
14500     # Create perl-defined tables.  Almost all are part of the pseudo-property
14501     # named 'perl' internally to this program.  Many of these are recommended
14502     # in UTS#18 "Unicode Regular Expressions", and their derivations are based
14503     # on those found there.
14504     # Almost all of these are equivalent to some Unicode property.
14505     # A number of these properties have equivalents restricted to the ASCII
14506     # range, with their names prefaced by 'Posix', to signify that these match
14507     # what the Posix standard says they should match.  A couple are
14508     # effectively this, but the name doesn't have 'Posix' in it because there
14509     # just isn't any Posix equivalent.  'XPosix' are the Posix tables extended
14510     # to the full Unicode range, by our guesses as to what is appropriate.
14511
14512     # 'All' is all code points.  As an error check, instead of just setting it
14513     # to be that, construct it to be the union of all the major categories
14514     $All = $perl->add_match_table('All',
14515       Description
14516         => "All code points, including those above Unicode.  Same as qr/./s",
14517       Matches_All => 1);
14518
14519     foreach my $major_table ($gc->tables) {
14520
14521         # Major categories are the ones with single letter names.
14522         next if length($major_table->name) != 1;
14523
14524         $All += $major_table;
14525     }
14526
14527     if ($All->max != $MAX_WORKING_CODEPOINT) {
14528         Carp::my_carp_bug("Generated highest code point ("
14529            . sprintf("%X", $All->max)
14530            . ") doesn't match expected value $MAX_WORKING_CODEPOINT_STRING.")
14531     }
14532     if ($All->range_count != 1 || $All->min != 0) {
14533      Carp::my_carp_bug("Generated table 'All' doesn't match all code points.")
14534     }
14535
14536     my $Any = $perl->add_match_table('Any',
14537                                     Description  => "All Unicode code points");
14538     $Any->add_range(0, $MAX_UNICODE_CODEPOINT);
14539     $Any->add_alias('Unicode');
14540
14541     calculate_Assigned();
14542
14543     my $ASCII = $perl->add_match_table('ASCII');
14544     if (defined $block) {   # This is equivalent to the block if have it.
14545         my $Unicode_ASCII = $block->table('Basic_Latin');
14546         if (defined $Unicode_ASCII && ! $Unicode_ASCII->is_empty) {
14547             $ASCII->set_equivalent_to($Unicode_ASCII, Related => 1);
14548         }
14549     }
14550
14551     # Very early releases didn't have blocks, so initialize ASCII ourselves if
14552     # necessary
14553     if ($ASCII->is_empty) {
14554         if (! NON_ASCII_PLATFORM) {
14555             $ASCII->add_range(0, 127);
14556         }
14557         else {
14558             for my $i (0 .. 127) {
14559                 $ASCII->add_range(utf8::unicode_to_native($i),
14560                                   utf8::unicode_to_native($i));
14561             }
14562         }
14563     }
14564
14565     # Get the best available case definitions.  Early Unicode versions didn't
14566     # have Uppercase and Lowercase defined, so use the general category
14567     # instead for them, modified by hard-coding in the code points each is
14568     # missing.
14569     my $Lower = $perl->add_match_table('XPosixLower');
14570     my $Unicode_Lower = property_ref('Lowercase');
14571     if (defined $Unicode_Lower && ! $Unicode_Lower->is_empty) {
14572         $Lower->set_equivalent_to($Unicode_Lower->table('Y'), Related => 1);
14573
14574     }
14575     else {
14576         $Lower += $gc->table('Lowercase_Letter');
14577
14578         # There are quite a few code points in Lower, that aren't in gc=lc,
14579         # and not all are in all releases.
14580         my $temp = Range_List->new(Initialize => [
14581                                                 utf8::unicode_to_native(0xAA),
14582                                                 utf8::unicode_to_native(0xBA),
14583                                                 0x02B0 .. 0x02B8,
14584                                                 0x02C0 .. 0x02C1,
14585                                                 0x02E0 .. 0x02E4,
14586                                                 0x0345,
14587                                                 0x037A,
14588                                                 0x1D2C .. 0x1D6A,
14589                                                 0x1D78,
14590                                                 0x1D9B .. 0x1DBF,
14591                                                 0x2071,
14592                                                 0x207F,
14593                                                 0x2090 .. 0x209C,
14594                                                 0x2170 .. 0x217F,
14595                                                 0x24D0 .. 0x24E9,
14596                                                 0x2C7C .. 0x2C7D,
14597                                                 0xA770,
14598                                                 0xA7F8 .. 0xA7F9,
14599                                 ]);
14600         $Lower += $temp & $Assigned;
14601     }
14602     my $Posix_Lower = $perl->add_match_table("PosixLower",
14603                             Initialize => $Lower & $ASCII,
14604                             );
14605
14606     my $Upper = $perl->add_match_table("XPosixUpper");
14607     my $Unicode_Upper = property_ref('Uppercase');
14608     if (defined $Unicode_Upper && ! $Unicode_Upper->is_empty) {
14609         $Upper->set_equivalent_to($Unicode_Upper->table('Y'), Related => 1);
14610     }
14611     else {
14612
14613         # Unlike Lower, there are only two ranges in Upper that aren't in
14614         # gc=Lu, and all code points were assigned in all releases.
14615         $Upper += $gc->table('Uppercase_Letter');
14616         $Upper->add_range(0x2160, 0x216F);  # Uppercase Roman numerals
14617         $Upper->add_range(0x24B6, 0x24CF);  # Circled Latin upper case letters
14618     }
14619     my $Posix_Upper = $perl->add_match_table("PosixUpper",
14620                             Initialize => $Upper & $ASCII,
14621                             );
14622
14623     # Earliest releases didn't have title case.  Initialize it to empty if not
14624     # otherwise present
14625     my $Title = $perl->add_match_table('Title', Full_Name => 'Titlecase',
14626                                        Description => '(= \p{Gc=Lt})');
14627     my $lt = $gc->table('Lt');
14628
14629     # Earlier versions of mktables had this related to $lt since they have
14630     # identical code points, but their caseless equivalents are not the same,
14631     # one being 'Cased' and the other being 'LC', and so now must be kept as
14632     # separate entities.
14633     if (defined $lt) {
14634         $Title += $lt;
14635     }
14636     else {
14637         push @tables_that_may_be_empty, $Title->complete_name;
14638     }
14639
14640     my $Unicode_Cased = property_ref('Cased');
14641     if (defined $Unicode_Cased) {
14642         my $yes = $Unicode_Cased->table('Y');
14643         my $no = $Unicode_Cased->table('N');
14644         $Title->set_caseless_equivalent($yes);
14645         if (defined $Unicode_Upper) {
14646             $Unicode_Upper->table('Y')->set_caseless_equivalent($yes);
14647             $Unicode_Upper->table('N')->set_caseless_equivalent($no);
14648         }
14649         $Upper->set_caseless_equivalent($yes);
14650         if (defined $Unicode_Lower) {
14651             $Unicode_Lower->table('Y')->set_caseless_equivalent($yes);
14652             $Unicode_Lower->table('N')->set_caseless_equivalent($no);
14653         }
14654         $Lower->set_caseless_equivalent($yes);
14655     }
14656     else {
14657         # If this Unicode version doesn't have Cased, set up the Perl
14658         # extension from first principles.  From Unicode 5.1: Definition D120:
14659         # A character C is defined to be cased if and only if C has the
14660         # Lowercase or Uppercase property or has a General_Category value of
14661         # Titlecase_Letter.
14662         my $cased = $perl->add_match_table('Cased',
14663                         Initialize => $Lower + $Upper + $Title,
14664                         Description => 'Uppercase or Lowercase or Titlecase',
14665                         );
14666         # $notcased is purely for the caseless equivalents below
14667         my $notcased = $perl->add_match_table('_Not_Cased',
14668                                 Initialize => ~ $cased,
14669                                 Fate => $INTERNAL_ONLY,
14670                                 Description => 'All not-cased code points');
14671         $Title->set_caseless_equivalent($cased);
14672         if (defined $Unicode_Upper) {
14673             $Unicode_Upper->table('Y')->set_caseless_equivalent($cased);
14674             $Unicode_Upper->table('N')->set_caseless_equivalent($notcased);
14675         }
14676         $Upper->set_caseless_equivalent($cased);
14677         if (defined $Unicode_Lower) {
14678             $Unicode_Lower->table('Y')->set_caseless_equivalent($cased);
14679             $Unicode_Lower->table('N')->set_caseless_equivalent($notcased);
14680         }
14681         $Lower->set_caseless_equivalent($cased);
14682     }
14683
14684     # The remaining perl defined tables are mostly based on Unicode TR 18,
14685     # "Annex C: Compatibility Properties".  All of these have two versions,
14686     # one whose name generally begins with Posix that is posix-compliant, and
14687     # one that matches Unicode characters beyond the Posix, ASCII range
14688
14689     my $Alpha = $perl->add_match_table('XPosixAlpha');
14690
14691     # Alphabetic was not present in early releases
14692     my $Alphabetic = property_ref('Alphabetic');
14693     if (defined $Alphabetic && ! $Alphabetic->is_empty) {
14694         $Alpha->set_equivalent_to($Alphabetic->table('Y'), Related => 1);
14695     }
14696     else {
14697
14698         # The Alphabetic property doesn't exist for early releases, so
14699         # generate it.  The actual definition, in 5.2 terms is:
14700         #
14701         # gc=L + gc=Nl + Other_Alphabetic
14702         #
14703         # Other_Alphabetic is also not defined in these early releases, but it
14704         # contains one gc=So range plus most of gc=Mn and gc=Mc, so we add
14705         # those last two as well, then subtract the relatively few of them that
14706         # shouldn't have been added.  (The gc=So range is the circled capital
14707         # Latin characters.  Early releases mistakenly didn't also include the
14708         # lower-case versions of these characters, and so we don't either, to
14709         # maintain consistency with those releases that first had this
14710         # property.
14711         $Alpha->initialize($gc->table('Letter')
14712                            + pre_3_dot_1_Nl()
14713                            + $gc->table('Mn')
14714                            + $gc->table('Mc')
14715                         );
14716         $Alpha->add_range(0x24D0, 0x24E9);  # gc=So
14717         foreach my $range (     [ 0x0300, 0x0344 ],
14718                                 [ 0x0346, 0x034E ],
14719                                 [ 0x0360, 0x0362 ],
14720                                 [ 0x0483, 0x0486 ],
14721                                 [ 0x0591, 0x05AF ],
14722                                 [ 0x06DF, 0x06E0 ],
14723                                 [ 0x06EA, 0x06EC ],
14724                                 [ 0x0740, 0x074A ],
14725                                 0x093C,
14726                                 0x094D,
14727                                 [ 0x0951, 0x0954 ],
14728                                 0x09BC,
14729                                 0x09CD,
14730                                 0x0A3C,
14731                                 0x0A4D,
14732                                 0x0ABC,
14733                                 0x0ACD,
14734                                 0x0B3C,
14735                                 0x0B4D,
14736                                 0x0BCD,
14737                                 0x0C4D,
14738                                 0x0CCD,
14739                                 0x0D4D,
14740                                 0x0DCA,
14741                                 [ 0x0E47, 0x0E4C ],
14742                                 0x0E4E,
14743                                 [ 0x0EC8, 0x0ECC ],
14744                                 [ 0x0F18, 0x0F19 ],
14745                                 0x0F35,
14746                                 0x0F37,
14747                                 0x0F39,
14748                                 [ 0x0F3E, 0x0F3F ],
14749                                 [ 0x0F82, 0x0F84 ],
14750                                 [ 0x0F86, 0x0F87 ],
14751                                 0x0FC6,
14752                                 0x1037,
14753                                 0x1039,
14754                                 [ 0x17C9, 0x17D3 ],
14755                                 [ 0x20D0, 0x20DC ],
14756                                 0x20E1,
14757                                 [ 0x302A, 0x302F ],
14758                                 [ 0x3099, 0x309A ],
14759                                 [ 0xFE20, 0xFE23 ],
14760                                 [ 0x1D165, 0x1D169 ],
14761                                 [ 0x1D16D, 0x1D172 ],
14762                                 [ 0x1D17B, 0x1D182 ],
14763                                 [ 0x1D185, 0x1D18B ],
14764                                 [ 0x1D1AA, 0x1D1AD ],
14765         ) {
14766             if (ref $range) {
14767                 $Alpha->delete_range($range->[0], $range->[1]);
14768             }
14769             else {
14770                 $Alpha->delete_range($range, $range);
14771             }
14772         }
14773         $Alpha->add_description('Alphabetic');
14774         $Alpha->add_alias('Alphabetic');
14775     }
14776     my $Posix_Alpha = $perl->add_match_table("PosixAlpha",
14777                             Initialize => $Alpha & $ASCII,
14778                             );
14779     $Posix_Upper->set_caseless_equivalent($Posix_Alpha);
14780     $Posix_Lower->set_caseless_equivalent($Posix_Alpha);
14781
14782     my $Alnum = $perl->add_match_table('Alnum', Full_Name => 'XPosixAlnum',
14783                         Description => 'Alphabetic and (decimal) Numeric',
14784                         Initialize => $Alpha + $gc->table('Decimal_Number'),
14785                         );
14786     $perl->add_match_table("PosixAlnum",
14787                             Initialize => $Alnum & $ASCII,
14788                             );
14789
14790     my $Word = $perl->add_match_table('Word', Full_Name => 'XPosixWord',
14791                                 Description => '\w, including beyond ASCII;'
14792                                             . ' = \p{Alnum} + \pM + \p{Pc}'
14793                                             . ' + \p{Join_Control}',
14794                                 Initialize => $Alnum + $gc->table('Mark'),
14795                                 );
14796     my $Pc = $gc->table('Connector_Punctuation'); # 'Pc' Not in release 1
14797     if (defined $Pc) {
14798         $Word += $Pc;
14799     }
14800     else {
14801         $Word += ord('_');  # Make sure this is a $Word
14802     }
14803     my $JC = property_ref('Join_Control');  # Wasn't in release 1
14804     if (defined $JC) {
14805         $Word += $JC->table('Y');
14806     }
14807     else {
14808         $Word += 0x200C + 0x200D;
14809     }
14810
14811     # This is a Perl extension, so the name doesn't begin with Posix.
14812     my $PerlWord = $perl->add_match_table('PosixWord',
14813                     Description => '\w, restricted to ASCII',
14814                     Initialize => $Word & $ASCII,
14815                     );
14816     $PerlWord->add_alias('PerlWord');
14817
14818     my $Blank = $perl->add_match_table('Blank', Full_Name => 'XPosixBlank',
14819                                 Description => '\h, Horizontal white space',
14820
14821                                 # 200B is Zero Width Space which is for line
14822                                 # break control, and was listed as
14823                                 # Space_Separator in early releases
14824                                 Initialize => $gc->table('Space_Separator')
14825                                             +   ord("\t")
14826                                             -   0x200B, # ZWSP
14827                                 );
14828     $Blank->add_alias('HorizSpace');        # Another name for it.
14829     $perl->add_match_table("PosixBlank",
14830                             Initialize => $Blank & $ASCII,
14831                             );
14832
14833     my $VertSpace = $perl->add_match_table('VertSpace',
14834                             Description => '\v',
14835                             Initialize =>
14836                                $gc->table('Line_Separator')
14837                              + $gc->table('Paragraph_Separator')
14838                              + utf8::unicode_to_native(0x0A)  # LINE FEED
14839                              + utf8::unicode_to_native(0x0B)  # VERTICAL TAB
14840                              + ord("\f")
14841                              + utf8::unicode_to_native(0x0D)  # CARRIAGE RETURN
14842                              + utf8::unicode_to_native(0x85)  # NEL
14843                     );
14844     # No Posix equivalent for vertical space
14845
14846     my $Space = $perl->add_match_table('XPosixSpace',
14847                 Description => '\s including beyond ASCII and vertical tab',
14848                 Initialize => $Blank + $VertSpace,
14849     );
14850     $Space->add_alias('XPerlSpace');    # Pre-existing synonyms
14851     $Space->add_alias('SpacePerl');
14852     $Space->add_alias('Space') if $v_version lt v4.1.0;
14853
14854     my $Posix_space = $perl->add_match_table("PosixSpace",
14855                             Initialize => $Space & $ASCII,
14856                             );
14857     $Posix_space->add_alias('PerlSpace'); # A pre-existing synonym
14858
14859     my $Cntrl = $perl->add_match_table('Cntrl', Full_Name => 'XPosixCntrl',
14860                                         Description => 'Control characters');
14861     $Cntrl->set_equivalent_to($gc->table('Cc'), Related => 1);
14862     $perl->add_match_table("PosixCntrl",
14863                             Description => "ASCII control characters",
14864                             Definition =>  "ACK, BEL, BS, CAN, CR, DC1, DC2,"
14865                                          . " DC3, DC4, DEL, DLE, ENQ, EOM,"
14866                                          . " EOT, ESC, ETB, ETX, FF, FS, GS,"
14867                                          . " HT, LF, NAK, NUL, RS, SI, SO,"
14868                                          . " SOH, STX, SUB, SYN, US, VT",
14869                             Initialize => $Cntrl & $ASCII,
14870                             );
14871
14872     my $perl_surrogate = $perl->add_match_table('_Perl_Surrogate');
14873     my $Cs = $gc->table('Cs');
14874     if (defined $Cs && ! $Cs->is_empty) {
14875         $perl_surrogate += $Cs;
14876     }
14877     else {
14878         push @tables_that_may_be_empty, '_Perl_Surrogate';
14879     }
14880
14881     # $controls is a temporary used to construct Graph.
14882     my $controls = Range_List->new(Initialize => $gc->table('Unassigned')
14883                                                 + $gc->table('Control')
14884                                                 + $perl_surrogate);
14885
14886     # Graph is  ~space &  ~(Cc|Cs|Cn) = ~(space + $controls)
14887     my $Graph = $perl->add_match_table('Graph', Full_Name => 'XPosixGraph',
14888                         Description => 'Characters that are graphical',
14889                         Initialize => ~ ($Space + $controls),
14890                         );
14891     $perl->add_match_table("PosixGraph",
14892                             Initialize => $Graph & $ASCII,
14893                             );
14894
14895     $print = $perl->add_match_table('Print', Full_Name => 'XPosixPrint',
14896                         Description => 'Characters that are graphical plus space characters (but no controls)',
14897                         Initialize => $Blank + $Graph - $gc->table('Control'),
14898                         );
14899     $perl->add_match_table("PosixPrint",
14900                             Initialize => $print & $ASCII,
14901                             );
14902
14903     my $Punct = $perl->add_match_table('Punct');
14904     $Punct->set_equivalent_to($gc->table('Punctuation'), Related => 1);
14905
14906     # \p{punct} doesn't include the symbols, which posix does
14907     my $XPosixPunct = $perl->add_match_table('XPosixPunct',
14908                     Description => '\p{Punct} + ASCII-range \p{Symbol}',
14909                     Initialize => $gc->table('Punctuation')
14910                                 + ($ASCII & $gc->table('Symbol')),
14911                                 Perl_Extension => 1
14912         );
14913     $perl->add_match_table('PosixPunct', Perl_Extension => 1,
14914         Initialize => $ASCII & $XPosixPunct,
14915         );
14916
14917     my $Digit = $perl->add_match_table('Digit', Full_Name => 'XPosixDigit',
14918                             Description => '[0-9] + all other decimal digits');
14919     $Digit->set_equivalent_to($gc->table('Decimal_Number'), Related => 1);
14920     my $PosixDigit = $perl->add_match_table("PosixDigit",
14921                                             Initialize => $Digit & $ASCII,
14922                                             );
14923
14924     # Hex_Digit was not present in first release
14925     my $Xdigit = $perl->add_match_table('XDigit', Full_Name => 'XPosixXDigit');
14926     my $Hex = property_ref('Hex_Digit');
14927     if (defined $Hex && ! $Hex->is_empty) {
14928         $Xdigit->set_equivalent_to($Hex->table('Y'), Related => 1);
14929     }
14930     else {
14931         $Xdigit->initialize([ ord('0') .. ord('9'),
14932                               ord('A') .. ord('F'),
14933                               ord('a') .. ord('f'),
14934                               0xFF10..0xFF19, 0xFF21..0xFF26, 0xFF41..0xFF46]);
14935     }
14936
14937     # AHex was not present in early releases
14938     my $PosixXDigit = $perl->add_match_table('PosixXDigit');
14939     my $AHex = property_ref('ASCII_Hex_Digit');
14940     if (defined $AHex && ! $AHex->is_empty) {
14941         $PosixXDigit->set_equivalent_to($AHex->table('Y'), Related => 1);
14942     }
14943     else {
14944         $PosixXDigit->initialize($Xdigit & $ASCII);
14945         $PosixXDigit->add_alias('AHex');
14946         $PosixXDigit->add_alias('Ascii_Hex_Digit');
14947     }
14948
14949     my $any_folds = $perl->add_match_table("_Perl_Any_Folds",
14950                     Description => "Code points that particpate in some fold",
14951                     );
14952     my $loc_problem_folds = $perl->add_match_table(
14953                "_Perl_Problematic_Locale_Folds",
14954                Description =>
14955                    "Code points that are in some way problematic under locale",
14956     );
14957
14958     # This allows regexec.c to skip some work when appropriate.  Some of the
14959     # entries in _Perl_Problematic_Locale_Folds are multi-character folds,
14960     my $loc_problem_folds_start = $perl->add_match_table(
14961                "_Perl_Problematic_Locale_Foldeds_Start",
14962                Description =>
14963                    "The first character of every sequence in _Perl_Problematic_Locale_Folds",
14964     );
14965
14966     my $cf = property_ref('Case_Folding');
14967
14968     # Every character 0-255 is problematic because what each folds to depends
14969     # on the current locale
14970     $loc_problem_folds->add_range(0, 255);
14971     $loc_problem_folds->add_range(0x130, 0x131);    # These are problematic in
14972                                                     # Turkic locales
14973     $loc_problem_folds_start += $loc_problem_folds;
14974
14975     # Also problematic are anything these fold to outside the range.  Likely
14976     # forever the only thing folded to by these outside the 0-255 range is the
14977     # GREEK SMALL MU (from the MICRO SIGN), but it's easy to make the code
14978     # completely general, which should catch any unexpected changes or errors.
14979     # We look at each code point 0-255, and add its fold (including each part
14980     # of a multi-char fold) to the list.  See commit message
14981     # 31f05a37c4e9c37a7263491f2fc0237d836e1a80 for a more complete description
14982     # of the MU issue.
14983     foreach my $range ($loc_problem_folds->ranges) {
14984         foreach my $code_point ($range->start .. $range->end) {
14985             my $fold_range = $cf->containing_range($code_point);
14986             next unless defined $fold_range;
14987
14988             # Skip if folds to itself
14989             next if $fold_range->value eq $CODE_POINT;
14990
14991             my @hex_folds = split " ", $fold_range->value;
14992             my $start_cp = $hex_folds[0];
14993             next if $start_cp eq $CODE_POINT;
14994             $start_cp = hex $start_cp;
14995             foreach my $i (0 .. @hex_folds - 1) {
14996                 my $cp = $hex_folds[$i];
14997                 next if $cp eq $CODE_POINT;
14998                 $cp = hex $cp;
14999                 next unless $cp > 255;    # Already have the < 256 ones
15000
15001                 $loc_problem_folds->add_range($cp, $cp);
15002                 $loc_problem_folds_start->add_range($start_cp, $start_cp);
15003             }
15004         }
15005     }
15006
15007     my $folds_to_multi_char = $perl->add_match_table(
15008          "_Perl_Folds_To_Multi_Char",
15009          Description =>
15010               "Code points whose fold is a string of more than one character",
15011     );
15012     my $in_multi_fold = $perl->add_match_table(
15013                "_Perl_Is_In_Multi_Char_Fold",
15014                Description =>
15015                    "Code points that are in some multiple character fold",
15016     );
15017     if ($v_version lt v3.0.1) {
15018         push @tables_that_may_be_empty, '_Perl_Folds_To_Multi_Char',
15019                                         '_Perl_Is_In_Multi_Char_Fold',
15020                                         '_Perl_Non_Final_Folds';
15021     }
15022
15023     # Look through all the known folds to populate these tables.
15024     foreach my $range ($cf->ranges) {
15025         next if $range->value eq $CODE_POINT;
15026         my $start = $range->start;
15027         my $end = $range->end;
15028         $any_folds->add_range($start, $end);
15029
15030         my @hex_folds = split " ", $range->value;
15031         if (@hex_folds > 1) {   # Is multi-char fold
15032             $folds_to_multi_char->add_range($start, $end);
15033         }
15034
15035         my $found_locale_problematic = 0;
15036
15037         my $folded_count = @hex_folds;
15038         if ($folded_count > 3) {
15039             die Carp::my_carp("Maximum number of characters in a fold should be 3: Instead, it's  $folded_count for U+" . sprintf "%04X", $range->start);
15040         }
15041
15042         # Look at each of the folded-to characters...
15043         foreach my $i (1 .. $folded_count) {
15044             my $cp = hex $hex_folds[$i-1];
15045             $any_folds->add_range($cp, $cp);
15046
15047             # The fold is problematic if any of the folded-to characters is
15048             # already considered problematic.
15049             if ($loc_problem_folds->contains($cp)) {
15050                 $loc_problem_folds->add_range($start, $end);
15051                 $found_locale_problematic = 1;
15052             }
15053
15054             if ($folded_count > 1) {
15055                 $in_multi_fold->add_range($cp, $cp);
15056             }
15057         }
15058
15059         # If this is a problematic fold, add to the start chars the
15060         # folding-from characters and first folded-to character.
15061         if ($found_locale_problematic) {
15062             $loc_problem_folds_start->add_range($start, $end);
15063             my $cp = hex $hex_folds[0];
15064             $loc_problem_folds_start->add_range($cp, $cp);
15065         }
15066     }
15067
15068     my $dt = property_ref('Decomposition_Type');
15069     $dt->add_match_table('Non_Canon', Full_Name => 'Non_Canonical',
15070         Initialize => ~ ($dt->table('None') + $dt->table('Canonical')),
15071         Perl_Extension => 1,
15072         Note => 'Union of all non-canonical decompositions',
15073         );
15074
15075     # For backward compatibility, Perl has its own definition for IDStart.
15076     # It is regular XID_Start plus the underscore, but all characters must be
15077     # Word characters as well
15078     my $XID_Start = property_ref('XID_Start');
15079     my $perl_xids = $perl->add_match_table('_Perl_IDStart',
15080                                             Perl_Extension => 1,
15081                                             Fate => $INTERNAL_ONLY,
15082                                             Initialize => ord('_')
15083                                             );
15084     if (defined $XID_Start
15085         || defined ($XID_Start = property_ref('ID_Start')))
15086     {
15087         $perl_xids += $XID_Start->table('Y');
15088     }
15089     else {
15090         # For Unicode versions that don't have the property, construct our own
15091         # from first principles.  The actual definition is:
15092         #     Letters
15093         #   + letter numbers (Nl)
15094         #   - Pattern_Syntax
15095         #   - Pattern_White_Space
15096         #   + stability extensions
15097         #   - NKFC modifications
15098         #
15099         # What we do in the code below is to include the identical code points
15100         # that are in the first release that had Unicode's version of this
15101         # property, essentially extrapolating backwards.  There were no
15102         # stability extensions until v4.1, so none are included; likewise in
15103         # no Unicode version so far do subtracting PatSyn and PatWS make any
15104         # difference, so those also are ignored.
15105         $perl_xids += $gc->table('Letter') + pre_3_dot_1_Nl();
15106
15107         # We do subtract the NFKC modifications that are in the first version
15108         # that had this property.  We don't bother to test if they are in the
15109         # version in question, because if they aren't, the operation is a
15110         # no-op.  The NKFC modifications are discussed in
15111         # http://www.unicode.org/reports/tr31/#NFKC_Modifications
15112         foreach my $range ( 0x037A,
15113                             0x0E33,
15114                             0x0EB3,
15115                             [ 0xFC5E, 0xFC63 ],
15116                             [ 0xFDFA, 0xFE70 ],
15117                             [ 0xFE72, 0xFE76 ],
15118                             0xFE78,
15119                             0xFE7A,
15120                             0xFE7C,
15121                             0xFE7E,
15122                             [ 0xFF9E, 0xFF9F ],
15123         ) {
15124             if (ref $range) {
15125                 $perl_xids->delete_range($range->[0], $range->[1]);
15126             }
15127             else {
15128                 $perl_xids->delete_range($range, $range);
15129             }
15130         }
15131     }
15132
15133     $perl_xids &= $Word;
15134
15135     my $perl_xidc = $perl->add_match_table('_Perl_IDCont',
15136                                         Perl_Extension => 1,
15137                                         Fate => $INTERNAL_ONLY);
15138     my $XIDC = property_ref('XID_Continue');
15139     if (defined $XIDC
15140         || defined ($XIDC = property_ref('ID_Continue')))
15141     {
15142         $perl_xidc += $XIDC->table('Y');
15143     }
15144     else {
15145         # Similarly, we construct our own XIDC if necessary for early Unicode
15146         # versions.  The definition is:
15147         #     everything in XIDS
15148         #   + Gc=Mn
15149         #   + Gc=Mc
15150         #   + Gc=Nd
15151         #   + Gc=Pc
15152         #   - Pattern_Syntax
15153         #   - Pattern_White_Space
15154         #   + stability extensions
15155         #   - NFKC modifications
15156         #
15157         # The same thing applies to this as with XIDS for the PatSyn, PatWS,
15158         # and stability extensions.  There is a somewhat different set of NFKC
15159         # mods to remove (and add in this case).  The ones below make this
15160         # have identical code points as in the first release that defined it.
15161         $perl_xidc += $perl_xids
15162                     + $gc->table('L')
15163                     + $gc->table('Mn')
15164                     + $gc->table('Mc')
15165                     + $gc->table('Nd')
15166                     + utf8::unicode_to_native(0xB7)
15167                     ;
15168         if (defined (my $pc = $gc->table('Pc'))) {
15169             $perl_xidc += $pc;
15170         }
15171         else {  # 1.1.5 didn't have Pc, but these should have been in it
15172             $perl_xidc += 0xFF3F;
15173             $perl_xidc->add_range(0x203F, 0x2040);
15174             $perl_xidc->add_range(0xFE33, 0xFE34);
15175             $perl_xidc->add_range(0xFE4D, 0xFE4F);
15176         }
15177
15178         # Subtract the NFKC mods
15179         foreach my $range ( 0x037A,
15180                             [ 0xFC5E, 0xFC63 ],
15181                             [ 0xFDFA, 0xFE1F ],
15182                             0xFE70,
15183                             [ 0xFE72, 0xFE76 ],
15184                             0xFE78,
15185                             0xFE7A,
15186                             0xFE7C,
15187                             0xFE7E,
15188         ) {
15189             if (ref $range) {
15190                 $perl_xidc->delete_range($range->[0], $range->[1]);
15191             }
15192             else {
15193                 $perl_xidc->delete_range($range, $range);
15194             }
15195         }
15196     }
15197
15198     $perl_xidc &= $Word;
15199
15200     my $charname_begin = $perl->add_match_table('_Perl_Charname_Begin',
15201                     Perl_Extension => 1,
15202                     Fate => $INTERNAL_ONLY,
15203                     Initialize => $gc->table('Letter') & $Alpha & $perl_xids,
15204                     );
15205
15206     my $charname_continue = $perl->add_match_table('_Perl_Charname_Continue',
15207                         Perl_Extension => 1,
15208                         Fate => $INTERNAL_ONLY,
15209                         Initialize => $perl_xidc
15210                                     + ord(" ")
15211                                     + ord("(")
15212                                     + ord(")")
15213                                     + ord("-")
15214                         );
15215
15216     my @composition = ('Name', 'Unicode_1_Name', '_Perl_Name_Alias');
15217
15218     if (@named_sequences) {
15219         push @composition, 'Named_Sequence';
15220         foreach my $sequence (@named_sequences) {
15221             $perl_charname->add_anomalous_entry($sequence);
15222         }
15223     }
15224
15225     my $alias_sentence = "";
15226     my %abbreviations;
15227     my $alias = property_ref('_Perl_Name_Alias');
15228     $perl_charname->set_proxy_for('_Perl_Name_Alias');
15229
15230     # Add each entry in _Perl_Name_Alias to Perl_Charnames.  Where these go
15231     # with respect to any existing entry depends on the entry type.
15232     # Corrections go before said entry, as they should be returned in
15233     # preference over the existing entry.  (A correction to a correction
15234     # should be later in the _Perl_Name_Alias table, so it will correctly
15235     # precede the erroneous correction in Perl_Charnames.)
15236     #
15237     # Abbreviations go after everything else, so they are saved temporarily in
15238     # a hash for later.
15239     #
15240     # Everything else is added added afterwards, which preserves the input
15241     # ordering
15242
15243     foreach my $range ($alias->ranges) {
15244         next if $range->value eq "";
15245         my $code_point = $range->start;
15246         if ($code_point != $range->end) {
15247             Carp::my_carp_bug("Bad News.  Expecting only one code point in the range $range.  Just to keep going, using only the first code point;");
15248         }
15249         my ($value, $type) = split ': ', $range->value;
15250         my $replace_type;
15251         if ($type eq 'correction') {
15252             $replace_type = $MULTIPLE_BEFORE;
15253         }
15254         elsif ($type eq 'abbreviation') {
15255
15256             # Save for later
15257             $abbreviations{$value} = $code_point;
15258             next;
15259         }
15260         else {
15261             $replace_type = $MULTIPLE_AFTER;
15262         }
15263
15264         # Actually add; before or after current entry(ies) as determined
15265         # above.
15266
15267         $perl_charname->add_duplicate($code_point, $value, Replace => $replace_type);
15268     }
15269     $alias_sentence = <<END;
15270 The _Perl_Name_Alias property adds duplicate code point entries that are
15271 alternatives to the original name.  If an addition is a corrected
15272 name, it will be physically first in the table.  The original (less correct,
15273 but still valid) name will be next; then any alternatives, in no particular
15274 order; and finally any abbreviations, again in no particular order.
15275 END
15276
15277     # Now add the Unicode_1 names for the controls.  The Unicode_1 names had
15278     # precedence before 6.1, including the awful ones like "LINE FEED (LF)",
15279     # so should be first in the file; the other names have precedence starting
15280     # in 6.1,
15281     my $before_or_after = ($v_version lt v6.1.0)
15282                           ? $MULTIPLE_BEFORE
15283                           : $MULTIPLE_AFTER;
15284
15285     foreach my $range (property_ref('Unicode_1_Name')->ranges) {
15286         my $code_point = $range->start;
15287         my $unicode_1_value = $range->value;
15288         next if $unicode_1_value eq "";     # Skip if name doesn't exist.
15289
15290         if ($code_point != $range->end) {
15291             Carp::my_carp_bug("Bad News.  Expecting only one code point in the range $range.  Just to keep going, using only the first code point;");
15292         }
15293
15294         # To handle EBCDIC, we don't hard code in the code points of the
15295         # controls; instead realizing that all of them are below 256.
15296         last if $code_point > 255;
15297
15298         # We only add in the controls.
15299         next if $gc->value_of($code_point) ne 'Cc';
15300
15301         # We reject this Unicode1 name for later Perls, as it is used for
15302         # another code point
15303         next if $unicode_1_value eq 'BELL' && $^V ge v5.17.0;
15304
15305         # This won't add an exact duplicate.
15306         $perl_charname->add_duplicate($code_point, $unicode_1_value,
15307                                         Replace => $before_or_after);
15308     }
15309
15310     # Now that have everything added, add in abbreviations after
15311     # everything else.  Sort so results don't change between runs of this
15312     # program
15313     foreach my $value (sort keys %abbreviations) {
15314         $perl_charname->add_duplicate($abbreviations{$value}, $value,
15315                                         Replace => $MULTIPLE_AFTER);
15316     }
15317
15318     my $comment;
15319     if (@composition <= 2) { # Always at least 2
15320         $comment = join " and ", @composition;
15321     }
15322     else {
15323         $comment = join ", ", @composition[0 .. scalar @composition - 2];
15324         $comment .= ", and $composition[-1]";
15325     }
15326
15327     $perl_charname->add_comment(join_lines( <<END
15328 This file is for charnames.pm.  It is the union of the $comment properties.
15329 Unicode_1_Name entries are used only for nameless code points in the Name
15330 property.
15331 $alias_sentence
15332 This file doesn't include the algorithmically determinable names.  For those,
15333 use 'unicore/Name.pm'
15334 END
15335     ));
15336     property_ref('Name')->add_comment(join_lines( <<END
15337 This file doesn't include the algorithmically determinable names.  For those,
15338 use 'unicore/Name.pm'
15339 END
15340     ));
15341
15342     # Construct the Present_In property from the Age property.
15343     if (-e 'DAge.txt' && defined $age) {
15344         my $default_map = $age->default_map;
15345         my $in = Property->new('In',
15346                                 Default_Map => $default_map,
15347                                 Full_Name => "Present_In",
15348                                 Perl_Extension => 1,
15349                                 Type => $ENUM,
15350                                 Initialize => $age,
15351                                 );
15352         $in->add_comment(join_lines(<<END
15353 THIS FILE SHOULD NOT BE USED FOR ANY PURPOSE.  The values in this file are the
15354 same as for $age, and not for what $in really means.  This is because anything
15355 defined in a given release should have multiple values: that release and all
15356 higher ones.  But only one value per code point can be represented in a table
15357 like this.
15358 END
15359         ));
15360
15361         # The Age tables are named like 1.5, 2.0, 2.1, ....  Sort so that the
15362         # lowest numbered (earliest) come first, with the non-numeric one
15363         # last.
15364         my ($first_age, @rest_ages) = sort { ($a->name !~ /^[\d.]*$/)
15365                                             ? 1
15366                                             : ($b->name !~ /^[\d.]*$/)
15367                                                 ? -1
15368                                                 : $a->name <=> $b->name
15369                                             } $age->tables;
15370
15371         # The Present_In property is the cumulative age properties.  The first
15372         # one hence is identical to the first age one.
15373         my $previous_in = $in->add_match_table($first_age->name);
15374         $previous_in->set_equivalent_to($first_age, Related => 1);
15375
15376         my $description_start = "Code point's usage introduced in version ";
15377         $first_age->add_description($description_start . $first_age->name);
15378
15379         # To construct the accumulated values, for each of the age tables
15380         # starting with the 2nd earliest, merge the earliest with it, to get
15381         # all those code points existing in the 2nd earliest.  Repeat merging
15382         # the new 2nd earliest with the 3rd earliest to get all those existing
15383         # in the 3rd earliest, and so on.
15384         foreach my $current_age (@rest_ages) {
15385             next if $current_age->name !~ /^[\d.]*$/;   # Skip the non-numeric
15386
15387             my $current_in = $in->add_match_table(
15388                                     $current_age->name,
15389                                     Initialize => $current_age + $previous_in,
15390                                     Description => $description_start
15391                                                     . $current_age->name
15392                                                     . ' or earlier',
15393                                     );
15394             foreach my $alias ($current_age->aliases) {
15395                 $current_in->add_alias($alias->name);
15396             }
15397             $previous_in = $current_in;
15398
15399             # Add clarifying material for the corresponding age file.  This is
15400             # in part because of the confusing and contradictory information
15401             # given in the Standard's documentation itself, as of 5.2.
15402             $current_age->add_description(
15403                             "Code point's usage was introduced in version "
15404                             . $current_age->name);
15405             $current_age->add_note("See also $in");
15406
15407         }
15408
15409         # And finally the code points whose usages have yet to be decided are
15410         # the same in both properties.  Note that permanently unassigned code
15411         # points actually have their usage assigned (as being permanently
15412         # unassigned), so that these tables are not the same as gc=cn.
15413         my $unassigned = $in->add_match_table($default_map);
15414         my $age_default = $age->table($default_map);
15415         $age_default->add_description(<<END
15416 Code point's usage has not been assigned in any Unicode release thus far.
15417 END
15418         );
15419         $unassigned->set_equivalent_to($age_default, Related => 1);
15420     }
15421
15422     my $patws = $perl->add_match_table('_Perl_PatWS',
15423                                        Perl_Extension => 1,
15424                                        Fate => $INTERNAL_ONLY);
15425     if (defined (my $off_patws = property_ref('Pattern_White_Space'))) {
15426         $patws->initialize($off_patws->table('Y'));
15427     }
15428     else {
15429         $patws->initialize([ ord("\t"),
15430                              ord("\n"),
15431                              utf8::unicode_to_native(0x0B), # VT
15432                              ord("\f"),
15433                              ord("\r"),
15434                              ord(" "),
15435                              utf8::unicode_to_native(0x85), # NEL
15436                              0x200E..0x200F,             # Left, Right marks
15437                              0x2028..0x2029              # Line, Paragraph seps
15438                            ] );
15439     }
15440
15441     # See L<perlfunc/quotemeta>
15442     my $quotemeta = $perl->add_match_table('_Perl_Quotemeta',
15443                                            Perl_Extension => 1,
15444                                            Fate => $INTERNAL_ONLY,
15445
15446                                            # Initialize to what's common in
15447                                            # all Unicode releases.
15448                                            Initialize =>
15449                                                   $gc->table('Control')
15450                                                 + $Space
15451                                                 + $patws
15452                                                 + ((~ $Word) & $ASCII)
15453                            );
15454
15455     if (defined (my $patsyn = property_ref('Pattern_Syntax'))) {
15456         $quotemeta += $patsyn->table('Y');
15457     }
15458     else {
15459         $quotemeta += ((~ $Word) & Range->new(0, 255))
15460                     - utf8::unicode_to_native(0xA8)
15461                     - utf8::unicode_to_native(0xAF)
15462                     - utf8::unicode_to_native(0xB2)
15463                     - utf8::unicode_to_native(0xB3)
15464                     - utf8::unicode_to_native(0xB4)
15465                     - utf8::unicode_to_native(0xB7)
15466                     - utf8::unicode_to_native(0xB8)
15467                     - utf8::unicode_to_native(0xB9)
15468                     - utf8::unicode_to_native(0xBC)
15469                     - utf8::unicode_to_native(0xBD)
15470                     - utf8::unicode_to_native(0xBE);
15471         $quotemeta += [ # These are above-Latin1 patsyn; hence should be the
15472                         # same in all releases
15473                         0x2010 .. 0x2027,
15474                         0x2030 .. 0x203E,
15475                         0x2041 .. 0x2053,
15476                         0x2055 .. 0x205E,
15477                         0x2190 .. 0x245F,
15478                         0x2500 .. 0x2775,
15479                         0x2794 .. 0x2BFF,
15480                         0x2E00 .. 0x2E7F,
15481                         0x3001 .. 0x3003,
15482                         0x3008 .. 0x3020,
15483                         0x3030 .. 0x3030,
15484                         0xFD3E .. 0xFD3F,
15485                         0xFE45 .. 0xFE46
15486                       ];
15487     }
15488
15489     if (defined (my $di = property_ref('Default_Ignorable_Code_Point'))) {
15490         $quotemeta += $di->table('Y')
15491     }
15492     else {
15493         if ($v_version ge v2.0) {
15494             $quotemeta += $gc->table('Cf')
15495                        +  $gc->table('Cs');
15496
15497             # These are above the Unicode version 1 max
15498             $quotemeta->add_range(0xE0000, 0xE0FFF);
15499         }
15500         $quotemeta += $gc->table('Cc')
15501                     - $Space;
15502         my $temp = Range_List->new(Initialize => [ 0x180B .. 0x180D,
15503                                                    0x2060 .. 0x206F,
15504                                                    0xFE00 .. 0xFE0F,
15505                                                    0xFFF0 .. 0xFFFB,
15506                                                   ]);
15507         $temp->add_range(0xE0000, 0xE0FFF) if $v_version ge v2.0;
15508         $quotemeta += $temp;
15509     }
15510     calculate_DI();
15511     $quotemeta += $DI;
15512
15513     calculate_NChar();
15514
15515     # Finished creating all the perl properties.  All non-internal non-string
15516     # ones have a synonym of 'Is_' prefixed.  (Internal properties begin with
15517     # an underscore.)  These do not get a separate entry in the pod file
15518     foreach my $table ($perl->tables) {
15519         foreach my $alias ($table->aliases) {
15520             next if $alias->name =~ /^_/;
15521             $table->add_alias('Is_' . $alias->name,
15522                                Re_Pod_Entry => 0,
15523                                UCD => 0,
15524                                Status => $alias->status,
15525                                OK_as_Filename => 0);
15526         }
15527     }
15528
15529     # Perl tailors the WordBreak property so that \b{wb} doesn't split
15530     # adjacent spaces into separate words.  Unicode 11.0 moved in that
15531     # direction, but left TAB,  FIGURE SPACE (U+2007), and (ironically) NO
15532     # BREAK SPACE as breaking, so we retained the original Perl customization.
15533     # To do this, in the Perl copy of WB, simply replace the mappings of
15534     # horizontal space characters that otherwise would map to the default or
15535     # the 11.0 'WSegSpace' to instead map to our tailoring.
15536     my $perl_wb = property_ref('_Perl_WB');
15537     my $default = $perl_wb->default_map;
15538     for my $range ($Blank->ranges) {
15539         for my $i ($range->start .. $range->end) {
15540             my $value = $perl_wb->value_of($i);
15541
15542             next unless $value eq $default || $value eq 'WSegSpace';
15543             $perl_wb->add_map($i, $i, 'Perl_Tailored_HSpace',
15544                               Replace => $UNCONDITIONALLY);
15545         }
15546     }
15547
15548     # Also starting in Unicode 11.0, rules for some of the boundary types are
15549     # based on a non-UCD property (which we have read in if it exists).
15550     # Recall that these boundary properties partition the code points into
15551     # equivalence classes (represented as enums).
15552     #
15553     # The loop below goes through each code point that matches the non-UCD
15554     # property, and for each current equivalence class containing such a code
15555     # point, splits it so that those that are in both are now in a newly
15556     # created equivalence class whose name is a combination of the property
15557     # and the old class name, leaving unchanged everything that doesn't match
15558     # the non-UCD property.
15559     my $pictographic_emoji = property_ref('XPG');
15560     if (defined $pictographic_emoji) {
15561         foreach my $base_property (property_ref('GCB'),
15562                                    property_ref('WB'))
15563         {
15564             my $property = property_ref('_Perl_' . $base_property->name);
15565             foreach my $range ($pictographic_emoji->table('Y')->ranges) {
15566                 foreach my $i ($range->start .. $range->end) {
15567                     my $current = $property->value_of($i);
15568                     $current = $property->table($current)->short_name;
15569                     $property->add_map($i, $i, 'XPG_' . $current,
15570                                        Replace => $UNCONDITIONALLY);
15571                 }
15572             }
15573         }
15574     }
15575
15576     # Create a version of the LineBreak property with the mappings that are
15577     # omitted in the default algorithm remapped to what
15578     # http://www.unicode.org/reports/tr14 says they should be.
15579     #
15580     # Original     Resolved  General_Category
15581     # AI, SG, XX      AL      Any
15582     # SA              CM      Only Mn or Mc
15583     # SA              AL      Any except Mn and Mc
15584     # CJ              NS      Any
15585     #
15586     # All property values are also written out in their long form, as
15587     # regen/mk_invlist.pl expects that.  This also fixes occurrences of the
15588     # typo in early Unicode versions: 'inseperable'.
15589     my $perl_lb = property_ref('_Perl_LB');
15590     if (! defined $perl_lb) {
15591         $perl_lb = Property->new('_Perl_LB',
15592                                  Fate => $INTERNAL_ONLY,
15593                                  Perl_Extension => 1,
15594                                  Directory => $map_directory,
15595                                  Type => $STRING);
15596         my $lb = property_ref('Line_Break');
15597
15598         # Populate from $lb, but use full name and fix typo.
15599         foreach my $range ($lb->ranges) {
15600             my $full_name = $lb->table($range->value)->full_name;
15601             $full_name = 'Inseparable'
15602                                 if standardize($full_name) eq 'inseperable';
15603             $perl_lb->add_map($range->start, $range->end, $full_name);
15604         }
15605     }
15606
15607     $perl_lb->set_default_map('Alphabetic', 'full_name');    # XX -> AL
15608
15609     for my $range ($perl_lb->ranges) {
15610         my $value = standardize($range->value);
15611         if (   $value eq standardize('Unknown')
15612             || $value eq standardize('Ambiguous')
15613             || $value eq standardize('Surrogate'))
15614         {
15615             $perl_lb->add_map($range->start, $range->end, 'Alphabetic',
15616                               Replace => $UNCONDITIONALLY);
15617         }
15618         elsif ($value eq standardize('Conditional_Japanese_Starter')) {
15619             $perl_lb->add_map($range->start, $range->end, 'Nonstarter',
15620                               Replace => $UNCONDITIONALLY);
15621         }
15622         elsif ($value eq standardize('Complex_Context')) {
15623             for my $i ($range->start .. $range->end) {
15624                 my $gc_val = $gc->value_of($i);
15625                 if ($gc_val eq 'Mn' || $gc_val eq 'Mc') {
15626                     $perl_lb->add_map($i, $i, 'Combining_Mark',
15627                                       Replace => $UNCONDITIONALLY);
15628                 }
15629                 else {
15630                     $perl_lb->add_map($i, $i, 'Alphabetic',
15631                                       Replace => $UNCONDITIONALLY);
15632                 }
15633             }
15634         }
15635     }
15636
15637     # This property is a modification of the scx property
15638     my $perl_scx = Property->new('_Perl_SCX',
15639                                  Fate => $INTERNAL_ONLY,
15640                                  Perl_Extension => 1,
15641                                  Directory => $map_directory,
15642                                  Type => $ENUM);
15643     my $source;
15644
15645     # Use scx if available; otherwise sc;  if neither is there (a very old
15646     # Unicode version, just say that everything is 'Common'
15647     if (defined $scx) {
15648         $source = $scx;
15649         $perl_scx->set_default_map('Unknown');
15650     }
15651     elsif (defined $script) {
15652         $source = $script;
15653
15654         # Early versions of 'sc', had everything be 'Common'
15655         if (defined $script->table('Unknown')) {
15656             $perl_scx->set_default_map('Unknown');
15657         }
15658         else {
15659             $perl_scx->set_default_map('Common');
15660         }
15661     } else {
15662         $perl_scx->add_match_table('Common');
15663         $perl_scx->add_map(0, $MAX_UNICODE_CODEPOINT, 'Common');
15664
15665         $perl_scx->add_match_table('Unknown');
15666         $perl_scx->set_default_map('Unknown');
15667     }
15668
15669     $perl_scx->_set_format($STRING_WHITE_SPACE_LIST);
15670     $perl_scx->set_pre_declared_maps(0); # PropValueAliases doesn't list these
15671
15672     if (defined $source) {
15673         $perl_scx->initialize($source);
15674
15675         # UTS 39 says that the scx property should be modified for these
15676         # countries where certain mixed scripts are commonly used.
15677         for my $range ($perl_scx->ranges) {
15678             my $value = $range->value;
15679             my $changed = $value =~ s/ ( \b Han i? \b ) /$1 Hanb Jpan Kore/xi;
15680              $changed |=  $value =~ s/ ( \b Hira (gana)? \b ) /$1 Jpan/xi;
15681              $changed |=  $value =~ s/ ( \b Kata (kana)? \b ) /$1 Jpan/xi;
15682              $changed |=  $value =~ s{ ( \b Katakana_or_Hiragana \b ) }
15683                                      {$1 Katakana Hiragana Jpan}xi;
15684              $changed |=  $value =~ s/ ( \b Hang (ul)? \b ) /$1 Kore/xi;
15685              $changed |=  $value =~ s/ ( \b Bopo (mofo)? \b ) /$1 Hanb/xi;
15686
15687             if ($changed) {
15688                 $value = join " ", uniques split " ", $value;
15689                 $range->set_value($value)
15690             }
15691         }
15692
15693         foreach my $table ($source->tables) {
15694             my $scx_table = $perl_scx->add_match_table($table->name,
15695                                     Full_Name => $table->full_name);
15696             foreach my $alias ($table->aliases) {
15697                 $scx_table->add_alias($alias->name);
15698             }
15699         }
15700     }
15701
15702     # Here done with all the basic stuff.  Ready to populate the information
15703     # about each character if annotating them.
15704     if ($annotate) {
15705
15706         # See comments at its declaration
15707         $annotate_ranges = Range_Map->new;
15708
15709         # This separates out the non-characters from the other unassigneds, so
15710         # can give different annotations for each.
15711         $unassigned_sans_noncharacters = Range_List->new(
15712                                     Initialize => $gc->table('Unassigned'));
15713         $unassigned_sans_noncharacters &= (~ $NChar);
15714
15715         for (my $i = 0; $i <= $MAX_UNICODE_CODEPOINT + 1; $i++ ) {
15716             $i = populate_char_info($i);    # Note sets $i so may cause skips
15717
15718         }
15719     }
15720
15721     return;
15722 }
15723
15724 sub add_perl_synonyms() {
15725     # A number of Unicode tables have Perl synonyms that are expressed in
15726     # the single-form, \p{name}.  These are:
15727     #   All the binary property Y tables, so that \p{Name=Y} gets \p{Name} and
15728     #       \p{Is_Name} as synonyms
15729     #   \p{Script_Extensions=Value} gets \p{Value}, \p{Is_Value} as synonyms
15730     #   \p{General_Category=Value} gets \p{Value}, \p{Is_Value} as synonyms
15731     #   \p{Block=Value} gets \p{In_Value} as a synonym, and, if there is no
15732     #       conflict, \p{Value} and \p{Is_Value} as well
15733     #
15734     # This routine generates these synonyms, warning of any unexpected
15735     # conflicts.
15736
15737     # Construct the list of tables to get synonyms for.  Start with all the
15738     # binary and the General_Category ones.
15739     my @tables = grep { $_->type == $BINARY || $_->type == $FORCED_BINARY }
15740                                                             property_ref('*');
15741     push @tables, $gc->tables;
15742
15743     # If the version of Unicode includes the Script Extensions (preferably),
15744     # or Script property, add its tables
15745     if (defined $scx) {
15746         push @tables, $scx->tables;
15747     }
15748     else {
15749         push @tables, $script->tables if defined $script;
15750     }
15751
15752     # The Block tables are kept separate because they are treated differently.
15753     # And the earliest versions of Unicode didn't include them, so add only if
15754     # there are some.
15755     my @blocks;
15756     push @blocks, $block->tables if defined $block;
15757
15758     # Here, have the lists of tables constructed.  Process blocks last so that
15759     # if there are name collisions with them, blocks have lowest priority.
15760     # Should there ever be other collisions, manual intervention would be
15761     # required.  See the comments at the beginning of the program for a
15762     # possible way to handle those semi-automatically.
15763     foreach my $table (@tables,  @blocks) {
15764
15765         # For non-binary properties, the synonym is just the name of the
15766         # table, like Greek, but for binary properties the synonym is the name
15767         # of the property, and means the code points in its 'Y' table.
15768         my $nominal = $table;
15769         my $nominal_property = $nominal->property;
15770         my $actual;
15771         if (! $nominal->isa('Property')) {
15772             $actual = $table;
15773         }
15774         else {
15775
15776             # Here is a binary property.  Use the 'Y' table.  Verify that is
15777             # there
15778             my $yes = $nominal->table('Y');
15779             unless (defined $yes) {  # Must be defined, but is permissible to
15780                                      # be empty.
15781                 Carp::my_carp_bug("Undefined $nominal, 'Y'.  Skipping.");
15782                 next;
15783             }
15784             $actual = $yes;
15785         }
15786
15787         foreach my $alias ($nominal->aliases) {
15788
15789             # Attempt to create a table in the perl directory for the
15790             # candidate table, using whatever aliases in it that don't
15791             # conflict.  Also add non-conflicting aliases for all these
15792             # prefixed by 'Is_' (and/or 'In_' for Block property tables)
15793             PREFIX:
15794             foreach my $prefix ("", 'Is_', 'In_') {
15795
15796                 # Only Block properties can have added 'In_' aliases.
15797                 next if $prefix eq 'In_' and $nominal_property != $block;
15798
15799                 my $proposed_name = $prefix . $alias->name;
15800
15801                 # No Is_Is, In_In, nor combinations thereof
15802                 trace "$proposed_name is a no-no" if main::DEBUG && $to_trace && $proposed_name =~ /^ I [ns] _I [ns] _/x;
15803                 next if $proposed_name =~ /^ I [ns] _I [ns] _/x;
15804
15805                 trace "Seeing if can add alias or table: 'perl=$proposed_name' based on $nominal" if main::DEBUG && $to_trace;
15806
15807                 # Get a reference to any existing table in the perl
15808                 # directory with the desired name.
15809                 my $pre_existing = $perl->table($proposed_name);
15810
15811                 if (! defined $pre_existing) {
15812
15813                     # No name collision, so OK to add the perl synonym.
15814
15815                     my $make_re_pod_entry;
15816                     my $ok_as_filename;
15817                     my $status = $alias->status;
15818                     if ($nominal_property == $block) {
15819
15820                         # For block properties, only the compound form is
15821                         # preferred for external use; the others are
15822                         # discouraged.  The pod file contains wild cards for
15823                         # the 'In' and 'Is' forms so no entries for those; and
15824                         # we don't want people using the name without any
15825                         # prefix, so discourage that.
15826                         if ($prefix eq "") {
15827                             $make_re_pod_entry = 1;
15828                             $status = $status || $DISCOURAGED;
15829                             $ok_as_filename = 0;
15830                         }
15831                         elsif ($prefix eq 'In_') {
15832                             $make_re_pod_entry = 0;
15833                             $status = $status || $DISCOURAGED;
15834                             $ok_as_filename = 1;
15835                         }
15836                         else {
15837                             $make_re_pod_entry = 0;
15838                             $status = $status || $DISCOURAGED;
15839                             $ok_as_filename = 0;
15840                         }
15841                     }
15842                     elsif ($prefix ne "") {
15843
15844                         # The 'Is' prefix is handled in the pod by a wild
15845                         # card, and we won't use it for an external name
15846                         $make_re_pod_entry = 0;
15847                         $status = $status || $NORMAL;
15848                         $ok_as_filename = 0;
15849                     }
15850                     else {
15851
15852                         # Here, is an empty prefix, non block.  This gets its
15853                         # own pod entry and can be used for an external name.
15854                         $make_re_pod_entry = 1;
15855                         $status = $status || $NORMAL;
15856                         $ok_as_filename = 1;
15857                     }
15858
15859                     # Here, there isn't a perl pre-existing table with the
15860                     # name.  Look through the list of equivalents of this
15861                     # table to see if one is a perl table.
15862                     foreach my $equivalent ($actual->leader->equivalents) {
15863                         next if $equivalent->property != $perl;
15864
15865                         # Here, have found a table for $perl.  Add this alias
15866                         # to it, and are done with this prefix.
15867                         $equivalent->add_alias($proposed_name,
15868                                         Re_Pod_Entry => $make_re_pod_entry,
15869
15870                                         # Currently don't output these in the
15871                                         # ucd pod, as are strongly discouraged
15872                                         # from being used
15873                                         UCD => 0,
15874
15875                                         Status => $status,
15876                                         OK_as_Filename => $ok_as_filename);
15877                         trace "adding alias perl=$proposed_name to $equivalent" if main::DEBUG && $to_trace;
15878                         next PREFIX;
15879                     }
15880
15881                     # Here, $perl doesn't already have a table that is a
15882                     # synonym for this property, add one.
15883                     my $added_table = $perl->add_match_table($proposed_name,
15884                                             Re_Pod_Entry => $make_re_pod_entry,
15885
15886                                             # See UCD comment just above
15887                                             UCD => 0,
15888
15889                                             Status => $status,
15890                                             OK_as_Filename => $ok_as_filename);
15891                     # And it will be related to the actual table, since it is
15892                     # based on it.
15893                     $added_table->set_equivalent_to($actual, Related => 1);
15894                     trace "added ", $perl->table($proposed_name) if main::DEBUG && $to_trace;
15895                     next;
15896                 } # End of no pre-existing.
15897
15898                 # Here, there is a pre-existing table that has the proposed
15899                 # name.  We could be in trouble, but not if this is just a
15900                 # synonym for another table that we have already made a child
15901                 # of the pre-existing one.
15902                 if ($pre_existing->is_set_equivalent_to($actual)) {
15903                     trace "$pre_existing is already equivalent to $actual; adding alias perl=$proposed_name to it" if main::DEBUG && $to_trace;
15904                     $pre_existing->add_alias($proposed_name);
15905                     next;
15906                 }
15907
15908                 # Here, there is a name collision, but it still could be OK if
15909                 # the tables match the identical set of code points, in which
15910                 # case, we can combine the names.  Compare each table's code
15911                 # point list to see if they are identical.
15912                 trace "Potential name conflict with $pre_existing having ", $pre_existing->count, " code points" if main::DEBUG && $to_trace;
15913                 if ($pre_existing->matches_identically_to($actual)) {
15914
15915                     # Here, they do match identically.  Not a real conflict.
15916                     # Make the perl version a child of the Unicode one, except
15917                     # in the non-obvious case of where the perl name is
15918                     # already a synonym of another Unicode property.  (This is
15919                     # excluded by the test for it being its own parent.)  The
15920                     # reason for this exclusion is that then the two Unicode
15921                     # properties become related; and we don't really know if
15922                     # they are or not.  We generate documentation based on
15923                     # relatedness, and this would be misleading.  Code
15924                     # later executed in the process will cause the tables to
15925                     # be represented by a single file anyway, without making
15926                     # it look in the pod like they are necessarily related.
15927                     if ($pre_existing->parent == $pre_existing
15928                         && ($pre_existing->property == $perl
15929                             || $actual->property == $perl))
15930                     {
15931                         trace "Setting $pre_existing equivalent to $actual since one is \$perl, and match identical sets" if main::DEBUG && $to_trace;
15932                         $pre_existing->set_equivalent_to($actual, Related => 1);
15933                     }
15934                     elsif (main::DEBUG && $to_trace) {
15935                         trace "$pre_existing is equivalent to $actual since match identical sets, but not setting them equivalent, to preserve the separateness of the perl aliases";
15936                         trace $pre_existing->parent;
15937                     }
15938                     next PREFIX;
15939                 }
15940
15941                 # Here they didn't match identically, there is a real conflict
15942                 # between our new name and a pre-existing property.
15943                 $actual->add_conflicting($proposed_name, 'p', $pre_existing);
15944                 $pre_existing->add_conflicting($nominal->full_name,
15945                                                'p',
15946                                                $actual);
15947
15948                 # Don't output a warning for aliases for the block
15949                 # properties (unless they start with 'In_') as it is
15950                 # expected that there will be conflicts and the block
15951                 # form loses.
15952                 if ($verbosity >= $NORMAL_VERBOSITY
15953                     && ($actual->property != $block || $prefix eq 'In_'))
15954                 {
15955                     print simple_fold(join_lines(<<END
15956 There is already an alias named $proposed_name (from $pre_existing),
15957 so not creating this alias for $actual
15958 END
15959                     ), "", 4);
15960                 }
15961
15962                 # Keep track for documentation purposes.
15963                 $has_In_conflicts++ if $prefix eq 'In_';
15964                 $has_Is_conflicts++ if $prefix eq 'Is_';
15965             }
15966         }
15967     }
15968
15969     # There are some properties which have No and Yes (and N and Y) as
15970     # property values, but aren't binary, and could possibly be confused with
15971     # binary ones.  So create caveats for them.  There are tables that are
15972     # named 'No', and tables that are named 'N', but confusion is not likely
15973     # unless they are the same table.  For example, N meaning Number or
15974     # Neutral is not likely to cause confusion, so don't add caveats to things
15975     # like them.
15976     foreach my $property (grep { $_->type != $BINARY
15977                                  && $_->type != $FORCED_BINARY }
15978                                                             property_ref('*'))
15979     {
15980         my $yes = $property->table('Yes');
15981         if (defined $yes) {
15982             my $y = $property->table('Y');
15983             if (defined $y && $yes == $y) {
15984                 foreach my $alias ($property->aliases) {
15985                     $yes->add_conflicting($alias->name);
15986                 }
15987             }
15988         }
15989         my $no = $property->table('No');
15990         if (defined $no) {
15991             my $n = $property->table('N');
15992             if (defined $n && $no == $n) {
15993                 foreach my $alias ($property->aliases) {
15994                     $no->add_conflicting($alias->name, 'P');
15995                 }
15996             }
15997         }
15998     }
15999
16000     return;
16001 }
16002
16003 sub register_file_for_name($$$) {
16004     # Given info about a table and a datafile that it should be associated
16005     # with, register that association
16006
16007     my $table = shift;
16008     my $directory_ref = shift;   # Array of the directory path for the file
16009     my $file = shift;            # The file name in the final directory.
16010     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
16011
16012     trace "table=$table, file=$file, directory=@$directory_ref, fate=", $table->fate if main::DEBUG && $to_trace;
16013
16014     if ($table->isa('Property')) {
16015         $table->set_file_path(@$directory_ref, $file);
16016         push @map_properties, $table;
16017
16018         # No swash means don't do the rest of this.
16019         return if $table->fate != $ORDINARY
16020                   && ! ($table->name =~ /^_/ && $table->fate == $INTERNAL_ONLY);
16021
16022         # Get the path to the file
16023         my @path = $table->file_path;
16024
16025         # Use just the file name if no subdirectory.
16026         shift @path if $path[0] eq File::Spec->curdir();
16027
16028         my $file = join '/', @path;
16029
16030         # Create a hash entry for Unicode::UCD to get the file that stores this
16031         # property's map table
16032         foreach my $alias ($table->aliases) {
16033             my $name = $alias->name;
16034             if ($name =~ /^_/) {
16035                 $strict_property_to_file_of{lc $name} = $file;
16036             }
16037             else {
16038                 $loose_property_to_file_of{standardize($name)} = $file;
16039             }
16040         }
16041
16042         # And a way for Unicode::UCD to find the proper key in the SwashInfo
16043         # hash for this property.
16044         $file_to_swash_name{$file} = "To" . $table->swash_name;
16045         return;
16046     }
16047
16048     # Do all of the work for all equivalent tables when called with the leader
16049     # table, so skip if isn't the leader.
16050     return if $table->leader != $table;
16051
16052     # If this is a complement of another file, use that other file instead,
16053     # with a ! prepended to it.
16054     my $complement;
16055     if (($complement = $table->complement) != 0) {
16056         my @directories = $complement->file_path;
16057
16058         # This assumes that the 0th element is something like 'lib',
16059         # the 1th element the property name (in its own directory), like
16060         # 'AHex', and the 2th element the file like 'Y' which will have a .pl
16061         # appended to it later.
16062         $directories[1] =~ s/^/!/;
16063         $file = pop @directories;
16064         $directory_ref =\@directories;
16065     }
16066
16067     # Join all the file path components together, using slashes.
16068     my $full_filename = join('/', @$directory_ref, $file);
16069
16070     # All go in the same subdirectory of unicore, or the special
16071     # pseudo-directory '#'
16072     if ($directory_ref->[0] !~ / ^ $matches_directory | \# $ /x) {
16073         Carp::my_carp("Unexpected directory in "
16074                 .  join('/', @{$directory_ref}, $file));
16075     }
16076
16077     # For this table and all its equivalents ...
16078     foreach my $table ($table, $table->equivalents) {
16079
16080         # Associate it with its file internally.  Don't include the
16081         # $matches_directory first component
16082         $table->set_file_path(@$directory_ref, $file);
16083
16084         # No swash means don't do the rest of this.
16085         next if $table->isa('Map_Table') && $table->fate != $ORDINARY;
16086
16087         my $sub_filename = join('/', $directory_ref->[1, -1], $file);
16088
16089         my $property = $table->property;
16090         my $property_name = ($property == $perl)
16091                              ? ""  # 'perl' is never explicitly stated
16092                              : standardize($property->name) . '=';
16093
16094         my $is_default = 0; # Is this table the default one for the property?
16095
16096         # To calculate $is_default, we find if this table is the same as the
16097         # default one for the property.  But this is complicated by the
16098         # possibility that there is a master table for this one, and the
16099         # information is stored there instead of here.
16100         my $parent = $table->parent;
16101         my $leader_prop = $parent->property;
16102         my $default_map = $leader_prop->default_map;
16103         if (defined $default_map) {
16104             my $default_table = $leader_prop->table($default_map);
16105             $is_default = 1 if defined $default_table && $parent == $default_table;
16106         }
16107
16108         # Calculate the loose name for this table.  Mostly it's just its name,
16109         # standardized.  But in the case of Perl tables that are single-form
16110         # equivalents to Unicode properties, it is the latter's name.
16111         my $loose_table_name =
16112                         ($property != $perl || $leader_prop == $perl)
16113                         ? standardize($table->name)
16114                         : standardize($parent->name);
16115
16116         my $deprecated = ($table->status eq $DEPRECATED)
16117                          ? $table->status_info
16118                          : "";
16119         my $caseless_equivalent = $table->caseless_equivalent;
16120
16121         # And for each of the table's aliases...  This inner loop eventually
16122         # goes through all aliases in the UCD that we generate regex match
16123         # files for
16124         foreach my $alias ($table->aliases) {
16125             my $standard = UCD_name($table, $alias);
16126
16127             # Generate an entry in either the loose or strict hashes, which
16128             # will translate the property and alias names combination into the
16129             # file where the table for them is stored.
16130             if ($alias->loose_match) {
16131                 if (exists $loose_to_file_of{$standard}) {
16132                     Carp::my_carp("Can't change file registered to $loose_to_file_of{$standard} to '$sub_filename'.");
16133                 }
16134                 else {
16135                     $loose_to_file_of{$standard} = $sub_filename;
16136                 }
16137             }
16138             else {
16139                 if (exists $stricter_to_file_of{$standard}) {
16140                     Carp::my_carp("Can't change file registered to $stricter_to_file_of{$standard} to '$sub_filename'.");
16141                 }
16142                 else {
16143                     $stricter_to_file_of{$standard} = $sub_filename;
16144
16145                     # Tightly coupled with how Unicode::UCD works, for a
16146                     # floating point number that is a whole number, get rid of
16147                     # the trailing decimal point and 0's, so that Unicode::UCD
16148                     # will work.  Also note that this assumes that such a
16149                     # number is matched strictly; so if that were to change,
16150                     # this would be wrong.
16151                     if ((my $integer_name = $alias->name)
16152                             =~ s/^ ( -? \d+ ) \.0+ $ /$1/x)
16153                     {
16154                         $stricter_to_file_of{$property_name . $integer_name}
16155                                                             = $sub_filename;
16156                     }
16157                 }
16158             }
16159
16160             # For Unicode::UCD, create a mapping of the prop=value to the
16161             # canonical =value for that property.
16162             if ($standard =~ /=/) {
16163
16164                 # This could happen if a strict name mapped into an existing
16165                 # loose name.  In that event, the strict names would have to
16166                 # be moved to a new hash.
16167                 if (exists($loose_to_standard_value{$standard})) {
16168                     Carp::my_carp_bug("'$standard' conflicts with a pre-existing use.  Bad News.  Continuing anyway");
16169                 }
16170                 $loose_to_standard_value{$standard} = $loose_table_name;
16171             }
16172
16173             # Keep a list of the deprecated properties and their filenames
16174             if ($deprecated && $complement == 0) {
16175                 $Unicode::UCD::why_deprecated{$sub_filename} = $deprecated;
16176             }
16177
16178             # And a substitute table, if any, for case-insensitive matching
16179             if ($caseless_equivalent != 0) {
16180                 $caseless_equivalent_to{$standard} = $caseless_equivalent;
16181             }
16182
16183             # Add to defaults list if the table this alias belongs to is the
16184             # default one
16185             $loose_defaults{$standard} = 1 if $is_default;
16186         }
16187     }
16188
16189     return;
16190 }
16191
16192 {   # Closure
16193     my %base_names;  # Names already used for avoiding DOS 8.3 filesystem
16194                      # conflicts
16195     my %full_dir_name_of;   # Full length names of directories used.
16196
16197     sub construct_filename($$$) {
16198         # Return a file name for a table, based on the table name, but perhaps
16199         # changed to get rid of non-portable characters in it, and to make
16200         # sure that it is unique on a file system that allows the names before
16201         # any period to be at most 8 characters (DOS).  While we're at it
16202         # check and complain if there are any directory conflicts.
16203
16204         my $name = shift;       # The name to start with
16205         my $mutable = shift;    # Boolean: can it be changed?  If no, but
16206                                 # yet it must be to work properly, a warning
16207                                 # is given
16208         my $directories_ref = shift;  # A reference to an array containing the
16209                                 # path to the file, with each element one path
16210                                 # component.  This is used because the same
16211                                 # name can be used in different directories.
16212         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
16213
16214         my $warn = ! defined wantarray;  # If true, then if the name is
16215                                 # changed, a warning is issued as well.
16216
16217         if (! defined $name) {
16218             Carp::my_carp("Undefined name in directory "
16219                           . File::Spec->join(@$directories_ref)
16220                           . ". '_' used");
16221             return '_';
16222         }
16223
16224         # Make sure that no directory names conflict with each other.  Look at
16225         # each directory in the input file's path.  If it is already in use,
16226         # assume it is correct, and is merely being re-used, but if we
16227         # truncate it to 8 characters, and find that there are two directories
16228         # that are the same for the first 8 characters, but differ after that,
16229         # then that is a problem.
16230         foreach my $directory (@$directories_ref) {
16231             my $short_dir = substr($directory, 0, 8);
16232             if (defined $full_dir_name_of{$short_dir}) {
16233                 next if $full_dir_name_of{$short_dir} eq $directory;
16234                 Carp::my_carp("$directory conflicts with $full_dir_name_of{$short_dir}.  Bad News.  Continuing anyway");
16235             }
16236             else {
16237                 $full_dir_name_of{$short_dir} = $directory;
16238             }
16239         }
16240
16241         my $path = join '/', @$directories_ref;
16242         $path .= '/' if $path;
16243
16244         # Remove interior underscores.
16245         (my $filename = $name) =~ s/ (?<=.) _ (?=.) //xg;
16246
16247         # Convert the dot in floating point numbers to an underscore
16248         $filename =~ s/\./_/ if $filename =~ / ^ \d+ \. \d+ $ /x;
16249
16250         my $suffix = "";
16251
16252         # Extract any suffix, delete any non-word character, and truncate to 3
16253         # after the dot
16254         if ($filename =~ m/ ( .*? ) ( \. .* ) /x) {
16255             $filename = $1;
16256             $suffix = $2;
16257             $suffix =~ s/\W+//g;
16258             substr($suffix, 4) = "" if length($suffix) > 4;
16259         }
16260
16261         # Change any non-word character outside the suffix into an underscore,
16262         # and truncate to 8.
16263         $filename =~ s/\W+/_/g;   # eg., "L&" -> "L_"
16264         substr($filename, 8) = "" if length($filename) > 8;
16265
16266         # Make sure the basename doesn't conflict with something we
16267         # might have already written. If we have, say,
16268         #     InGreekExtended1
16269         #     InGreekExtended2
16270         # they become
16271         #     InGreekE
16272         #     InGreek2
16273         my $warned = 0;
16274         while (my $num = $base_names{$path}{lc "$filename$suffix"}++) {
16275             $num++; # so basenames with numbers start with '2', which
16276                     # just looks more natural.
16277
16278             # Want to append $num, but if it'll make the basename longer
16279             # than 8 characters, pre-truncate $filename so that the result
16280             # is acceptable.
16281             my $delta = length($filename) + length($num) - 8;
16282             if ($delta > 0) {
16283                 substr($filename, -$delta) = $num;
16284             }
16285             else {
16286                 $filename .= $num;
16287             }
16288             if ($warn && ! $warned) {
16289                 $warned = 1;
16290                 Carp::my_carp("'$path$name' conflicts with another name on a filesystem with 8 significant characters (like DOS).  Proceeding anyway.");
16291             }
16292         }
16293
16294         return $filename if $mutable;
16295
16296         # If not changeable, must return the input name, but warn if needed to
16297         # change it beyond shortening it.
16298         if ($name ne $filename
16299             && substr($name, 0, length($filename)) ne $filename) {
16300             Carp::my_carp("'$path$name' had to be changed into '$filename'.  Bad News.  Proceeding anyway.");
16301         }
16302         return $name;
16303     }
16304 }
16305
16306 # The pod file contains a very large table.  Many of the lines in that table
16307 # would exceed a typical output window's size, and so need to be wrapped with
16308 # a hanging indent to make them look good.  The pod language is really
16309 # insufficient here.  There is no general construct to do that in pod, so it
16310 # is done here by beginning each such line with a space to cause the result to
16311 # be output without formatting, and doing all the formatting here.  This leads
16312 # to the result that if the eventual display window is too narrow it won't
16313 # look good, and if the window is too wide, no advantage is taken of that
16314 # extra width.  A further complication is that the output may be indented by
16315 # the formatter so that there is less space than expected.  What I (khw) have
16316 # done is to assume that that indent is a particular number of spaces based on
16317 # what it is in my Linux system;  people can always resize their windows if
16318 # necessary, but this is obviously less than desirable, but the best that can
16319 # be expected.
16320 my $automatic_pod_indent = 8;
16321
16322 # Try to format so that uses fewest lines, but few long left column entries
16323 # slide into the right column.  An experiment on 5.1 data yielded the
16324 # following percentages that didn't cut into the other side along with the
16325 # associated first-column widths
16326 # 69% = 24
16327 # 80% not too bad except for a few blocks
16328 # 90% = 33; # , cuts 353/3053 lines from 37 = 12%
16329 # 95% = 37;
16330 my $indent_info_column = 27;    # 75% of lines didn't have overlap
16331
16332 my $FILLER = 3;     # Length of initial boiler-plate columns in a pod line
16333                     # The 3 is because of:
16334                     #   1   for the leading space to tell the pod formatter to
16335                     #       output as-is
16336                     #   1   for the flag
16337                     #   1   for the space between the flag and the main data
16338
16339 sub format_pod_line ($$$;$$) {
16340     # Take a pod line and return it, formatted properly
16341
16342     my $first_column_width = shift;
16343     my $entry = shift;  # Contents of left column
16344     my $info = shift;   # Contents of right column
16345
16346     my $status = shift || "";   # Any flag
16347
16348     my $loose_match = shift;    # Boolean.
16349     $loose_match = 1 unless defined $loose_match;
16350
16351     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
16352
16353     my $flags = "";
16354     $flags .= $STRICTER if ! $loose_match;
16355
16356     $flags .= $status if $status;
16357
16358     # There is a blank in the left column to cause the pod formatter to
16359     # output the line as-is.
16360     return sprintf " %-*s%-*s %s\n",
16361                     # The first * in the format is replaced by this, the -1 is
16362                     # to account for the leading blank.  There isn't a
16363                     # hard-coded blank after this to separate the flags from
16364                     # the rest of the line, so that in the unlikely event that
16365                     # multiple flags are shown on the same line, they both
16366                     # will get displayed at the expense of that separation,
16367                     # but since they are left justified, a blank will be
16368                     # inserted in the normal case.
16369                     $FILLER - 1,
16370                     $flags,
16371
16372                     # The other * in the format is replaced by this number to
16373                     # cause the first main column to right fill with blanks.
16374                     # The -1 is for the guaranteed blank following it.
16375                     $first_column_width - $FILLER - 1,
16376                     $entry,
16377                     $info;
16378 }
16379
16380 my @zero_match_tables;  # List of tables that have no matches in this release
16381
16382 sub make_re_pod_entries($) {
16383     # This generates the entries for the pod file for a given table.
16384     # Also done at this time are any children tables.  The output looks like:
16385     # \p{Common}              \p{Script=Common} (Short: \p{Zyyy}) (5178)
16386
16387     my $input_table = shift;        # Table the entry is for
16388     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
16389
16390     # Generate parent and all its children at the same time.
16391     return if $input_table->parent != $input_table;
16392
16393     my $property = $input_table->property;
16394     my $type = $property->type;
16395     my $full_name = $property->full_name;
16396
16397     my $count = $input_table->count;
16398     my $unicode_count;
16399     my $non_unicode_string;
16400     if ($count > $MAX_UNICODE_CODEPOINTS) {
16401         $unicode_count = $count - ($MAX_WORKING_CODEPOINT
16402                                     - $MAX_UNICODE_CODEPOINT);
16403         $non_unicode_string = " plus all above-Unicode code points";
16404     }
16405     else {
16406         $unicode_count = $count;
16407         $non_unicode_string = "";
16408     }
16409
16410     my $string_count = clarify_number($unicode_count) . $non_unicode_string;
16411
16412     my $definition = $input_table->calculate_table_definition;
16413     if ($definition) {
16414
16415         # Save the definition for later use.
16416         $input_table->set_definition($definition);
16417
16418         $definition = ": $definition";
16419     }
16420
16421     my $status = $input_table->status;
16422     my $status_info = $input_table->status_info;
16423     my $caseless_equivalent = $input_table->caseless_equivalent;
16424
16425     # Don't mention a placeholder equivalent as it isn't to be listed in the
16426     # pod
16427     $caseless_equivalent = 0 if $caseless_equivalent != 0
16428                                 && $caseless_equivalent->fate > $ORDINARY;
16429
16430     my $entry_for_first_table; # The entry for the first table output.
16431                            # Almost certainly, it is the parent.
16432
16433     # For each related table (including itself), we will generate a pod entry
16434     # for each name each table goes by
16435     foreach my $table ($input_table, $input_table->children) {
16436
16437         # Unicode::UCD cannot deal with null string property values, so skip
16438         # any tables that have no non-null names.
16439         next if ! grep { $_->name ne "" } $table->aliases;
16440
16441         # First, gather all the info that applies to this table as a whole.
16442
16443         push @zero_match_tables, $table if $count == 0
16444                                             # Don't mention special tables
16445                                             # as being zero length
16446                                            && $table->fate == $ORDINARY;
16447
16448         my $table_property = $table->property;
16449
16450         # The short name has all the underscores removed, while the full name
16451         # retains them.  Later, we decide whether to output a short synonym
16452         # for the full one, we need to compare apples to apples, so we use the
16453         # short name's length including underscores.
16454         my $table_property_short_name_length;
16455         my $table_property_short_name
16456             = $table_property->short_name(\$table_property_short_name_length);
16457         my $table_property_full_name = $table_property->full_name;
16458
16459         # Get how much savings there is in the short name over the full one
16460         # (delta will always be <= 0)
16461         my $table_property_short_delta = $table_property_short_name_length
16462                                          - length($table_property_full_name);
16463         my @table_description = $table->description;
16464         my @table_note = $table->note;
16465
16466         # Generate an entry for each alias in this table.
16467         my $entry_for_first_alias;  # saves the first one encountered.
16468         foreach my $alias ($table->aliases) {
16469
16470             # Skip if not to go in pod.
16471             next unless $alias->make_re_pod_entry;
16472
16473             # Start gathering all the components for the entry
16474             my $name = $alias->name;
16475
16476             # Skip if name is empty, as can't be accessed by regexes.
16477             next if $name eq "";
16478
16479             my $entry;      # Holds the left column, may include extras
16480             my $entry_ref;  # To refer to the left column's contents from
16481                             # another entry; has no extras
16482
16483             # First the left column of the pod entry.  Tables for the $perl
16484             # property always use the single form.
16485             if ($table_property == $perl) {
16486                 $entry = "\\p{$name}";
16487                 $entry .= " \\p$name" if length $name == 1; # Show non-braced
16488                                                             # form too
16489                 $entry_ref = "\\p{$name}";
16490             }
16491             else {    # Compound form.
16492
16493                 # Only generate one entry for all the aliases that mean true
16494                 # or false in binary properties.  Append a '*' to indicate
16495                 # some are missing.  (The heading comment notes this.)
16496                 my $rhs;
16497                 if ($type == $BINARY) {
16498                     next if $name ne 'N' && $name ne 'Y';
16499                     $rhs = "$name*";
16500                 }
16501                 elsif ($type != $FORCED_BINARY) {
16502                     $rhs = $name;
16503                 }
16504                 else {
16505
16506                     # Forced binary properties require special handling.  It
16507                     # has two sets of tables, one set is true/false; and the
16508                     # other set is everything else.  Entries are generated for
16509                     # each set.  Use the Bidi_Mirrored property (which appears
16510                     # in all Unicode versions) to get a list of the aliases
16511                     # for the true/false tables.  Of these, only output the N
16512                     # and Y ones, the same as, a regular binary property.  And
16513                     # output all the rest, same as a non-binary property.
16514                     my $bm = property_ref("Bidi_Mirrored");
16515                     if ($name eq 'N' || $name eq 'Y') {
16516                         $rhs = "$name*";
16517                     } elsif (grep { $name eq $_->name } $bm->table("Y")->aliases,
16518                                                         $bm->table("N")->aliases)
16519                     {
16520                         next;
16521                     }
16522                     else {
16523                         $rhs = $name;
16524                     }
16525                 }
16526
16527                 # Colon-space is used to give a little more space to be easier
16528                 # to read;
16529                 $entry = "\\p{"
16530                         . $table_property_full_name
16531                         . ": $rhs}";
16532
16533                 # But for the reference to this entry, which will go in the
16534                 # right column, where space is at a premium, use equals
16535                 # without a space
16536                 $entry_ref = "\\p{" . $table_property_full_name . "=$name}";
16537             }
16538
16539             # Then the right (info) column.  This is stored as components of
16540             # an array for the moment, then joined into a string later.  For
16541             # non-internal only properties, begin the info with the entry for
16542             # the first table we encountered (if any), as things are ordered
16543             # so that that one is the most descriptive.  This leads to the
16544             # info column of an entry being a more descriptive version of the
16545             # name column
16546             my @info;
16547             if ($name =~ /^_/) {
16548                 push @info,
16549                         '(For internal use by Perl, not necessarily stable)';
16550             }
16551             elsif ($entry_for_first_alias) {
16552                 push @info, $entry_for_first_alias;
16553             }
16554
16555             # If this entry is equivalent to another, add that to the info,
16556             # using the first such table we encountered
16557             if ($entry_for_first_table) {
16558                 if (@info) {
16559                     push @info, "(= $entry_for_first_table)";
16560                 }
16561                 else {
16562                     push @info, $entry_for_first_table;
16563                 }
16564             }
16565
16566             # If the name is a large integer, add an equivalent with an
16567             # exponent for better readability
16568             if ($name =~ /^[+-]?[\d]+$/ && $name >= 10_000) {
16569                 push @info, sprintf "(= %.1e)", $name
16570             }
16571
16572             my $parenthesized = "";
16573             if (! $entry_for_first_alias) {
16574
16575                 # This is the first alias for the current table.  The alias
16576                 # array is ordered so that this is the fullest, most
16577                 # descriptive alias, so it gets the fullest info.  The other
16578                 # aliases are mostly merely pointers to this one, using the
16579                 # information already added above.
16580
16581                 # Display any status message, but only on the parent table
16582                 if ($status && ! $entry_for_first_table) {
16583                     push @info, $status_info;
16584                 }
16585
16586                 # Put out any descriptive info
16587                 if (@table_description || @table_note) {
16588                     push @info, join "; ", @table_description, @table_note;
16589                 }
16590
16591                 # Look to see if there is a shorter name we can point people
16592                 # at
16593                 my $standard_name = standardize($name);
16594                 my $short_name;
16595                 my $proposed_short = $table->short_name;
16596                 if (defined $proposed_short) {
16597                     my $standard_short = standardize($proposed_short);
16598
16599                     # If the short name is shorter than the standard one, or
16600                     # even if it's not, but the combination of it and its
16601                     # short property name (as in \p{prop=short} ($perl doesn't
16602                     # have this form)) saves at least two characters, then,
16603                     # cause it to be listed as a shorter synonym.
16604                     if (length $standard_short < length $standard_name
16605                         || ($table_property != $perl
16606                             && (length($standard_short)
16607                                 - length($standard_name)
16608                                 + $table_property_short_delta)  # (<= 0)
16609                                 < -2))
16610                     {
16611                         $short_name = $proposed_short;
16612                         if ($table_property != $perl) {
16613                             $short_name = $table_property_short_name
16614                                           . "=$short_name";
16615                         }
16616                         $short_name = "\\p{$short_name}";
16617                     }
16618                 }
16619
16620                 # And if this is a compound form name, see if there is a
16621                 # single form equivalent
16622                 my $single_form;
16623                 if ($table_property != $perl && $table_property != $block) {
16624
16625                     # Special case the binary N tables, so that will print
16626                     # \P{single}, but use the Y table values to populate
16627                     # 'single', as we haven't likewise populated the N table.
16628                     # For forced binary tables, we can't just look at the N
16629                     # table, but must see if this table is equivalent to the N
16630                     # one, as there are two equivalent beasts in these
16631                     # properties.
16632                     my $test_table;
16633                     my $p;
16634                     if (   ($type == $BINARY
16635                             && $input_table == $property->table('No'))
16636                         || ($type == $FORCED_BINARY
16637                             && $property->table('No')->
16638                                         is_set_equivalent_to($input_table)))
16639                     {
16640                         $test_table = $property->table('Yes');
16641                         $p = 'P';
16642                     }
16643                     else {
16644                         $test_table = $input_table;
16645                         $p = 'p';
16646                     }
16647
16648                     # Look for a single form amongst all the children.
16649                     foreach my $table ($test_table->children) {
16650                         next if $table->property != $perl;
16651                         my $proposed_name = $table->short_name;
16652                         next if ! defined $proposed_name;
16653
16654                         # Don't mention internal-only properties as a possible
16655                         # single form synonym
16656                         next if substr($proposed_name, 0, 1) eq '_';
16657
16658                         $proposed_name = "\\$p\{$proposed_name}";
16659                         if (! defined $single_form
16660                             || length($proposed_name) < length $single_form)
16661                         {
16662                             $single_form = $proposed_name;
16663
16664                             # The goal here is to find a single form; not the
16665                             # shortest possible one.  We've already found a
16666                             # short name.  So, stop at the first single form
16667                             # found, which is likely to be closer to the
16668                             # original.
16669                             last;
16670                         }
16671                     }
16672                 }
16673
16674                 # Output both short and single in the same parenthesized
16675                 # expression, but with only one of 'Single', 'Short' if there
16676                 # are both items.
16677                 if ($short_name || $single_form || $table->conflicting) {
16678                     $parenthesized .= "Short: $short_name" if $short_name;
16679                     if ($short_name && $single_form) {
16680                         $parenthesized .= ', ';
16681                     }
16682                     elsif ($single_form) {
16683                         $parenthesized .= 'Single: ';
16684                     }
16685                     $parenthesized .= $single_form if $single_form;
16686                 }
16687             }
16688
16689             if ($caseless_equivalent != 0) {
16690                 $parenthesized .=  '; ' if $parenthesized ne "";
16691                 $parenthesized .= "/i= " . $caseless_equivalent->complete_name;
16692             }
16693
16694
16695             # Warn if this property isn't the same as one that a
16696             # semi-casual user might expect.  The other components of this
16697             # parenthesized structure are calculated only for the first entry
16698             # for this table, but the conflicting is deemed important enough
16699             # to go on every entry.
16700             my $conflicting = join " NOR ", $table->conflicting;
16701             if ($conflicting) {
16702                 $parenthesized .=  '; ' if $parenthesized ne "";
16703                 $parenthesized .= "NOT $conflicting";
16704             }
16705
16706             push @info, "($parenthesized)" if $parenthesized;
16707
16708             if ($name =~ /_$/ && $alias->loose_match) {
16709                 push @info, "Note the trailing '_' matters in spite of loose matching rules.";
16710             }
16711
16712             if ($table_property != $perl && $table->perl_extension) {
16713                 push @info, '(Perl extension)';
16714             }
16715             my $definition = $table->definition // "";
16716             $definition = "" if $entry_for_first_alias;
16717             $definition = ": $definition" if $definition;
16718             push @info, "($string_count$definition)";
16719
16720             # Now, we have both the entry and info so add them to the
16721             # list of all the properties.
16722             push @match_properties,
16723                 format_pod_line($indent_info_column,
16724                                 $entry,
16725                                 join( " ", @info),
16726                                 $alias->status,
16727                                 $alias->loose_match);
16728
16729             $entry_for_first_alias = $entry_ref unless $entry_for_first_alias;
16730         } # End of looping through the aliases for this table.
16731
16732         if (! $entry_for_first_table) {
16733             $entry_for_first_table = $entry_for_first_alias;
16734         }
16735     } # End of looping through all the related tables
16736     return;
16737 }
16738
16739 sub make_ucd_table_pod_entries {
16740     my $table = shift;
16741
16742     # Generate the entries for the UCD section of the pod for $table.  This
16743     # also calculates if names are ambiguous, so has to be called even if the
16744     # pod is not being output
16745
16746     my $short_name = $table->name;
16747     my $standard_short_name = standardize($short_name);
16748     my $full_name = $table->full_name;
16749     my $standard_full_name = standardize($full_name);
16750
16751     my $full_info = "";     # Text of info column for full-name entries
16752     my $other_info = "";    # Text of info column for short-name entries
16753     my $short_info = "";    # Text of info column for other entries
16754     my $meaning = "";       # Synonym of this table
16755
16756     my $property = ($table->isa('Property'))
16757                    ? $table
16758                    : $table->parent->property;
16759
16760     my $perl_extension = $table->perl_extension;
16761     my $is_perl_extension_match_table_but_not_dollar_perl
16762                                                         = $property != $perl
16763                                                        && $perl_extension
16764                                                        && $property != $table;
16765
16766     # Get the more official name for for perl extensions that aren't
16767     # stand-alone properties
16768     if ($is_perl_extension_match_table_but_not_dollar_perl) {
16769         if ($property->type == $BINARY) {
16770             $meaning = $property->full_name;
16771         }
16772         else {
16773             $meaning = $table->parent->complete_name;
16774         }
16775     }
16776
16777     # There are three types of info column.  One for the short name, one for
16778     # the full name, and one for everything else.  They mostly are the same,
16779     # so initialize in the same loop.
16780
16781     foreach my $info_ref (\$full_info, \$short_info, \$other_info) {
16782         if ($info_ref != \$full_info) {
16783
16784             # The non-full name columns include the full name
16785             $$info_ref .= $full_name;
16786         }
16787
16788
16789         if ($is_perl_extension_match_table_but_not_dollar_perl) {
16790
16791             # Add the synonymous name for the non-full name entries; and to
16792             # the full-name entry if it adds extra information
16793             if (   standardize($meaning) ne $standard_full_name
16794                 || $info_ref == \$other_info
16795                 || $info_ref == \$short_info)
16796             {
16797                 my $parenthesized =  $info_ref != \$full_info;
16798                 $$info_ref .= " " if $$info_ref && $parenthesized;
16799                 $$info_ref .= "(=" if $parenthesized;
16800                 $$info_ref .= "$meaning";
16801                 $$info_ref .= ")" if $parenthesized;
16802                 $$info_ref .= ".";
16803             }
16804         }
16805
16806         # And the full-name entry includes the short name, if shorter
16807         if ($info_ref == \$full_info
16808             && length $standard_short_name < length $standard_full_name)
16809         {
16810             $full_info =~ s/\.\Z//;
16811             $full_info .= "  " if $full_info;
16812             $full_info .= "(Short: $short_name)";
16813         }
16814
16815         if ($table->perl_extension) {
16816             $$info_ref =~ s/\.\Z//;
16817             $$info_ref .= ".  " if $$info_ref;
16818             $$info_ref .= "(Perl extension)";
16819         }
16820     }
16821
16822     my $definition;
16823     my $definition_table;
16824     my $type = $table->property->type;
16825     if ($type == $BINARY || $type == $FORCED_BINARY) {
16826         $definition_table = $table->property->table('Y');
16827     }
16828     elsif ($table->isa('Match_Table')) {
16829         $definition_table = $table;
16830     }
16831
16832     $definition = $definition_table->calculate_table_definition
16833                                             if defined $definition_table
16834                                                     && $definition_table != 0;
16835
16836     # Add any extra annotations to the full name entry
16837     foreach my $more_info ($table->description,
16838                             $definition,
16839                             $table->note,
16840                             $table->status_info)
16841     {
16842         next unless $more_info;
16843         $full_info =~ s/\.\Z//;
16844         $full_info .= ".  " if $full_info;
16845         $full_info .= $more_info;
16846     }
16847     if ($table->property->type == $FORCED_BINARY) {
16848         if ($full_info) {
16849             $full_info =~ s/\.\Z//;
16850             $full_info .= ".  ";
16851         }
16852         $full_info .= "This is a combination property which has both:"
16853                     . " 1) a map to various string values; and"
16854                     . " 2) a map to boolean Y/N, where 'Y' means the"
16855                     . " string value is non-empty.  Add the prefix 'is'"
16856                     . " to the prop_invmap() call to get the latter";
16857     }
16858
16859     # These keep track if have created full and short name pod entries for the
16860     # property
16861     my $done_full = 0;
16862     my $done_short = 0;
16863
16864     # Every possible name is kept track of, even those that aren't going to be
16865     # output.  This way we can be sure to find the ambiguities.
16866     foreach my $alias ($table->aliases) {
16867         my $name = $alias->name;
16868         my $standard = standardize($name);
16869         my $info;
16870         my $output_this = $alias->ucd;
16871
16872         # If the full and short names are the same, we want to output the full
16873         # one's entry, so it has priority.
16874         if ($standard eq $standard_full_name) {
16875             next if $done_full;
16876             $done_full = 1;
16877             $info = $full_info;
16878         }
16879         elsif ($standard eq $standard_short_name) {
16880             next if $done_short;
16881             $done_short = 1;
16882             next if $standard_short_name eq $standard_full_name;
16883             $info = $short_info;
16884         }
16885         else {
16886             $info = $other_info;
16887         }
16888
16889         $combination_property{$standard} = 1
16890                                   if $table->property->type == $FORCED_BINARY;
16891
16892         # Here, we have set up the two columns for this entry.  But if an
16893         # entry already exists for this name, we have to decide which one
16894         # we're going to later output.
16895         if (exists $ucd_pod{$standard}) {
16896
16897             # If the two entries refer to the same property, it's not going to
16898             # be ambiguous.  (Likely it's because the names when standardized
16899             # are the same.)  But that means if they are different properties,
16900             # there is ambiguity.
16901             if ($ucd_pod{$standard}->{'property'} != $property) {
16902
16903                 # Here, we have an ambiguity.  This code assumes that one is
16904                 # scheduled to be output and one not and that one is a perl
16905                 # extension (which is not to be output) and the other isn't.
16906                 # If those assumptions are wrong, things have to be rethought.
16907                 if ($ucd_pod{$standard}{'output_this'} == $output_this
16908                     || $ucd_pod{$standard}{'perl_extension'} == $perl_extension
16909                     || $output_this == $perl_extension)
16910                 {
16911                     Carp::my_carp("Bad news.  $property and $ucd_pod{$standard}->{'property'} have unexpected output status and perl-extension combinations.  Proceeding anyway.");
16912                 }
16913
16914                 # We modify the info column of the one being output to
16915                 # indicate the ambiguity.  Set $which to point to that one's
16916                 # info.
16917                 my $which;
16918                 if ($ucd_pod{$standard}{'output_this'}) {
16919                     $which = \$ucd_pod{$standard}->{'info'};
16920                 }
16921                 else {
16922                     $which = \$info;
16923                     $meaning = $ucd_pod{$standard}{'meaning'};
16924                 }
16925
16926                 chomp $$which;
16927                 $$which =~ s/\.\Z//;
16928                 $$which .= "; NOT '$standard' meaning '$meaning'";
16929
16930                 $ambiguous_names{$standard} = 1;
16931             }
16932
16933             # Use the non-perl-extension variant
16934             next unless $ucd_pod{$standard}{'perl_extension'};
16935         }
16936
16937         # Store enough information about this entry that we can later look for
16938         # ambiguities, and output it properly.
16939         $ucd_pod{$standard} = { 'name' => $name,
16940                                 'info' => $info,
16941                                 'meaning' => $meaning,
16942                                 'output_this' => $output_this,
16943                                 'perl_extension' => $perl_extension,
16944                                 'property' => $property,
16945                                 'status' => $alias->status,
16946         };
16947     } # End of looping through all this table's aliases
16948
16949     return;
16950 }
16951
16952 sub pod_alphanumeric_sort {
16953     # Sort pod entries alphanumerically.
16954
16955     # The first few character columns are filler, plus the '\p{'; and get rid
16956     # of all the trailing stuff, starting with the trailing '}', so as to sort
16957     # on just 'Name=Value'
16958     (my $a = lc $a) =~ s/^ .*? \{ //x;
16959     $a =~ s/}.*//;
16960     (my $b = lc $b) =~ s/^ .*? \{ //x;
16961     $b =~ s/}.*//;
16962
16963     # Determine if the two operands are both internal only or both not.
16964     # Character 0 should be a '\'; 1 should be a p; 2 should be '{', so 3
16965     # should be the underscore that begins internal only
16966     my $a_is_internal = (substr($a, 0, 1) eq '_');
16967     my $b_is_internal = (substr($b, 0, 1) eq '_');
16968
16969     # Sort so the internals come last in the table instead of first (which the
16970     # leading underscore would otherwise indicate).
16971     if ($a_is_internal != $b_is_internal) {
16972         return 1 if $a_is_internal;
16973         return -1
16974     }
16975
16976     # Determine if the two operands are compound or not, and if so if are
16977     # "numeric" property values or not, like \p{Age: 3.0}.  But there are also
16978     # things like \p{Canonical_Combining_Class: CCC133} and \p{Age: V10_0},
16979     # all of which this considers numeric, and for sorting, looks just at the
16980     # numeric parts.  It can also be a rational like \p{Numeric Value=-1/2}.
16981     my $split_re = qr/
16982         ^ ( [^:=]+ ) # $1 is undef if not a compound form, otherwise is the
16983                      # property name
16984         [:=] \s*     # The syntax for the compound form
16985         (?:          # followed by ...
16986             (        # $2 gets defined if what follows is a "numeric"
16987                      # expression, which is ...
16988               ( -? \d+ (?: [.\/] \d+)?  # An integer, float, or rational
16989                                         # number, optionally signed
16990                | [[:alpha:]]{2,} \d+ $ ) # or something like CCC131.  Either
16991                                          # of these go into $3
16992              | ( V \d+ _ \d+ )           # or a Unicode's Age property version
16993                                          # number, into $4
16994             )
16995             | .* $    # If not "numeric", accept anything so that $1 gets
16996                       # defined if it is any compound form
16997         ) /ix;
16998     my ($a_initial, $a_numeric, $a_number, $a_version) = ($a =~ $split_re);
16999     my ($b_initial, $b_numeric, $b_number, $b_version) = ($b =~ $split_re);
17000
17001     # Sort alphabeticlly on the whole property name if either operand isn't
17002     # compound, or they differ.
17003     return $a cmp $b if   ! defined $a_initial
17004                        || ! defined $b_initial
17005                        || $a_initial ne $b_initial;
17006
17007     if (! defined $a_numeric) {
17008
17009         # If neither is numeric, use alpha sort
17010         return $a cmp $b if ! defined $b_numeric;
17011         return 1;  # Sort numeric ahead of alpha
17012     }
17013
17014     # Here $a is numeric
17015     return -1 if ! defined $b_numeric;  # Numeric sorts before alpha
17016
17017     # Here they are both numeric in the same property.
17018     # Convert version numbers into regular numbers
17019     if (defined $a_version) {
17020         ($a_number = $a_version) =~ s/^V//i;
17021         $a_number =~ s/_/./;
17022     }
17023     else {  # Otherwise get rid of the, e.g., CCC in CCC9 */
17024         $a_number =~ s/ ^ [[:alpha:]]+ //x;
17025     }
17026     if (defined $b_version) {
17027         ($b_number = $b_version) =~ s/^V//i;
17028         $b_number =~ s/_/./;
17029     }
17030     else {
17031         $b_number =~ s/ ^ [[:alpha:]]+ //x;
17032     }
17033
17034     # Convert rationals to floating for the comparison.
17035     $a_number = eval $a_number if $a_number =~ qr{/};
17036     $b_number = eval $b_number if $b_number =~ qr{/};
17037
17038     return $a_number <=> $b_number || $a cmp $b;
17039 }
17040
17041 sub make_pod () {
17042     # Create the .pod file.  This generates the various subsections and then
17043     # combines them in one big HERE document.
17044
17045     my $Is_flags_text = "If an entry has flag(s) at its beginning, like \"$DEPRECATED\", the \"Is_\" form has the same flag(s)";
17046
17047     return unless defined $pod_directory;
17048     print "Making pod file\n" if $verbosity >= $PROGRESS;
17049
17050     my $exception_message =
17051     '(Any exceptions are individually noted beginning with the word NOT.)';
17052     my @block_warning;
17053     if (-e 'Blocks.txt') {
17054
17055         # Add the line: '\p{In_*}    \p{Block: *}', with the warning message
17056         # if the global $has_In_conflicts indicates we have them.
17057         push @match_properties, format_pod_line($indent_info_column,
17058                                                 '\p{In_*}',
17059                                                 '\p{Block: *}'
17060                                                     . (($has_In_conflicts)
17061                                                       ? " $exception_message"
17062                                                       : ""),
17063                                                  $DISCOURAGED);
17064         @block_warning = << "END";
17065
17066 In particular, matches in the Block property have single forms
17067 defined by Perl that begin with C<"In_">, C<"Is_>, or even with no prefix at
17068 all,  Like all B<DISCOURAGED> forms, these are not stable.  For example,
17069 C<\\p{Block=Deseret}> can currently be written as C<\\p{In_Deseret}>,
17070 C<\\p{Is_Deseret}>, or C<\\p{Deseret}>.  But, a new Unicode version may
17071 come along that would force Perl to change the meaning of one or more of
17072 these, and your program would no longer be correct.  Currently there are no
17073 such conflicts with the form that begins C<"In_">, but there are many with the
17074 other two shortcuts, and Unicode continues to define new properties that begin
17075 with C<"In">, so it's quite possible that a conflict will occur in the future.
17076 The compound form is guaranteed to not become obsolete, and its meaning is
17077 clearer anyway.  See L<perlunicode/"Blocks"> for more information about this.
17078
17079 User-defined properties must begin with "In" or "Is".  These override any
17080 Unicode property of the same name.
17081 END
17082     }
17083     my $text = $Is_flags_text;
17084     $text = "$exception_message $text" if $has_Is_conflicts;
17085
17086     # And the 'Is_ line';
17087     push @match_properties, format_pod_line($indent_info_column,
17088                                             '\p{Is_*}',
17089                                             "\\p{*} $text");
17090
17091     # Sort the properties array for output.  It is sorted alphabetically
17092     # except numerically for numeric properties, and only output unique lines.
17093     @match_properties = sort pod_alphanumeric_sort uniques @match_properties;
17094
17095     my $formatted_properties = simple_fold(\@match_properties,
17096                                         "",
17097                                         # indent succeeding lines by two extra
17098                                         # which looks better
17099                                         $indent_info_column + 2,
17100
17101                                         # shorten the line length by how much
17102                                         # the formatter indents, so the folded
17103                                         # line will fit in the space
17104                                         # presumably available
17105                                         $automatic_pod_indent);
17106     # Add column headings, indented to be a little more centered, but not
17107     # exactly
17108     $formatted_properties =  format_pod_line($indent_info_column,
17109                                                     '    NAME',
17110                                                     '           INFO')
17111                                     . "\n"
17112                                     . $formatted_properties;
17113
17114     # Generate pod documentation lines for the tables that match nothing
17115     my $zero_matches = "";
17116     if (@zero_match_tables) {
17117         @zero_match_tables = uniques(@zero_match_tables);
17118         $zero_matches = join "\n\n",
17119                         map { $_ = '=item \p{' . $_->complete_name . "}" }
17120                             sort { $a->complete_name cmp $b->complete_name }
17121                             @zero_match_tables;
17122
17123         $zero_matches = <<END;
17124
17125 =head2 Legal C<\\p{}> and C<\\P{}> constructs that match no characters
17126
17127 Unicode has some property-value pairs that currently don't match anything.
17128 This happens generally either because they are obsolete, or they exist for
17129 symmetry with other forms, but no language has yet been encoded that uses
17130 them.  In this version of Unicode, the following match zero code points:
17131
17132 =over 4
17133
17134 $zero_matches
17135
17136 =back
17137
17138 END
17139     }
17140
17141     # Generate list of properties that we don't accept, grouped by the reasons
17142     # why.  This is so only put out the 'why' once, and then list all the
17143     # properties that have that reason under it.
17144
17145     my %why_list;   # The keys are the reasons; the values are lists of
17146                     # properties that have the key as their reason
17147
17148     # For each property, add it to the list that are suppressed for its reason
17149     # The sort will cause the alphabetically first properties to be added to
17150     # each list first, so each list will be sorted.
17151     foreach my $property (sort keys %why_suppressed) {
17152         next unless $why_suppressed{$property};
17153         push @{$why_list{$why_suppressed{$property}}}, $property;
17154     }
17155
17156     # For each reason (sorted by the first property that has that reason)...
17157     my @bad_re_properties;
17158     foreach my $why (sort { $why_list{$a}->[0] cmp $why_list{$b}->[0] }
17159                      keys %why_list)
17160     {
17161         # Add to the output, all the properties that have that reason.
17162         my $has_item = 0;   # Flag if actually output anything.
17163         foreach my $name (@{$why_list{$why}}) {
17164
17165             # Split compound names into $property and $table components
17166             my $property = $name;
17167             my $table;
17168             if ($property =~ / (.*) = (.*) /x) {
17169                 $property = $1;
17170                 $table = $2;
17171             }
17172
17173             # This release of Unicode may not have a property that is
17174             # suppressed, so don't reference a non-existent one.
17175             $property = property_ref($property);
17176             next if ! defined $property;
17177
17178             # And since this list is only for match tables, don't list the
17179             # ones that don't have match tables.
17180             next if ! $property->to_create_match_tables;
17181
17182             # Find any abbreviation, and turn it into a compound name if this
17183             # is a property=value pair.
17184             my $short_name = $property->name;
17185             $short_name .= '=' . $property->table($table)->name if $table;
17186
17187             # Start with an empty line.
17188             push @bad_re_properties, "\n\n" unless $has_item;
17189
17190             # And add the property as an item for the reason.
17191             push @bad_re_properties, "\n=item I<$name> ($short_name)\n";
17192             $has_item = 1;
17193         }
17194
17195         # And add the reason under the list of properties, if such a list
17196         # actually got generated.  Note that the header got added
17197         # unconditionally before.  But pod ignores extra blank lines, so no
17198         # harm.
17199         push @bad_re_properties, "\n$why\n" if $has_item;
17200
17201     } # End of looping through each reason.
17202
17203     if (! @bad_re_properties) {
17204         push @bad_re_properties,
17205                 "*** This installation accepts ALL non-Unihan properties ***";
17206     }
17207     else {
17208         # Add =over only if non-empty to avoid an empty =over/=back section,
17209         # which is considered bad form.
17210         unshift @bad_re_properties, "\n=over 4\n";
17211         push @bad_re_properties, "\n=back\n";
17212     }
17213
17214     # Similarly, generate a list of files that we don't use, grouped by the
17215     # reasons why (Don't output if the reason is empty).  First, create a hash
17216     # whose keys are the reasons, and whose values are anonymous arrays of all
17217     # the files that share that reason.
17218     my %grouped_by_reason;
17219     foreach my $file (keys %skipped_files) {
17220         next unless $skipped_files{$file};
17221         push @{$grouped_by_reason{$skipped_files{$file}}}, $file;
17222     }
17223
17224     # Then, sort each group.
17225     foreach my $group (keys %grouped_by_reason) {
17226         @{$grouped_by_reason{$group}} = sort { lc $a cmp lc $b }
17227                                         @{$grouped_by_reason{$group}} ;
17228     }
17229
17230     # Finally, create the output text.  For each reason (sorted by the
17231     # alphabetically first file that has that reason)...
17232     my @unused_files;
17233     foreach my $reason (sort { lc $grouped_by_reason{$a}->[0]
17234                                cmp lc $grouped_by_reason{$b}->[0]
17235                               }
17236                          keys %grouped_by_reason)
17237     {
17238         # Add all the files that have that reason to the output.  Start
17239         # with an empty line.
17240         push @unused_files, "\n\n";
17241         push @unused_files, map { "\n=item F<$_> \n" }
17242                             @{$grouped_by_reason{$reason}};
17243         # And add the reason under the list of files
17244         push @unused_files, "\n$reason\n";
17245     }
17246
17247     # Similarly, create the output text for the UCD section of the pod
17248     my @ucd_pod;
17249     foreach my $key (keys %ucd_pod) {
17250         next unless $ucd_pod{$key}->{'output_this'};
17251         push @ucd_pod, format_pod_line($indent_info_column,
17252                                        $ucd_pod{$key}->{'name'},
17253                                        $ucd_pod{$key}->{'info'},
17254                                        $ucd_pod{$key}->{'status'},
17255                                       );
17256     }
17257
17258     # Sort alphabetically, and fold for output
17259     @ucd_pod = sort { lc substr($a, 2) cmp lc substr($b, 2) } @ucd_pod;
17260     my $ucd_pod = simple_fold(\@ucd_pod,
17261                            ' ',
17262                            $indent_info_column,
17263                            $automatic_pod_indent);
17264     $ucd_pod =  format_pod_line($indent_info_column, 'NAME', '  INFO')
17265                 . "\n"
17266                 . $ucd_pod;
17267     my $space_hex = sprintf("%02x", ord " ");
17268     local $" = "";
17269
17270     # Everything is ready to assemble.
17271     my @OUT = << "END";
17272 =begin comment
17273
17274 $HEADER
17275
17276 To change this file, edit $0 instead.
17277
17278 =end comment
17279
17280 =head1 NAME
17281
17282 $pod_file - Index of Unicode Version $unicode_version character properties in Perl
17283
17284 =head1 DESCRIPTION
17285
17286 This document provides information about the portion of the Unicode database
17287 that deals with character properties, that is the portion that is defined on
17288 single code points.  (L</Other information in the Unicode data base>
17289 below briefly mentions other data that Unicode provides.)
17290
17291 Perl can provide access to all non-provisional Unicode character properties,
17292 though not all are enabled by default.  The omitted ones are the Unihan
17293 properties (accessible via the CPAN module L<Unicode::Unihan>) and certain
17294 deprecated or Unicode-internal properties.  (An installation may choose to
17295 recompile Perl's tables to change this.  See L</Unicode character
17296 properties that are NOT accepted by Perl>.)
17297
17298 For most purposes, access to Unicode properties from the Perl core is through
17299 regular expression matches, as described in the next section.
17300 For some special purposes, and to access the properties that are not suitable
17301 for regular expression matching, all the Unicode character properties that
17302 Perl handles are accessible via the standard L<Unicode::UCD> module, as
17303 described in the section L</Properties accessible through Unicode::UCD>.
17304
17305 Perl also provides some additional extensions and short-cut synonyms
17306 for Unicode properties.
17307
17308 This document merely lists all available properties and does not attempt to
17309 explain what each property really means.  There is a brief description of each
17310 Perl extension; see L<perlunicode/Other Properties> for more information on
17311 these.  There is some detail about Blocks, Scripts, General_Category,
17312 and Bidi_Class in L<perlunicode>, but to find out about the intricacies of the
17313 official Unicode properties, refer to the Unicode standard.  A good starting
17314 place is L<$unicode_reference_url>.
17315
17316 Note that you can define your own properties; see
17317 L<perlunicode/"User-Defined Character Properties">.
17318
17319 =head1 Properties accessible through C<\\p{}> and C<\\P{}>
17320
17321 The Perl regular expression C<\\p{}> and C<\\P{}> constructs give access to
17322 most of the Unicode character properties.  The table below shows all these
17323 constructs, both single and compound forms.
17324
17325 B<Compound forms> consist of two components, separated by an equals sign or a
17326 colon.  The first component is the property name, and the second component is
17327 the particular value of the property to match against, for example,
17328 C<\\p{Script_Extensions: Greek}> and C<\\p{Script_Extensions=Greek}> both mean
17329 to match characters whose Script_Extensions property value is Greek.
17330 (C<Script_Extensions> is an improved version of the C<Script> property.)
17331
17332 B<Single forms>, like C<\\p{Greek}>, are mostly Perl-defined shortcuts for
17333 their equivalent compound forms.  The table shows these equivalences.  (In our
17334 example, C<\\p{Greek}> is a just a shortcut for
17335 C<\\p{Script_Extensions=Greek}>).  There are also a few Perl-defined single
17336 forms that are not shortcuts for a compound form.  One such is C<\\p{Word}>.
17337 These are also listed in the table.
17338
17339 In parsing these constructs, Perl always ignores Upper/lower case differences
17340 everywhere within the {braces}.  Thus C<\\p{Greek}> means the same thing as
17341 C<\\p{greek}>.  But note that changing the case of the C<"p"> or C<"P"> before
17342 the left brace completely changes the meaning of the construct, from "match"
17343 (for C<\\p{}>) to "doesn't match" (for C<\\P{}>).  Casing in this document is
17344 for improved legibility.
17345
17346 Also, white space, hyphens, and underscores are normally ignored
17347 everywhere between the {braces}, and hence can be freely added or removed
17348 even if the C</x> modifier hasn't been specified on the regular expression.
17349 But in the table below $a_bold_stricter at the beginning of an entry
17350 means that tighter (stricter) rules are used for that entry:
17351
17352 =over 4
17353
17354 =over 4
17355
17356 =item Single form (C<\\p{name}>) tighter rules:
17357
17358 White space, hyphens, and underscores ARE significant
17359 except for:
17360
17361 =over 4
17362
17363 =item * white space adjacent to a non-word character
17364
17365 =item * underscores separating digits in numbers
17366
17367 =back
17368
17369 That means, for example, that you can freely add or remove white space
17370 adjacent to (but within) the braces without affecting the meaning.
17371
17372 =item Compound form (C<\\p{name=value}> or C<\\p{name:value}>) tighter rules:
17373
17374 The tighter rules given above for the single form apply to everything to the
17375 right of the colon or equals; the looser rules still apply to everything to
17376 the left.
17377
17378 That means, for example, that you can freely add or remove white space
17379 adjacent to (but within) the braces and the colon or equal sign.
17380
17381 =back
17382
17383 =back
17384
17385 Some properties are considered obsolete by Unicode, but still available.
17386 There are several varieties of obsolescence:
17387
17388 =over 4
17389
17390 =over 4
17391
17392 =item Stabilized
17393
17394 A property may be stabilized.  Such a determination does not indicate
17395 that the property should or should not be used; instead it is a declaration
17396 that the property will not be maintained nor extended for newly encoded
17397 characters.  Such properties are marked with $a_bold_stabilized in the
17398 table.
17399
17400 =item Deprecated
17401
17402 A property may be deprecated, perhaps because its original intent
17403 has been replaced by another property, or because its specification was
17404 somehow defective.  This means that its use is strongly
17405 discouraged, so much so that a warning will be issued if used, unless the
17406 regular expression is in the scope of a C<S<no warnings 'deprecated'>>
17407 statement.  $A_bold_deprecated flags each such entry in the table, and
17408 the entry there for the longest, most descriptive version of the property will
17409 give the reason it is deprecated, and perhaps advice.  Perl may issue such a
17410 warning, even for properties that aren't officially deprecated by Unicode,
17411 when there used to be characters or code points that were matched by them, but
17412 no longer.  This is to warn you that your program may not work like it did on
17413 earlier Unicode releases.
17414
17415 A deprecated property may be made unavailable in a future Perl version, so it
17416 is best to move away from them.
17417
17418 A deprecated property may also be stabilized, but this fact is not shown.
17419
17420 =item Obsolete
17421
17422 Properties marked with $a_bold_obsolete in the table are considered (plain)
17423 obsolete.  Generally this designation is given to properties that Unicode once
17424 used for internal purposes (but not any longer).
17425
17426 =item Discouraged
17427
17428 This is not actually a Unicode-specified obsolescence, but applies to certain
17429 Perl extensions that are present for backwards compatibility, but are
17430 discouraged from being used.  These are not obsolete, but their meanings are
17431 not stable.  Future Unicode versions could force any of these extensions to be
17432 removed without warning, replaced by another property with the same name that
17433 means something different.  $A_bold_discouraged flags each such entry in the
17434 table.  Use the equivalent shown instead.
17435
17436 @block_warning
17437
17438 =back
17439
17440 =back
17441
17442 The table below has two columns.  The left column contains the C<\\p{}>
17443 constructs to look up, possibly preceded by the flags mentioned above; and
17444 the right column contains information about them, like a description, or
17445 synonyms.  The table shows both the single and compound forms for each
17446 property that has them.  If the left column is a short name for a property,
17447 the right column will give its longer, more descriptive name; and if the left
17448 column is the longest name, the right column will show any equivalent shortest
17449 name, in both single and compound forms if applicable.
17450
17451 If braces are not needed to specify a property (e.g., C<\\pL>), the left
17452 column contains both forms, with and without braces.
17453
17454 The right column will also caution you if a property means something different
17455 than what might normally be expected.
17456
17457 All single forms are Perl extensions; a few compound forms are as well, and
17458 are noted as such.
17459
17460 Numbers in (parentheses) indicate the total number of Unicode code points
17461 matched by the property.  For the entries that give the longest, most
17462 descriptive version of the property, the count is followed by a list of some
17463 of the code points matched by it.  The list includes all the matched
17464 characters in the 0-255 range, enclosed in the familiar [brackets] the same as
17465 a regular expression bracketed character class.  Following that, the next few
17466 higher matching ranges are also given.  To avoid visual ambiguity, the SPACE
17467 character is represented as C<\\x$space_hex>.
17468
17469 For emphasis, those properties that match no code points at all are listed as
17470 well in a separate section following the table.
17471
17472 Most properties match the same code points regardless of whether C<"/i">
17473 case-insensitive matching is specified or not.  But a few properties are
17474 affected.  These are shown with the notation S<C<(/i= I<other_property>)>>
17475 in the second column.  Under case-insensitive matching they match the
17476 same code pode points as the property I<other_property>.
17477
17478 There is no description given for most non-Perl defined properties (See
17479 L<$unicode_reference_url> for that).
17480
17481 For compactness, 'B<*>' is used as a wildcard instead of showing all possible
17482 combinations.  For example, entries like:
17483
17484  \\p{Gc: *}                                  \\p{General_Category: *}
17485
17486 mean that 'Gc' is a synonym for 'General_Category', and anything that is valid
17487 for the latter is also valid for the former.  Similarly,
17488
17489  \\p{Is_*}                                   \\p{*}
17490
17491 means that if and only if, for example, C<\\p{Foo}> exists, then
17492 C<\\p{Is_Foo}> and C<\\p{IsFoo}> are also valid and all mean the same thing.
17493 And similarly, C<\\p{Foo=Bar}> means the same as C<\\p{Is_Foo=Bar}> and
17494 C<\\p{IsFoo=Bar}>.  "*" here is restricted to something not beginning with an
17495 underscore.
17496
17497 Also, in binary properties, 'Yes', 'T', and 'True' are all synonyms for 'Y'.
17498 And 'No', 'F', and 'False' are all synonyms for 'N'.  The table shows 'Y*' and
17499 'N*' to indicate this, and doesn't have separate entries for the other
17500 possibilities.  Note that not all properties which have values 'Yes' and 'No'
17501 are binary, and they have all their values spelled out without using this wild
17502 card, and a C<NOT> clause in their description that highlights their not being
17503 binary.  These also require the compound form to match them, whereas true
17504 binary properties have both single and compound forms available.
17505
17506 Note that all non-essential underscores are removed in the display of the
17507 short names below.
17508
17509 B<Legend summary:>
17510
17511 =over 4
17512
17513 =item Z<>B<*> is a wild-card
17514
17515 =item B<(\\d+)> in the info column gives the number of Unicode code points matched
17516 by this property.
17517
17518 =item B<$DEPRECATED> means this is deprecated.
17519
17520 =item B<$OBSOLETE> means this is obsolete.
17521
17522 =item B<$STABILIZED> means this is stabilized.
17523
17524 =item B<$STRICTER> means tighter (stricter) name matching applies.
17525
17526 =item B<$DISCOURAGED> means use of this form is discouraged, and may not be
17527 stable.
17528
17529 =back
17530
17531 $formatted_properties
17532
17533 $zero_matches
17534
17535 =head1 Properties accessible through Unicode::UCD
17536
17537 The value of any Unicode (not including Perl extensions) character
17538 property mentioned above for any single code point is available through
17539 L<Unicode::UCD/charprop()>.  L<Unicode::UCD/charprops_all()> returns the
17540 values of all the Unicode properties for a given code point.
17541
17542 Besides these, all the Unicode character properties mentioned above
17543 (except for those marked as for internal use by Perl) are also
17544 accessible by L<Unicode::UCD/prop_invlist()>.
17545
17546 Due to their nature, not all Unicode character properties are suitable for
17547 regular expression matches, nor C<prop_invlist()>.  The remaining
17548 non-provisional, non-internal ones are accessible via
17549 L<Unicode::UCD/prop_invmap()> (except for those that this Perl installation
17550 hasn't included; see L<below for which those are|/Unicode character properties
17551 that are NOT accepted by Perl>).
17552
17553 For compatibility with other parts of Perl, all the single forms given in the
17554 table in the L<section above|/Properties accessible through \\p{} and \\P{}>
17555 are recognized.  BUT, there are some ambiguities between some Perl extensions
17556 and the Unicode properties, all of which are silently resolved in favor of the
17557 official Unicode property.  To avoid surprises, you should only use
17558 C<prop_invmap()> for forms listed in the table below, which omits the
17559 non-recommended ones.  The affected forms are the Perl single form equivalents
17560 of Unicode properties, such as C<\\p{sc}> being a single-form equivalent of
17561 C<\\p{gc=sc}>, which is treated by C<prop_invmap()> as the C<Script> property,
17562 whose short name is C<sc>.  The table indicates the current ambiguities in the
17563 INFO column, beginning with the word C<"NOT">.
17564
17565 The standard Unicode properties listed below are documented in
17566 L<$unicode_reference_url>; Perl_Decimal_Digit is documented in
17567 L<Unicode::UCD/prop_invmap()>.  The other Perl extensions are in
17568 L<perlunicode/Other Properties>;
17569
17570 The first column in the table is a name for the property; the second column is
17571 an alternative name, if any, plus possibly some annotations.  The alternative
17572 name is the property's full name, unless that would simply repeat the first
17573 column, in which case the second column indicates the property's short name
17574 (if different).  The annotations are given only in the entry for the full
17575 name.  The annotations for binary properties include a list of the first few
17576 ranges that the property matches.  To avoid any ambiguity, the SPACE character
17577 is represented as C<\\x$space_hex>.
17578
17579 If a property is obsolete, etc, the entry will be flagged with the same
17580 characters used in the table in the L<section above|/Properties accessible
17581 through \\p{} and \\P{}>, like B<$DEPRECATED> or B<$STABILIZED>.
17582
17583 $ucd_pod
17584
17585 =head1 Properties accessible through other means
17586
17587 Certain properties are accessible also via core function calls.  These are:
17588
17589  Lowercase_Mapping          lc() and lcfirst()
17590  Titlecase_Mapping          ucfirst()
17591  Uppercase_Mapping          uc()
17592
17593 Also, Case_Folding is accessible through the C</i> modifier in regular
17594 expressions, the C<\\F> transliteration escape, and the C<L<fc|perlfunc/fc>>
17595 operator.
17596
17597 And, the Name and Name_Aliases properties are accessible through the C<\\N{}>
17598 interpolation in double-quoted strings and regular expressions; and functions
17599 C<charnames::viacode()>, C<charnames::vianame()>, and
17600 C<charnames::string_vianame()> (which require a C<use charnames ();> to be
17601 specified.
17602
17603 Finally, most properties related to decomposition are accessible via
17604 L<Unicode::Normalize>.
17605
17606 =head1 Unicode character properties that are NOT accepted by Perl
17607
17608 Perl will generate an error for a few character properties in Unicode when
17609 used in a regular expression.  The non-Unihan ones are listed below, with the
17610 reasons they are not accepted, perhaps with work-arounds.  The short names for
17611 the properties are listed enclosed in (parentheses).
17612 As described after the list, an installation can change the defaults and choose
17613 to accept any of these.  The list is machine generated based on the
17614 choices made for the installation that generated this document.
17615
17616 @bad_re_properties
17617
17618 An installation can choose to allow any of these to be matched by downloading
17619 the Unicode database from L<http://www.unicode.org/Public/> to
17620 C<\$Config{privlib}>/F<unicore/> in the Perl source tree, changing the
17621 controlling lists contained in the program
17622 C<\$Config{privlib}>/F<unicore/mktables> and then re-compiling and installing.
17623 (C<\%Config> is available from the Config module).
17624
17625 Also, perl can be recompiled to operate on an earlier version of the Unicode
17626 standard.  Further information is at
17627 C<\$Config{privlib}>/F<unicore/README.perl>.
17628
17629 =head1 Other information in the Unicode data base
17630
17631 The Unicode data base is delivered in two different formats.  The XML version
17632 is valid for more modern Unicode releases.  The other version is a collection
17633 of files.  The two are intended to give equivalent information.  Perl uses the
17634 older form; this allows you to recompile Perl to use early Unicode releases.
17635
17636 The only non-character property that Perl currently supports is Named
17637 Sequences, in which a sequence of code points
17638 is given a name and generally treated as a single entity.  (Perl supports
17639 these via the C<\\N{...}> double-quotish construct,
17640 L<charnames/charnames::string_vianame(name)>, and L<Unicode::UCD/namedseq()>.
17641
17642 Below is a list of the files in the Unicode data base that Perl doesn't
17643 currently use, along with very brief descriptions of their purposes.
17644 Some of the names of the files have been shortened from those that Unicode
17645 uses, in order to allow them to be distinguishable from similarly named files
17646 on file systems for which only the first 8 characters of a name are
17647 significant.
17648
17649 =over 4
17650
17651 @unused_files
17652
17653 =back
17654
17655 =head1 SEE ALSO
17656
17657 L<$unicode_reference_url>
17658
17659 L<perlrecharclass>
17660
17661 L<perlunicode>
17662
17663 END
17664
17665     # And write it.  The 0 means no utf8.
17666     main::write([ $pod_directory, "$pod_file.pod" ], 0, \@OUT);
17667     return;
17668 }
17669
17670 sub make_Name_pm () {
17671     # Create and write Name.pm, which contains subroutines and data to use in
17672     # conjunction with Name.pl
17673
17674     # Maybe there's nothing to do.
17675     return unless $has_hangul_syllables || @code_points_ending_in_code_point;
17676
17677     my @name = <<END;
17678 $HEADER
17679 $INTERNAL_ONLY_HEADER
17680 END
17681
17682     # Convert these structures to output format.
17683     my $code_points_ending_in_code_point =
17684         main::simple_dumper(\@code_points_ending_in_code_point,
17685                             ' ' x 8);
17686     my $names = main::simple_dumper(\%names_ending_in_code_point,
17687                                     ' ' x 8);
17688     my $loose_names = main::simple_dumper(\%loose_names_ending_in_code_point,
17689                                     ' ' x 8);
17690
17691     # Do the same with the Hangul names,
17692     my $jamo;
17693     my $jamo_l;
17694     my $jamo_v;
17695     my $jamo_t;
17696     my $jamo_re;
17697     if ($has_hangul_syllables) {
17698
17699         # Construct a regular expression of all the possible
17700         # combinations of the Hangul syllables.
17701         my @L_re;   # Leading consonants
17702         for my $i ($LBase .. $LBase + $LCount - 1) {
17703             push @L_re, $Jamo{$i}
17704         }
17705         my @V_re;   # Middle vowels
17706         for my $i ($VBase .. $VBase + $VCount - 1) {
17707             push @V_re, $Jamo{$i}
17708         }
17709         my @T_re;   # Trailing consonants
17710         for my $i ($TBase + 1 .. $TBase + $TCount - 1) {
17711             push @T_re, $Jamo{$i}
17712         }
17713
17714         # The whole re is made up of the L V T combination.
17715         $jamo_re = '('
17716                     . join ('|', sort @L_re)
17717                     . ')('
17718                     . join ('|', sort @V_re)
17719                     . ')('
17720                     . join ('|', sort @T_re)
17721                     . ')?';
17722
17723         # These hashes needed by the algorithm were generated
17724         # during reading of the Jamo.txt file
17725         $jamo = main::simple_dumper(\%Jamo, ' ' x 8);
17726         $jamo_l = main::simple_dumper(\%Jamo_L, ' ' x 8);
17727         $jamo_v = main::simple_dumper(\%Jamo_V, ' ' x 8);
17728         $jamo_t = main::simple_dumper(\%Jamo_T, ' ' x 8);
17729     }
17730
17731     push @name, <<END;
17732
17733 package charnames;
17734
17735 # This module contains machine-generated tables and code for the
17736 # algorithmically-determinable Unicode character names.  The following
17737 # routines can be used to translate between name and code point and vice versa
17738
17739 { # Closure
17740
17741     # Matches legal code point.  4-6 hex numbers, If there are 6, the first
17742     # two must be 10; if there are 5, the first must not be a 0.  Written this
17743     # way to decrease backtracking.  The first regex allows the code point to
17744     # be at the end of a word, but to work properly, the word shouldn't end
17745     # with a valid hex character.  The second one won't match a code point at
17746     # the end of a word, and doesn't have the run-on issue
17747     my \$run_on_code_point_re = qr/$run_on_code_point_re/;
17748     my \$code_point_re = qr/$code_point_re/;
17749
17750     # In the following hash, the keys are the bases of names which include
17751     # the code point in the name, like CJK UNIFIED IDEOGRAPH-4E01.  The value
17752     # of each key is another hash which is used to get the low and high ends
17753     # for each range of code points that apply to the name.
17754     my %names_ending_in_code_point = (
17755 $names
17756     );
17757
17758     # The following hash is a copy of the previous one, except is for loose
17759     # matching, so each name has blanks and dashes squeezed out
17760     my %loose_names_ending_in_code_point = (
17761 $loose_names
17762     );
17763
17764     # And the following array gives the inverse mapping from code points to
17765     # names.  Lowest code points are first
17766     my \@code_points_ending_in_code_point = (
17767 $code_points_ending_in_code_point
17768     );
17769 END
17770     # Earlier releases didn't have Jamos.  No sense outputting
17771     # them unless will be used.
17772     if ($has_hangul_syllables) {
17773         push @name, <<END;
17774
17775     # Convert from code point to Jamo short name for use in composing Hangul
17776     # syllable names
17777     my %Jamo = (
17778 $jamo
17779     );
17780
17781     # Leading consonant (can be null)
17782     my %Jamo_L = (
17783 $jamo_l
17784     );
17785
17786     # Vowel
17787     my %Jamo_V = (
17788 $jamo_v
17789     );
17790
17791     # Optional trailing consonant
17792     my %Jamo_T = (
17793 $jamo_t
17794     );
17795
17796     # Computed re that splits up a Hangul name into LVT or LV syllables
17797     my \$syllable_re = qr/$jamo_re/;
17798
17799     my \$HANGUL_SYLLABLE = "HANGUL SYLLABLE ";
17800     my \$loose_HANGUL_SYLLABLE = "HANGULSYLLABLE";
17801
17802     # These constants names and values were taken from the Unicode standard,
17803     # version 5.1, section 3.12.  They are used in conjunction with Hangul
17804     # syllables
17805     my \$SBase = $SBase_string;
17806     my \$LBase = $LBase_string;
17807     my \$VBase = $VBase_string;
17808     my \$TBase = $TBase_string;
17809     my \$SCount = $SCount;
17810     my \$LCount = $LCount;
17811     my \$VCount = $VCount;
17812     my \$TCount = $TCount;
17813     my \$NCount = \$VCount * \$TCount;
17814 END
17815     } # End of has Jamos
17816
17817     push @name, << 'END';
17818
17819     sub name_to_code_point_special {
17820         my ($name, $loose) = @_;
17821
17822         # Returns undef if not one of the specially handled names; otherwise
17823         # returns the code point equivalent to the input name
17824         # $loose is non-zero if to use loose matching, 'name' in that case
17825         # must be input as upper case with all blanks and dashes squeezed out.
17826 END
17827     if ($has_hangul_syllables) {
17828         push @name, << 'END';
17829
17830         if ((! $loose && $name =~ s/$HANGUL_SYLLABLE//)
17831             || ($loose && $name =~ s/$loose_HANGUL_SYLLABLE//))
17832         {
17833             return if $name !~ qr/^$syllable_re$/;
17834             my $L = $Jamo_L{$1};
17835             my $V = $Jamo_V{$2};
17836             my $T = (defined $3) ? $Jamo_T{$3} : 0;
17837             return ($L * $VCount + $V) * $TCount + $T + $SBase;
17838         }
17839 END
17840     }
17841     push @name, << 'END';
17842
17843         # Name must end in 'code_point' for this to handle.
17844         return if (($loose && $name !~ /^ (.*?) ($run_on_code_point_re) $/x)
17845                    || (! $loose && $name !~ /^ (.*) ($code_point_re) $/x));
17846
17847         my $base = $1;
17848         my $code_point = CORE::hex $2;
17849         my $names_ref;
17850
17851         if ($loose) {
17852             $names_ref = \%loose_names_ending_in_code_point;
17853         }
17854         else {
17855             return if $base !~ s/-$//;
17856             $names_ref = \%names_ending_in_code_point;
17857         }
17858
17859         # Name must be one of the ones which has the code point in it.
17860         return if ! $names_ref->{$base};
17861
17862         # Look through the list of ranges that apply to this name to see if
17863         # the code point is in one of them.
17864         for (my $i = 0; $i < scalar @{$names_ref->{$base}{'low'}}; $i++) {
17865             return if $names_ref->{$base}{'low'}->[$i] > $code_point;
17866             next if $names_ref->{$base}{'high'}->[$i] < $code_point;
17867
17868             # Here, the code point is in the range.
17869             return $code_point;
17870         }
17871
17872         # Here, looked like the name had a code point number in it, but
17873         # did not match one of the valid ones.
17874         return;
17875     }
17876
17877     sub code_point_to_name_special {
17878         my $code_point = shift;
17879
17880         # Returns the name of a code point if algorithmically determinable;
17881         # undef if not
17882 END
17883     if ($has_hangul_syllables) {
17884         push @name, << 'END';
17885
17886         # If in the Hangul range, calculate the name based on Unicode's
17887         # algorithm
17888         if ($code_point >= $SBase && $code_point <= $SBase + $SCount -1) {
17889             use integer;
17890             my $SIndex = $code_point - $SBase;
17891             my $L = $LBase + $SIndex / $NCount;
17892             my $V = $VBase + ($SIndex % $NCount) / $TCount;
17893             my $T = $TBase + $SIndex % $TCount;
17894             $name = "$HANGUL_SYLLABLE$Jamo{$L}$Jamo{$V}";
17895             $name .= $Jamo{$T} if $T != $TBase;
17896             return $name;
17897         }
17898 END
17899     }
17900     push @name, << 'END';
17901
17902         # Look through list of these code points for one in range.
17903         foreach my $hash (@code_points_ending_in_code_point) {
17904             return if $code_point < $hash->{'low'};
17905             if ($code_point <= $hash->{'high'}) {
17906                 return sprintf("%s-%04X", $hash->{'name'}, $code_point);
17907             }
17908         }
17909         return;            # None found
17910     }
17911 } # End closure
17912
17913 1;
17914 END
17915
17916     main::write("Name.pm", 0, \@name);  # The 0 means no utf8.
17917     return;
17918 }
17919
17920 sub make_UCD () {
17921     # Create and write UCD.pl, which passes info about the tables to
17922     # Unicode::UCD
17923
17924     # Stringify structures for output
17925     my $loose_property_name_of
17926                            = simple_dumper(\%loose_property_name_of, ' ' x 4);
17927     chomp $loose_property_name_of;
17928
17929     my $strict_property_name_of
17930                            = simple_dumper(\%strict_property_name_of, ' ' x 4);
17931     chomp $strict_property_name_of;
17932
17933     my $stricter_to_file_of = simple_dumper(\%stricter_to_file_of, ' ' x 4);
17934     chomp $stricter_to_file_of;
17935
17936     my $inline_definitions = simple_dumper(\@inline_definitions, " " x 4);
17937     chomp $inline_definitions;
17938
17939     my $loose_to_file_of = simple_dumper(\%loose_to_file_of, ' ' x 4);
17940     chomp $loose_to_file_of;
17941
17942     my $nv_floating_to_rational
17943                            = simple_dumper(\%nv_floating_to_rational, ' ' x 4);
17944     chomp $nv_floating_to_rational;
17945
17946     my $why_deprecated = simple_dumper(\%Unicode::UCD::why_deprecated, ' ' x 4);
17947     chomp $why_deprecated;
17948
17949     # We set the key to the file when we associated files with tables, but we
17950     # couldn't do the same for the value then, as we might not have the file
17951     # for the alternate table figured out at that time.
17952     foreach my $cased (keys %caseless_equivalent_to) {
17953         my @path = $caseless_equivalent_to{$cased}->file_path;
17954         my $path;
17955         if ($path[0] eq "#") {  # Pseudo-directory '#'
17956             $path = join '/', @path;
17957         }
17958         else {  # Gets rid of lib/
17959             $path = join '/', @path[1, -1];
17960         }
17961         $caseless_equivalent_to{$cased} = $path;
17962     }
17963     my $caseless_equivalent_to
17964                            = simple_dumper(\%caseless_equivalent_to, ' ' x 4);
17965     chomp $caseless_equivalent_to;
17966
17967     my $loose_property_to_file_of
17968                         = simple_dumper(\%loose_property_to_file_of, ' ' x 4);
17969     chomp $loose_property_to_file_of;
17970
17971     my $strict_property_to_file_of
17972                         = simple_dumper(\%strict_property_to_file_of, ' ' x 4);
17973     chomp $strict_property_to_file_of;
17974
17975     my $file_to_swash_name = simple_dumper(\%file_to_swash_name, ' ' x 4);
17976     chomp $file_to_swash_name;
17977
17978     # Create a mapping from each alias of Perl single-form extensions to all
17979     # its equivalent aliases, for quick look-up.
17980     my %perlprop_to_aliases;
17981     foreach my $table ($perl->tables) {
17982
17983         # First create the list of the aliases of each extension
17984         my @aliases_list;    # List of legal aliases for this extension
17985
17986         my $table_name = $table->name;
17987         my $standard_table_name = standardize($table_name);
17988         my $table_full_name = $table->full_name;
17989         my $standard_table_full_name = standardize($table_full_name);
17990
17991         # Make sure that the list has both the short and full names
17992         push @aliases_list, $table_name, $table_full_name;
17993
17994         my $found_ucd = 0;  # ? Did we actually get an alias that should be
17995                             # output for this table
17996
17997         # Go through all the aliases (including the two just added), and add
17998         # any new unique ones to the list
17999         foreach my $alias ($table->aliases) {
18000
18001             # Skip non-legal names
18002             next unless $alias->ok_as_filename;
18003             next unless $alias->ucd;
18004
18005             $found_ucd = 1;     # have at least one legal name
18006
18007             my $name = $alias->name;
18008             my $standard = standardize($name);
18009
18010             # Don't repeat a name that is equivalent to one already on the
18011             # list
18012             next if $standard eq $standard_table_name;
18013             next if $standard eq $standard_table_full_name;
18014
18015             push @aliases_list, $name;
18016         }
18017
18018         # If there were no legal names, don't output anything.
18019         next unless $found_ucd;
18020
18021         # To conserve memory in the program reading these in, omit full names
18022         # that are identical to the short name, when those are the only two
18023         # aliases for the property.
18024         if (@aliases_list == 2 && $aliases_list[0] eq $aliases_list[1]) {
18025             pop @aliases_list;
18026         }
18027
18028         # Here, @aliases_list is the list of all the aliases that this
18029         # extension legally has.  Now can create a map to it from each legal
18030         # standardized alias
18031         foreach my $alias ($table->aliases) {
18032             next unless $alias->ucd;
18033             next unless $alias->ok_as_filename;
18034             push @{$perlprop_to_aliases{standardize($alias->name)}},
18035                  uniques @aliases_list;
18036         }
18037     }
18038
18039     # Make a list of all combinations of properties/values that are suppressed.
18040     my @suppressed;
18041     if (! $debug_skip) {    # This tends to fail in this debug mode
18042         foreach my $property_name (keys %why_suppressed) {
18043
18044             # Just the value
18045             my $value_name = $1 if $property_name =~ s/ = ( .* ) //x;
18046
18047             # The hash may contain properties not in this release of Unicode
18048             next unless defined (my $property = property_ref($property_name));
18049
18050             # Find all combinations
18051             foreach my $prop_alias ($property->aliases) {
18052                 my $prop_alias_name = standardize($prop_alias->name);
18053
18054                 # If no =value, there's just one combination possible for this
18055                 if (! $value_name) {
18056
18057                     # The property may be suppressed, but there may be a proxy
18058                     # for it, so it shouldn't be listed as suppressed
18059                     next if $prop_alias->ucd;
18060                     push @suppressed, $prop_alias_name;
18061                 }
18062                 else {  # Otherwise
18063                     foreach my $value_alias
18064                                     ($property->table($value_name)->aliases)
18065                     {
18066                         next if $value_alias->ucd;
18067
18068                         push @suppressed, "$prop_alias_name="
18069                                         .  standardize($value_alias->name);
18070                     }
18071                 }
18072             }
18073         }
18074     }
18075     @suppressed = sort @suppressed; # So doesn't change between runs of this
18076                                     # program
18077
18078     # Convert the structure below (designed for Name.pm) to a form that UCD
18079     # wants, so it doesn't have to modify it at all; i.e. so that it includes
18080     # an element for the Hangul syllables in the appropriate place, and
18081     # otherwise changes the name to include the "-<code point>" suffix.
18082     my @algorithm_names;
18083     my $done_hangul = $v_version lt v2.0.0;  # Hanguls as we know them came
18084                                              # along in this version
18085     # Copy it linearly.
18086     for my $i (0 .. @code_points_ending_in_code_point - 1) {
18087
18088         # Insert the hanguls in the correct place.
18089         if (! $done_hangul
18090             && $code_points_ending_in_code_point[$i]->{'low'} > $SBase)
18091         {
18092             $done_hangul = 1;
18093             push @algorithm_names, { low => $SBase,
18094                                      high => $SBase + $SCount - 1,
18095                                      name => '<hangul syllable>',
18096                                     };
18097         }
18098
18099         # Copy the current entry, modified.
18100         push @algorithm_names, {
18101             low => $code_points_ending_in_code_point[$i]->{'low'},
18102             high => $code_points_ending_in_code_point[$i]->{'high'},
18103             name =>
18104                "$code_points_ending_in_code_point[$i]->{'name'}-<code point>",
18105         };
18106     }
18107
18108     # Serialize these structures for output.
18109     my $loose_to_standard_value
18110                           = simple_dumper(\%loose_to_standard_value, ' ' x 4);
18111     chomp $loose_to_standard_value;
18112
18113     my $string_property_loose_to_name
18114                     = simple_dumper(\%string_property_loose_to_name, ' ' x 4);
18115     chomp $string_property_loose_to_name;
18116
18117     my $perlprop_to_aliases = simple_dumper(\%perlprop_to_aliases, ' ' x 4);
18118     chomp $perlprop_to_aliases;
18119
18120     my $prop_aliases = simple_dumper(\%prop_aliases, ' ' x 4);
18121     chomp $prop_aliases;
18122
18123     my $prop_value_aliases = simple_dumper(\%prop_value_aliases, ' ' x 4);
18124     chomp $prop_value_aliases;
18125
18126     my $suppressed = (@suppressed) ? simple_dumper(\@suppressed, ' ' x 4) : "";
18127     chomp $suppressed;
18128
18129     my $algorithm_names = simple_dumper(\@algorithm_names, ' ' x 4);
18130     chomp $algorithm_names;
18131
18132     my $ambiguous_names = simple_dumper(\%ambiguous_names, ' ' x 4);
18133     chomp $ambiguous_names;
18134
18135     my $combination_property = simple_dumper(\%combination_property, ' ' x 4);
18136     chomp $combination_property;
18137
18138     my $loose_defaults = simple_dumper(\%loose_defaults, ' ' x 4);
18139     chomp $loose_defaults;
18140
18141     my @ucd = <<END;
18142 $HEADER
18143 $INTERNAL_ONLY_HEADER
18144
18145 # This file is for the use of Unicode::UCD
18146
18147 # Highest legal Unicode code point
18148 \$Unicode::UCD::MAX_UNICODE_CODEPOINT = 0x$MAX_UNICODE_CODEPOINT_STRING;
18149
18150 # Hangul syllables
18151 \$Unicode::UCD::HANGUL_BEGIN = $SBase_string;
18152 \$Unicode::UCD::HANGUL_COUNT = $SCount;
18153
18154 # Maps Unicode (not Perl single-form extensions) property names in loose
18155 # standard form to their corresponding standard names
18156 \%Unicode::UCD::loose_property_name_of = (
18157 $loose_property_name_of
18158 );
18159
18160 # Same, but strict names
18161 \%Unicode::UCD::strict_property_name_of = (
18162 $strict_property_name_of
18163 );
18164
18165 # Gives the definitions (in the form of inversion lists) for those properties
18166 # whose definitions aren't kept in files
18167 \@Unicode::UCD::inline_definitions = (
18168 $inline_definitions
18169 );
18170
18171 # Maps property, table to file for those using stricter matching.  For paths
18172 # whose directory is '#', the file is in the form of a numeric index into
18173 # \@inline_definitions
18174 \%Unicode::UCD::stricter_to_file_of = (
18175 $stricter_to_file_of
18176 );
18177
18178 # Maps property, table to file for those using loose matching.  For paths
18179 # whose directory is '#', the file is in the form of a numeric index into
18180 # \@inline_definitions
18181 \%Unicode::UCD::loose_to_file_of = (
18182 $loose_to_file_of
18183 );
18184
18185 # Maps floating point to fractional form
18186 \%Unicode::UCD::nv_floating_to_rational = (
18187 $nv_floating_to_rational
18188 );
18189
18190 # If a %e floating point number doesn't have this number of digits in it after
18191 # the decimal point to get this close to a fraction, it isn't considered to be
18192 # that fraction even if all the digits it does have match.
18193 \$Unicode::UCD::e_precision = $E_FLOAT_PRECISION;
18194
18195 # Deprecated tables to generate a warning for.  The key is the file containing
18196 # the table, so as to avoid duplication, as many property names can map to the
18197 # file, but we only need one entry for all of them.
18198 \%Unicode::UCD::why_deprecated = (
18199 $why_deprecated
18200 );
18201
18202 # A few properties have different behavior under /i matching.  This maps
18203 # those to substitute files to use under /i.
18204 \%Unicode::UCD::caseless_equivalent = (
18205 $caseless_equivalent_to
18206 );
18207
18208 # Property names to mapping files
18209 \%Unicode::UCD::loose_property_to_file_of = (
18210 $loose_property_to_file_of
18211 );
18212
18213 # Property names to mapping files
18214 \%Unicode::UCD::strict_property_to_file_of = (
18215 $strict_property_to_file_of
18216 );
18217
18218 # Files to the swash names within them.
18219 \%Unicode::UCD::file_to_swash_name = (
18220 $file_to_swash_name
18221 );
18222
18223 # Keys are all the possible "prop=value" combinations, in loose form; values
18224 # are the standard loose name for the 'value' part of the key
18225 \%Unicode::UCD::loose_to_standard_value = (
18226 $loose_to_standard_value
18227 );
18228
18229 # String property loose names to standard loose name
18230 \%Unicode::UCD::string_property_loose_to_name = (
18231 $string_property_loose_to_name
18232 );
18233
18234 # Keys are Perl extensions in loose form; values are each one's list of
18235 # aliases
18236 \%Unicode::UCD::loose_perlprop_to_name = (
18237 $perlprop_to_aliases
18238 );
18239
18240 # Keys are standard property name; values are each one's aliases
18241 \%Unicode::UCD::prop_aliases = (
18242 $prop_aliases
18243 );
18244
18245 # Keys of top level are standard property name; values are keys to another
18246 # hash,  Each one is one of the property's values, in standard form.  The
18247 # values are that prop-val's aliases.  If only one specified, the short and
18248 # long alias are identical.
18249 \%Unicode::UCD::prop_value_aliases = (
18250 $prop_value_aliases
18251 );
18252
18253 # Ordered (by code point ordinal) list of the ranges of code points whose
18254 # names are algorithmically determined.  Each range entry is an anonymous hash
18255 # of the start and end points and a template for the names within it.
18256 \@Unicode::UCD::algorithmic_named_code_points = (
18257 $algorithm_names
18258 );
18259
18260 # The properties that as-is have two meanings, and which must be disambiguated
18261 \%Unicode::UCD::ambiguous_names = (
18262 $ambiguous_names
18263 );
18264
18265 # Keys are the prop-val combinations which are the default values for the
18266 # given property, expressed in standard loose form
18267 \%Unicode::UCD::loose_defaults = (
18268 $loose_defaults
18269 );
18270
18271 # The properties that are combinations, in that they have both a map table and
18272 # a match table.  This is actually for UCD.t, so it knows how to test for
18273 # these.
18274 \%Unicode::UCD::combination_property = (
18275 $combination_property
18276 );
18277
18278 # All combinations of names that are suppressed.
18279 # This is actually for UCD.t, so it knows which properties shouldn't have
18280 # entries.  If it got any bigger, would probably want to put it in its own
18281 # file to use memory only when it was needed, in testing.
18282 \@Unicode::UCD::suppressed_properties = (
18283 $suppressed
18284 );
18285
18286 1;
18287 END
18288
18289     main::write("UCD.pl", 0, \@ucd);  # The 0 means no utf8.
18290     return;
18291 }
18292
18293 sub write_all_tables() {
18294     # Write out all the tables generated by this program to files, as well as
18295     # the supporting data structures, pod file, and .t file.
18296
18297     my @writables;              # List of tables that actually get written
18298     my %match_tables_to_write;  # Used to collapse identical match tables
18299                                 # into one file.  Each key is a hash function
18300                                 # result to partition tables into buckets.
18301                                 # Each value is an array of the tables that
18302                                 # fit in the bucket.
18303
18304     # For each property ...
18305     # (sort so that if there is an immutable file name, it has precedence, so
18306     # some other property can't come in and take over its file name.  (We
18307     # don't care if both defined, as they had better be different anyway.)
18308     # The property named 'Perl' needs to be first (it doesn't have any
18309     # immutable file name) because empty properties are defined in terms of
18310     # its table named 'All' under the -annotate option.)   We also sort by
18311     # the property's name.  This is just for repeatability of the outputs
18312     # between runs of this program, but does not affect correctness.
18313     PROPERTY:
18314     foreach my $property ($perl,
18315                           sort { return -1 if defined $a->file;
18316                                  return 1 if defined $b->file;
18317                                  return $a->name cmp $b->name;
18318                                 } grep { $_ != $perl } property_ref('*'))
18319     {
18320         my $type = $property->type;
18321
18322         # And for each table for that property, starting with the mapping
18323         # table for it ...
18324         TABLE:
18325         foreach my $table($property,
18326
18327                         # and all the match tables for it (if any), sorted so
18328                         # the ones with the shortest associated file name come
18329                         # first.  The length sorting prevents problems of a
18330                         # longer file taking a name that might have to be used
18331                         # by a shorter one.  The alphabetic sorting prevents
18332                         # differences between releases
18333                         sort {  my $ext_a = $a->external_name;
18334                                 return 1 if ! defined $ext_a;
18335                                 my $ext_b = $b->external_name;
18336                                 return -1 if ! defined $ext_b;
18337
18338                                 # But return the non-complement table before
18339                                 # the complement one, as the latter is defined
18340                                 # in terms of the former, and needs to have
18341                                 # the information for the former available.
18342                                 return 1 if $a->complement != 0;
18343                                 return -1 if $b->complement != 0;
18344
18345                                 # Similarly, return a subservient table after
18346                                 # a leader
18347                                 return 1 if $a->leader != $a;
18348                                 return -1 if $b->leader != $b;
18349
18350                                 my $cmp = length $ext_a <=> length $ext_b;
18351
18352                                 # Return result if lengths not equal
18353                                 return $cmp if $cmp;
18354
18355                                 # Alphabetic if lengths equal
18356                                 return $ext_a cmp $ext_b
18357                         } $property->tables
18358                     )
18359         {
18360
18361             # Here we have a table associated with a property.  It could be
18362             # the map table (done first for each property), or one of the
18363             # other tables.  Determine which type.
18364             my $is_property = $table->isa('Property');
18365
18366             my $name = $table->name;
18367             my $complete_name = $table->complete_name;
18368
18369             # See if should suppress the table if is empty, but warn if it
18370             # contains something.
18371             my $suppress_if_empty_warn_if_not
18372                     = $why_suppress_if_empty_warn_if_not{$complete_name} || 0;
18373
18374             # Calculate if this table should have any code points associated
18375             # with it or not.
18376             my $expected_empty =
18377
18378                 # $perl should be empty
18379                 ($is_property && ($table == $perl))
18380
18381                 # Match tables in properties we skipped populating should be
18382                 # empty
18383                 || (! $is_property && ! $property->to_create_match_tables)
18384
18385                 # Tables and properties that are expected to have no code
18386                 # points should be empty
18387                 || $suppress_if_empty_warn_if_not
18388             ;
18389
18390             # Set a boolean if this table is the complement of an empty binary
18391             # table
18392             my $is_complement_of_empty_binary =
18393                 $type == $BINARY &&
18394                 (($table == $property->table('Y')
18395                     && $property->table('N')->is_empty)
18396                 || ($table == $property->table('N')
18397                     && $property->table('Y')->is_empty));
18398
18399             if ($table->is_empty) {
18400
18401                 if ($suppress_if_empty_warn_if_not) {
18402                     $table->set_fate($SUPPRESSED,
18403                                      $suppress_if_empty_warn_if_not);
18404                 }
18405
18406                 # Suppress (by skipping them) expected empty tables.
18407                 next TABLE if $expected_empty;
18408
18409                 # And setup to later output a warning for those that aren't
18410                 # known to be allowed to be empty.  Don't do the warning if
18411                 # this table is a child of another one to avoid duplicating
18412                 # the warning that should come from the parent one.
18413                 if (($table == $property || $table->parent == $table)
18414                     && $table->fate != $SUPPRESSED
18415                     && $table->fate != $MAP_PROXIED
18416                     && ! grep { $complete_name =~ /^$_$/ }
18417                                                     @tables_that_may_be_empty)
18418                 {
18419                     push @unhandled_properties, "$table";
18420                 }
18421
18422                 # The old way of expressing an empty match list was to
18423                 # complement the list that matches everything.  The new way is
18424                 # to create an empty inversion list, but this doesn't work for
18425                 # annotating, so use the old way then.
18426                 $table->set_complement($All) if $annotate
18427                                                 && $table != $property;
18428             }
18429             elsif ($expected_empty) {
18430                 my $because = "";
18431                 if ($suppress_if_empty_warn_if_not) {
18432                     $because = " because $suppress_if_empty_warn_if_not";
18433                 }
18434
18435                 Carp::my_carp("Not expecting property $table$because.  Generating file for it anyway.");
18436             }
18437
18438             # Some tables should match everything
18439             my $expected_full =
18440                 ($table->fate == $SUPPRESSED)
18441                 ? 0
18442                 : ($is_property)
18443                   ? # All these types of map tables will be full because
18444                     # they will have been populated with defaults
18445                     ($type == $ENUM)
18446
18447                   : # A match table should match everything if its method
18448                     # shows it should
18449                     ($table->matches_all
18450
18451                     # The complement of an empty binary table will match
18452                     # everything
18453                     || $is_complement_of_empty_binary
18454                     )
18455             ;
18456
18457             my $count = $table->count;
18458             if ($expected_full) {
18459                 if ($count != $MAX_WORKING_CODEPOINTS) {
18460                     Carp::my_carp("$table matches only "
18461                     . clarify_number($count)
18462                     . " Unicode code points but should match "
18463                     . clarify_number($MAX_WORKING_CODEPOINTS)
18464                     . " (off by "
18465                     .  clarify_number(abs($MAX_WORKING_CODEPOINTS - $count))
18466                     . ").  Proceeding anyway.");
18467                 }
18468
18469                 # Here is expected to be full.  If it is because it is the
18470                 # complement of an (empty) binary table that is to be
18471                 # suppressed, then suppress this one as well.
18472                 if ($is_complement_of_empty_binary) {
18473                     my $opposing_name = ($name eq 'Y') ? 'N' : 'Y';
18474                     my $opposing = $property->table($opposing_name);
18475                     my $opposing_status = $opposing->status;
18476                     if ($opposing_status) {
18477                         $table->set_status($opposing_status,
18478                                            $opposing->status_info);
18479                     }
18480                 }
18481             }
18482             elsif ($count == $MAX_UNICODE_CODEPOINTS
18483                    && $name ne "Any"
18484                    && ($table == $property || $table->leader == $table)
18485                    && $table->property->status ne $NORMAL)
18486             {
18487                     Carp::my_carp("$table unexpectedly matches all Unicode code points.  Proceeding anyway.");
18488             }
18489
18490             if ($table->fate >= $SUPPRESSED) {
18491                 if (! $is_property) {
18492                     my @children = $table->children;
18493                     foreach my $child (@children) {
18494                         if ($child->fate < $SUPPRESSED) {
18495                             Carp::my_carp_bug("'$table' is suppressed and has a child '$child' which isn't");
18496                         }
18497                     }
18498                 }
18499                 next TABLE;
18500
18501             }
18502
18503             if (! $is_property) {
18504
18505                 make_ucd_table_pod_entries($table) if $table->property == $perl;
18506
18507                 # Several things need to be done just once for each related
18508                 # group of match tables.  Do them on the parent.
18509                 if ($table->parent == $table) {
18510
18511                     # Add an entry in the pod file for the table; it also does
18512                     # the children.
18513                     make_re_pod_entries($table) if defined $pod_directory;
18514
18515                     # See if the table matches identical code points with
18516                     # something that has already been processed and is ready
18517                     # for output.  In that case, no need to have two files
18518                     # with the same code points in them.  We use the table's
18519                     # hash() method to store these in buckets, so that it is
18520                     # quite likely that if two tables are in the same bucket
18521                     # they will be identical, so don't have to compare tables
18522                     # frequently.  The tables have to have the same status to
18523                     # share a file, so add this to the bucket hash.  (The
18524                     # reason for this latter is that UCD.pm associates a
18525                     # status with a file.) We don't check tables that are
18526                     # inverses of others, as it would lead to some coding
18527                     # complications, and checking all the regular ones should
18528                     # find everything.
18529                     if ($table->complement == 0) {
18530                         my $hash = $table->hash . ';' . $table->status;
18531
18532                         # Look at each table that is in the same bucket as
18533                         # this one would be.
18534                         foreach my $comparison
18535                                             (@{$match_tables_to_write{$hash}})
18536                         {
18537                             # If the table doesn't point back to this one, we
18538                             # see if it matches identically
18539                             if (   $comparison->leader != $table
18540                                 && $table->matches_identically_to($comparison))
18541                             {
18542                                 $table->set_equivalent_to($comparison,
18543                                                                 Related => 0);
18544                                 next TABLE;
18545                             }
18546                         }
18547
18548                         # Here, not equivalent, add this table to the bucket.
18549                         push @{$match_tables_to_write{$hash}}, $table;
18550                     }
18551                 }
18552             }
18553             else {
18554
18555                 # Here is the property itself.
18556                 # Don't write out or make references to the $perl property
18557                 next if $table == $perl;
18558
18559                 make_ucd_table_pod_entries($table);
18560
18561                 # There is a mapping stored of the various synonyms to the
18562                 # standardized name of the property for Unicode::UCD.
18563                 # Also, the pod file contains entries of the form:
18564                 # \p{alias: *}         \p{full: *}
18565                 # rather than show every possible combination of things.
18566
18567                 my @property_aliases = $property->aliases;
18568
18569                 my $full_property_name = $property->full_name;
18570                 my $property_name = $property->name;
18571                 my $standard_property_name = standardize($property_name);
18572                 my $standard_property_full_name
18573                                         = standardize($full_property_name);
18574
18575                 # We also create for Unicode::UCD a list of aliases for
18576                 # the property.  The list starts with the property name;
18577                 # then its full name.  Legacy properties are not listed in
18578                 # Unicode::UCD.
18579                 my @property_list;
18580                 my @standard_list;
18581                 if ( $property->fate <= $MAP_PROXIED) {
18582                     @property_list = ($property_name, $full_property_name);
18583                     @standard_list = ($standard_property_name,
18584                                         $standard_property_full_name);
18585                 }
18586
18587                 # For each synonym ...
18588                 for my $i (0 .. @property_aliases - 1)  {
18589                     my $alias = $property_aliases[$i];
18590                     my $alias_name = $alias->name;
18591                     my $alias_standard = standardize($alias_name);
18592
18593
18594                     # Add other aliases to the list of property aliases
18595                     if ($property->fate <= $MAP_PROXIED
18596                         && ! grep { $alias_standard eq $_ } @standard_list)
18597                     {
18598                         push @property_list, $alias_name;
18599                         push @standard_list, $alias_standard;
18600                     }
18601
18602                     # For Unicode::UCD, set the mapping of the alias to the
18603                     # property
18604                     if ($type == $STRING) {
18605                         if ($property->fate <= $MAP_PROXIED) {
18606                             $string_property_loose_to_name{$alias_standard}
18607                                             = $standard_property_name;
18608                         }
18609                     }
18610                     else {
18611                         my $hash_ref = ($alias_standard =~ /^_/)
18612                                        ? \%strict_property_name_of
18613                                        : \%loose_property_name_of;
18614                         if (exists $hash_ref->{$alias_standard}) {
18615                             Carp::my_carp("There already is a property with the same standard name as $alias_name: $hash_ref->{$alias_standard}.  Old name is retained");
18616                         }
18617                         else {
18618                             $hash_ref->{$alias_standard}
18619                                                 = $standard_property_name;
18620                         }
18621
18622                         # Now for the re pod entry for this alias.  Skip if not
18623                         # outputting a pod; skip the first one, which is the
18624                         # full name so won't have an entry like: '\p{full: *}
18625                         # \p{full: *}', and skip if don't want an entry for
18626                         # this one.
18627                         next if $i == 0
18628                                 || ! defined $pod_directory
18629                                 || ! $alias->make_re_pod_entry;
18630
18631                         my $rhs = "\\p{$full_property_name: *}";
18632                         if ($property != $perl && $table->perl_extension) {
18633                             $rhs .= ' (Perl extension)';
18634                         }
18635                         push @match_properties,
18636                             format_pod_line($indent_info_column,
18637                                         '\p{' . $alias->name . ': *}',
18638                                         $rhs,
18639                                         $alias->status);
18640                     }
18641                 }
18642
18643                 # The list of all possible names is attached to each alias, so
18644                 # lookup is easy
18645                 if (@property_list) {
18646                     push @{$prop_aliases{$standard_list[0]}}, @property_list;
18647                 }
18648
18649                 if ($property->fate <= $MAP_PROXIED) {
18650
18651                     # Similarly, we create for Unicode::UCD a list of
18652                     # property-value aliases.
18653
18654                     # Look at each table in the property...
18655                     foreach my $table ($property->tables) {
18656                         my @values_list;
18657                         my $table_full_name = $table->full_name;
18658                         my $standard_table_full_name
18659                                               = standardize($table_full_name);
18660                         my $table_name = $table->name;
18661                         my $standard_table_name = standardize($table_name);
18662
18663                         # The list starts with the table name and its full
18664                         # name.
18665                         push @values_list, $table_name, $table_full_name;
18666
18667                         # We add to the table each unique alias that isn't
18668                         # discouraged from use.
18669                         foreach my $alias ($table->aliases) {
18670                             next if $alias->status
18671                                  && $alias->status eq $DISCOURAGED;
18672                             my $name = $alias->name;
18673                             my $standard = standardize($name);
18674                             next if $standard eq $standard_table_name;
18675                             next if $standard eq $standard_table_full_name;
18676                             push @values_list, $name;
18677                         }
18678
18679                         # Here @values_list is a list of all the aliases for
18680                         # the table.  That is, all the property-values given
18681                         # by this table.  By agreement with Unicode::UCD,
18682                         # if the name and full name are identical, and there
18683                         # are no other names, drop the duplicate entry to save
18684                         # memory.
18685                         if (@values_list == 2
18686                             && $values_list[0] eq $values_list[1])
18687                         {
18688                             pop @values_list
18689                         }
18690
18691                         # To save memory, unlike the similar list for property
18692                         # aliases above, only the standard forms have the list.
18693                         # This forces an extra step of converting from input
18694                         # name to standard name, but the savings are
18695                         # considerable.  (There is only marginal savings if we
18696                         # did this with the property aliases.)
18697                         push @{$prop_value_aliases{$standard_property_name}{$standard_table_name}}, @values_list;
18698                     }
18699                 }
18700
18701                 # Don't write out a mapping file if not desired.
18702                 next if ! $property->to_output_map;
18703             }
18704
18705             # Here, we know we want to write out the table, but don't do it
18706             # yet because there may be other tables that come along and will
18707             # want to share the file, and the file's comments will change to
18708             # mention them.  So save for later.
18709             push @writables, $table;
18710
18711         } # End of looping through the property and all its tables.
18712     } # End of looping through all properties.
18713
18714     # Now have all the tables that will have files written for them.  Do it.
18715     foreach my $table (@writables) {
18716         my @directory;
18717         my $filename;
18718         my $property = $table->property;
18719         my $is_property = ($table == $property);
18720
18721         # For very short tables, instead of writing them out to actual files,
18722         # we in-line their inversion list definitions into UCD.pm.  The
18723         # definition replaces the file name, and the special pseudo-directory
18724         # '#' is used to signal this.  This significantly cuts down the number
18725         # of files written at little extra cost to the hashes in UCD.pm.
18726         # And it means, no run-time files to read to get the definitions.
18727         if (! $is_property
18728             && ! $annotate  # For annotation, we want to explicitly show
18729                             # everything, so keep in files
18730             && $table->ranges <= 3)
18731         {
18732             my @ranges = $table->ranges;
18733             my $count = @ranges;
18734             if ($count == 0) {  # 0th index reserved for 0-length lists
18735                 $filename = 0;
18736             }
18737             elsif ($table->leader != $table) {
18738
18739                 # Here, is a table that is equivalent to another; code
18740                 # in register_file_for_name() causes its leader's definition
18741                 # to be used
18742
18743                 next;
18744             }
18745             else {  # No equivalent table so far.
18746
18747                 # Build up its definition range-by-range.
18748                 my $definition = "";
18749                 while (defined (my $range = shift @ranges)) {
18750                     my $end = $range->end;
18751                     if ($end < $MAX_WORKING_CODEPOINT) {
18752                         $count++;
18753                         $end = "\n" . ($end + 1);
18754                     }
18755                     else {  # Extends to infinity, hence no 'end'
18756                         $end = "";
18757                     }
18758                     $definition .= "\n" . $range->start . $end;
18759                 }
18760                 $definition = "V$count" . $definition;
18761                 $filename = @inline_definitions;
18762                 push @inline_definitions, $definition;
18763             }
18764             @directory = "#";
18765             register_file_for_name($table, \@directory, $filename);
18766             next;
18767         }
18768
18769         if (! $is_property) {
18770             # Match tables for the property go in lib/$subdirectory, which is
18771             # the property's name.  Don't use the standard file name for this,
18772             # as may get an unfamiliar alias
18773             @directory = ($matches_directory, $property->external_name);
18774         }
18775         else {
18776
18777             @directory = $table->directory;
18778             $filename = $table->file;
18779         }
18780
18781         # Use specified filename if available, or default to property's
18782         # shortest name.  We need an 8.3 safe filename (which means "an 8
18783         # safe" filename, since after the dot is only 'pl', which is < 3)
18784         # The 2nd parameter is if the filename shouldn't be changed, and
18785         # it shouldn't iff there is a hard-coded name for this table.
18786         $filename = construct_filename(
18787                                 $filename || $table->external_name,
18788                                 ! $filename,    # mutable if no filename
18789                                 \@directory);
18790
18791         register_file_for_name($table, \@directory, $filename);
18792
18793         # Only need to write one file when shared by more than one
18794         # property
18795         next if ! $is_property
18796                 && ($table->leader != $table || $table->complement != 0);
18797
18798         # Construct a nice comment to add to the file
18799         $table->set_final_comment;
18800
18801         $table->write;
18802     }
18803
18804
18805     # Write out the pod file
18806     make_pod;
18807
18808     # And Name.pm, UCD.pl
18809     make_Name_pm;
18810     make_UCD;
18811
18812     make_property_test_script() if $make_test_script;
18813     make_normalization_test_script() if $make_norm_test_script;
18814     return;
18815 }
18816
18817 my @white_space_separators = ( # This used only for making the test script.
18818                             "",
18819                             ' ',
18820                             "\t",
18821                             '   '
18822                         );
18823
18824 sub generate_separator($) {
18825     # This used only for making the test script.  It generates the colon or
18826     # equal separator between the property and property value, with random
18827     # white space surrounding the separator
18828
18829     my $lhs = shift;
18830
18831     return "" if $lhs eq "";  # No separator if there's only one (the r) side
18832
18833     # Choose space before and after randomly
18834     my $spaces_before =$white_space_separators[rand(@white_space_separators)];
18835     my $spaces_after = $white_space_separators[rand(@white_space_separators)];
18836
18837     # And return the whole complex, half the time using a colon, half the
18838     # equals
18839     return $spaces_before
18840             . (rand() < 0.5) ? '=' : ':'
18841             . $spaces_after;
18842 }
18843
18844 sub generate_tests($$$$$) {
18845     # This used only for making the test script.  It generates test cases that
18846     # are expected to compile successfully in perl.  Note that the LHS and
18847     # RHS are assumed to already be as randomized as the caller wants.
18848
18849     my $lhs = shift;           # The property: what's to the left of the colon
18850                                #  or equals separator
18851     my $rhs = shift;           # The property value; what's to the right
18852     my $valid_code = shift;    # A code point that's known to be in the
18853                                # table given by LHS=RHS; undef if table is
18854                                # empty
18855     my $invalid_code = shift;  # A code point known to not be in the table;
18856                                # undef if the table is all code points
18857     my $warning = shift;
18858
18859     # Get the colon or equal
18860     my $separator = generate_separator($lhs);
18861
18862     # The whole 'property=value'
18863     my $name = "$lhs$separator$rhs";
18864
18865     my @output;
18866     # Create a complete set of tests, with complements.
18867     if (defined $valid_code) {
18868         push @output, <<"EOC"
18869 Expect(1, $valid_code, '\\p{$name}', $warning);
18870 Expect(0, $valid_code, '\\p{^$name}', $warning);
18871 Expect(0, $valid_code, '\\P{$name}', $warning);
18872 Expect(1, $valid_code, '\\P{^$name}', $warning);
18873 EOC
18874     }
18875     if (defined $invalid_code) {
18876         push @output, <<"EOC"
18877 Expect(0, $invalid_code, '\\p{$name}', $warning);
18878 Expect(1, $invalid_code, '\\p{^$name}', $warning);
18879 Expect(1, $invalid_code, '\\P{$name}', $warning);
18880 Expect(0, $invalid_code, '\\P{^$name}', $warning);
18881 EOC
18882     }
18883     return @output;
18884 }
18885
18886 sub generate_wildcard_tests($$$$$) {
18887     # This used only for making the test script.  It generates wildcardl
18888     # matching test cases that are expected to compile successfully in perl.
18889
18890     my $lhs = shift;           # The property: what's to the left of the
18891                                # or equals separator
18892     my $rhs = shift;           # The property value; what's to the right
18893     my $valid_code = shift;    # A code point that's known to be in the
18894                                # table given by LHS=RHS; undef if table is
18895                                # empty
18896     my $invalid_code = shift;  # A code point known to not be in the table;
18897                                # undef if the table is all code points
18898     my $warning = shift;
18899
18900     return if $lhs eq "";
18901     return if $lhs =~ / ^ Is_ /x;   # These are not currently supported
18902
18903     # Generate a standardized pattern, with colon being the delimitter
18904     my $wildcard = "$lhs=:\\A$rhs\\z:";
18905
18906     my @output;
18907     push @output, "Expect(1, $valid_code, '\\p{$wildcard}', $warning);"
18908                                                         if defined $valid_code;
18909     push @output, "Expect(0, $invalid_code, '\\p{$wildcard}', $warning);"
18910                                                       if defined $invalid_code;
18911     return @output;
18912 }
18913
18914 sub generate_error($$$) {
18915     # This used only for making the test script.  It generates test cases that
18916     # are expected to not only not match, but to be syntax or similar errors
18917
18918     my $lhs = shift;                # The property: what's to the left of the
18919                                     # colon or equals separator
18920     my $rhs = shift;                # The property value; what's to the right
18921     my $already_in_error = shift;   # Boolean; if true it's known that the
18922                                 # unmodified LHS and RHS will cause an error.
18923                                 # This routine should not force another one
18924     # Get the colon or equal
18925     my $separator = generate_separator($lhs);
18926
18927     # Since this is an error only, don't bother to randomly decide whether to
18928     # put the error on the left or right side; and assume that the RHS is
18929     # loosely matched, again for convenience rather than rigor.
18930     $rhs = randomize_loose_name($rhs, 'ERROR') unless $already_in_error;
18931
18932     my $property = $lhs . $separator . $rhs;
18933
18934     return <<"EOC";
18935 Error('\\p{$property}');
18936 Error('\\P{$property}');
18937 EOC
18938 }
18939
18940 # These are used only for making the test script
18941 # XXX Maybe should also have a bad strict seps, which includes underscore.
18942
18943 my @good_loose_seps = (
18944             " ",
18945             "-",
18946             "\t",
18947             "",
18948             "_",
18949            );
18950 my @bad_loose_seps = (
18951            "/a/",
18952            ':=',
18953           );
18954
18955 sub randomize_stricter_name {
18956     # This used only for making the test script.  Take the input name and
18957     # return a randomized, but valid version of it under the stricter matching
18958     # rules.
18959
18960     my $name = shift;
18961     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
18962
18963     # If the name looks like a number (integer, floating, or rational), do
18964     # some extra work
18965     if ($name =~ qr{ ^ ( -? ) (\d+ ( ( [./] ) \d+ )? ) $ }x) {
18966         my $sign = $1;
18967         my $number = $2;
18968         my $separator = $3;
18969
18970         # If there isn't a sign, part of the time add a plus
18971         # Note: Not testing having any denominator having a minus sign
18972         if (! $sign) {
18973             $sign = '+' if rand() <= .3;
18974         }
18975
18976         # And add 0 or more leading zeros.
18977         $name = $sign . ('0' x int rand(10)) . $number;
18978
18979         if (defined $separator) {
18980             my $extra_zeros = '0' x int rand(10);
18981
18982             if ($separator eq '.') {
18983
18984                 # Similarly, add 0 or more trailing zeros after a decimal
18985                 # point
18986                 $name .= $extra_zeros;
18987             }
18988             else {
18989
18990                 # Or, leading zeros before the denominator
18991                 $name =~ s,/,/$extra_zeros,;
18992             }
18993         }
18994     }
18995
18996     # For legibility of the test, only change the case of whole sections at a
18997     # time.  To do this, first split into sections.  The split returns the
18998     # delimiters
18999     my @sections;
19000     for my $section (split / ( [ - + \s _ . ]+ ) /x, $name) {
19001         trace $section if main::DEBUG && $to_trace;
19002
19003         if (length $section > 1 && $section !~ /\D/) {
19004
19005             # If the section is a sequence of digits, about half the time
19006             # randomly add underscores between some of them.
19007             if (rand() > .5) {
19008
19009                 # Figure out how many underscores to add.  max is 1 less than
19010                 # the number of digits.  (But add 1 at the end to make sure
19011                 # result isn't 0, and compensate earlier by subtracting 2
19012                 # instead of 1)
19013                 my $num_underscores = int rand(length($section) - 2) + 1;
19014
19015                 # And add them evenly throughout, for convenience, not rigor
19016                 use integer;
19017                 my $spacing = (length($section) - 1)/ $num_underscores;
19018                 my $temp = $section;
19019                 $section = "";
19020                 for my $i (1 .. $num_underscores) {
19021                     $section .= substr($temp, 0, $spacing, "") . '_';
19022                 }
19023                 $section .= $temp;
19024             }
19025             push @sections, $section;
19026         }
19027         else {
19028
19029             # Here not a sequence of digits.  Change the case of the section
19030             # randomly
19031             my $switch = int rand(4);
19032             if ($switch == 0) {
19033                 push @sections, uc $section;
19034             }
19035             elsif ($switch == 1) {
19036                 push @sections, lc $section;
19037             }
19038             elsif ($switch == 2) {
19039                 push @sections, ucfirst $section;
19040             }
19041             else {
19042                 push @sections, $section;
19043             }
19044         }
19045     }
19046     trace "returning", join "", @sections if main::DEBUG && $to_trace;
19047     return join "", @sections;
19048 }
19049
19050 sub randomize_loose_name($;$) {
19051     # This used only for making the test script
19052
19053     my $name = shift;
19054     my $want_error = shift;  # if true, make an error
19055     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
19056
19057     $name = randomize_stricter_name($name);
19058
19059     my @parts;
19060     push @parts, $good_loose_seps[rand(@good_loose_seps)];
19061
19062     # Preserve trailing ones for the sake of not stripping the underscore from
19063     # 'L_'
19064     for my $part (split /[-\s_]+ (?= . )/, $name) {
19065         if (@parts) {
19066             if ($want_error and rand() < 0.3) {
19067                 push @parts, $bad_loose_seps[rand(@bad_loose_seps)];
19068                 $want_error = 0;
19069             }
19070             else {
19071                 push @parts, $good_loose_seps[rand(@good_loose_seps)];
19072             }
19073         }
19074         push @parts, $part;
19075     }
19076     my $new = join("", @parts);
19077     trace "$name => $new" if main::DEBUG && $to_trace;
19078
19079     if ($want_error) {
19080         if (rand() >= 0.5) {
19081             $new .= $bad_loose_seps[rand(@bad_loose_seps)];
19082         }
19083         else {
19084             $new = $bad_loose_seps[rand(@bad_loose_seps)] . $new;
19085         }
19086     }
19087     return $new;
19088 }
19089
19090 # Used to make sure don't generate duplicate test cases.
19091 my %test_generated;
19092
19093 sub make_property_test_script() {
19094     # This used only for making the test script
19095     # this written directly -- it's huge.
19096
19097     print "Making test script\n" if $verbosity >= $PROGRESS;
19098
19099     # This uses randomness to test different possibilities without testing all
19100     # possibilities.  To ensure repeatability, set the seed to 0.  But if
19101     # tests are added, it will perturb all later ones in the .t file
19102     srand 0;
19103
19104     $t_path = 'TestProp.pl' unless defined $t_path; # the traditional name
19105
19106     # Create a list of what the %f representation is for each rational number.
19107     # This will be used below.
19108     my @valid_base_floats = '0.0';
19109     foreach my $e_representation (keys %nv_floating_to_rational) {
19110         push @valid_base_floats,
19111                             eval $nv_floating_to_rational{$e_representation};
19112     }
19113
19114     # It doesn't matter whether the elements of this array contain single lines
19115     # or multiple lines. main::write doesn't count the lines.
19116     my @output;
19117
19118     push @output, <<'EOF_CODE';
19119 Error('\p{Script=InGreek}');    # Bug #69018
19120 Test_GCB("1100 $nobreak 1161");  # Bug #70940
19121 Expect(0, 0x2028, '\p{Print}', ""); # Bug # 71722
19122 Expect(0, 0x2029, '\p{Print}', ""); # Bug # 71722
19123 Expect(1, 0xFF10, '\p{XDigit}', ""); # Bug # 71726
19124 Error('\p{InKana}');    # 'Kana' is not a block so InKana shouldn't compile
19125
19126 # Make sure this gets tested; it was not part of the official test suite at
19127 # the time this was added.  Note that this is as it would appear in the
19128 # official suite, and gets modified to check for the perl tailoring by
19129 # Test_WB()
19130 Test_WB("$breakable 0020 $breakable 0020 $breakable 0308 $breakable");
19131 Test_LB("$nobreak 200B $nobreak 0020 $nobreak 0020 $breakable 2060 $breakable");
19132 EOF_CODE
19133
19134     # Sort these so get results in same order on different runs of this
19135     # program
19136     foreach my $property (sort { $a->has_dependency <=> $b->has_dependency
19137                                     or
19138                                  lc $a->name cmp lc $b->name
19139                                } property_ref('*'))
19140     {
19141         # Non-binary properties should not match \p{};  Test all for that.
19142         if ($property->type != $BINARY && $property->type != $FORCED_BINARY) {
19143             my @property_aliases = grep { $_->status ne $INTERNAL_ALIAS }
19144                                                             $property->aliases;
19145             foreach my $property_alias ($property->aliases) {
19146                 my $name = standardize($property_alias->name);
19147
19148                 # But some names are ambiguous, meaning a binary property with
19149                 # the same name when used in \p{}, and a different
19150                 # (non-binary) property in other contexts.
19151                 next if grep { $name eq $_ } keys %ambiguous_names;
19152
19153                 push @output, <<"EOF_CODE";
19154 Error('\\p{$name}');
19155 Error('\\P{$name}');
19156 EOF_CODE
19157             }
19158         }
19159         foreach my $table (sort { $a->has_dependency <=> $b->has_dependency
19160                                     or
19161                                   lc $a->name cmp lc $b->name
19162                                 } $property->tables)
19163         {
19164
19165             # Find code points that match, and don't match this table.
19166             my $valid = $table->get_valid_code_point;
19167             my $invalid = $table->get_invalid_code_point;
19168             my $warning = ($table->status eq $DEPRECATED)
19169                             ? "'deprecated'"
19170                             : '""';
19171
19172             # Test each possible combination of the property's aliases with
19173             # the table's.  If this gets to be too many, could do what is done
19174             # in the set_final_comment() for Tables
19175             my @table_aliases = grep { $_->status ne $INTERNAL_ALIAS } $table->aliases;
19176             next unless @table_aliases;
19177             my @property_aliases = grep { $_->status ne $INTERNAL_ALIAS } $table->property->aliases;
19178             next unless @property_aliases;
19179
19180             # Every property can be optionally be prefixed by 'Is_', so test
19181             # that those work, by creating such a new alias for each
19182             # pre-existing one.
19183             push @property_aliases, map { Alias->new("Is_" . $_->name,
19184                                                     $_->loose_match,
19185                                                     $_->make_re_pod_entry,
19186                                                     $_->ok_as_filename,
19187                                                     $_->status,
19188                                                     $_->ucd,
19189                                                     )
19190                                          } @property_aliases;
19191             my $max = max(scalar @table_aliases, scalar @property_aliases);
19192             for my $j (0 .. $max - 1) {
19193
19194                 # The current alias for property is the next one on the list,
19195                 # or if beyond the end, start over.  Similarly for table
19196                 my $property_name
19197                             = $property_aliases[$j % @property_aliases]->name;
19198
19199                 $property_name = "" if $table->property == $perl;
19200                 my $table_alias = $table_aliases[$j % @table_aliases];
19201                 my $table_name = $table_alias->name;
19202                 my $loose_match = $table_alias->loose_match;
19203
19204                 # If the table doesn't have a file, any test for it is
19205                 # already guaranteed to be in error
19206                 my $already_error = ! $table->file_path;
19207
19208                 # A table that begins with these could actually be a
19209                 # user-defined property, so won't be compile time errors, as
19210                 # the definitions of those can be deferred until runtime
19211                 next if $already_error && $table_name =~ / ^ I[ns] /x;
19212
19213                 # Generate error cases for this alias.
19214                 push @output, generate_error($property_name,
19215                                              $table_name,
19216                                              $already_error);
19217
19218                 # If the table is guaranteed to always generate an error,
19219                 # quit now without generating success cases.
19220                 next if $already_error;
19221
19222                 # Now for the success cases.  First, wildcard matching, as it
19223                 # shouldn't have any randomization.
19224                 if ($table_alias->status eq $NORMAL) {
19225                     push @output, generate_wildcard_tests($property_name,
19226                                                           $table_name,
19227                                                           $valid,
19228                                                           $invalid,
19229                                                           $warning,
19230                                                          );
19231                 }
19232                 my $random;
19233                 if ($loose_match) {
19234
19235                     # For loose matching, create an extra test case for the
19236                     # standard name.
19237                     my $standard = standardize($table_name);
19238
19239                     # $test_name should be a unique combination for each test
19240                     # case; used just to avoid duplicate tests
19241                     my $test_name = "$property_name=$standard";
19242
19243                     # Don't output duplicate test cases.
19244                     if (! exists $test_generated{$test_name}) {
19245                         $test_generated{$test_name} = 1;
19246                         push @output, generate_tests($property_name,
19247                                                      $standard,
19248                                                      $valid,
19249                                                      $invalid,
19250                                                      $warning,
19251                                                  );
19252                         if ($table_alias->status eq $NORMAL) {
19253                             push @output, generate_wildcard_tests(
19254                                                      $property_name,
19255                                                      $standard,
19256                                                      $valid,
19257                                                      $invalid,
19258                                                      $warning,
19259                                                  );
19260                         }
19261                     }
19262                     $random = randomize_loose_name($table_name)
19263                 }
19264                 else { # Stricter match
19265                     $random = randomize_stricter_name($table_name);
19266                 }
19267
19268                 # Now for the main test case for this alias.
19269                 my $test_name = "$property_name=$random";
19270                 if (! exists $test_generated{$test_name}) {
19271                     $test_generated{$test_name} = 1;
19272                     push @output, generate_tests($property_name,
19273                                                  $random,
19274                                                  $valid,
19275                                                  $invalid,
19276                                                  $warning,
19277                                              );
19278
19279                     if ($property->name eq 'nv') {
19280                         if ($table_name !~ qr{/}) {
19281                             push @output, generate_tests($property_name,
19282                                                 sprintf("%.15e", $table_name),
19283                                                 $valid,
19284                                                 $invalid,
19285                                                 $warning,
19286                                             );
19287                     }
19288                     else {
19289                         # If the name is a rational number, add tests for a
19290                         # non-reduced form, and for a floating point equivalent.
19291
19292                         # 60 is a number divisible by a bunch of things
19293                         my ($numerator, $denominator) = $table_name
19294                                                         =~ m! (.+) / (.+) !x;
19295                         $numerator *= 60;
19296                         $denominator *= 60;
19297                         push @output, generate_tests($property_name,
19298                                                     "$numerator/$denominator",
19299                                                     $valid,
19300                                                     $invalid,
19301                                                     $warning,
19302                                     );
19303
19304                         # Calculate the float, and the %e representation
19305                         my $float = eval $table_name;
19306                         my $e_representation = sprintf("%.*e",
19307                                                 $E_FLOAT_PRECISION, $float);
19308                         # Parse that
19309                         my ($non_zeros, $zeros, $exponent_sign, $exponent)
19310                            = $e_representation
19311                                =~ / -? [1-9] \. (\d*?) (0*) e ([+-]) (\d+) /x;
19312                         my $min_e_precision;
19313                         my $min_f_precision;
19314
19315                         if ($exponent_sign eq '+' && $exponent != 0) {
19316                             Carp::my_carp_bug("Not yet equipped to handle"
19317                                             . " positive exponents");
19318                             return;
19319                         }
19320                         else {
19321                             # We're trying to find the minimum precision that
19322                             # is needed to indicate this particular rational
19323                             # for the given $E_FLOAT_PRECISION.  For %e, any
19324                             # trailing zeros, like 1.500e-02 aren't needed, so
19325                             # the correct value is how many non-trailing zeros
19326                             # there are after the decimal point.
19327                             $min_e_precision = length $non_zeros;
19328
19329                             # For %f, like .01500, we want at least
19330                             # $E_FLOAT_PRECISION digits, but any trailing
19331                             # zeros aren't needed, so we can subtract the
19332                             # length of those.  But we also need to include
19333                             # the zeros after the decimal point, but before
19334                             # the first significant digit.
19335                             $min_f_precision = $E_FLOAT_PRECISION
19336                                              + $exponent
19337                                              - length $zeros;
19338                         }
19339
19340                         # Make tests for each possible precision from 1 to
19341                         # just past the worst case.
19342                         my $upper_limit = ($min_e_precision > $min_f_precision)
19343                                            ? $min_e_precision
19344                                            : $min_f_precision;
19345
19346                         for my $i (1 .. $upper_limit + 1) {
19347                             for my $format ("e", "f") {
19348                                 my $this_table
19349                                           = sprintf("%.*$format", $i, $float);
19350
19351                                 # If we don't have enough precision digits,
19352                                 # make a fail test; otherwise a pass test.
19353                                 my $pass = ($format eq "e")
19354                                             ? $i >= $min_e_precision
19355                                             : $i >= $min_f_precision;
19356                                 if ($pass) {
19357                                     push @output, generate_tests($property_name,
19358                                                                 $this_table,
19359                                                                 $valid,
19360                                                                 $invalid,
19361                                                                 $warning,
19362                                                 );
19363                                 }
19364                                 elsif (   $format eq "e"
19365
19366                                           # Here we would fail, but in the %f
19367                                           # case, the representation at this
19368                                           # precision could actually be a
19369                                           # valid one for some other rational
19370                                        || ! grep { $this_table
19371                                                             =~ / ^ $_ 0* $ /x }
19372                                                             @valid_base_floats)
19373                                 {
19374                                     push @output,
19375                                         generate_error($property_name,
19376                                                        $this_table,
19377                                                        1   # 1 => already an
19378                                                            # error
19379                                                 );
19380                                 }
19381                             }
19382                         }
19383                     }
19384                     }
19385                 }
19386             }
19387             $table->DESTROY();
19388         }
19389         $property->DESTROY();
19390     }
19391
19392     # Make any test of the boundary (break) properties TODO if the code
19393     # doesn't match the version being compiled
19394     my $TODO_FAILING_BREAKS = ($version_of_mk_invlist_bounds ne $v_version)
19395                              ? "\nsub TODO_FAILING_BREAKS { 1 }\n"
19396                              : "\nsub TODO_FAILING_BREAKS { 0 }\n";
19397
19398     @output= map {
19399         map s/^/    /mgr,
19400         map "$_;\n",
19401         split /;\n/, $_
19402     } @output;
19403
19404     # Cause there to be 'if' statements to only execute a portion of this
19405     # long-running test each time, so that we can have a bunch of .t's running
19406     # in parallel
19407     my $chunks = 10     # Number of test files
19408                - 1      # For GCB & SB
19409                - 1      # For WB
19410                - 4;     # LB split into this many files
19411     my @output_chunked;
19412     my $chunk_count=0;
19413     my $chunk_size= int(@output / $chunks) + 1;
19414     while (@output) {
19415         $chunk_count++;
19416         my @chunk= splice @output, 0, $chunk_size;
19417         push @output_chunked,
19418             "if (!\$::TESTCHUNK or \$::TESTCHUNK == $chunk_count) {\n",
19419                 @chunk,
19420             "}\n";
19421     }
19422
19423     $chunk_count++;
19424     push @output_chunked,
19425         "if (!\$::TESTCHUNK or \$::TESTCHUNK == $chunk_count) {\n",
19426             (map {"    Test_GCB('$_');\n"} @backslash_X_tests),
19427             (map {"    Test_SB('$_');\n"} @SB_tests),
19428         "}\n";
19429
19430
19431     $chunk_size= int(@LB_tests / 4) + 1;
19432     @LB_tests = map {"    Test_LB('$_');\n"} @LB_tests;
19433     while (@LB_tests) {
19434         $chunk_count++;
19435         my @chunk= splice @LB_tests, 0, $chunk_size;
19436         push @output_chunked,
19437             "if (!\$::TESTCHUNK or \$::TESTCHUNK == $chunk_count) {\n",
19438                 @chunk,
19439             "}\n";
19440     }
19441
19442     $chunk_count++;
19443     push @output_chunked,
19444         "if (!\$::TESTCHUNK or \$::TESTCHUNK == $chunk_count) {\n",
19445             (map {"    Test_WB('$_');\n"} @WB_tests),
19446         "}\n";
19447
19448     &write($t_path,
19449            0,           # Not utf8;
19450            [$HEADER,
19451             $TODO_FAILING_BREAKS,
19452             <DATA>,
19453             @output_chunked,
19454             "Finished();\n",
19455            ]);
19456
19457     return;
19458 }
19459
19460 sub make_normalization_test_script() {
19461     print "Making normalization test script\n" if $verbosity >= $PROGRESS;
19462
19463     my $n_path = 'TestNorm.pl';
19464
19465     unshift @normalization_tests, <<'END';
19466 use utf8;
19467 use Test::More;
19468
19469 sub ord_string {    # Convert packed ords to printable string
19470     use charnames ();
19471     return "'" . join("", map { '\N{' . charnames::viacode($_) . '}' }
19472                                                 unpack "U*", shift) .  "'";
19473     #return "'" . join(" ", map { sprintf "%04X", $_ } unpack "U*", shift) .  "'";
19474 }
19475
19476 sub Test_N {
19477     my ($source, $nfc, $nfd, $nfkc, $nfkd) = @_;
19478     my $display_source = ord_string($source);
19479     my $display_nfc = ord_string($nfc);
19480     my $display_nfd = ord_string($nfd);
19481     my $display_nfkc = ord_string($nfkc);
19482     my $display_nfkd = ord_string($nfkd);
19483
19484     use Unicode::Normalize;
19485     #    NFC
19486     #      nfc ==  toNFC(source) ==  toNFC(nfc) ==  toNFC(nfd)
19487     #      nfkc ==  toNFC(nfkc) ==  toNFC(nfkd)
19488     #
19489     #    NFD
19490     #      nfd ==  toNFD(source) ==  toNFD(nfc) ==  toNFD(nfd)
19491     #      nfkd ==  toNFD(nfkc) ==  toNFD(nfkd)
19492     #
19493     #    NFKC
19494     #      nfkc == toNFKC(source) == toNFKC(nfc) == toNFKC(nfd) ==
19495     #      toNFKC(nfkc) == toNFKC(nfkd)
19496     #
19497     #    NFKD
19498     #      nfkd == toNFKD(source) == toNFKD(nfc) == toNFKD(nfd) ==
19499     #      toNFKD(nfkc) == toNFKD(nfkd)
19500
19501     is(NFC($source), $nfc, "NFC($display_source) eq $display_nfc");
19502     is(NFC($nfc), $nfc, "NFC($display_nfc) eq $display_nfc");
19503     is(NFC($nfd), $nfc, "NFC($display_nfd) eq $display_nfc");
19504     is(NFC($nfkc), $nfkc, "NFC($display_nfkc) eq $display_nfkc");
19505     is(NFC($nfkd), $nfkc, "NFC($display_nfkd) eq $display_nfkc");
19506
19507     is(NFD($source), $nfd, "NFD($display_source) eq $display_nfd");
19508     is(NFD($nfc), $nfd, "NFD($display_nfc) eq $display_nfd");
19509     is(NFD($nfd), $nfd, "NFD($display_nfd) eq $display_nfd");
19510     is(NFD($nfkc), $nfkd, "NFD($display_nfkc) eq $display_nfkd");
19511     is(NFD($nfkd), $nfkd, "NFD($display_nfkd) eq $display_nfkd");
19512
19513     is(NFKC($source), $nfkc, "NFKC($display_source) eq $display_nfkc");
19514     is(NFKC($nfc), $nfkc, "NFKC($display_nfc) eq $display_nfkc");
19515     is(NFKC($nfd), $nfkc, "NFKC($display_nfd) eq $display_nfkc");
19516     is(NFKC($nfkc), $nfkc, "NFKC($display_nfkc) eq $display_nfkc");
19517     is(NFKC($nfkd), $nfkc, "NFKC($display_nfkd) eq $display_nfkc");
19518
19519     is(NFKD($source), $nfkd, "NFKD($display_source) eq $display_nfkd");
19520     is(NFKD($nfc), $nfkd, "NFKD($display_nfc) eq $display_nfkd");
19521     is(NFKD($nfd), $nfkd, "NFKD($display_nfd) eq $display_nfkd");
19522     is(NFKD($nfkc), $nfkd, "NFKD($display_nfkc) eq $display_nfkd");
19523     is(NFKD($nfkd), $nfkd, "NFKD($display_nfkd) eq $display_nfkd");
19524 }
19525 END
19526
19527     &write($n_path,
19528            1,           # Is utf8;
19529            [
19530             @normalization_tests,
19531             'done_testing();'
19532             ]);
19533     return;
19534 }
19535
19536 # Skip reasons, so will be exact same text and hence the files with each
19537 # reason will get grouped together in perluniprops.
19538 my $Documentation = "Documentation";
19539 my $Indic_Skip
19540             = "Provisional; for the analysis and processing of Indic scripts";
19541 my $Validation = "Validation Tests";
19542 my $Validation_Documentation = "Documentation of validation Tests";
19543
19544 # This is a list of the input files and how to handle them.  The files are
19545 # processed in their order in this list.  Some reordering is possible if
19546 # desired, but the PropertyAliases and PropValueAliases files should be first,
19547 # and the extracted before the others (as data in an extracted file can be
19548 # over-ridden by the non-extracted.  Some other files depend on data derived
19549 # from an earlier file, like UnicodeData requires data from Jamo, and the case
19550 # changing and folding requires data from Unicode.  Mostly, it is safest to
19551 # order by first version releases in (except the Jamo).
19552 #
19553 # The version strings allow the program to know whether to expect a file or
19554 # not, but if a file exists in the directory, it will be processed, even if it
19555 # is in a version earlier than expected, so you can copy files from a later
19556 # release into an earlier release's directory.
19557 my @input_file_objects = (
19558     Input_file->new('PropertyAliases.txt', v3.2,
19559                     Handler => \&process_PropertyAliases,
19560                     Early => [ \&substitute_PropertyAliases ],
19561                     Required_Even_in_Debug_Skip => 1,
19562                    ),
19563     Input_file->new(undef, v0,  # No file associated with this
19564                     Progress_Message => 'Finishing property setup',
19565                     Handler => \&finish_property_setup,
19566                    ),
19567     Input_file->new('PropValueAliases.txt', v3.2,
19568                      Handler => \&process_PropValueAliases,
19569                      Early => [ \&substitute_PropValueAliases ],
19570                      Has_Missings_Defaults => $NOT_IGNORED,
19571                      Required_Even_in_Debug_Skip => 1,
19572                     ),
19573     Input_file->new("${EXTRACTED}DGeneralCategory.txt", v3.1.0,
19574                     Property => 'General_Category',
19575                    ),
19576     Input_file->new("${EXTRACTED}DCombiningClass.txt", v3.1.0,
19577                     Property => 'Canonical_Combining_Class',
19578                     Has_Missings_Defaults => $NOT_IGNORED,
19579                    ),
19580     Input_file->new("${EXTRACTED}DNumType.txt", v3.1.0,
19581                     Property => 'Numeric_Type',
19582                     Has_Missings_Defaults => $NOT_IGNORED,
19583                    ),
19584     Input_file->new("${EXTRACTED}DEastAsianWidth.txt", v3.1.0,
19585                     Property => 'East_Asian_Width',
19586                     Has_Missings_Defaults => $NOT_IGNORED,
19587                    ),
19588     Input_file->new("${EXTRACTED}DLineBreak.txt", v3.1.0,
19589                     Property => 'Line_Break',
19590                     Has_Missings_Defaults => $NOT_IGNORED,
19591                    ),
19592     Input_file->new("${EXTRACTED}DBidiClass.txt", v3.1.1,
19593                     Property => 'Bidi_Class',
19594                     Has_Missings_Defaults => $NOT_IGNORED,
19595                    ),
19596     Input_file->new("${EXTRACTED}DDecompositionType.txt", v3.1.0,
19597                     Property => 'Decomposition_Type',
19598                     Has_Missings_Defaults => $NOT_IGNORED,
19599                    ),
19600     Input_file->new("${EXTRACTED}DBinaryProperties.txt", v3.1.0),
19601     Input_file->new("${EXTRACTED}DNumValues.txt", v3.1.0,
19602                     Property => 'Numeric_Value',
19603                     Each_Line_Handler => \&filter_numeric_value_line,
19604                     Has_Missings_Defaults => $NOT_IGNORED,
19605                    ),
19606     Input_file->new("${EXTRACTED}DJoinGroup.txt", v3.1.0,
19607                     Property => 'Joining_Group',
19608                     Has_Missings_Defaults => $NOT_IGNORED,
19609                    ),
19610
19611     Input_file->new("${EXTRACTED}DJoinType.txt", v3.1.0,
19612                     Property => 'Joining_Type',
19613                     Has_Missings_Defaults => $NOT_IGNORED,
19614                    ),
19615     Input_file->new("${EXTRACTED}DName.txt", v10.0.0,
19616                     Skip => 'This file adds no new information not already'
19617                           . ' present in other files',
19618                     # And it's unnecessary programmer work to handle this new
19619                     # format.  Previous Derived files actually had bug fixes
19620                     # in them that were useful, but that should not be the
19621                     # case here.
19622                    ),
19623     Input_file->new('Jamo.txt', v2.0.0,
19624                     Property => 'Jamo_Short_Name',
19625                     Each_Line_Handler => \&filter_jamo_line,
19626                    ),
19627     Input_file->new('UnicodeData.txt', v1.1.5,
19628                     Pre_Handler => \&setup_UnicodeData,
19629
19630                     # We clean up this file for some early versions.
19631                     Each_Line_Handler => [ (($v_version lt v2.0.0 )
19632                                             ? \&filter_v1_ucd
19633                                             : ($v_version eq v2.1.5)
19634                                                 ? \&filter_v2_1_5_ucd
19635
19636                                                 # And for 5.14 Perls with 6.0,
19637                                                 # have to also make changes
19638                                                 : ($v_version ge v6.0.0
19639                                                    && $^V lt v5.17.0)
19640                                                     ? \&filter_v6_ucd
19641                                                     : undef),
19642
19643                                             # Early versions did not have the
19644                                             # proper Unicode_1 names for the
19645                                             # controls
19646                                             (($v_version lt v3.0.0)
19647                                             ? \&filter_early_U1_names
19648                                             : undef),
19649
19650                                             # Early versions did not correctly
19651                                             # use the later method for giving
19652                                             # decimal digit values
19653                                             (($v_version le v3.2.0)
19654                                             ? \&filter_bad_Nd_ucd
19655                                             : undef),
19656
19657                                             # And the main filter
19658                                             \&filter_UnicodeData_line,
19659                                          ],
19660                     EOF_Handler => \&EOF_UnicodeData,
19661                    ),
19662     Input_file->new('CJKXREF.TXT', v1.1.5,
19663                     Withdrawn => v2.0.0,
19664                     Skip => 'Gives the mapping of CJK code points '
19665                           . 'between Unicode and various other standards',
19666                    ),
19667     Input_file->new('ArabicShaping.txt', v2.0.0,
19668                     Each_Line_Handler =>
19669                         ($v_version lt 4.1.0)
19670                                     ? \&filter_old_style_arabic_shaping
19671                                     : undef,
19672                     # The first field after the range is a "schematic name"
19673                     # not used by Perl
19674                     Properties => [ '<ignored>', 'Joining_Type', 'Joining_Group' ],
19675                     Has_Missings_Defaults => $NOT_IGNORED,
19676                    ),
19677     Input_file->new('Blocks.txt', v2.0.0,
19678                     Property => 'Block',
19679                     Has_Missings_Defaults => $NOT_IGNORED,
19680                     Each_Line_Handler => \&filter_blocks_lines
19681                    ),
19682     Input_file->new('Index.txt', v2.0.0,
19683                     Skip => 'Alphabetical index of Unicode characters',
19684                    ),
19685     Input_file->new('NamesList.txt', v2.0.0,
19686                     Skip => 'Annotated list of characters',
19687                    ),
19688     Input_file->new('PropList.txt', v2.0.0,
19689                     Each_Line_Handler => (($v_version lt v3.1.0)
19690                                             ? \&filter_old_style_proplist
19691                                             : undef),
19692                    ),
19693     Input_file->new('Props.txt', v2.0.0,
19694                     Withdrawn => v3.0.0,
19695                     Skip => 'A subset of F<PropList.txt> (which is used instead)',
19696                    ),
19697     Input_file->new('ReadMe.txt', v2.0.0,
19698                     Skip => $Documentation,
19699                    ),
19700     Input_file->new('Unihan.txt', v2.0.0,
19701                     Withdrawn => v5.2.0,
19702                     Construction_Time_Handler => \&construct_unihan,
19703                     Pre_Handler => \&setup_unihan,
19704                     Optional => [ "",
19705                                   'Unicode_Radical_Stroke'
19706                                 ],
19707                     Each_Line_Handler => \&filter_unihan_line,
19708                    ),
19709     Input_file->new('SpecialCasing.txt', v2.1.8,
19710                     Each_Line_Handler => ($v_version eq 2.1.8)
19711                                          ? \&filter_2_1_8_special_casing_line
19712                                          : \&filter_special_casing_line,
19713                     Pre_Handler => \&setup_special_casing,
19714                     Has_Missings_Defaults => $IGNORED,
19715                    ),
19716     Input_file->new(
19717                     'LineBreak.txt', v3.0.0,
19718                     Has_Missings_Defaults => $NOT_IGNORED,
19719                     Property => 'Line_Break',
19720                     # Early versions had problematic syntax
19721                     Each_Line_Handler => ($v_version ge v3.1.0)
19722                                           ? undef
19723                                           : ($v_version lt v3.0.0)
19724                                             ? \&filter_substitute_lb
19725                                             : \&filter_early_ea_lb,
19726                     # Must use long names for property values see comments at
19727                     # sub filter_substitute_lb
19728                     Early => [ "LBsubst.txt", '_Perl_LB', 'Alphabetic',
19729                                'Alphabetic', # default to this because XX ->
19730                                              # AL
19731
19732                                # Don't use _Perl_LB as a synonym for
19733                                # Line_Break in later perls, as it is tailored
19734                                # and isn't the same as Line_Break
19735                                'ONLY_EARLY' ],
19736                    ),
19737     Input_file->new('EastAsianWidth.txt', v3.0.0,
19738                     Property => 'East_Asian_Width',
19739                     Has_Missings_Defaults => $NOT_IGNORED,
19740                     # Early versions had problematic syntax
19741                     Each_Line_Handler => (($v_version lt v3.1.0)
19742                                         ? \&filter_early_ea_lb
19743                                         : undef),
19744                    ),
19745     Input_file->new('CompositionExclusions.txt', v3.0.0,
19746                     Property => 'Composition_Exclusion',
19747                    ),
19748     Input_file->new('UnicodeData.html', v3.0.0,
19749                     Withdrawn => v4.0.1,
19750                     Skip => $Documentation,
19751                    ),
19752     Input_file->new('BidiMirroring.txt', v3.0.1,
19753                     Property => 'Bidi_Mirroring_Glyph',
19754                     Has_Missings_Defaults => ($v_version lt v6.2.0)
19755                                               ? $NO_DEFAULTS
19756                                               # Is <none> which doesn't mean
19757                                               # anything to us, we will use the
19758                                               # null string
19759                                               : $IGNORED,
19760                    ),
19761     Input_file->new('NamesList.html', v3.0.0,
19762                     Skip => 'Describes the format and contents of '
19763                           . 'F<NamesList.txt>',
19764                    ),
19765     Input_file->new('UnicodeCharacterDatabase.html', v3.0.0,
19766                     Withdrawn => v5.1,
19767                     Skip => $Documentation,
19768                    ),
19769     Input_file->new('CaseFolding.txt', v3.0.1,
19770                     Pre_Handler => \&setup_case_folding,
19771                     Each_Line_Handler =>
19772                         [ ($v_version lt v3.1.0)
19773                                  ? \&filter_old_style_case_folding
19774                                  : undef,
19775                            \&filter_case_folding_line
19776                         ],
19777                     Has_Missings_Defaults => $IGNORED,
19778                    ),
19779     Input_file->new("NormTest.txt", v3.0.1,
19780                      Handler => \&process_NormalizationsTest,
19781                      Skip => ($make_norm_test_script) ? 0 : $Validation,
19782                    ),
19783     Input_file->new('DCoreProperties.txt', v3.1.0,
19784                     # 5.2 changed this file
19785                     Has_Missings_Defaults => (($v_version ge v5.2.0)
19786                                             ? $NOT_IGNORED
19787                                             : $NO_DEFAULTS),
19788                    ),
19789     Input_file->new('DProperties.html', v3.1.0,
19790                     Withdrawn => v3.2.0,
19791                     Skip => $Documentation,
19792                    ),
19793     Input_file->new('PropList.html', v3.1.0,
19794                     Withdrawn => v5.1,
19795                     Skip => $Documentation,
19796                    ),
19797     Input_file->new('Scripts.txt', v3.1.0,
19798                     Property => 'Script',
19799                     Each_Line_Handler => (($v_version le v4.0.0)
19800                                           ? \&filter_all_caps_script_names
19801                                           : undef),
19802                     Has_Missings_Defaults => $NOT_IGNORED,
19803                    ),
19804     Input_file->new('DNormalizationProps.txt', v3.1.0,
19805                     Has_Missings_Defaults => $NOT_IGNORED,
19806                     Each_Line_Handler => (($v_version lt v4.0.1)
19807                                       ? \&filter_old_style_normalization_lines
19808                                       : undef),
19809                    ),
19810     Input_file->new('DerivedProperties.html', v3.1.1,
19811                     Withdrawn => v5.1,
19812                     Skip => $Documentation,
19813                    ),
19814     Input_file->new('DAge.txt', v3.2.0,
19815                     Has_Missings_Defaults => $NOT_IGNORED,
19816                     Property => 'Age'
19817                    ),
19818     Input_file->new('HangulSyllableType.txt', v4.0,
19819                     Has_Missings_Defaults => $NOT_IGNORED,
19820                     Early => [ \&generate_hst, 'Hangul_Syllable_Type' ],
19821                     Property => 'Hangul_Syllable_Type'
19822                    ),
19823     Input_file->new('NormalizationCorrections.txt', v3.2.0,
19824                      # This documents the cumulative fixes to erroneous
19825                      # normalizations in earlier Unicode versions.  Its main
19826                      # purpose is so that someone running on an earlier
19827                      # version can use this file to override what got
19828                      # published in that earlier release.  It would be easy
19829                      # for mktables to handle this file.  But all the
19830                      # corrections in it should already be in the other files
19831                      # for the release it is.  To get it to actually mean
19832                      # something useful, someone would have to be using an
19833                      # earlier Unicode release, and copy it into the directory
19834                      # for that release and recompile.  So far there has been
19835                      # no demand to do that, so this hasn't been implemented.
19836                     Skip => 'Documentation of corrections already '
19837                           . 'incorporated into the Unicode data base',
19838                    ),
19839     Input_file->new('StandardizedVariants.html', v3.2.0,
19840                     Skip => 'Obsoleted as of Unicode 9.0, but previously '
19841                           . 'provided a visual display of the standard '
19842                           . 'variant sequences derived from '
19843                           . 'F<StandardizedVariants.txt>.',
19844                         # I don't know why the html came earlier than the
19845                         # .txt, but both are skipped anyway, so it doesn't
19846                         # matter.
19847                    ),
19848     Input_file->new('StandardizedVariants.txt', v4.0.0,
19849                     Skip => 'Certain glyph variations for character display '
19850                           . 'are standardized.  This lists the non-Unihan '
19851                           . 'ones; the Unihan ones are also not used by '
19852                           . 'Perl, and are in a separate Unicode data base '
19853                           . 'L<http://www.unicode.org/ivd>',
19854                    ),
19855     Input_file->new('UCD.html', v4.0.0,
19856                     Withdrawn => v5.2,
19857                     Skip => $Documentation,
19858                    ),
19859     Input_file->new("$AUXILIARY/WordBreakProperty.txt", v4.1.0,
19860                     Early => [ "WBsubst.txt", '_Perl_WB', 'ALetter' ],
19861                     Property => 'Word_Break',
19862                     Has_Missings_Defaults => $NOT_IGNORED,
19863                    ),
19864     Input_file->new("$AUXILIARY/GraphemeBreakProperty.txt", v4.1,
19865                     Early => [ \&generate_GCB, '_Perl_GCB' ],
19866                     Property => 'Grapheme_Cluster_Break',
19867                     Has_Missings_Defaults => $NOT_IGNORED,
19868                    ),
19869     Input_file->new("$AUXILIARY/GCBTest.txt", v4.1.0,
19870                     Handler => \&process_GCB_test,
19871                     retain_trailing_comments => 1,
19872                    ),
19873     Input_file->new("$AUXILIARY/GraphemeBreakTest.html", v4.1.0,
19874                     Skip => $Validation_Documentation,
19875                    ),
19876     Input_file->new("$AUXILIARY/SBTest.txt", v4.1.0,
19877                     Handler => \&process_SB_test,
19878                     retain_trailing_comments => 1,
19879                    ),
19880     Input_file->new("$AUXILIARY/SentenceBreakTest.html", v4.1.0,
19881                     Skip => $Validation_Documentation,
19882                    ),
19883     Input_file->new("$AUXILIARY/WBTest.txt", v4.1.0,
19884                     Handler => \&process_WB_test,
19885                     retain_trailing_comments => 1,
19886                    ),
19887     Input_file->new("$AUXILIARY/WordBreakTest.html", v4.1.0,
19888                     Skip => $Validation_Documentation,
19889                    ),
19890     Input_file->new("$AUXILIARY/SentenceBreakProperty.txt", v4.1.0,
19891                     Property => 'Sentence_Break',
19892                     Early => [ "SBsubst.txt", '_Perl_SB', 'OLetter' ],
19893                     Has_Missings_Defaults => $NOT_IGNORED,
19894                    ),
19895     Input_file->new('NamedSequences.txt', v4.1.0,
19896                     Handler => \&process_NamedSequences
19897                    ),
19898     Input_file->new('Unihan.html', v4.1.0,
19899                     Withdrawn => v5.2,
19900                     Skip => $Documentation,
19901                    ),
19902     Input_file->new('NameAliases.txt', v5.0,
19903                     Property => 'Name_Alias',
19904                     Each_Line_Handler => ($v_version le v6.0.0)
19905                                    ? \&filter_early_version_name_alias_line
19906                                    : \&filter_later_version_name_alias_line,
19907                    ),
19908         # NameAliases.txt came along in v5.0.  The above constructor handles
19909         # this.  But until 6.1, it was lacking some information needed by core
19910         # perl.  The constructor below handles that.  It is either a kludge or
19911         # clever, depending on your point of view.  The 'Withdrawn' parameter
19912         # indicates not to use it at all starting in 6.1 (so the above
19913         # constructor applies), and the 'v6.1' parameter indicates to use the
19914         # Early parameter before 6.1.  Therefore 'Early" is always used,
19915         # yielding the internal-only property '_Perl_Name_Alias', which it
19916         # gets from a NameAliases.txt from 6.1 or later stored in
19917         # N_Asubst.txt.  In combination with the above constructor,
19918         # 'Name_Alias' is publicly accessible starting with v5.0, and the
19919         # better 6.1 version is accessible to perl core in all releases.
19920     Input_file->new("NameAliases.txt", v6.1,
19921                     Withdrawn => v6.1,
19922                     Early => [ "N_Asubst.txt", '_Perl_Name_Alias', "" ],
19923                     Property => 'Name_Alias',
19924                     EOF_Handler => \&fixup_early_perl_name_alias,
19925                     Each_Line_Handler =>
19926                                        \&filter_later_version_name_alias_line,
19927                    ),
19928     Input_file->new('NamedSqProv.txt', v5.0.0,
19929                     Skip => 'Named sequences proposed for inclusion in a '
19930                           . 'later version of the Unicode Standard; if you '
19931                           . 'need them now, you can append this file to '
19932                           . 'F<NamedSequences.txt> and recompile perl',
19933                    ),
19934     Input_file->new("$AUXILIARY/LBTest.txt", v5.1.0,
19935                     Handler => \&process_LB_test,
19936                     retain_trailing_comments => 1,
19937                    ),
19938     Input_file->new("$AUXILIARY/LineBreakTest.html", v5.1.0,
19939                     Skip => $Validation_Documentation,
19940                    ),
19941     Input_file->new("BidiTest.txt", v5.2.0,
19942                     Skip => $Validation,
19943                    ),
19944     Input_file->new('UnihanIndicesDictionary.txt', v5.2.0,
19945                     Optional => "",
19946                     Each_Line_Handler => \&filter_unihan_line,
19947                    ),
19948     Input_file->new('UnihanDataDictionaryLike.txt', v5.2.0,
19949                     Optional => "",
19950                     Each_Line_Handler => \&filter_unihan_line,
19951                    ),
19952     Input_file->new('UnihanIRGSources.txt', v5.2.0,
19953                     Optional => [ "",
19954                                   'kCompatibilityVariant',
19955                                   'kIICore',
19956                                   'kIRG_GSource',
19957                                   'kIRG_HSource',
19958                                   'kIRG_JSource',
19959                                   'kIRG_KPSource',
19960                                   'kIRG_MSource',
19961                                   'kIRG_KSource',
19962                                   'kIRG_TSource',
19963                                   'kIRG_USource',
19964                                   'kIRG_VSource',
19965                                ],
19966                     Pre_Handler => \&setup_unihan,
19967                     Each_Line_Handler => \&filter_unihan_line,
19968                    ),
19969     Input_file->new('UnihanNumericValues.txt', v5.2.0,
19970                     Optional => [ "",
19971                                   'kAccountingNumeric',
19972                                   'kOtherNumeric',
19973                                   'kPrimaryNumeric',
19974                                 ],
19975                     Each_Line_Handler => \&filter_unihan_line,
19976                    ),
19977     Input_file->new('UnihanOtherMappings.txt', v5.2.0,
19978                     Optional => "",
19979                     Each_Line_Handler => \&filter_unihan_line,
19980                    ),
19981     Input_file->new('UnihanRadicalStrokeCounts.txt', v5.2.0,
19982                     Optional => [ "",
19983                                   'Unicode_Radical_Stroke'
19984                                 ],
19985                     Each_Line_Handler => \&filter_unihan_line,
19986                    ),
19987     Input_file->new('UnihanReadings.txt', v5.2.0,
19988                     Optional => "",
19989                     Each_Line_Handler => \&filter_unihan_line,
19990                    ),
19991     Input_file->new('UnihanVariants.txt', v5.2.0,
19992                     Optional => "",
19993                     Each_Line_Handler => \&filter_unihan_line,
19994                    ),
19995     Input_file->new('CJKRadicals.txt', v5.2.0,
19996                     Skip => 'Maps the kRSUnicode property values to '
19997                           . 'corresponding code points',
19998                    ),
19999     Input_file->new('EmojiSources.txt', v6.0.0,
20000                     Skip => 'Maps certain Unicode code points to their '
20001                           . 'legacy Japanese cell-phone values',
20002                    ),
20003     # This file is actually not usable as-is until 6.1.0, because the property
20004     # is provisional, so its name is missing from PropertyAliases.txt until
20005     # that release, so that further work would have to be done to get it to
20006     # work properly
20007     Input_file->new('ScriptExtensions.txt', v6.0.0,
20008                     Property => 'Script_Extensions',
20009                     Early => [ sub {} ], # Doesn't do anything but ensures
20010                                          # that this isn't skipped for early
20011                                          # versions
20012                     Pre_Handler => \&setup_script_extensions,
20013                     Each_Line_Handler => \&filter_script_extensions_line,
20014                     Has_Missings_Defaults => (($v_version le v6.0.0)
20015                                             ? $NO_DEFAULTS
20016                                             : $IGNORED),
20017                    ),
20018     # These two Indic files are actually not usable as-is until 6.1.0,
20019     # because they are provisional, so their property values are missing from
20020     # PropValueAliases.txt until that release, so that further work would have
20021     # to be done to get them to work properly.
20022     Input_file->new('IndicMatraCategory.txt', v6.0.0,
20023                     Withdrawn => v8.0.0,
20024                     Property => 'Indic_Matra_Category',
20025                     Has_Missings_Defaults => $NOT_IGNORED,
20026                     Skip => $Indic_Skip,
20027                    ),
20028     Input_file->new('IndicSyllabicCategory.txt', v6.0.0,
20029                     Property => 'Indic_Syllabic_Category',
20030                     Has_Missings_Defaults => $NOT_IGNORED,
20031                     Skip => (($v_version lt v8.0.0)
20032                               ? $Indic_Skip
20033                               : 0),
20034                    ),
20035     Input_file->new('USourceData.txt', v6.2.0,
20036                     Skip => 'Documentation of status and cross reference of '
20037                           . 'proposals for encoding by Unicode of Unihan '
20038                           . 'characters',
20039                    ),
20040     Input_file->new('USourceGlyphs.pdf', v6.2.0,
20041                     Skip => 'Pictures of the characters in F<USourceData.txt>',
20042                    ),
20043     Input_file->new('BidiBrackets.txt', v6.3.0,
20044                     Properties => [ 'Bidi_Paired_Bracket',
20045                                     'Bidi_Paired_Bracket_Type'
20046                                   ],
20047                     Has_Missings_Defaults => $NO_DEFAULTS,
20048                    ),
20049     Input_file->new("BidiCharacterTest.txt", v6.3.0,
20050                     Skip => $Validation,
20051                    ),
20052     Input_file->new('IndicPositionalCategory.txt', v8.0.0,
20053                     Property => 'Indic_Positional_Category',
20054                     Has_Missings_Defaults => $NOT_IGNORED,
20055                    ),
20056     Input_file->new('TangutSources.txt', v9.0.0,
20057                     Skip => 'Specifies source mappings for Tangut ideographs'
20058                           . ' and components. This data file also includes'
20059                           . ' informative radical-stroke values that are used'
20060                           . ' internally by Unicode',
20061                    ),
20062     Input_file->new('VerticalOrientation.txt', v10.0.0,
20063                     Property => 'Vertical_Orientation',
20064                     Has_Missings_Defaults => $NOT_IGNORED,
20065                    ),
20066     Input_file->new('NushuSources.txt', v10.0.0,
20067                     Skip => 'Specifies source material for Nushu characters',
20068                    ),
20069     Input_file->new('EquivalentUnifiedIdeograph.txt', v11.0.0,
20070                     Property => 'Equivalent_Unified_Ideograph',
20071                     Has_Missings_Defaults => $NOT_IGNORED,
20072                    ),
20073     Input_file->new('EmojiData.txt', v11.0.0,
20074                     # Is in UAX #51 and not the UCD, so must be updated
20075                     # separately, and the first line edited to indicate the
20076                     # UCD release we're pretending it to be in.  The UTC says
20077                     # this is a transitional state.
20078                     Pre_Handler => \&setup_emojidata,
20079                     Has_Missings_Defaults => $NOT_IGNORED,
20080                     Each_Line_Handler => \&filter_emojidata_line,
20081                    ),
20082 );
20083
20084 # End of all the preliminaries.
20085 # Do it...
20086
20087 if (@missing_early_files) {
20088     print simple_fold(join_lines(<<END
20089
20090 The compilation cannot be completed because one or more required input files,
20091 listed below, are missing.  This is because you are compiling Unicode version
20092 $unicode_version, which predates the existence of these file(s).  To fully
20093 function, perl needs the data that these files would have contained if they
20094 had been in this release.  To work around this, create copies of later
20095 versions of the missing files in the directory containing '$0'.  (Perl will
20096 make the necessary adjustments to the data to compensate for it not being the
20097 same version as is being compiled.)  The files are available from unicode.org,
20098 via either ftp or http.  If using http, they will be under
20099 www.unicode.org/versions/.  Below are listed the source file name of each
20100 missing file, the Unicode version to copy it from, and the name to store it
20101 as.  (Note that the listed source file name may not be exactly the one that
20102 Unicode calls it.  If you don't find it, you can look it up in 'README.perl'
20103 to get the correct name.)
20104 END
20105     ));
20106     print simple_fold(join_lines("\n$_")) for @missing_early_files;
20107     exit 2;
20108 }
20109
20110 if ($compare_versions) {
20111     Carp::my_carp(<<END
20112 Warning.  \$compare_versions is set.  Output is not suitable for production
20113 END
20114     );
20115 }
20116
20117 # Put into %potential_files a list of all the files in the directory structure
20118 # that could be inputs to this program
20119 File::Find::find({
20120     wanted=>sub {
20121         return unless / \. ( txt | htm l? ) $ /xi;  # Some platforms change the
20122                                                     # name's case
20123         my $full = lc(File::Spec->rel2abs($_));
20124         $potential_files{$full} = 1;
20125         return;
20126     }
20127 }, File::Spec->curdir());
20128
20129 my @mktables_list_output_files;
20130 my $old_start_time = 0;
20131 my $old_options = "";
20132
20133 if (! -e $file_list) {
20134     print "'$file_list' doesn't exist, so forcing rebuild.\n" if $verbosity >= $VERBOSE;
20135     $write_unchanged_files = 1;
20136 } elsif ($write_unchanged_files) {
20137     print "Not checking file list '$file_list'.\n" if $verbosity >= $VERBOSE;
20138 }
20139 else {
20140     print "Reading file list '$file_list'\n" if $verbosity >= $VERBOSE;
20141     my $file_handle;
20142     if (! open $file_handle, "<", $file_list) {
20143         Carp::my_carp("Failed to open '$file_list'; turning on -globlist option instead: $!");
20144         $glob_list = 1;
20145     }
20146     else {
20147         my @input;
20148
20149         # Read and parse mktables.lst, placing the results from the first part
20150         # into @input, and the second part into @mktables_list_output_files
20151         for my $list ( \@input, \@mktables_list_output_files ) {
20152             while (<$file_handle>) {
20153                 s/^ \s+ | \s+ $//xg;
20154                 if (/^ \s* \# \s* Autogenerated\ starting\ on\ (\d+)/x) {
20155                     $old_start_time = $1;
20156                     next;
20157                 }
20158                 if (/^ \s* \# \s* From\ options\ (.+) /x) {
20159                     $old_options = $1;
20160                     next;
20161                 }
20162                 next if /^ \s* (?: \# .* )? $/x;
20163                 last if /^ =+ $/x;
20164                 my ( $file ) = split /\t/;
20165                 push @$list, $file;
20166             }
20167             @$list = uniques(@$list);
20168             next;
20169         }
20170
20171         # Look through all the input files
20172         foreach my $input (@input) {
20173             next if $input eq 'version'; # Already have checked this.
20174
20175             # Ignore if doesn't exist.  The checking about whether we care or
20176             # not is done via the Input_file object.
20177             next if ! file_exists($input);
20178
20179             # The paths are stored with relative names, and with '/' as the
20180             # delimiter; convert to absolute on this machine
20181             my $full = lc(File::Spec->rel2abs(internal_file_to_platform($input)));
20182             $potential_files{lc $full} = 1;
20183         }
20184     }
20185
20186     close $file_handle;
20187 }
20188
20189 if ($glob_list) {
20190
20191     # Here wants to process all .txt files in the directory structure.
20192     # Convert them to full path names.  They are stored in the platform's
20193     # relative style
20194     my @known_files;
20195     foreach my $object (@input_file_objects) {
20196         my $file = $object->file;
20197         next unless defined $file;
20198         push @known_files, File::Spec->rel2abs($file);
20199     }
20200
20201     my @unknown_input_files;
20202     foreach my $file (keys %potential_files) {  # The keys are stored in lc
20203         next if grep { $file eq lc($_) } @known_files;
20204
20205         # Here, the file is unknown to us.  Get relative path name
20206         $file = File::Spec->abs2rel($file);
20207         push @unknown_input_files, $file;
20208
20209         # What will happen is we create a data structure for it, and add it to
20210         # the list of input files to process.  First get the subdirectories
20211         # into an array
20212         my (undef, $directories, undef) = File::Spec->splitpath($file);
20213         $directories =~ s;/$;;;     # Can have extraneous trailing '/'
20214         my @directories = File::Spec->splitdir($directories);
20215
20216         # If the file isn't extracted (meaning none of the directories is the
20217         # extracted one), just add it to the end of the list of inputs.
20218         if (! grep { $EXTRACTED_DIR eq $_ } @directories) {
20219             push @input_file_objects, Input_file->new($file, v0);
20220         }
20221         else {
20222
20223             # Here, the file is extracted.  It needs to go ahead of most other
20224             # processing.  Search for the first input file that isn't a
20225             # special required property (that is, find one whose first_release
20226             # is non-0), and isn't extracted.  Also, the Age property file is
20227             # processed before the extracted ones, just in case
20228             # $compare_versions is set.
20229             for (my $i = 0; $i < @input_file_objects; $i++) {
20230                 if ($input_file_objects[$i]->first_released ne v0
20231                     && lc($input_file_objects[$i]->file) ne 'dage.txt'
20232                     && $input_file_objects[$i]->file !~ /$EXTRACTED_DIR/i)
20233                 {
20234                     splice @input_file_objects, $i, 0,
20235                                                 Input_file->new($file, v0);
20236                     last;
20237                 }
20238             }
20239
20240         }
20241     }
20242     if (@unknown_input_files) {
20243         print STDERR simple_fold(join_lines(<<END
20244
20245 The following files are unknown as to how to handle.  Assuming they are
20246 typical property files.  You'll know by later error messages if it worked or
20247 not:
20248 END
20249         ) . " " . join(", ", @unknown_input_files) . "\n\n");
20250     }
20251 } # End of looking through directory structure for more .txt files.
20252
20253 # Create the list of input files from the objects we have defined, plus
20254 # version
20255 my @input_files = qw(version Makefile);
20256 foreach my $object (@input_file_objects) {
20257     my $file = $object->file;
20258     next if ! defined $file;    # Not all objects have files
20259     next if defined $object->skip;;
20260     push @input_files,  $file;
20261 }
20262
20263 if ( $verbosity >= $VERBOSE ) {
20264     print "Expecting ".scalar( @input_files )." input files. ",
20265          "Checking ".scalar( @mktables_list_output_files )." output files.\n";
20266 }
20267
20268 # We set $most_recent to be the most recently changed input file, including
20269 # this program itself (done much earlier in this file)
20270 foreach my $in (@input_files) {
20271     next unless -e $in;        # Keep going even if missing a file
20272     my $mod_time = (stat $in)[9];
20273     $most_recent = $mod_time if $mod_time > $most_recent;
20274
20275     # See that the input files have distinct names, to warn someone if they
20276     # are adding a new one
20277     if ($make_list) {
20278         my ($volume, $directories, $file ) = File::Spec->splitpath($in);
20279         $directories =~ s;/$;;;     # Can have extraneous trailing '/'
20280         my @directories = File::Spec->splitdir($directories);
20281         construct_filename($file, 'mutable', \@directories);
20282     }
20283 }
20284
20285 # We use 'Makefile' just to see if it has changed since the last time we
20286 # rebuilt.  Now discard it.
20287 @input_files = grep { $_ ne 'Makefile' } @input_files;
20288
20289 my $rebuild = $write_unchanged_files    # Rebuild: if unconditional rebuild
20290               || ! scalar @mktables_list_output_files  # or if no outputs known
20291               || $old_start_time < $most_recent        # or out-of-date
20292               || $old_options ne $command_line_arguments; # or with different
20293                                                           # options
20294
20295 # Now we check to see if any output files are older than youngest, if
20296 # they are, we need to continue on, otherwise we can presumably bail.
20297 if (! $rebuild) {
20298     foreach my $out (@mktables_list_output_files) {
20299         if ( ! file_exists($out)) {
20300             print "'$out' is missing.\n" if $verbosity >= $VERBOSE;
20301             $rebuild = 1;
20302             last;
20303          }
20304         #local $to_trace = 1 if main::DEBUG;
20305         trace $most_recent, (stat $out)[9] if main::DEBUG && $to_trace;
20306         if ( (stat $out)[9] <= $most_recent ) {
20307             #trace "$out:  most recent mod time: ", (stat $out)[9], ", youngest: $most_recent\n" if main::DEBUG && $to_trace;
20308             print "'$out' is too old.\n" if $verbosity >= $VERBOSE;
20309             $rebuild = 1;
20310             last;
20311         }
20312     }
20313 }
20314 if (! $rebuild) {
20315     print "$0: Files seem to be ok, not bothering to rebuild.  Add '-w' option to force build\n";
20316     exit(0);
20317 }
20318 print "$0: Must rebuild tables.\n" if $verbosity >= $VERBOSE;
20319
20320 # Ready to do the major processing.  First create the perl pseudo-property.
20321 $perl = Property->new('perl', Type => $NON_STRING, Perl_Extension => 1);
20322
20323 # Process each input file
20324 foreach my $file (@input_file_objects) {
20325     $file->run;
20326 }
20327
20328 # Finish the table generation.
20329
20330 print "Finishing processing Unicode properties\n" if $verbosity >= $PROGRESS;
20331 finish_Unicode();
20332
20333 # For the very specialized case of comparing two Unicode versions...
20334 if (DEBUG && $compare_versions) {
20335     handle_compare_versions();
20336 }
20337
20338 print "Compiling Perl properties\n" if $verbosity >= $PROGRESS;
20339 compile_perl();
20340
20341 print "Creating Perl synonyms\n" if $verbosity >= $PROGRESS;
20342 add_perl_synonyms();
20343
20344 print "Writing tables\n" if $verbosity >= $PROGRESS;
20345 write_all_tables();
20346
20347 # Write mktables.lst
20348 if ( $file_list and $make_list ) {
20349
20350     print "Updating '$file_list'\n" if $verbosity >= $PROGRESS;
20351     foreach my $file (@input_files, @files_actually_output) {
20352         my (undef, $directories, $basefile) = File::Spec->splitpath($file);
20353         my @directories = grep length, File::Spec->splitdir($directories);
20354         $file = join '/', @directories, $basefile;
20355     }
20356
20357     my $ofh;
20358     if (! open $ofh,">",$file_list) {
20359         Carp::my_carp("Can't write to '$file_list'.  Skipping: $!");
20360         return
20361     }
20362     else {
20363         my $localtime = localtime $start_time;
20364         print $ofh <<"END";
20365 #
20366 # $file_list -- File list for $0.
20367 #
20368 #   Autogenerated starting on $start_time ($localtime)
20369 #   From options $command_line_arguments
20370 #
20371 # - First section is input files
20372 #   ($0 itself is not listed but is automatically considered an input)
20373 # - Section separator is /^=+\$/
20374 # - Second section is a list of output files.
20375 # - Lines matching /^\\s*#/ are treated as comments
20376 #   which along with blank lines are ignored.
20377 #
20378
20379 # Input files:
20380
20381 END
20382         print $ofh "$_\n" for sort(@input_files);
20383         print $ofh "\n=================================\n# Output files:\n\n";
20384         print $ofh "$_\n" for sort @files_actually_output;
20385         print $ofh "\n# ",scalar(@input_files)," input files\n",
20386                 "# ",scalar(@files_actually_output)+1," output files\n\n",
20387                 "# End list\n";
20388         close $ofh
20389             or Carp::my_carp("Failed to close $ofh: $!");
20390
20391         print "Filelist has ",scalar(@input_files)," input files and ",
20392             scalar(@files_actually_output)+1," output files\n"
20393             if $verbosity >= $VERBOSE;
20394     }
20395 }
20396
20397 # Output these warnings unless -q explicitly specified.
20398 if ($verbosity >= $NORMAL_VERBOSITY && ! $debug_skip) {
20399     if (@unhandled_properties) {
20400         print "\nProperties and tables that unexpectedly have no code points\n";
20401         foreach my $property (sort @unhandled_properties) {
20402             print $property, "\n";
20403         }
20404     }
20405
20406     if (%potential_files) {
20407         print "\nInput files that are not considered:\n";
20408         foreach my $file (sort keys %potential_files) {
20409             print File::Spec->abs2rel($file), "\n";
20410         }
20411     }
20412     print "\nAll done\n" if $verbosity >= $VERBOSE;
20413 }
20414
20415 if ($version_of_mk_invlist_bounds lt $v_version) {
20416     Carp::my_carp("WARNING: \\b{} algorithms (regen/mk_invlist.pl) need"
20417                 . " to be checked and possibly updated to Unicode"
20418                 . " $string_version.  Failing tests will be marked TODO");
20419 }
20420
20421 exit(0);
20422
20423 # TRAILING CODE IS USED BY make_property_test_script()
20424 __DATA__
20425
20426 use strict;
20427 use warnings;
20428 no warnings 'experimental::uniprop_wildcards';
20429
20430 # Test qr/\X/ and the \p{} regular expression constructs.  This file is
20431 # constructed by mktables from the tables it generates, so if mktables is
20432 # buggy, this won't necessarily catch those bugs.  Tests are generated for all
20433 # feasible properties; a few aren't currently feasible; see
20434 # is_code_point_usable() in mktables for details.
20435
20436 # Standard test packages are not used because this manipulates SIG_WARN.  It
20437 # exits 0 if every non-skipped test succeeded; -1 if any failed.
20438
20439 my $Tests = 0;
20440 my $Fails = 0;
20441
20442 # loc_tools.pl requires this function to be defined
20443 sub ok($@) {
20444     my ($pass, @msg) = @_;
20445     print "not " unless $pass;
20446     print "ok ";
20447     print ++$Tests;
20448     print " - ", join "", @msg if @msg;
20449     print "\n";
20450 }
20451
20452 sub Expect($$$$) {
20453     my $expected = shift;
20454     my $ord = shift;
20455     my $regex  = shift;
20456     my $warning_type = shift;   # Type of warning message, like 'deprecated'
20457                                 # or empty if none
20458     my $line   = (caller)[2];
20459
20460     # Convert the code point to hex form
20461     my $string = sprintf "\"\\x{%04X}\"", $ord;
20462
20463     my @tests = "";
20464
20465     # The first time through, use all warnings.  If the input should generate
20466     # a warning, add another time through with them turned off
20467     push @tests, "no warnings '$warning_type';" if $warning_type;
20468
20469     foreach my $no_warnings (@tests) {
20470
20471         # Store any warning messages instead of outputting them
20472         local $SIG{__WARN__} = $SIG{__WARN__};
20473         my $warning_message;
20474         $SIG{__WARN__} = sub { $warning_message = $_[0] };
20475
20476         $Tests++;
20477
20478         # A string eval is needed because of the 'no warnings'.
20479         # Assumes no parentheses in the regular expression
20480         my $result = eval "$no_warnings
20481                             my \$RegObj = qr($regex);
20482                             $string =~ \$RegObj ? 1 : 0";
20483         if (not defined $result) {
20484             print "not ok $Tests - couldn't compile /$regex/; line $line: $@\n";
20485             $Fails++;
20486         }
20487         elsif ($result ^ $expected) {
20488             print "not ok $Tests - expected $expected but got $result for $string =~ qr/$regex/; line $line\n";
20489             $Fails++;
20490         }
20491         elsif ($warning_message) {
20492             if (! $warning_type || ($warning_type && $no_warnings)) {
20493                 print "not ok $Tests - for qr/$regex/ did not expect warning message '$warning_message'; line $line\n";
20494                 $Fails++;
20495             }
20496             else {
20497                 print "ok $Tests - expected and got a warning message for qr/$regex/; line $line\n";
20498             }
20499         }
20500         elsif ($warning_type && ! $no_warnings) {
20501             print "not ok $Tests - for qr/$regex/ expected a $warning_type warning message, but got none; line $line\n";
20502             $Fails++;
20503         }
20504         else {
20505             print "ok $Tests - got $result for $string =~ qr/$regex/; line $line\n";
20506         }
20507     }
20508     return;
20509 }
20510
20511 sub Error($) {
20512     my $regex  = shift;
20513     $Tests++;
20514     if (eval { 'x' =~ qr/$regex/; 1 }) {
20515         $Fails++;
20516         my $line = (caller)[2];
20517         print "not ok $Tests - re compiled ok, but expected error for qr/$regex/; line $line: $@\n";
20518     }
20519     else {
20520         my $line = (caller)[2];
20521         print "ok $Tests - got and expected error for qr/$regex/; line $line\n";
20522     }
20523     return;
20524 }
20525
20526 # Break test files (e.g. GCBTest.txt) character that break allowed here
20527 my $breakable_utf8 = my $breakable = chr(utf8::unicode_to_native(0xF7));
20528 utf8::upgrade($breakable_utf8);
20529
20530 # Break test files (e.g. GCBTest.txt) character that indicates can't break
20531 # here
20532 my $nobreak_utf8 = my $nobreak = chr(utf8::unicode_to_native(0xD7));
20533 utf8::upgrade($nobreak_utf8);
20534
20535 my $are_ctype_locales_available;
20536 my $utf8_locale;
20537 chdir 't' if -d 't';
20538 eval { require "./loc_tools.pl" };
20539 if (defined &locales_enabled) {
20540     $are_ctype_locales_available = locales_enabled('LC_CTYPE');
20541     if ($are_ctype_locales_available) {
20542         $utf8_locale = &find_utf8_ctype_locale;
20543     }
20544 }
20545
20546 # Eval'd so can run on versions earlier than the property is available in
20547 my $WB_Extend_or_Format_re = eval 'qr/[\p{WB=Extend}\p{WB=Format}\p{WB=ZWJ}]/';
20548 if (! defined $WB_Extend_or_Format_re) {
20549     $WB_Extend_or_Format_re = eval 'qr/[\p{WB=Extend}\p{WB=Format}]/';
20550 }
20551
20552 sub _test_break($$) {
20553     # Test various break property matches.  The 2nd parameter gives the
20554     # property name.  The input is a line from auxiliary/*Test.txt for the
20555     # given property.  Each such line is a sequence of Unicode (not native)
20556     # code points given by their hex numbers, separated by the two characters
20557     # defined just before this subroutine that indicate that either there can
20558     # or cannot be a break between the adjacent code points.  All these are
20559     # tested.
20560     #
20561     # For the gcb property extra tests are made.  if there isn't a break, that
20562     # means the sequence forms an extended grapheme cluster, which means that
20563     # \X should match the whole thing.  If there is a break, \X should stop
20564     # there.  This is all converted by this routine into a match: $string =~
20565     # /(\X)/, Each \X should match the next cluster; and that is what is
20566     # checked.
20567
20568     my $template = shift;
20569     my $break_type = shift;
20570
20571     my $line   = (caller 1)[2];   # Line number
20572     my $comment = "";
20573
20574     if ($template =~ / ( .*? ) \s* \# (.*) /x) {
20575         $template = $1;
20576         $comment = $2;
20577
20578         # Replace leading spaces with a single one.
20579         $comment =~ s/ ^ \s* / # /x;
20580     }
20581
20582     # The line contains characters above the ASCII range, but in Latin1.  It
20583     # may or may not be in utf8, and if it is, it may or may not know it.  So,
20584     # convert these characters to 8 bits.  If knows is in utf8, simply
20585     # downgrade.
20586     if (utf8::is_utf8($template)) {
20587         utf8::downgrade($template);
20588     } else {
20589
20590         # Otherwise, if it is in utf8, but doesn't know it, the next lines
20591         # convert the two problematic characters to their 8-bit equivalents.
20592         # If it isn't in utf8, they don't harm anything.
20593         use bytes;
20594         $template =~ s/$nobreak_utf8/$nobreak/g;
20595         $template =~ s/$breakable_utf8/$breakable/g;
20596     }
20597
20598     # Perl customizes wb.  So change the official tests accordingly
20599     if ($break_type eq 'wb' && $WB_Extend_or_Format_re) {
20600
20601         # Split into elements that alternate between code point and
20602         # break/no-break
20603         my @line = split / +/, $template;
20604
20605         # Look at each code point and its following one
20606         for (my $i = 1; $i <  @line - 1 - 1; $i+=2) {
20607
20608             # The customization only involves changing some breaks to
20609             # non-breaks.
20610             next if $line[$i+1] =~ /$nobreak/;
20611
20612             my $lhs = chr utf8::unicode_to_native(hex $line[$i]);
20613             my $rhs = chr utf8::unicode_to_native(hex $line[$i+2]);
20614
20615             # And it only affects adjacent space characters.
20616             next if $lhs !~ /\s/u;
20617
20618             # But, we want to make sure to test spaces followed by a Extend
20619             # or Format.
20620             next if $rhs !~ /\s|$WB_Extend_or_Format_re/;
20621
20622             # To test the customization, add some white-space before this to
20623             # create a span.  The $lhs white space may or may not be bound to
20624             # that span, and also with the $rhs.  If the $rhs is a binding
20625             # character, the $lhs is bound to it and not to the span, unless
20626             # $lhs is vertical space.  In all other cases, the $lhs is bound
20627             # to the span.  If the $rhs is white space, it is bound to the
20628             # $lhs
20629             my $bound;
20630             my $span;
20631             if ($rhs =~ /$WB_Extend_or_Format_re/) {
20632                 if ($lhs =~ /\v/) {
20633                     $bound = $breakable;
20634                     $span = $nobreak;
20635                 }
20636                 else {
20637                     $bound = $nobreak;
20638                     $span = $breakable;
20639                 }
20640             }
20641             else {
20642                 $span = $nobreak;
20643                 $bound = $nobreak;
20644             }
20645
20646             splice @line, $i, 0, ( '0020', $nobreak, '0020', $span);
20647             $i += 4;
20648             $line[$i+1] = $bound;
20649         }
20650         $template = join " ", @line;
20651     }
20652
20653     # The input is just the break/no-break symbols and sequences of Unicode
20654     # code points as hex digits separated by spaces for legibility. e.g.:
20655     # ÷ 0020 × 0308 ÷ 0020 ÷
20656     # Convert to native \x format
20657     $template =~ s/ \s* ( [[:xdigit:]]+ ) \s* /sprintf("\\x{%02X}", utf8::unicode_to_native(hex $1))/gex;
20658     $template =~ s/ \s* //gx;   # Probably the line above removed all spaces;
20659                                 # but be sure
20660
20661     # Make a copy of the input with the symbols replaced by \b{} and \B{} as
20662     # appropriate
20663     my $break_pattern = $template =~ s/ $breakable /\\b{$break_type}/grx;
20664     $break_pattern =~ s/ $nobreak /\\B{$break_type}/gx;
20665
20666     my $display_string = $template =~ s/[$breakable$nobreak]//gr;
20667     my $string = eval "\"$display_string\"";
20668
20669     # The remaining massaging of the input is for the \X tests.  Get rid of
20670     # the leading and trailing breakables
20671     $template =~ s/^ \s* $breakable \s* //x;
20672     $template =~ s/ \s* $breakable \s* $ //x;
20673
20674     # Delete no-breaks
20675     $template =~ s/ \s* $nobreak \s* //xg;
20676
20677     # Split the input into segments that are breakable between them.
20678     my @should_display = split /\s*$breakable\s*/, $template;
20679     my @should_match = map { eval "\"$_\"" } @should_display;
20680
20681     # If a string can be represented in both non-ut8 and utf8, test both cases
20682     my $display_upgrade = "";
20683     UPGRADE:
20684     for my $to_upgrade (0 .. 1) {
20685
20686         if ($to_upgrade) {
20687
20688             # If already in utf8, would just be a repeat
20689             next UPGRADE if utf8::is_utf8($string);
20690
20691             utf8::upgrade($string);
20692             $display_upgrade = " (utf8-upgraded)";
20693         }
20694
20695         my @modifiers = qw(a aa d u i);
20696         if ($are_ctype_locales_available) {
20697             push @modifiers, "l$utf8_locale" if defined $utf8_locale;
20698
20699             # The /l modifier has C after it to indicate the locale to try
20700             push @modifiers, "lC";
20701         }
20702
20703         # Test for each of the regex modifiers.
20704         for my $modifier (@modifiers) {
20705             my $display_locale = "";
20706
20707             # For /l, set the locale to what it says to.
20708             if ($modifier =~ / ^ l (.*) /x) {
20709                 my $locale = $1;
20710                 $display_locale = "(locale = $locale)";
20711                 POSIX::setlocale(&POSIX::LC_CTYPE, $locale);
20712                 $modifier = 'l';
20713             }
20714
20715             no warnings qw(locale regexp surrogate);
20716             my $pattern = "(?$modifier:$break_pattern)";
20717
20718             # Actually do the test
20719             my $matched_text;
20720             my $matched = $string =~ qr/$pattern/;
20721             if ($matched) {
20722                 $matched_text = "matched";
20723             }
20724             else {
20725                 $matched_text = "failed to match";
20726                 print "not ";
20727
20728                 if (TODO_FAILING_BREAKS) {
20729                     $comment = " # $comment" unless $comment =~ / ^ \s* \# /x;
20730                     $comment =~ s/#/# TODO/;
20731                 }
20732             }
20733             print "ok ", ++$Tests, " - \"$display_string\" $matched_text /$pattern/$display_upgrade; line $line $display_locale$comment\n";
20734
20735             # Only print the comment on the first use of this line
20736             $comment = "";
20737
20738             # Repeat with the first \B{} in the pattern.  This makes sure the
20739             # code in regexec.c:find_byclass() for \B gets executed
20740             if ($pattern =~ / ( .*? : ) .* ( \\B\{ .* ) /x) {
20741                 my $B_pattern = "$1$2";
20742                 $matched = $string =~ qr/$B_pattern/;
20743                 print "not " unless $matched;
20744                 $matched_text = ($matched) ? "matched" : "failed to match";
20745                 print "ok ", ++$Tests, " - \"$display_string\" $matched_text /$B_pattern/$display_upgrade; line $line $display_locale";
20746                 print " # TODO" if TODO_FAILING_BREAKS && ! $matched;
20747                 print "\n";
20748             }
20749         }
20750
20751         next if $break_type ne 'gcb';
20752
20753         # Finally, do the \X match.
20754         my @matches = $string =~ /(\X)/g;
20755
20756         # Look through each matched cluster to verify that it matches what we
20757         # expect.
20758         my $min = (@matches < @should_match) ? @matches : @should_match;
20759         for my $i (0 .. $min - 1) {
20760             $Tests++;
20761             if ($matches[$i] eq $should_match[$i]) {
20762                 print "ok $Tests - ";
20763                 if ($i == 0) {
20764                     print "In \"$display_string\" =~ /(\\X)/g, \\X #1";
20765                 } else {
20766                     print "And \\X #", $i + 1,
20767                 }
20768                 print " correctly matched $should_display[$i]; line $line\n";
20769             } else {
20770                 $matches[$i] = join("", map { sprintf "\\x{%04X}", ord $_ }
20771                                                     split "", $matches[$i]);
20772                 print "not ok $Tests -";
20773                 print " # TODO" if TODO_FAILING_BREAKS;
20774                 print " In \"$display_string\" =~ /(\\X)/g, \\X #",
20775                     $i + 1,
20776                     " should have matched $should_display[$i]",
20777                     " but instead matched $matches[$i]",
20778                     ".  Abandoning rest of line $line\n";
20779                 next UPGRADE;
20780             }
20781         }
20782
20783         # And the number of matches should equal the number of expected matches.
20784         $Tests++;
20785         if (@matches == @should_match) {
20786             print "ok $Tests - Nothing was left over; line $line\n";
20787         } else {
20788             print "not ok $Tests - There were ", scalar @should_match, " \\X matches expected, but got ", scalar @matches, " instead; line $line";
20789             print " # TODO" if TODO_FAILING_BREAKS;
20790             print "\n";
20791         }
20792     }
20793
20794     return;
20795 }
20796
20797 sub Test_GCB($) {
20798     _test_break(shift, 'gcb');
20799 }
20800
20801 sub Test_LB($) {
20802     _test_break(shift, 'lb');
20803 }
20804
20805 sub Test_SB($) {
20806     _test_break(shift, 'sb');
20807 }
20808
20809 sub Test_WB($) {
20810     _test_break(shift, 'wb');
20811 }
20812
20813 sub Finished() {
20814     print "1..$Tests\n";
20815     exit($Fails ? -1 : 0);
20816 }
20817