lib/unicore/mktables

   1 #!/usr/bin/perl -w
   2
   3 # !!!!!!!!!!!!!!       IF YOU MODIFY THIS FILE       !!!!!!!!!!!!!!!!!!!!!!!!!
   4 # Any files created or read by this program should be listed in 'mktables.lst'
   5 # Use -makelist to regenerate it.
   6
   7 # There was an attempt when this was first rewritten to make it 5.8
   8 # compatible, but that has now been abandoned, and newer constructs are used
   9 # as convenient.
  10
  11 # NOTE: this script can run quite slowly in older/slower systems.
  12 # It can also consume a lot of memory (128 MB or more), you may need
  13 # to raise your process resource limits (e.g. in bash, "ulimit -a"
  14 # to inspect, and "ulimit -d ..." or "ulimit -m ..." to set)
  15
  16 my $start_time;
  17 BEGIN { # Get the time the script started running; do it at compilation to
  18         # get it as close as possible
  19     $start_time= time;
  20 }
  21
  22 require 5.010_001;
  23 use strict;
  24 use warnings;
  25 use Carp;
  26 use Config;
  27 use File::Find;
  28 use File::Path;
  29 use File::Spec;
  30 use Text::Tabs;
  31 use re "/aa";
  32 use feature 'state';
  33
  34 sub DEBUG () { 0 }  # Set to 0 for production; 1 for development
  35 my $debugging_build = $Config{"ccflags"} =~ /-DDEBUGGING/;
  36
  37 sub NON_ASCII_PLATFORM { ord("A") != 65 }
  38
  39 # When a new version of Unicode is published, unfortunately the algorithms for
  40 # dealing with various bounds, like \b{gcb}, \b{lb} may have to be updated
  41 # manually.  The changes may or may not be backward compatible with older
  42 # releases.  The code is in regen/mk_invlist.pl and regexec.c.  Make the
  43 # changes, then come back here and set the variable below to what version the
  44 # code is expecting.  If a newer version of Unicode is being compiled than
  45 # expected, a warning will be generated.  If an older version is being
  46 # compiled, any bounds tests that fail in the generated test file (-maketest
  47 # option) will be marked as TODO.
  48 my $version_of_mk_invlist_bounds = v10.0.0;
  49
  50 ##########################################################################
  51 #
  52 # mktables -- create the runtime Perl Unicode files (lib/unicore/.../*.pl),
  53 # from the Unicode database files (lib/unicore/.../*.txt),  It also generates
  54 # a pod file and .t files, depending on option parameters.
  55 #
  56 # The structure of this file is:
  57 #   First these introductory comments; then
  58 #   code needed for everywhere, such as debugging stuff; then
  59 #   code to handle input parameters; then
  60 #   data structures likely to be of external interest (some of which depend on
  61 #       the input parameters, so follows them; then
  62 #   more data structures and subroutine and package (class) definitions; then
  63 #   the small actual loop to process the input files and finish up; then
  64 #   a __DATA__ section, for the .t tests
  65 #
  66 # This program works on all releases of Unicode so far.  The outputs have been
  67 # scrutinized most intently for release 5.1.  The others have been checked for
  68 # somewhat more than just sanity.  It can handle all non-provisional Unicode
  69 # character properties in those releases.
  70 #
  71 # This program is mostly about Unicode character (or code point) properties.
  72 # A property describes some attribute or quality of a code point, like if it
  73 # is lowercase or not, its name, what version of Unicode it was first defined
  74 # in, or what its uppercase equivalent is.  Unicode deals with these disparate
  75 # possibilities by making all properties into mappings from each code point
  76 # into some corresponding value.  In the case of it being lowercase or not,
  77 # the mapping is either to 'Y' or 'N' (or various synonyms thereof).  Each
  78 # property maps each Unicode code point to a single value, called a "property
  79 # value".  (Some more recently defined properties, map a code point to a set
  80 # of values.)
  81 #
  82 # When using a property in a regular expression, what is desired isn't the
  83 # mapping of the code point to its property's value, but the reverse (or the
  84 # mathematical "inverse relation"): starting with the property value, "Does a
  85 # code point map to it?"  These are written in a "compound" form:
  86 # \p{property=value}, e.g., \p{category=punctuation}.  This program generates
  87 # files containing the lists of code points that map to each such regular
  88 # expression property value, one file per list
  89 #
  90 # There is also a single form shortcut that Perl adds for many of the commonly
  91 # used properties.  This happens for all binary properties, plus script,
  92 # general_category, and block properties.
  93 #
  94 # Thus the outputs of this program are files.  There are map files, mostly in
  95 # the 'To' directory; and there are list files for use in regular expression
  96 # matching, all in subdirectories of the 'lib' directory, with each
  97 # subdirectory being named for the property that the lists in it are for.
  98 # Bookkeeping, test, and documentation files are also generated.
  99
 100 my $matches_directory = 'lib';   # Where match (\p{}) files go.
 101 my $map_directory = 'To';        # Where map files go.
 102
 103 # DATA STRUCTURES
 104 #
 105 # The major data structures of this program are Property, of course, but also
 106 # Table.  There are two kinds of tables, very similar to each other.
 107 # "Match_Table" is the data structure giving the list of code points that have
 108 # a particular property value, mentioned above.  There is also a "Map_Table"
 109 # data structure which gives the property's mapping from code point to value.
 110 # There are two structures because the match tables need to be combined in
 111 # various ways, such as constructing unions, intersections, complements, etc.,
 112 # and the map ones don't.  And there would be problems, perhaps subtle, if
 113 # a map table were inadvertently operated on in some of those ways.
 114 # The use of separate classes with operations defined on one but not the other
 115 # prevents accidentally confusing the two.
 116 #
 117 # At the heart of each table's data structure is a "Range_List", which is just
 118 # an ordered list of "Ranges", plus ancillary information, and methods to
 119 # operate on them.  A Range is a compact way to store property information.
 120 # Each range has a starting code point, an ending code point, and a value that
 121 # is meant to apply to all the code points between the two end points,
 122 # inclusive.  For a map table, this value is the property value for those
 123 # code points.  Two such ranges could be written like this:
 124 #   0x41 .. 0x5A, 'Upper',
 125 #   0x61 .. 0x7A, 'Lower'
 126 #
 127 # Each range also has a type used as a convenience to classify the values.
 128 # Most ranges in this program will be Type 0, or normal, but there are some
 129 # ranges that have a non-zero type.  These are used only in map tables, and
 130 # are for mappings that don't fit into the normal scheme of things.  Mappings
 131 # that require a hash entry to communicate with utf8.c are one example;
 132 # another example is mappings for charnames.pm to use which indicate a name
 133 # that is algorithmically determinable from its code point (and the reverse).
 134 # These are used to significantly compact these tables, instead of listing
 135 # each one of the tens of thousands individually.
 136 #
 137 # In a match table, the value of a range is irrelevant (and hence the type as
 138 # well, which will always be 0), and arbitrarily set to the empty string.
 139 # Using the example above, there would be two match tables for those two
 140 # entries, one named Upper would contain the 0x41..0x5A range, and the other
 141 # named Lower would contain 0x61..0x7A.
 142 #
 143 # Actually, there are two types of range lists, "Range_Map" is the one
 144 # associated with map tables, and "Range_List" with match tables.
 145 # Again, this is so that methods can be defined on one and not the others so
 146 # as to prevent operating on them in incorrect ways.
 147 #
 148 # Eventually, most tables are written out to files to be read by utf8_heavy.pl
 149 # in the perl core.  All tables could in theory be written, but some are
 150 # suppressed because there is no current practical use for them.  It is easy
 151 # to change which get written by changing various lists that are near the top
 152 # of the actual code in this file.  The table data structures contain enough
 153 # ancillary information to allow them to be treated as separate entities for
 154 # writing, such as the path to each one's file.  There is a heading in each
 155 # map table that gives the format of its entries, and what the map is for all
 156 # the code points missing from it.  (This allows tables to be more compact.)
 157 #
 158 # The Property data structure contains one or more tables.  All properties
 159 # contain a map table (except the $perl property which is a
 160 # pseudo-property containing only match tables), and any properties that
 161 # are usable in regular expression matches also contain various matching
 162 # tables, one for each value the property can have.  A binary property can
 163 # have two values, True and False (or Y and N, which are preferred by Unicode
 164 # terminology).  Thus each of these properties will have a map table that
 165 # takes every code point and maps it to Y or N (but having ranges cuts the
 166 # number of entries in that table way down), and two match tables, one
 167 # which has a list of all the code points that map to Y, and one for all the
 168 # code points that map to N.  (For each binary property, a third table is also
 169 # generated for the pseudo Perl property.  It contains the identical code
 170 # points as the Y table, but can be written in regular expressions, not in the
 171 # compound form, but in a "single" form like \p{IsUppercase}.)  Many
 172 # properties are binary, but some properties have several possible values,
 173 # some have many, and properties like Name have a different value for every
 174 # named code point.  Those will not, unless the controlling lists are changed,
 175 # have their match tables written out.  But all the ones which can be used in
 176 # regular expression \p{} and \P{} constructs will.  Prior to 5.14, generally
 177 # a property would have either its map table or its match tables written but
 178 # not both.  Again, what gets written is controlled by lists which can easily
 179 # be changed.  Starting in 5.14, advantage was taken of this, and all the map
 180 # tables needed to reconstruct the Unicode db are now written out, while
 181 # suppressing the Unicode .txt files that contain the data.  Our tables are
 182 # much more compact than the .txt files, so a significant space savings was
 183 # achieved.  Also, tables are not written out that are trivially derivable
 184 # from tables that do get written.  So, there typically is no file containing
 185 # the code points not matched by a binary property (the table for \P{} versus
 186 # lowercase \p{}), since you just need to invert the True table to get the
 187 # False table.
 188
 189 # Properties have a 'Type', like 'binary', or 'string', or 'enum' depending on
 190 # how many match tables there are and the content of the maps.  This 'Type' is
 191 # different than a range 'Type', so don't get confused by the two concepts
 192 # having the same name.
 193 #
 194 # For information about the Unicode properties, see Unicode's UAX44 document:
 195
 196 my $unicode_reference_url = 'http://www.unicode.org/reports/tr44/';
 197
 198 # As stated earlier, this program will work on any release of Unicode so far.
 199 # Most obvious problems in earlier data have NOT been corrected except when
 200 # necessary to make Perl or this program work reasonably, and to keep out
 201 # potential security issues.  For example, no folding information was given in
 202 # early releases, so this program substitutes lower case instead, just so that
 203 # a regular expression with the /i option will do something that actually
 204 # gives the right results in many cases.  There are also a couple other
 205 # corrections for version 1.1.5, commented at the point they are made.  As an
 206 # example of corrections that weren't made (but could be) is this statement
 207 # from DerivedAge.txt: "The supplementary private use code points and the
 208 # non-character code points were assigned in version 2.0, but not specifically
 209 # listed in the UCD until versions 3.0 and 3.1 respectively."  (To be precise
 210 # it was 3.0.1 not 3.0.0)  More information on Unicode version glitches is
 211 # further down in these introductory comments.
 212 #
 213 # This program works on all non-provisional properties as of the current
 214 # Unicode release, though the files for some are suppressed for various
 215 # reasons.  You can change which are output by changing lists in this program.
 216 #
 217 # The old version of mktables emphasized the term "Fuzzy" to mean Unicode's
 218 # loose matchings rules (from Unicode TR18):
 219 #
 220 #    The recommended names for UCD properties and property values are in
 221 #    PropertyAliases.txt [Prop] and PropertyValueAliases.txt
 222 #    [PropValue]. There are both abbreviated names and longer, more
 223 #    descriptive names. It is strongly recommended that both names be
 224 #    recognized, and that loose matching of property names be used,
 225 #    whereby the case distinctions, whitespace, hyphens, and underbar
 226 #    are ignored.
 227 #
 228 # The program still allows Fuzzy to override its determination of if loose
 229 # matching should be used, but it isn't currently used, as it is no longer
 230 # needed; the calculations it makes are good enough.
 231 #
 232 # SUMMARY OF HOW IT WORKS:
 233 #
 234 #   Process arguments
 235 #
 236 #   A list is constructed containing each input file that is to be processed
 237 #
 238 #   Each file on the list is processed in a loop, using the associated handler
 239 #   code for each:
 240 #        The PropertyAliases.txt and PropValueAliases.txt files are processed
 241 #            first.  These files name the properties and property values.
 242 #            Objects are created of all the property and property value names
 243 #            that the rest of the input should expect, including all synonyms.
 244 #        The other input files give mappings from properties to property
 245 #           values.  That is, they list code points and say what the mapping
 246 #           is under the given property.  Some files give the mappings for
 247 #           just one property; and some for many.  This program goes through
 248 #           each file and populates the properties and their map tables from
 249 #           them.  Some properties are listed in more than one file, and
 250 #           Unicode has set up a precedence as to which has priority if there
 251 #           is a conflict.  Thus the order of processing matters, and this
 252 #           program handles the conflict possibility by processing the
 253 #           overriding input files last, so that if necessary they replace
 254 #           earlier values.
 255 #        After this is all done, the program creates the property mappings not
 256 #            furnished by Unicode, but derivable from what it does give.
 257 #        The tables of code points that match each property value in each
 258 #            property that is accessible by regular expressions are created.
 259 #        The Perl-defined properties are created and populated.  Many of these
 260 #            require data determined from the earlier steps
 261 #        Any Perl-defined synonyms are created, and name clashes between Perl
 262 #            and Unicode are reconciled and warned about.
 263 #        All the properties are written to files
 264 #        Any other files are written, and final warnings issued.
 265 #
 266 # For clarity, a number of operators have been overloaded to work on tables:
 267 #   ~ means invert (take all characters not in the set).  The more
 268 #       conventional '!' is not used because of the possibility of confusing
 269 #       it with the actual boolean operation.
 270 #   + means union
 271 #   - means subtraction
 272 #   & means intersection
 273 # The precedence of these is the order listed.  Parentheses should be
 274 # copiously used.  These are not a general scheme.  The operations aren't
 275 # defined for a number of things, deliberately, to avoid getting into trouble.
 276 # Operations are done on references and affect the underlying structures, so
 277 # that the copy constructors for them have been overloaded to not return a new
 278 # clone, but the input object itself.
 279 #
 280 # The bool operator is deliberately not overloaded to avoid confusion with
 281 # "should it mean if the object merely exists, or also is non-empty?".
 282 #
 283 # WHY CERTAIN DESIGN DECISIONS WERE MADE
 284 #
 285 # This program needs to be able to run under miniperl.  Therefore, it uses a
 286 # minimum of other modules, and hence implements some things itself that could
 287 # be gotten from CPAN
 288 #
 289 # This program uses inputs published by the Unicode Consortium.  These can
 290 # change incompatibly between releases without the Perl maintainers realizing
 291 # it.  Therefore this program is now designed to try to flag these.  It looks
 292 # at the directories where the inputs are, and flags any unrecognized files.
 293 # It keeps track of all the properties in the files it handles, and flags any
 294 # that it doesn't know how to handle.  It also flags any input lines that
 295 # don't match the expected syntax, among other checks.
 296 #
 297 # It is also designed so if a new input file matches one of the known
 298 # templates, one hopefully just needs to add it to a list to have it
 299 # processed.
 300 #
 301 # As mentioned earlier, some properties are given in more than one file.  In
 302 # particular, the files in the extracted directory are supposedly just
 303 # reformattings of the others.  But they contain information not easily
 304 # derivable from the other files, including results for Unihan (which isn't
 305 # usually available to this program) and for unassigned code points.  They
 306 # also have historically had errors or been incomplete.  In an attempt to
 307 # create the best possible data, this program thus processes them first to
 308 # glean information missing from the other files; then processes those other
 309 # files to override any errors in the extracted ones.  Much of the design was
 310 # driven by this need to store things and then possibly override them.
 311 #
 312 # It tries to keep fatal errors to a minimum, to generate something usable for
 313 # testing purposes.  It always looks for files that could be inputs, and will
 314 # warn about any that it doesn't know how to handle (the -q option suppresses
 315 # the warning).
 316 #
 317 # Why is there more than one type of range?
 318 #   This simplified things.  There are some very specialized code points that
 319 #   have to be handled specially for output, such as Hangul syllable names.
 320 #   By creating a range type (done late in the development process), it
 321 #   allowed this to be stored with the range, and overridden by other input.
 322 #   Originally these were stored in another data structure, and it became a
 323 #   mess trying to decide if a second file that was for the same property was
 324 #   overriding the earlier one or not.
 325 #
 326 # Why are there two kinds of tables, match and map?
 327 #   (And there is a base class shared by the two as well.)  As stated above,
 328 #   they actually are for different things.  Development proceeded much more
 329 #   smoothly when I (khw) realized the distinction.  Map tables are used to
 330 #   give the property value for every code point (actually every code point
 331 #   that doesn't map to a default value).  Match tables are used for regular
 332 #   expression matches, and are essentially the inverse mapping.  Separating
 333 #   the two allows more specialized methods, and error checks so that one
 334 #   can't just take the intersection of two map tables, for example, as that
 335 #   is nonsensical.
 336 #
 337 # What about 'fate' and 'status'.  The concept of a table's fate was created
 338 #   late when it became clear that something more was needed.  The difference
 339 #   between this and 'status' is unclean, and could be improved if someone
 340 #   wanted to spend the effort.
 341 #
 342 # DEBUGGING
 343 #
 344 # This program is written so it will run under miniperl.  Occasionally changes
 345 # will cause an error where the backtrace doesn't work well under miniperl.
 346 # To diagnose the problem, you can instead run it under regular perl, if you
 347 # have one compiled.
 348 #
 349 # There is a good trace facility.  To enable it, first sub DEBUG must be set
 350 # to return true.  Then a line like
 351 #
 352 # local $to_trace = 1 if main::DEBUG;
 353 #
 354 # can be added to enable tracing in its lexical scope (plus dynamic) or until
 355 # you insert another line:
 356 #
 357 # local $to_trace = 0 if main::DEBUG;
 358 #
 359 # To actually trace, use a line like "trace $a, @b, %c, ...;
 360 #
 361 # Some of the more complex subroutines already have trace statements in them.
 362 # Permanent trace statements should be like:
 363 #
 364 # trace ... if main::DEBUG && $to_trace;
 365 #
 366 # main::stack_trace() will display what its name implies
 367 #
 368 # If there is just one or a few files that you're debugging, you can easily
 369 # cause most everything else to be skipped.  Change the line
 370 #
 371 # my $debug_skip = 0;
 372 #
 373 # to 1, and every file whose object is in @input_file_objects and doesn't have
 374 # a, 'non_skip => 1,' in its constructor will be skipped.  However, skipping
 375 # Jamo.txt or UnicodeData.txt will likely cause fatal errors.
 376 #
 377 # To compare the output tables, it may be useful to specify the -annotate
 378 # flag.  (As of this writing, this can't be done on a clean workspace, due to
 379 # requirements in Text::Tabs used in this option; so first run mktables
 380 # without this option.)  This option adds comment lines to each table, one for
 381 # each non-algorithmically named character giving, currently its code point,
 382 # name, and graphic representation if printable (and you have a font that
 383 # knows about it).  This makes it easier to see what the particular code
 384 # points are in each output table.  Non-named code points are annotated with a
 385 # description of their status, and contiguous ones with the same description
 386 # will be output as a range rather than individually.  Algorithmically named
 387 # characters are also output as ranges, except when there are just a few
 388 # contiguous ones.
 389 #
 390 # FUTURE ISSUES
 391 #
 392 # The program would break if Unicode were to change its names so that
 393 # interior white space, underscores, or dashes differences were significant
 394 # within property and property value names.
 395 #
 396 # It might be easier to use the xml versions of the UCD if this program ever
 397 # would need heavy revision, and the ability to handle old versions was not
 398 # required.
 399 #
 400 # There is the potential for name collisions, in that Perl has chosen names
 401 # that Unicode could decide it also likes.  There have been such collisions in
 402 # the past, with mostly Perl deciding to adopt the Unicode definition of the
 403 # name.  However in the 5.2 Unicode beta testing, there were a number of such
 404 # collisions, which were withdrawn before the final release, because of Perl's
 405 # and other's protests.  These all involved new properties which began with
 406 # 'Is'.  Based on the protests, Unicode is unlikely to try that again.  Also,
 407 # many of the Perl-defined synonyms, like Any, Word, etc, are listed in a
 408 # Unicode document, so they are unlikely to be used by Unicode for another
 409 # purpose.  However, they might try something beginning with 'In', or use any
 410 # of the other Perl-defined properties.  This program will warn you of name
 411 # collisions, and refuse to generate tables with them, but manual intervention
 412 # will be required in this event.  One scheme that could be implemented, if
 413 # necessary, would be to have this program generate another file, or add a
 414 # field to mktables.lst that gives the date of first definition of a property.
 415 # Each new release of Unicode would use that file as a basis for the next
 416 # iteration.  And the Perl synonym addition code could sort based on the age
 417 # of the property, so older properties get priority, and newer ones that clash
 418 # would be refused; hence existing code would not be impacted, and some other
 419 # synonym would have to be used for the new property.  This is ugly, and
 420 # manual intervention would certainly be easier to do in the short run; lets
 421 # hope it never comes to this.
 422 #
 423 # A NOTE ON UNIHAN
 424 #
 425 # This program can generate tables from the Unihan database.  But that DB
 426 # isn't normally available, so it is marked as optional.  Prior to version
 427 # 5.2, this database was in a single file, Unihan.txt.  In 5.2 the database
 428 # was split into 8 different files, all beginning with the letters 'Unihan'.
 429 # If you plunk those files down into the directory mktables ($0) is in, this
 430 # program will read them and automatically create tables for the properties
 431 # from it that are listed in PropertyAliases.txt and PropValueAliases.txt,
 432 # plus any you add to the @cjk_properties array and the @cjk_property_values
 433 # array, being sure to add necessary '# @missings' lines to the latter.  For
 434 # Unicode versions earlier than 5.2, most of the Unihan properties are not
 435 # listed at all in PropertyAliases nor PropValueAliases.  This program assumes
 436 # for these early releases that you want the properties that are specified in
 437 # the 5.2 release.
 438 #
 439 # You may need to adjust the entries to suit your purposes.  setup_unihan(),
 440 # and filter_unihan_line() are the functions where this is done.  This program
 441 # already does some adjusting to make the lines look more like the rest of the
 442 # Unicode DB;  You can see what that is in filter_unihan_line()
 443 #
 444 # There is a bug in the 3.2 data file in which some values for the
 445 # kPrimaryNumeric property have commas and an unexpected comment.  A filter
 446 # could be added to correct these; or for a particular installation, the
 447 # Unihan.txt file could be edited to fix them.
 448 #
 449 # HOW TO ADD A FILE TO BE PROCESSED
 450 #
 451 # A new file from Unicode needs to have an object constructed for it in
 452 # @input_file_objects, probably at the end or at the end of the extracted
 453 # ones.  The program should warn you if its name will clash with others on
 454 # restrictive file systems, like DOS.  If so, figure out a better name, and
 455 # add lines to the README.perl file giving that.  If the file is a character
 456 # property, it should be in the format that Unicode has implicitly
 457 # standardized for such files for the more recently introduced ones.
 458 # If so, the Input_file constructor for @input_file_objects can just be the
 459 # file name and release it first appeared in.  If not, then it should be
 460 # possible to construct an each_line_handler() to massage the line into the
 461 # standardized form.
 462 #
 463 # For non-character properties, more code will be needed.  You can look at
 464 # the existing entries for clues.
 465 #
 466 # UNICODE VERSIONS NOTES
 467 #
 468 # The Unicode UCD has had a number of errors in it over the versions.  And
 469 # these remain, by policy, in the standard for that version.  Therefore it is
 470 # risky to correct them, because code may be expecting the error.  So this
 471 # program doesn't generally make changes, unless the error breaks the Perl
 472 # core.  As an example, some versions of 2.1.x Jamo.txt have the wrong value
 473 # for U+1105, which causes real problems for the algorithms for Jamo
 474 # calculations, so it is changed here.
 475 #
 476 # But it isn't so clear cut as to what to do about concepts that are
 477 # introduced in a later release; should they extend back to earlier releases
 478 # where the concept just didn't exist?  It was easier to do this than to not,
 479 # so that's what was done.  For example, the default value for code points not
 480 # in the files for various properties was probably undefined until changed by
 481 # some version.  No_Block for blocks is such an example.  This program will
 482 # assign No_Block even in Unicode versions that didn't have it.  This has the
 483 # benefit that code being written doesn't have to special case earlier
 484 # versions; and the detriment that it doesn't match the Standard precisely for
 485 # the affected versions.
 486 #
 487 # Here are some observations about some of the issues in early versions:
 488 #
 489 # Prior to version 3.0, there were 3 character decompositions.  These are not
 490 # handled by Unicode::Normalize, nor will it compile when presented a version
 491 # that has them.  However, you can trivially get it to compile by simply
 492 # ignoring those decompositions, by changing the croak to a carp.  At the time
 493 # of this writing, the line (in dist/Unicode-Normalize/Normalize.pm or
 494 # dist/Unicode-Normalize/mkheader) reads
 495 #
 496 #   croak("Weird Canonical Decomposition of U+$h");
 497 #
 498 # Simply comment it out.  It will compile, but will not know about any three
 499 # character decompositions.
 500
 501 # The number of code points in \p{alpha=True} halved in 2.1.9.  It turns out
 502 # that the reason is that the CJK block starting at 4E00 was removed from
 503 # PropList, and was not put back in until 3.1.0.  The Perl extension (the
 504 # single property name \p{alpha}) has the correct values.  But the compound
 505 # form is simply not generated until 3.1, as it can be argued that prior to
 506 # this release, this was not an official property.  The comments for
 507 # filter_old_style_proplist() give more details.
 508 #
 509 # Unicode introduced the synonym Space for White_Space in 4.1.  Perl has
 510 # always had a \p{Space}.  In release 3.2 only, they are not synonymous.  The
 511 # reason is that 3.2 introduced U+205F=medium math space, which was not
 512 # classed as white space, but Perl figured out that it should have been. 4.0
 513 # reclassified it correctly.
 514 #
 515 # Another change between 3.2 and 4.0 is the CCC property value ATBL.  In 3.2
 516 # this was erroneously a synonym for 202 (it should be 200).  In 4.0, ATB
 517 # became 202, and ATBL was left with no code points, as all the ones that
 518 # mapped to 202 stayed mapped to 202.  Thus if your program used the numeric
 519 # name for the class, it would not have been affected, but if it used the
 520 # mnemonic, it would have been.
 521 #
 522 # \p{Script=Hrkt} (Katakana_Or_Hiragana) came in 4.0.1.  Before that, code
 523 # points which eventually came to have this script property value, instead
 524 # mapped to "Unknown".  But in the next release all these code points were
 525 # moved to \p{sc=common} instead.
 526
 527 # The tests furnished  by Unicode for testing WordBreak and SentenceBreak
 528 # generate errors in 5.0 and earlier.
 529 #
 530 # The default for missing code points for BidiClass is complicated.  Starting
 531 # in 3.1.1, the derived file DBidiClass.txt handles this, but this program
 532 # tries to do the best it can for earlier releases.  It is done in
 533 # process_PropertyAliases()
 534 #
 535 # In version 2.1.2, the entry in UnicodeData.txt:
 536 #   0275;LATIN SMALL LETTER BARRED O;Ll;0;L;;;;;N;;;;019F;
 537 # should instead be
 538 #   0275;LATIN SMALL LETTER BARRED O;Ll;0;L;;;;;N;;;019F;;019F
 539 # Without this change, there are casing problems for this character.
 540 #
 541 # Search for $string_compare_versions to see how to compare changes to
 542 # properties between Unicode versions
 543 #
 544 ##############################################################################
 545
 546 my $UNDEF = ':UNDEF:';  # String to print out for undefined values in tracing
 547                         # and errors
 548 my $MAX_LINE_WIDTH = 78;
 549
 550 # Debugging aid to skip most files so as to not be distracted by them when
 551 # concentrating on the ones being debugged.  Add
 552 # non_skip => 1,
 553 # to the constructor for those files you want processed when you set this.
 554 # Files with a first version number of 0 are special: they are always
 555 # processed regardless of the state of this flag.  Generally, Jamo.txt and
 556 # UnicodeData.txt must not be skipped if you want this program to not die
 557 # before normal completion.
 558 my $debug_skip = 0;
 559
 560
 561 # Normally these are suppressed.
 562 my $write_Unicode_deprecated_tables = 0;
 563
 564 # Set to 1 to enable tracing.
 565 our $to_trace = 0;
 566
 567 { # Closure for trace: debugging aid
 568     my $print_caller = 1;        # ? Include calling subroutine name
 569     my $main_with_colon = 'main::';
 570     my $main_colon_length = length($main_with_colon);
 571
 572     sub trace {
 573         return unless $to_trace;        # Do nothing if global flag not set
 574
 575         my @input = @_;
 576
 577         local $DB::trace = 0;
 578         $DB::trace = 0;          # Quiet 'used only once' message
 579
 580         my $line_number;
 581
 582         # Loop looking up the stack to get the first non-trace caller
 583         my $caller_line;
 584         my $caller_name;
 585         my $i = 0;
 586         do {
 587             $line_number = $caller_line;
 588             (my $pkg, my $file, $caller_line, my $caller) = caller $i++;
 589             $caller = $main_with_colon unless defined $caller;
 590
 591             $caller_name = $caller;
 592
 593             # get rid of pkg
 594             $caller_name =~ s/.*:://;
 595             if (substr($caller_name, 0, $main_colon_length)
 596                 eq $main_with_colon)
 597             {
 598                 $caller_name = substr($caller_name, $main_colon_length);
 599             }
 600
 601         } until ($caller_name ne 'trace');
 602
 603         # If the stack was empty, we were called from the top level
 604         $caller_name = 'main' if ($caller_name eq ""
 605                                     || $caller_name eq 'trace');
 606
 607         my $output = "";
 608         #print STDERR __LINE__, ": ", join ", ", @input, "\n";
 609         foreach my $string (@input) {
 610             if (ref $string eq 'ARRAY' || ref $string eq 'HASH') {
 611                 $output .= simple_dumper($string);
 612             }
 613             else {
 614                 $string = "$string" if ref $string;
 615                 $string = $UNDEF unless defined $string;
 616                 chomp $string;
 617                 $string = '""' if $string eq "";
 618                 $output .= " " if $output ne ""
 619                                 && $string ne ""
 620                                 && substr($output, -1, 1) ne " "
 621                                 && substr($string, 0, 1) ne " ";
 622                 $output .= $string;
 623             }
 624         }
 625
 626         print STDERR sprintf "%4d: ", $line_number if defined $line_number;
 627         print STDERR "$caller_name: " if $print_caller;
 628         print STDERR $output, "\n";
 629         return;
 630     }
 631 }
 632
 633 sub stack_trace() {
 634     local $to_trace = 1 if main::DEBUG;
 635     my $line = (caller(0))[2];
 636     my $i = 1;
 637
 638     # Accumulate the stack trace
 639     while (1) {
 640         my ($pkg, $file, $caller_line, $caller) = caller $i++;
 641
 642         last unless defined $caller;
 643
 644         trace "called from $caller() at line $line";
 645         $line = $caller_line;
 646     }
 647 }
 648
 649 # This is for a rarely used development feature that allows you to compare two
 650 # versions of the Unicode standard without having to deal with changes caused
 651 # by the code points introduced in the later version.  You probably also want
 652 # to use the -annotate option when using this.  Run this program on a unicore
 653 # containing the starting release you want to compare.  Save that output
 654 # structure.  Then, switching to a unicore with the ending release, change the
 655 # 0 in the $string_compare_versions definition just below to a string
 656 # containing a SINGLE dotted Unicode release number (e.g. "2.1") corresponding
 657 # to the starting release.  This program will then compile, but throw away all
 658 # code points introduced after the starting release.  Finally use a diff tool
 659 # to compare the two directory structures.  They include only the code points
 660 # common to both releases, and you can see the changes caused just by the
 661 # underlying release semantic changes.  For versions earlier than 3.2, you
 662 # must copy a version of DAge.txt into the directory.
 663 my $string_compare_versions = DEBUG && ""; #  e.g., "2.1";
 664 my $compare_versions = DEBUG
 665                        && $string_compare_versions
 666                        && pack "C*", split /\./, $string_compare_versions;
 667
 668 sub uniques {
 669     # Returns non-duplicated input values.  From "Perl Best Practices:
 670     # Encapsulated Cleverness".  p. 455 in first edition.
 671
 672     my %seen;
 673     # Arguably this breaks encapsulation, if the goal is to permit multiple
 674     # distinct objects to stringify to the same value, and be interchangeable.
 675     # However, for this program, no two objects stringify identically, and all
 676     # lists passed to this function are either objects or strings. So this
 677     # doesn't affect correctness, but it does give a couple of percent speedup.
 678     no overloading;
 679     return grep { ! $seen{$_}++ } @_;
 680 }
 681
 682 $0 = File::Spec->canonpath($0);
 683
 684 my $make_test_script = 0;      # ? Should we output a test script
 685 my $make_norm_test_script = 0; # ? Should we output a normalization test script
 686 my $write_unchanged_files = 0; # ? Should we update the output files even if
 687                                #    we don't think they have changed
 688 my $use_directory = "";        # ? Should we chdir somewhere.
 689 my $pod_directory;             # input directory to store the pod file.
 690 my $pod_file = 'perluniprops';
 691 my $t_path;                     # Path to the .t test file
 692 my $file_list = 'mktables.lst'; # File to store input and output file names.
 693                                # This is used to speed up the build, by not
 694                                # executing the main body of the program if
 695                                # nothing on the list has changed since the
 696                                # previous build
 697 my $make_list = 1;             # ? Should we write $file_list.  Set to always
 698                                # make a list so that when the pumpking is
 699                                # preparing a release, s/he won't have to do
 700                                # special things
 701 my $glob_list = 0;             # ? Should we try to include unknown .txt files
 702                                # in the input.
 703 my $output_range_counts = $debugging_build;   # ? Should we include the number
 704                                               # of code points in ranges in
 705                                               # the output
 706 my $annotate = 0;              # ? Should character names be in the output
 707
 708 # Verbosity levels; 0 is quiet
 709 my $NORMAL_VERBOSITY = 1;
 710 my $PROGRESS = 2;
 711 my $VERBOSE = 3;
 712
 713 my $verbosity = $NORMAL_VERBOSITY;
 714
 715 # Stored in mktables.lst so that if this program is called with different
 716 # options, will regenerate even if the files otherwise look like they're
 717 # up-to-date.
 718 my $command_line_arguments = join " ", @ARGV;
 719
 720 # Process arguments
 721 while (@ARGV) {
 722     my $arg = shift @ARGV;
 723     if ($arg eq '-v') {
 724         $verbosity = $VERBOSE;
 725     }
 726     elsif ($arg eq '-p') {
 727         $verbosity = $PROGRESS;
 728         $| = 1;     # Flush buffers as we go.
 729     }
 730     elsif ($arg eq '-q') {
 731         $verbosity = 0;
 732     }
 733     elsif ($arg eq '-w') {
 734         # update the files even if they haven't changed
 735         $write_unchanged_files = 1;
 736     }
 737     elsif ($arg eq '-check') {
 738         my $this = shift @ARGV;
 739         my $ok = shift @ARGV;
 740         if ($this ne $ok) {
 741             print "Skipping as check params are not the same.\n";
 742             exit(0);
 743         }
 744     }
 745     elsif ($arg eq '-P' && defined ($pod_directory = shift)) {
 746         -d $pod_directory or croak "Directory '$pod_directory' doesn't exist";
 747     }
 748     elsif ($arg eq '-maketest' || ($arg eq '-T' && defined ($t_path = shift)))
 749     {
 750         $make_test_script = 1;
 751     }
 752     elsif ($arg eq '-makenormtest')
 753     {
 754         $make_norm_test_script = 1;
 755     }
 756     elsif ($arg eq '-makelist') {
 757         $make_list = 1;
 758     }
 759     elsif ($arg eq '-C' && defined ($use_directory = shift)) {
 760         -d $use_directory or croak "Unknown directory '$use_directory'";
 761     }
 762     elsif ($arg eq '-L') {
 763
 764         # Existence not tested until have chdir'd
 765         $file_list = shift;
 766     }
 767     elsif ($arg eq '-globlist') {
 768         $glob_list = 1;
 769     }
 770     elsif ($arg eq '-c') {
 771         $output_range_counts = ! $output_range_counts
 772     }
 773     elsif ($arg eq '-annotate') {
 774         $annotate = 1;
 775         $debugging_build = 1;
 776         $output_range_counts = 1;
 777     }
 778     else {
 779         my $with_c = 'with';
 780         $with_c .= 'out' if $output_range_counts;   # Complements the state
 781         croak <<END;
 782 usage: $0 [-c|-p|-q|-v|-w] [-C dir] [-L filelist] [ -P pod_dir ]
 783           [ -T test_file_path ] [-globlist] [-makelist] [-maketest]
 784           [-check A B ]
 785   -c          : Output comments $with_c number of code points in ranges
 786   -q          : Quiet Mode: Only output serious warnings.
 787   -p          : Set verbosity level to normal plus show progress.
 788   -v          : Set Verbosity level high:  Show progress and non-serious
 789                 warnings
 790   -w          : Write files regardless
 791   -C dir      : Change to this directory before proceeding. All relative paths
 792                 except those specified by the -P and -T options will be done
 793                 with respect to this directory.
 794   -P dir      : Output $pod_file file to directory 'dir'.
 795   -T path     : Create a test script as 'path'; overrides -maketest
 796   -L filelist : Use alternate 'filelist' instead of standard one
 797   -globlist   : Take as input all non-Test *.txt files in current and sub
 798                 directories
 799   -maketest   : Make test script 'TestProp.pl' in current (or -C directory),
 800                 overrides -T
 801   -makelist   : Rewrite the file list $file_list based on current setup
 802   -annotate   : Output an annotation for each character in the table files;
 803                 useful for debugging mktables, looking at diffs; but is slow
 804                 and memory intensive
 805   -check A B  : Executes $0 only if A and B are the same
 806 END
 807     }
 808 }
 809
 810 # Stores the most-recently changed file.  If none have changed, can skip the
 811 # build
 812 my $most_recent = (stat $0)[9];   # Do this before the chdir!
 813
 814 # Change directories now, because need to read 'version' early.
 815 if ($use_directory) {
 816     if ($pod_directory && ! File::Spec->file_name_is_absolute($pod_directory)) {
 817         $pod_directory = File::Spec->rel2abs($pod_directory);
 818     }
 819     if ($t_path && ! File::Spec->file_name_is_absolute($t_path)) {
 820         $t_path = File::Spec->rel2abs($t_path);
 821     }
 822     chdir $use_directory or croak "Failed to chdir to '$use_directory':$!";
 823     if ($pod_directory && File::Spec->file_name_is_absolute($pod_directory)) {
 824         $pod_directory = File::Spec->abs2rel($pod_directory);
 825     }
 826     if ($t_path && File::Spec->file_name_is_absolute($t_path)) {
 827         $t_path = File::Spec->abs2rel($t_path);
 828     }
 829 }
 830
 831 # Get Unicode version into regular and v-string.  This is done now because
 832 # various tables below get populated based on it.  These tables are populated
 833 # here to be near the top of the file, and so easily seeable by those needing
 834 # to modify things.
 835 open my $VERSION, "<", "version"
 836                     or croak "$0: can't open required file 'version': $!\n";
 837 my $string_version = <$VERSION>;
 838 close $VERSION;
 839 chomp $string_version;
 840 my $v_version = pack "C*", split /\./, $string_version;        # v string
 841
 842 my $unicode_version = ($compare_versions)
 843                       ? (  "$string_compare_versions (using "
 844                          . "$string_version rules)")
 845                       : $string_version;
 846
 847 # The following are the complete names of properties with property values that
 848 # are known to not match any code points in some versions of Unicode, but that
 849 # may change in the future so they should be matchable, hence an empty file is
 850 # generated for them.
 851 my @tables_that_may_be_empty;
 852 push @tables_that_may_be_empty, 'Joining_Type=Left_Joining'
 853                                                     if $v_version lt v6.3.0;
 854 push @tables_that_may_be_empty, 'Script=Common' if $v_version le v4.0.1;
 855 push @tables_that_may_be_empty, 'Title' if $v_version lt v2.0.0;
 856 push @tables_that_may_be_empty, 'Script=Katakana_Or_Hiragana'
 857                                                     if $v_version ge v4.1.0;
 858 push @tables_that_may_be_empty, 'Script_Extensions=Katakana_Or_Hiragana'
 859                                                     if $v_version ge v6.0.0;
 860 push @tables_that_may_be_empty, 'Grapheme_Cluster_Break=Prepend'
 861                                                     if $v_version ge v6.1.0;
 862 push @tables_that_may_be_empty, 'Canonical_Combining_Class=CCC133'
 863                                                     if $v_version ge v6.2.0;
 864
 865 # The lists below are hashes, so the key is the item in the list, and the
 866 # value is the reason why it is in the list.  This makes generation of
 867 # documentation easier.
 868
 869 my %why_suppressed;  # No file generated for these.
 870
 871 # Files aren't generated for empty extraneous properties.  This is arguable.
 872 # Extraneous properties generally come about because a property is no longer
 873 # used in a newer version of Unicode.  If we generated a file without code
 874 # points, programs that used to work on that property will still execute
 875 # without errors.  It just won't ever match (or will always match, with \P{}).
 876 # This means that the logic is now likely wrong.  I (khw) think its better to
 877 # find this out by getting an error message.  Just move them to the table
 878 # above to change this behavior
 879 my %why_suppress_if_empty_warn_if_not = (
 880
 881    # It is the only property that has ever officially been removed from the
 882    # Standard.  The database never contained any code points for it.
 883    'Special_Case_Condition' => 'Obsolete',
 884
 885    # Apparently never official, but there were code points in some versions of
 886    # old-style PropList.txt
 887    'Non_Break' => 'Obsolete',
 888 );
 889
 890 # These would normally go in the warn table just above, but they were changed
 891 # a long time before this program was written, so warnings about them are
 892 # moot.
 893 if ($v_version gt v3.2.0) {
 894     push @tables_that_may_be_empty,
 895                                 'Canonical_Combining_Class=Attached_Below_Left'
 896 }
 897
 898 # Enum values for to_output_map() method in the Map_Table package. (0 is don't
 899 # output)
 900 my $EXTERNAL_MAP = 1;
 901 my $INTERNAL_MAP = 2;
 902 my $OUTPUT_ADJUSTED = 3;
 903
 904 # To override computed values for writing the map tables for these properties.
 905 # The default for enum map tables is to write them out, so that the Unicode
 906 # .txt files can be removed, but all the data to compute any property value
 907 # for any code point is available in a more compact form.
 908 my %global_to_output_map = (
 909     # Needed by UCD.pm, but don't want to publicize that it exists, so won't
 910     # get stuck supporting it if things change.  Since it is a STRING
 911     # property, it normally would be listed in the pod, but INTERNAL_MAP
 912     # suppresses that.
 913     Unicode_1_Name => $INTERNAL_MAP,
 914
 915     Present_In => 0,                # Suppress, as easily computed from Age
 916     Block => (NON_ASCII_PLATFORM) ? 1 : 0,  # Suppress, as Blocks.txt is
 917                                             # retained, but needed for
 918                                             # non-ASCII
 919
 920     # Suppress, as mapping can be found instead from the
 921     # Perl_Decomposition_Mapping file
 922     Decomposition_Type => 0,
 923 );
 924
 925 # There are several types of obsolete properties defined by Unicode.  These
 926 # must be hand-edited for every new Unicode release.
 927 my %why_deprecated;  # Generates a deprecated warning message if used.
 928 my %why_stabilized;  # Documentation only
 929 my %why_obsolete;    # Documentation only
 930
 931 {   # Closure
 932     my $simple = 'Perl uses the more complete version';
 933     my $unihan = 'Unihan properties are by default not enabled in the Perl core.  Instead use CPAN: Unicode::Unihan';
 934
 935     my $other_properties = 'other properties';
 936     my $contributory = "Used by Unicode internally for generating $other_properties and not intended to be used stand-alone";
 937     my $why_no_expand  = "Deprecated by Unicode.  These are characters that expand to more than one character in the specified normalization form, but whether they actually take up more bytes or not depends on the encoding being used.  For example, a UTF-8 encoded character may expand to a different number of bytes than a UTF-32 encoded character.";
 938
 939     %why_deprecated = (
 940         'Grapheme_Link' => 'Duplicates ccc=vr (Canonical_Combining_Class=Virama)',
 941         'Jamo_Short_Name' => $contributory,
 942         'Line_Break=Surrogate' => 'Surrogates should never appear in well-formed text, and therefore shouldn\'t be the basis for line breaking',
 943         'Other_Alphabetic' => $contributory,
 944         'Other_Default_Ignorable_Code_Point' => $contributory,
 945         'Other_Grapheme_Extend' => $contributory,
 946         'Other_ID_Continue' => $contributory,
 947         'Other_ID_Start' => $contributory,
 948         'Other_Lowercase' => $contributory,
 949         'Other_Math' => $contributory,
 950         'Other_Uppercase' => $contributory,
 951         'Expands_On_NFC' => $why_no_expand,
 952         'Expands_On_NFD' => $why_no_expand,
 953         'Expands_On_NFKC' => $why_no_expand,
 954         'Expands_On_NFKD' => $why_no_expand,
 955     );
 956
 957     %why_suppressed = (
 958         # There is a lib/unicore/Decomposition.pl (used by Normalize.pm) which
 959         # contains the same information, but without the algorithmically
 960         # determinable Hangul syllables'.  This file is not published, so it's
 961         # existence is not noted in the comment.
 962         'Decomposition_Mapping' => 'Accessible via Unicode::Normalize or prop_invmap() or charprop() in Unicode::UCD::',
 963
 964         # Don't suppress ISO_Comment, as otherwise special handling is needed
 965         # to differentiate between it and gc=c, which can be written as 'isc',
 966         # which is the same characters as ISO_Comment's short name.
 967
 968         'Name' => "Accessible via \\N{...} or 'use charnames;' or charprop() or prop_invmap() in Unicode::UCD::",
 969
 970         'Simple_Case_Folding' => "$simple.  Can access this through casefold(), charprop(), or prop_invmap() in Unicode::UCD",
 971         'Simple_Lowercase_Mapping' => "$simple.  Can access this through charinfo(), charprop(), or prop_invmap() in Unicode::UCD",
 972         'Simple_Titlecase_Mapping' => "$simple.  Can access this through charinfo(), charprop(), or prop_invmap() in Unicode::UCD",
 973         'Simple_Uppercase_Mapping' => "$simple.  Can access this through charinfo(), charprop(), or prop_invmap() in Unicode::UCD",
 974
 975         FC_NFKC_Closure => 'Deprecated by Unicode, and supplanted in usage by NFKC_Casefold; otherwise not useful',
 976     );
 977
 978     foreach my $property (
 979
 980             # The following are suppressed because they were made contributory
 981             # or deprecated by Unicode before Perl ever thought about
 982             # supporting them.
 983             'Jamo_Short_Name',
 984             'Grapheme_Link',
 985             'Expands_On_NFC',
 986             'Expands_On_NFD',
 987             'Expands_On_NFKC',
 988             'Expands_On_NFKD',
 989
 990             # The following are suppressed because they have been marked
 991             # as deprecated for a sufficient amount of time
 992             'Other_Alphabetic',
 993             'Other_Default_Ignorable_Code_Point',
 994             'Other_Grapheme_Extend',
 995             'Other_ID_Continue',
 996             'Other_ID_Start',
 997             'Other_Lowercase',
 998             'Other_Math',
 999             'Other_Uppercase',
1000     ) {
1001         $why_suppressed{$property} = $why_deprecated{$property};
1002     }
1003
1004     # Customize the message for all the 'Other_' properties
1005     foreach my $property (keys %why_deprecated) {
1006         next if (my $main_property = $property) !~ s/^Other_//;
1007         $why_deprecated{$property} =~ s/$other_properties/the $main_property property (which should be used instead)/;
1008     }
1009 }
1010
1011 if ($write_Unicode_deprecated_tables) {
1012     foreach my $property (keys %why_suppressed) {
1013         delete $why_suppressed{$property} if $property =~
1014                                                     / ^ Other | Grapheme /x;
1015     }
1016 }
1017
1018 if ($v_version ge 4.0.0) {
1019     $why_stabilized{'Hyphen'} = 'Use the Line_Break property instead; see www.unicode.org/reports/tr14';
1020     if ($v_version ge 6.0.0) {
1021         $why_deprecated{'Hyphen'} = 'Supplanted by Line_Break property values; see www.unicode.org/reports/tr14';
1022     }
1023 }
1024 if ($v_version ge 5.2.0 && $v_version lt 6.0.0) {
1025     $why_obsolete{'ISO_Comment'} = 'Code points for it have been removed';
1026     if ($v_version ge 6.0.0) {
1027         $why_deprecated{'ISO_Comment'} = 'No longer needed for Unicode\'s internal chart generation; otherwise not useful, and code points for it have been removed';
1028     }
1029 }
1030
1031 # Probably obsolete forever
1032 if ($v_version ge v4.1.0) {
1033     $why_suppressed{'Script=Katakana_Or_Hiragana'} = 'Obsolete.  All code points previously matched by this have been moved to "Script=Common".';
1034 }
1035 if ($v_version ge v6.0.0) {
1036     $why_suppressed{'Script=Katakana_Or_Hiragana'} .= '  Consider instead using "Script_Extensions=Katakana" or "Script_Extensions=Hiragana" (or both)';
1037     $why_suppressed{'Script_Extensions=Katakana_Or_Hiragana'} = 'All code points that would be matched by this are matched by either "Script_Extensions=Katakana" or "Script_Extensions=Hiragana"';
1038 }
1039
1040 # This program can create files for enumerated-like properties, such as
1041 # 'Numeric_Type'.  This file would be the same format as for a string
1042 # property, with a mapping from code point to its value, so you could look up,
1043 # for example, the script a code point is in.  But no one so far wants this
1044 # mapping, or they have found another way to get it since this is a new
1045 # feature.  So no file is generated except if it is in this list.
1046 my @output_mapped_properties = split "\n", <<END;
1047 END
1048
1049 # If you want more Unihan properties than the default, you need to add them to
1050 # these arrays.  Depending on the property type, @missing lines might have to
1051 # be added to the second array.  A sample entry would be (including the '#'):
1052 # @missing: 0000..10FFFF; cjkAccountingNumeric; NaN
1053 my @cjk_properties = split "\n", <<'END';
1054 END
1055 my @cjk_property_values = split "\n", <<'END';
1056 END
1057
1058 # The input files don't list every code point.  Those not listed are to be
1059 # defaulted to some value.  Below are hard-coded what those values are for
1060 # non-binary properties as of 5.1.  Starting in 5.0, there are
1061 # machine-parsable comment lines in the files that give the defaults; so this
1062 # list shouldn't have to be extended.  The claim is that all missing entries
1063 # for binary properties will default to 'N'.  Unicode tried to change that in
1064 # 5.2, but the beta period produced enough protest that they backed off.
1065 #
1066 # The defaults for the fields that appear in UnicodeData.txt in this hash must
1067 # be in the form that it expects.  The others may be synonyms.
1068 my $CODE_POINT = '<code point>';
1069 my %default_mapping = (
1070     Age => "Unassigned",
1071     # Bidi_Class => Complicated; set in code
1072     Bidi_Mirroring_Glyph => "",
1073     Block => 'No_Block',
1074     Canonical_Combining_Class => 0,
1075     Case_Folding => $CODE_POINT,
1076     Decomposition_Mapping => $CODE_POINT,
1077     Decomposition_Type => 'None',
1078     East_Asian_Width => "Neutral",
1079     FC_NFKC_Closure => $CODE_POINT,
1080     General_Category => ($v_version le 6.3.0) ? 'Cn' : 'Unassigned',
1081     Grapheme_Cluster_Break => 'Other',
1082     Hangul_Syllable_Type => 'NA',
1083     ISO_Comment => "",
1084     Jamo_Short_Name => "",
1085     Joining_Group => "No_Joining_Group",
1086     # Joining_Type => Complicated; set in code
1087     kIICore => 'N',   #                       Is converted to binary
1088     #Line_Break => Complicated; set in code
1089     Lowercase_Mapping => $CODE_POINT,
1090     Name => "",
1091     Name_Alias => "",
1092     NFC_QC => 'Yes',
1093     NFD_QC => 'Yes',
1094     NFKC_QC => 'Yes',
1095     NFKD_QC => 'Yes',
1096     Numeric_Type => 'None',
1097     Numeric_Value => 'NaN',
1098     Script => ($v_version le 4.1.0) ? 'Common' : 'Unknown',
1099     Sentence_Break => 'Other',
1100     Simple_Case_Folding => $CODE_POINT,
1101     Simple_Lowercase_Mapping => $CODE_POINT,
1102     Simple_Titlecase_Mapping => $CODE_POINT,
1103     Simple_Uppercase_Mapping => $CODE_POINT,
1104     Titlecase_Mapping => $CODE_POINT,
1105     Unicode_1_Name => "",
1106     Unicode_Radical_Stroke => "",
1107     Uppercase_Mapping => $CODE_POINT,
1108     Word_Break => 'Other',
1109 );
1110
1111 ### End of externally interesting definitions, except for @input_file_objects
1112
1113 my $HEADER=<<"EOF";
1114 # !!!!!!!   DO NOT EDIT THIS FILE   !!!!!!!
1115 # This file is machine-generated by $0 from the Unicode
1116 # database, Version $unicode_version.  Any changes made here will be lost!
1117 EOF
1118
1119 my $INTERNAL_ONLY_HEADER = <<"EOF";
1120
1121 # !!!!!!!   INTERNAL PERL USE ONLY   !!!!!!!
1122 # This file is for internal use by core Perl only.  The format and even the
1123 # name or existence of this file are subject to change without notice.  Don't
1124 # use it directly.  Use Unicode::UCD to access the Unicode character data
1125 # base.
1126 EOF
1127
1128 my $DEVELOPMENT_ONLY=<<"EOF";
1129 # !!!!!!!   DEVELOPMENT USE ONLY   !!!!!!!
1130 # This file contains information artificially constrained to code points
1131 # present in Unicode release $string_compare_versions.
1132 # IT CANNOT BE RELIED ON.  It is for use during development only and should
1133 # not be used for production.
1134
1135 EOF
1136
1137 my $MAX_UNICODE_CODEPOINT_STRING = ($v_version ge v2.0.0)
1138                                    ? "10FFFF"
1139                                    : "FFFF";
1140 my $MAX_UNICODE_CODEPOINT = hex $MAX_UNICODE_CODEPOINT_STRING;
1141 my $MAX_UNICODE_CODEPOINTS = $MAX_UNICODE_CODEPOINT + 1;
1142
1143 # We work with above-Unicode code points, up to IV_MAX, but we may want to use
1144 # sentinels above that number.  Therefore for internal use, we use a much
1145 # smaller number, translating it to IV_MAX only for output.  The exact number
1146 # is immaterial (all above-Unicode code points are treated exactly the same),
1147 # but the algorithm requires it to be at least
1148 # 2 * $MAX_UNICODE_CODEPOINTS + 1
1149 my $MAX_WORKING_CODEPOINTS= $MAX_UNICODE_CODEPOINT * 8;
1150 my $MAX_WORKING_CODEPOINT = $MAX_WORKING_CODEPOINTS - 1;
1151 my $MAX_WORKING_CODEPOINT_STRING = sprintf("%X", $MAX_WORKING_CODEPOINT);
1152
1153 my $MAX_PLATFORM_CODEPOINT = ~0 >> 1;
1154
1155 # Matches legal code point.  4-6 hex numbers, If there are 6, the first
1156 # two must be 10; if there are 5, the first must not be a 0.  Written this way
1157 # to decrease backtracking.  The first regex allows the code point to be at
1158 # the end of a word, but to work properly, the word shouldn't end with a valid
1159 # hex character.  The second one won't match a code point at the end of a
1160 # word, and doesn't have the run-on issue
1161 my $run_on_code_point_re =
1162             qr/ (?: 10[0-9A-F]{4} | [1-9A-F][0-9A-F]{4} | [0-9A-F]{4} ) \b/x;
1163 my $code_point_re = qr/\b$run_on_code_point_re/;
1164
1165 # This matches the beginning of the line in the Unicode DB files that give the
1166 # defaults for code points not listed (i.e., missing) in the file.  The code
1167 # depends on this ending with a semi-colon, so it can assume it is a valid
1168 # field when the line is split() by semi-colons
1169 my $missing_defaults_prefix = qr/^#\s+\@missing:\s+0000\.\.10FFFF\s*;/;
1170
1171 # Property types.  Unicode has more types, but these are sufficient for our
1172 # purposes.
1173 my $UNKNOWN = -1;   # initialized to illegal value
1174 my $NON_STRING = 1; # Either binary or enum
1175 my $BINARY = 2;
1176 my $FORCED_BINARY = 3; # Not a binary property, but, besides its normal
1177                        # tables, additional true and false tables are
1178                        # generated so that false is anything matching the
1179                        # default value, and true is everything else.
1180 my $ENUM = 4;       # Include catalog
1181 my $STRING = 5;     # Anything else: string or misc
1182
1183 # Some input files have lines that give default values for code points not
1184 # contained in the file.  Sometimes these should be ignored.
1185 my $NO_DEFAULTS = 0;        # Must evaluate to false
1186 my $NOT_IGNORED = 1;
1187 my $IGNORED = 2;
1188
1189 # Range types.  Each range has a type.  Most ranges are type 0, for normal,
1190 # and will appear in the main body of the tables in the output files, but
1191 # there are other types of ranges as well, listed below, that are specially
1192 # handled.   There are pseudo-types as well that will never be stored as a
1193 # type, but will affect the calculation of the type.
1194
1195 # 0 is for normal, non-specials
1196 my $MULTI_CP = 1;           # Sequence of more than code point
1197 my $HANGUL_SYLLABLE = 2;
1198 my $CP_IN_NAME = 3;         # The NAME contains the code point appended to it.
1199 my $NULL = 4;               # The map is to the null string; utf8.c can't
1200                             # handle these, nor is there an accepted syntax
1201                             # for them in \p{} constructs
1202 my $COMPUTE_NO_MULTI_CP = 5; # Pseudo-type; means that ranges that would
1203                              # otherwise be $MULTI_CP type are instead type 0
1204
1205 # process_generic_property_file() can accept certain overrides in its input.
1206 # Each of these must begin AND end with $CMD_DELIM.
1207 my $CMD_DELIM = "\a";
1208 my $REPLACE_CMD = 'replace';    # Override the Replace
1209 my $MAP_TYPE_CMD = 'map_type';  # Override the Type
1210
1211 my $NO = 0;
1212 my $YES = 1;
1213
1214 # Values for the Replace argument to add_range.
1215 # $NO                      # Don't replace; add only the code points not
1216                            # already present.
1217 my $IF_NOT_EQUIVALENT = 1; # Replace only under certain conditions; details in
1218                            # the comments at the subroutine definition.
1219 my $UNCONDITIONALLY = 2;   # Replace without conditions.
1220 my $MULTIPLE_BEFORE = 4;   # Don't replace, but add a duplicate record if
1221                            # already there
1222 my $MULTIPLE_AFTER = 5;    # Don't replace, but add a duplicate record if
1223                            # already there
1224 my $CROAK = 6;             # Die with an error if is already there
1225
1226 # Flags to give property statuses.  The phrases are to remind maintainers that
1227 # if the flag is changed, the indefinite article referring to it in the
1228 # documentation may need to be as well.
1229 my $NORMAL = "";
1230 my $DEPRECATED = 'D';
1231 my $a_bold_deprecated = "a 'B<$DEPRECATED>'";
1232 my $A_bold_deprecated = "A 'B<$DEPRECATED>'";
1233 my $DISCOURAGED = 'X';
1234 my $a_bold_discouraged = "an 'B<$DISCOURAGED>'";
1235 my $A_bold_discouraged = "An 'B<$DISCOURAGED>'";
1236 my $STRICTER = 'T';
1237 my $a_bold_stricter = "a 'B<$STRICTER>'";
1238 my $A_bold_stricter = "A 'B<$STRICTER>'";
1239 my $STABILIZED = 'S';
1240 my $a_bold_stabilized = "an 'B<$STABILIZED>'";
1241 my $A_bold_stabilized = "An 'B<$STABILIZED>'";
1242 my $OBSOLETE = 'O';
1243 my $a_bold_obsolete = "an 'B<$OBSOLETE>'";
1244 my $A_bold_obsolete = "An 'B<$OBSOLETE>'";
1245
1246 # Aliases can also have an extra status:
1247 my $INTERNAL_ALIAS = 'P';
1248
1249 my %status_past_participles = (
1250     $DISCOURAGED => 'discouraged',
1251     $STABILIZED => 'stabilized',
1252     $OBSOLETE => 'obsolete',
1253     $DEPRECATED => 'deprecated',
1254     $INTERNAL_ALIAS => 'reserved for Perl core internal use only',
1255 );
1256
1257 # Table fates.  These are somewhat ordered, so that fates < $MAP_PROXIED should be
1258 # externally documented.
1259 my $ORDINARY = 0;       # The normal fate.
1260 my $MAP_PROXIED = 1;    # The map table for the property isn't written out,
1261                         # but there is a file written that can be used to
1262                         # reconstruct this table
1263 my $INTERNAL_ONLY = 2;  # The file for this table is written out, but it is
1264                         # for Perl's internal use only
1265 my $LEGACY_ONLY = 3;    # Like $INTERNAL_ONLY, but not actually used by Perl.
1266                         # Is for backwards compatibility for applications that
1267                         # read the file directly, so it's format is
1268                         # unchangeable.
1269 my $SUPPRESSED = 4;     # The file for this table is not written out, and as a
1270                         # result, we don't bother to do many computations on
1271                         # it.
1272 my $PLACEHOLDER = 5;    # Like $SUPPRESSED, but we go through all the
1273                         # computations anyway, as the values are needed for
1274                         # things to work.  This happens when we have Perl
1275                         # extensions that depend on Unicode tables that
1276                         # wouldn't normally be in a given Unicode version.
1277
1278 # The format of the values of the tables:
1279 my $EMPTY_FORMAT = "";
1280 my $BINARY_FORMAT = 'b';
1281 my $DECIMAL_FORMAT = 'd';
1282 my $FLOAT_FORMAT = 'f';
1283 my $INTEGER_FORMAT = 'i';
1284 my $HEX_FORMAT = 'x';
1285 my $RATIONAL_FORMAT = 'r';
1286 my $STRING_FORMAT = 's';
1287 my $ADJUST_FORMAT = 'a';
1288 my $HEX_ADJUST_FORMAT = 'ax';
1289 my $DECOMP_STRING_FORMAT = 'c';
1290 my $STRING_WHITE_SPACE_LIST = 'sw';
1291
1292 my %map_table_formats = (
1293     $BINARY_FORMAT => 'binary',
1294     $DECIMAL_FORMAT => 'single decimal digit',
1295     $FLOAT_FORMAT => 'floating point number',
1296     $INTEGER_FORMAT => 'integer',
1297     $HEX_FORMAT => 'non-negative hex whole number; a code point',
1298     $RATIONAL_FORMAT => 'rational: an integer or a fraction',
1299     $STRING_FORMAT => 'string',
1300     $ADJUST_FORMAT => 'some entries need adjustment',
1301     $HEX_ADJUST_FORMAT => 'mapped value in hex; some entries need adjustment',
1302     $DECOMP_STRING_FORMAT => 'Perl\'s internal (Normalize.pm) decomposition mapping',
1303     $STRING_WHITE_SPACE_LIST => 'string, but some elements are interpreted as a list; white space occurs only as list item separators'
1304 );
1305
1306 # Unicode didn't put such derived files in a separate directory at first.
1307 my $EXTRACTED_DIR = (-d 'extracted') ? 'extracted' : "";
1308 my $EXTRACTED = ($EXTRACTED_DIR) ? "$EXTRACTED_DIR/" : "";
1309 my $AUXILIARY = 'auxiliary';
1310
1311 # Hashes and arrays that will eventually go into Heavy.pl for the use of
1312 # utf8_heavy.pl and into UCD.pl for the use of UCD.pm
1313 my %loose_to_file_of;       # loosely maps table names to their respective
1314                             # files
1315 my %stricter_to_file_of;    # same; but for stricter mapping.
1316 my %loose_property_to_file_of; # Maps a loose property name to its map file
1317 my %strict_property_to_file_of; # Same, but strict
1318 my @inline_definitions = "V0"; # Each element gives a definition of a unique
1319                             # inversion list.  When a definition is inlined,
1320                             # its value in the hash it's in (one of the two
1321                             # defined just above) will include an index into
1322                             # this array.  The 0th element is initialized to
1323                             # the definition for a zero length inversion list
1324 my %file_to_swash_name;     # Maps the file name to its corresponding key name
1325                             # in the hash %utf8::SwashInfo
1326 my %nv_floating_to_rational; # maps numeric values floating point numbers to
1327                              # their rational equivalent
1328 my %loose_property_name_of; # Loosely maps (non_string) property names to
1329                             # standard form
1330 my %strict_property_name_of; # Strictly maps (non_string) property names to
1331                             # standard form
1332 my %string_property_loose_to_name; # Same, for string properties.
1333 my %loose_defaults;         # keys are of form "prop=value", where 'prop' is
1334                             # the property name in standard loose form, and
1335                             # 'value' is the default value for that property,
1336                             # also in standard loose form.
1337 my %loose_to_standard_value; # loosely maps table names to the canonical
1338                             # alias for them
1339 my %ambiguous_names;        # keys are alias names (in standard form) that
1340                             # have more than one possible meaning.
1341 my %combination_property;   # keys are alias names (in standard form) that
1342                             # have both a map table, and a binary one that
1343                             # yields true for all non-null maps.
1344 my %prop_aliases;           # Keys are standard property name; values are each
1345                             # one's aliases
1346 my %prop_value_aliases;     # Keys of top level are standard property name;
1347                             # values are keys to another hash,  Each one is
1348                             # one of the property's values, in standard form.
1349                             # The values are that prop-val's aliases.
1350 my %skipped_files;          # List of files that we skip
1351 my %ucd_pod;    # Holds entries that will go into the UCD section of the pod
1352
1353 # Most properties are immune to caseless matching, otherwise you would get
1354 # nonsensical results, as properties are a function of a code point, not
1355 # everything that is caselessly equivalent to that code point.  For example,
1356 # Changes_When_Case_Folded('s') should be false, whereas caselessly it would
1357 # be true because 's' and 'S' are equivalent caselessly.  However,
1358 # traditionally, [:upper:] and [:lower:] are equivalent caselessly, so we
1359 # extend that concept to those very few properties that are like this.  Each
1360 # such property will match the full range caselessly.  They are hard-coded in
1361 # the program; it's not worth trying to make it general as it's extremely
1362 # unlikely that they will ever change.
1363 my %caseless_equivalent_to;
1364
1365 # This is the range of characters that were in Release 1 of Unicode, and
1366 # removed in Release 2 (replaced with the current Hangul syllables starting at
1367 # U+AC00).  The range was reused starting in Release 3 for other purposes.
1368 my $FIRST_REMOVED_HANGUL_SYLLABLE = 0x3400;
1369 my $FINAL_REMOVED_HANGUL_SYLLABLE = 0x4DFF;
1370
1371 # These constants names and values were taken from the Unicode standard,
1372 # version 5.1, section 3.12.  They are used in conjunction with Hangul
1373 # syllables.  The '_string' versions are so generated tables can retain the
1374 # hex format, which is the more familiar value
1375 my $SBase_string = "0xAC00";
1376 my $SBase = CORE::hex $SBase_string;
1377 my $LBase_string = "0x1100";
1378 my $LBase = CORE::hex $LBase_string;
1379 my $VBase_string = "0x1161";
1380 my $VBase = CORE::hex $VBase_string;
1381 my $TBase_string = "0x11A7";
1382 my $TBase = CORE::hex $TBase_string;
1383 my $SCount = 11172;
1384 my $LCount = 19;
1385 my $VCount = 21;
1386 my $TCount = 28;
1387 my $NCount = $VCount * $TCount;
1388
1389 # For Hangul syllables;  These store the numbers from Jamo.txt in conjunction
1390 # with the above published constants.
1391 my %Jamo;
1392 my %Jamo_L;     # Leading consonants
1393 my %Jamo_V;     # Vowels
1394 my %Jamo_T;     # Trailing consonants
1395
1396 # For code points whose name contains its ordinal as a '-ABCD' suffix.
1397 # The key is the base name of the code point, and the value is an
1398 # array giving all the ranges that use this base name.  Each range
1399 # is actually a hash giving the 'low' and 'high' values of it.
1400 my %names_ending_in_code_point;
1401 my %loose_names_ending_in_code_point;   # Same as above, but has blanks, dashes
1402                                         # removed from the names
1403 # Inverse mapping.  The list of ranges that have these kinds of
1404 # names.  Each element contains the low, high, and base names in an
1405 # anonymous hash.
1406 my @code_points_ending_in_code_point;
1407
1408 # To hold Unicode's normalization test suite
1409 my @normalization_tests;
1410
1411 # Boolean: does this Unicode version have the hangul syllables, and are we
1412 # writing out a table for them?
1413 my $has_hangul_syllables = 0;
1414
1415 # Does this Unicode version have code points whose names end in their
1416 # respective code points, and are we writing out a table for them?  0 for no;
1417 # otherwise points to first property that a table is needed for them, so that
1418 # if multiple tables are needed, we don't create duplicates
1419 my $needing_code_points_ending_in_code_point = 0;
1420
1421 my @backslash_X_tests;     # List of tests read in for testing \X
1422 my @LB_tests;              # List of tests read in for testing \b{lb}
1423 my @SB_tests;              # List of tests read in for testing \b{sb}
1424 my @WB_tests;              # List of tests read in for testing \b{wb}
1425 my @unhandled_properties;  # Will contain a list of properties found in
1426                            # the input that we didn't process.
1427 my @match_properties;      # Properties that have match tables, to be
1428                            # listed in the pod
1429 my @map_properties;        # Properties that get map files written
1430 my @named_sequences;       # NamedSequences.txt contents.
1431 my %potential_files;       # Generated list of all .txt files in the directory
1432                            # structure so we can warn if something is being
1433                            # ignored.
1434 my @missing_early_files;   # Generated list of absent files that we need to
1435                            # proceed in compiling this early Unicode version
1436 my @files_actually_output; # List of files we generated.
1437 my @more_Names;            # Some code point names are compound; this is used
1438                            # to store the extra components of them.
1439 my $E_FLOAT_PRECISION = 2; # The minimum number of digits after the decimal
1440                            # point of a normalized floating point number
1441                            # needed to match before we consider it equivalent
1442                            # to a candidate rational
1443
1444 # These store references to certain commonly used property objects
1445 my $age;
1446 my $ccc;
1447 my $gc;
1448 my $perl;
1449 my $block;
1450 my $perl_charname;
1451 my $print;
1452 my $All;
1453 my $Assigned;   # All assigned characters in this Unicode release
1454 my $DI;         # Default_Ignorable_Code_Point property
1455 my $NChar;      # Noncharacter_Code_Point property
1456 my $script;
1457 my $scx;        # Script_Extensions property
1458
1459 # Are there conflicting names because of beginning with 'In_', or 'Is_'
1460 my $has_In_conflicts = 0;
1461 my $has_Is_conflicts = 0;
1462
1463 sub internal_file_to_platform ($) {
1464     # Convert our file paths which have '/' separators to those of the
1465     # platform.
1466
1467     my $file = shift;
1468     return undef unless defined $file;
1469
1470     return File::Spec->join(split '/', $file);
1471 }
1472
1473 sub file_exists ($) {   # platform independent '-e'.  This program internally
1474                         # uses slash as a path separator.
1475     my $file = shift;
1476     return 0 if ! defined $file;
1477     return -e internal_file_to_platform($file);
1478 }
1479
1480 sub objaddr($) {
1481     # Returns the address of the blessed input object.
1482     # It doesn't check for blessedness because that would do a string eval
1483     # every call, and the program is structured so that this is never called
1484     # for a non-blessed object.
1485
1486     no overloading; # If overloaded, numifying below won't work.
1487
1488     # Numifying a ref gives its address.
1489     return pack 'J', $_[0];
1490 }
1491
1492 # These are used only if $annotate is true.
1493 # The entire range of Unicode characters is examined to populate these
1494 # after all the input has been processed.  But most can be skipped, as they
1495 # have the same descriptive phrases, such as being unassigned
1496 my @viacode;            # Contains the 1 million character names
1497 my @age;                # And their ages ("" if none)
1498 my @printable;          # boolean: And are those characters printable?
1499 my @annotate_char_type; # Contains a type of those characters, specifically
1500                         # for the purposes of annotation.
1501 my $annotate_ranges;    # A map of ranges of code points that have the same
1502                         # name for the purposes of annotation.  They map to the
1503                         # upper edge of the range, so that the end point can
1504                         # be immediately found.  This is used to skip ahead to
1505                         # the end of a range, and avoid processing each
1506                         # individual code point in it.
1507 my $unassigned_sans_noncharacters; # A Range_List of the unassigned
1508                                    # characters, but excluding those which are
1509                                    # also noncharacter code points
1510
1511 # The annotation types are an extension of the regular range types, though
1512 # some of the latter are folded into one.  Make the new types negative to
1513 # avoid conflicting with the regular types
1514 my $SURROGATE_TYPE = -1;
1515 my $UNASSIGNED_TYPE = -2;
1516 my $PRIVATE_USE_TYPE = -3;
1517 my $NONCHARACTER_TYPE = -4;
1518 my $CONTROL_TYPE = -5;
1519 my $ABOVE_UNICODE_TYPE = -6;
1520 my $UNKNOWN_TYPE = -7;  # Used only if there is a bug in this program
1521
1522 sub populate_char_info ($) {
1523     # Used only with the $annotate option.  Populates the arrays with the
1524     # input code point's info that are needed for outputting more detailed
1525     # comments.  If calling context wants a return, it is the end point of
1526     # any contiguous range of characters that share essentially the same info
1527
1528     my $i = shift;
1529     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
1530
1531     $viacode[$i] = $perl_charname->value_of($i) || "";
1532     $age[$i] = (defined $age)
1533                ? (($age->value_of($i) =~ / ^ \d+ \. \d+ $ /x)
1534                   ? $age->value_of($i)
1535                   : "")
1536                : "";
1537
1538     # A character is generally printable if Unicode says it is,
1539     # but below we make sure that most Unicode general category 'C' types
1540     # aren't.
1541     $printable[$i] = $print->contains($i);
1542
1543     # But the characters in this range were removed in v2.0 and replaced by
1544     # different ones later.  Modern fonts will be for the replacement
1545     # characters, so suppress printing them.
1546     if (($v_version lt v2.0
1547          || ($compare_versions && $compare_versions lt v2.0))
1548         && (   $i >= $FIRST_REMOVED_HANGUL_SYLLABLE
1549             && $i <= $FINAL_REMOVED_HANGUL_SYLLABLE))
1550     {
1551         $printable[$i] = 0;
1552     }
1553
1554     $annotate_char_type[$i] = $perl_charname->type_of($i) || 0;
1555
1556     # Only these two regular types are treated specially for annotations
1557     # purposes
1558     $annotate_char_type[$i] = 0 if $annotate_char_type[$i] != $CP_IN_NAME
1559                                 && $annotate_char_type[$i] != $HANGUL_SYLLABLE;
1560
1561     # Give a generic name to all code points that don't have a real name.
1562     # We output ranges, if applicable, for these.  Also calculate the end
1563     # point of the range.
1564     my $end;
1565     if (! $viacode[$i]) {
1566         if ($i > $MAX_UNICODE_CODEPOINT) {
1567             $viacode[$i] = 'Above-Unicode';
1568             $annotate_char_type[$i] = $ABOVE_UNICODE_TYPE;
1569             $printable[$i] = 0;
1570             $end = $MAX_WORKING_CODEPOINT;
1571         }
1572         elsif ($gc-> table('Private_use')->contains($i)) {
1573             $viacode[$i] = 'Private Use';
1574             $annotate_char_type[$i] = $PRIVATE_USE_TYPE;
1575             $printable[$i] = 0;
1576             $end = $gc->table('Private_Use')->containing_range($i)->end;
1577         }
1578         elsif ($NChar->contains($i)) {
1579             $viacode[$i] = 'Noncharacter';
1580             $annotate_char_type[$i] = $NONCHARACTER_TYPE;
1581             $printable[$i] = 0;
1582             $end = $NChar->containing_range($i)->end;
1583         }
1584         elsif ($gc-> table('Control')->contains($i)) {
1585             my $name_ref = property_ref('Name_Alias');
1586             $name_ref = property_ref('Unicode_1_Name') if ! defined $name_ref;
1587             $viacode[$i] = (defined $name_ref)
1588                            ? $name_ref->value_of($i)
1589                            : 'Control';
1590             $annotate_char_type[$i] = $CONTROL_TYPE;
1591             $printable[$i] = 0;
1592         }
1593         elsif ($gc-> table('Unassigned')->contains($i)) {
1594             $annotate_char_type[$i] = $UNASSIGNED_TYPE;
1595             $printable[$i] = 0;
1596             $viacode[$i] = 'Unassigned';
1597
1598             if (defined $block) { # No blocks in earliest releases
1599                 $viacode[$i] .= ', block=' . $block-> value_of($i);
1600                 $end = $gc-> table('Unassigned')->containing_range($i)->end;
1601
1602                 # Because we name the unassigned by the blocks they are in, it
1603                 # can't go past the end of that block, and it also can't go
1604                 # past the unassigned range it is in.  The special table makes
1605                 # sure that the non-characters, which are unassigned, are
1606                 # separated out.
1607                 $end = min($block->containing_range($i)->end,
1608                            $unassigned_sans_noncharacters->
1609                                                     containing_range($i)->end);
1610             }
1611             else {
1612                 $end = $i + 1;
1613                 while ($unassigned_sans_noncharacters->contains($end)) {
1614                     $end++;
1615                 }
1616                 $end--;
1617             }
1618         }
1619         elsif ($perl->table('_Perl_Surrogate')->contains($i)) {
1620             $viacode[$i] = 'Surrogate';
1621             $annotate_char_type[$i] = $SURROGATE_TYPE;
1622             $printable[$i] = 0;
1623             $end = $gc->table('Surrogate')->containing_range($i)->end;
1624         }
1625         else {
1626             Carp::my_carp_bug("Can't figure out how to annotate "
1627                               . sprintf("U+%04X", $i)
1628                               . ".  Proceeding anyway.");
1629             $viacode[$i] = 'UNKNOWN';
1630             $annotate_char_type[$i] = $UNKNOWN_TYPE;
1631             $printable[$i] = 0;
1632         }
1633     }
1634
1635     # Here, has a name, but if it's one in which the code point number is
1636     # appended to the name, do that.
1637     elsif ($annotate_char_type[$i] == $CP_IN_NAME) {
1638         $viacode[$i] .= sprintf("-%04X", $i);
1639
1640         my $limit = $perl_charname->containing_range($i)->end;
1641         if (defined $age) {
1642             # Do all these as groups of the same age, instead of individually,
1643             # because their names are so meaningless, and there are typically
1644             # large quantities of them.
1645             $end = $i + 1;
1646             while ($end <= $limit && $age->value_of($end) == $age[$i]) {
1647                 $end++;
1648             }
1649             $end--;
1650         }
1651         else {
1652             $end = $limit;
1653         }
1654     }
1655
1656     # And here, has a name, but if it's a hangul syllable one, replace it with
1657     # the correct name from the Unicode algorithm
1658     elsif ($annotate_char_type[$i] == $HANGUL_SYLLABLE) {
1659         use integer;
1660         my $SIndex = $i - $SBase;
1661         my $L = $LBase + $SIndex / $NCount;
1662         my $V = $VBase + ($SIndex % $NCount) / $TCount;
1663         my $T = $TBase + $SIndex % $TCount;
1664         $viacode[$i] = "HANGUL SYLLABLE $Jamo{$L}$Jamo{$V}";
1665         $viacode[$i] .= $Jamo{$T} if $T != $TBase;
1666         $end = $perl_charname->containing_range($i)->end;
1667     }
1668
1669     return if ! defined wantarray;
1670     return $i if ! defined $end;    # If not a range, return the input
1671
1672     # Save this whole range so can find the end point quickly
1673     $annotate_ranges->add_map($i, $end, $end);
1674
1675     return $end;
1676 }
1677
1678 # Commented code below should work on Perl 5.8.
1679 ## This 'require' doesn't necessarily work in miniperl, and even if it does,
1680 ## the native perl version of it (which is what would operate under miniperl)
1681 ## is extremely slow, as it does a string eval every call.
1682 #my $has_fast_scalar_util = $^X !~ /miniperl/
1683 #                            && defined eval "require Scalar::Util";
1684 #
1685 #sub objaddr($) {
1686 #    # Returns the address of the blessed input object.  Uses the XS version if
1687 #    # available.  It doesn't check for blessedness because that would do a
1688 #    # string eval every call, and the program is structured so that this is
1689 #    # never called for a non-blessed object.
1690 #
1691 #    return Scalar::Util::refaddr($_[0]) if $has_fast_scalar_util;
1692 #
1693 #    # Check at least that is a ref.
1694 #    my $pkg = ref($_[0]) or return undef;
1695 #
1696 #    # Change to a fake package to defeat any overloaded stringify
1697 #    bless $_[0], 'main::Fake';
1698 #
1699 #    # Numifying a ref gives its address.
1700 #    my $addr = pack 'J', $_[0];
1701 #
1702 #    # Return to original class
1703 #    bless $_[0], $pkg;
1704 #    return $addr;
1705 #}
1706
1707 sub max ($$) {
1708     my $a = shift;
1709     my $b = shift;
1710     return $a if $a >= $b;
1711     return $b;
1712 }
1713
1714 sub min ($$) {
1715     my $a = shift;
1716     my $b = shift;
1717     return $a if $a <= $b;
1718     return $b;
1719 }
1720
1721 sub clarify_number ($) {
1722     # This returns the input number with underscores inserted every 3 digits
1723     # in large (5 digits or more) numbers.  Input must be entirely digits, not
1724     # checked.
1725
1726     my $number = shift;
1727     my $pos = length($number) - 3;
1728     return $number if $pos <= 1;
1729     while ($pos > 0) {
1730         substr($number, $pos, 0) = '_';
1731         $pos -= 3;
1732     }
1733     return $number;
1734 }
1735
1736 sub clarify_code_point_count ($) {
1737     # This is like clarify_number(), but the input is assumed to be a count of
1738     # code points, rather than a generic number.
1739
1740     my $append = "";
1741
1742     my $number = shift;
1743     if ($number > $MAX_UNICODE_CODEPOINTS) {
1744         $number -= ($MAX_WORKING_CODEPOINTS - $MAX_UNICODE_CODEPOINTS);
1745         return "All above-Unicode code points" if $number == 0;
1746         $append = " + all above-Unicode code points";
1747     }
1748     return clarify_number($number) . $append;
1749 }
1750
1751 package Carp;
1752
1753 # These routines give a uniform treatment of messages in this program.  They
1754 # are placed in the Carp package to cause the stack trace to not include them,
1755 # although an alternative would be to use another package and set @CARP_NOT
1756 # for it.
1757
1758 our $Verbose = 1 if main::DEBUG;  # Useful info when debugging
1759
1760 # This is a work-around suggested by Nicholas Clark to fix a problem with Carp
1761 # and overload trying to load Scalar:Util under miniperl.  See
1762 # http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/2009-11/msg01057.html
1763 undef $overload::VERSION;
1764
1765 sub my_carp {
1766     my $message = shift || "";
1767     my $nofold = shift || 0;
1768
1769     if ($message) {
1770         $message = main::join_lines($message);
1771         $message =~ s/^$0: *//;     # Remove initial program name
1772         $message =~ s/[.;,]+$//;    # Remove certain ending punctuation
1773         $message = "\n$0: $message;";
1774
1775         # Fold the message with program name, semi-colon end punctuation
1776         # (which looks good with the message that carp appends to it), and a
1777         # hanging indent for continuation lines.
1778         $message = main::simple_fold($message, "", 4) unless $nofold;
1779         $message =~ s/\n$//;        # Remove the trailing nl so what carp
1780                                     # appends is to the same line
1781     }
1782
1783     return $message if defined wantarray;   # If a caller just wants the msg
1784
1785     carp $message;
1786     return;
1787 }
1788
1789 sub my_carp_bug {
1790     # This is called when it is clear that the problem is caused by a bug in
1791     # this program.
1792
1793     my $message = shift;
1794     $message =~ s/^$0: *//;
1795     $message = my_carp("Bug in $0.  Please report it by running perlbug or if that is unavailable, by sending email to perbug\@perl.org:\n$message");
1796     carp $message;
1797     return;
1798 }
1799
1800 sub carp_too_few_args {
1801     if (@_ != 2) {
1802         my_carp_bug("Wrong number of arguments: to 'carp_too_few_arguments'.  No action taken.");
1803         return;
1804     }
1805
1806     my $args_ref = shift;
1807     my $count = shift;
1808
1809     my_carp_bug("Need at least $count arguments to "
1810         . (caller 1)[3]
1811         . ".  Instead got: '"
1812         . join ', ', @$args_ref
1813         . "'.  No action taken.");
1814     return;
1815 }
1816
1817 sub carp_extra_args {
1818     my $args_ref = shift;
1819     my_carp_bug("Too many arguments to 'carp_extra_args': (" . join(', ', @_) . ");  Extras ignored.") if @_;
1820
1821     unless (ref $args_ref) {
1822         my_carp_bug("Argument to 'carp_extra_args' ($args_ref) must be a ref.  Not checking arguments.");
1823         return;
1824     }
1825     my ($package, $file, $line) = caller;
1826     my $subroutine = (caller 1)[3];
1827
1828     my $list;
1829     if (ref $args_ref eq 'HASH') {
1830         foreach my $key (keys %$args_ref) {
1831             $args_ref->{$key} = $UNDEF unless defined $args_ref->{$key};
1832         }
1833         $list = join ', ', each %{$args_ref};
1834     }
1835     elsif (ref $args_ref eq 'ARRAY') {
1836         foreach my $arg (@$args_ref) {
1837             $arg = $UNDEF unless defined $arg;
1838         }
1839         $list = join ', ', @$args_ref;
1840     }
1841     else {
1842         my_carp_bug("Can't cope with ref "
1843                 . ref($args_ref)
1844                 . " . argument to 'carp_extra_args'.  Not checking arguments.");
1845         return;
1846     }
1847
1848     my_carp_bug("Unrecognized parameters in options: '$list' to $subroutine.  Skipped.");
1849     return;
1850 }
1851
1852 package main;
1853
1854 { # Closure
1855
1856     # This program uses the inside-out method for objects, as recommended in
1857     # "Perl Best Practices".  (This is the best solution still, since this has
1858     # to run under miniperl.)  This closure aids in generating those.  There
1859     # are two routines.  setup_package() is called once per package to set
1860     # things up, and then set_access() is called for each hash representing a
1861     # field in the object.  These routines arrange for the object to be
1862     # properly destroyed when no longer used, and for standard accessor
1863     # functions to be generated.  If you need more complex accessors, just
1864     # write your own and leave those accesses out of the call to set_access().
1865     # More details below.
1866
1867     my %constructor_fields; # fields that are to be used in constructors; see
1868                             # below
1869
1870     # The values of this hash will be the package names as keys to other
1871     # hashes containing the name of each field in the package as keys, and
1872     # references to their respective hashes as values.
1873     my %package_fields;
1874
1875     sub setup_package {
1876         # Sets up the package, creating standard DESTROY and dump methods
1877         # (unless already defined).  The dump method is used in debugging by
1878         # simple_dumper().
1879         # The optional parameters are:
1880         #   a)  a reference to a hash, that gets populated by later
1881         #       set_access() calls with one of the accesses being
1882         #       'constructor'.  The caller can then refer to this, but it is
1883         #       not otherwise used by these two routines.
1884         #   b)  a reference to a callback routine to call during destruction
1885         #       of the object, before any fields are actually destroyed
1886
1887         my %args = @_;
1888         my $constructor_ref = delete $args{'Constructor_Fields'};
1889         my $destroy_callback = delete $args{'Destroy_Callback'};
1890         Carp::carp_extra_args(\@_) if main::DEBUG && %args;
1891
1892         my %fields;
1893         my $package = (caller)[0];
1894
1895         $package_fields{$package} = \%fields;
1896         $constructor_fields{$package} = $constructor_ref;
1897
1898         unless ($package->can('DESTROY')) {
1899             my $destroy_name = "${package}::DESTROY";
1900             no strict "refs";
1901
1902             # Use typeglob to give the anonymous subroutine the name we want
1903             *$destroy_name = sub {
1904                 my $self = shift;
1905                 my $addr = do { no overloading; pack 'J', $self; };
1906
1907                 $self->$destroy_callback if $destroy_callback;
1908                 foreach my $field (keys %{$package_fields{$package}}) {
1909                     #print STDERR __LINE__, ": Destroying ", ref $self, " ", sprintf("%04X", $addr), ": ", $field, "\n";
1910                     delete $package_fields{$package}{$field}{$addr};
1911                 }
1912                 return;
1913             }
1914         }
1915
1916         unless ($package->can('dump')) {
1917             my $dump_name = "${package}::dump";
1918             no strict "refs";
1919             *$dump_name = sub {
1920                 my $self = shift;
1921                 return dump_inside_out($self, $package_fields{$package}, @_);
1922             }
1923         }
1924         return;
1925     }
1926
1927     sub set_access {
1928         # Arrange for the input field to be garbage collected when no longer
1929         # needed.  Also, creates standard accessor functions for the field
1930         # based on the optional parameters-- none if none of these parameters:
1931         #   'addable'    creates an 'add_NAME()' accessor function.
1932         #   'readable' or 'readable_array'   creates a 'NAME()' accessor
1933         #                function.
1934         #   'settable'   creates a 'set_NAME()' accessor function.
1935         #   'constructor' doesn't create an accessor function, but adds the
1936         #                field to the hash that was previously passed to
1937         #                setup_package();
1938         # Any of the accesses can be abbreviated down, so that 'a', 'ad',
1939         # 'add' etc. all mean 'addable'.
1940         # The read accessor function will work on both array and scalar
1941         # values.  If another accessor in the parameter list is 'a', the read
1942         # access assumes an array.  You can also force it to be array access
1943         # by specifying 'readable_array' instead of 'readable'
1944         #
1945         # A sort-of 'protected' access can be set-up by preceding the addable,
1946         # readable or settable with some initial portion of 'protected_' (but,
1947         # the underscore is required), like 'p_a', 'pro_set', etc.  The
1948         # "protection" is only by convention.  All that happens is that the
1949         # accessor functions' names begin with an underscore.  So instead of
1950         # calling set_foo, the call is _set_foo.  (Real protection could be
1951         # accomplished by having a new subroutine, end_package, called at the
1952         # end of each package, and then storing the __LINE__ ranges and
1953         # checking them on every accessor.  But that is way overkill.)
1954
1955         # We create anonymous subroutines as the accessors and then use
1956         # typeglobs to assign them to the proper package and name
1957
1958         my $name = shift;   # Name of the field
1959         my $field = shift;  # Reference to the inside-out hash containing the
1960                             # field
1961
1962         my $package = (caller)[0];
1963
1964         if (! exists $package_fields{$package}) {
1965             croak "$0: Must call 'setup_package' before 'set_access'";
1966         }
1967
1968         # Stash the field so DESTROY can get it.
1969         $package_fields{$package}{$name} = $field;
1970
1971         # Remaining arguments are the accessors.  For each...
1972         foreach my $access (@_) {
1973             my $access = lc $access;
1974
1975             my $protected = "";
1976
1977             # Match the input as far as it goes.
1978             if ($access =~ /^(p[^_]*)_/) {
1979                 $protected = $1;
1980                 if (substr('protected_', 0, length $protected)
1981                     eq $protected)
1982                 {
1983
1984                     # Add 1 for the underscore not included in $protected
1985                     $access = substr($access, length($protected) + 1);
1986                     $protected = '_';
1987                 }
1988                 else {
1989                     $protected = "";
1990                 }
1991             }
1992
1993             if (substr('addable', 0, length $access) eq $access) {
1994                 my $subname = "${package}::${protected}add_$name";
1995                 no strict "refs";
1996
1997                 # add_ accessor.  Don't add if already there, which we
1998                 # determine using 'eq' for scalars and '==' otherwise.
1999                 *$subname = sub {
2000                     use strict "refs";
2001                     return Carp::carp_too_few_args(\@_, 2) if main::DEBUG && @_ < 2;
2002                     my $self = shift;
2003                     my $value = shift;
2004                     my $addr = do { no overloading; pack 'J', $self; };
2005                     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2006                     if (ref $value) {
2007                         return if grep { $value == $_ } @{$field->{$addr}};
2008                     }
2009                     else {
2010                         return if grep { $value eq $_ } @{$field->{$addr}};
2011                     }
2012                     push @{$field->{$addr}}, $value;
2013                     return;
2014                 }
2015             }
2016             elsif (substr('constructor', 0, length $access) eq $access) {
2017                 if ($protected) {
2018                     Carp::my_carp_bug("Can't set-up 'protected' constructors")
2019                 }
2020                 else {
2021                     $constructor_fields{$package}{$name} = $field;
2022                 }
2023             }
2024             elsif (substr('readable_array', 0, length $access) eq $access) {
2025
2026                 # Here has read access.  If one of the other parameters for
2027                 # access is array, or this one specifies array (by being more
2028                 # than just 'readable_'), then create a subroutine that
2029                 # assumes the data is an array.  Otherwise just a scalar
2030                 my $subname = "${package}::${protected}$name";
2031                 if (grep { /^a/i } @_
2032                     or length($access) > length('readable_'))
2033                 {
2034                     no strict "refs";
2035                     *$subname = sub {
2036                         use strict "refs";
2037                         Carp::carp_extra_args(\@_) if main::DEBUG && @_ > 1;
2038                         my $addr = do { no overloading; pack 'J', $_[0]; };
2039                         if (ref $field->{$addr} ne 'ARRAY') {
2040                             my $type = ref $field->{$addr};
2041                             $type = 'scalar' unless $type;
2042                             Carp::my_carp_bug("Trying to read $name as an array when it is a $type.  Big problems.");
2043                             return;
2044                         }
2045                         return scalar @{$field->{$addr}} unless wantarray;
2046
2047                         # Make a copy; had problems with caller modifying the
2048                         # original otherwise
2049                         my @return = @{$field->{$addr}};
2050                         return @return;
2051                     }
2052                 }
2053                 else {
2054
2055                     # Here not an array value, a simpler function.
2056                     no strict "refs";
2057                     *$subname = sub {
2058                         use strict "refs";
2059                         Carp::carp_extra_args(\@_) if main::DEBUG && @_ > 1;
2060                         no overloading;
2061                         return $field->{pack 'J', $_[0]};
2062                     }
2063                 }
2064             }
2065             elsif (substr('settable', 0, length $access) eq $access) {
2066                 my $subname = "${package}::${protected}set_$name";
2067                 no strict "refs";
2068                 *$subname = sub {
2069                     use strict "refs";
2070                     if (main::DEBUG) {
2071                         return Carp::carp_too_few_args(\@_, 2) if @_ < 2;
2072                         Carp::carp_extra_args(\@_) if @_ > 2;
2073                     }
2074                     # $self is $_[0]; $value is $_[1]
2075                     no overloading;
2076                     $field->{pack 'J', $_[0]} = $_[1];
2077                     return;
2078                 }
2079             }
2080             else {
2081                 Carp::my_carp_bug("Unknown accessor type $access.  No accessor set.");
2082             }
2083         }
2084         return;
2085     }
2086 }
2087
2088 package Input_file;
2089
2090 # All input files use this object, which stores various attributes about them,
2091 # and provides for convenient, uniform handling.  The run method wraps the
2092 # processing.  It handles all the bookkeeping of opening, reading, and closing
2093 # the file, returning only significant input lines.
2094 #
2095 # Each object gets a handler which processes the body of the file, and is
2096 # called by run().  All character property files must use the generic,
2097 # default handler, which has code scrubbed to handle things you might not
2098 # expect, including automatic EBCDIC handling.  For files that don't deal with
2099 # mapping code points to a property value, such as test files,
2100 # PropertyAliases, PropValueAliases, and named sequences, you can override the
2101 # handler to be a custom one.  Such a handler should basically be a
2102 # while(next_line()) {...} loop.
2103 #
2104 # You can also set up handlers to
2105 #   0) call during object construction time, after everything else is done
2106 #   1) call before the first line is read, for pre processing
2107 #   2) call to adjust each line of the input before the main handler gets
2108 #      them.  This can be automatically generated, if appropriately simple
2109 #      enough, by specifying a Properties parameter in the constructor.
2110 #   3) call upon EOF before the main handler exits its loop
2111 #   4) call at the end, for post processing
2112 #
2113 # $_ is used to store the input line, and is to be filtered by the
2114 # each_line_handler()s.  So, if the format of the line is not in the desired
2115 # format for the main handler, these are used to do that adjusting.  They can
2116 # be stacked (by enclosing them in an [ anonymous array ] in the constructor,
2117 # so the $_ output of one is used as the input to the next.  The EOF handler
2118 # is also stackable, but none of the others are, but could easily be changed
2119 # to be so.
2120 #
2121 # Some properties are used by the Perl core but aren't defined until later
2122 # Unicode releases.  The perl interpreter would have problems working when
2123 # compiled with an earlier Unicode version that doesn't have them, so we need
2124 # to define them somehow for those releases.  The 'Early' constructor
2125 # parameter can be used to automatically handle this.  It is essentially
2126 # ignored if the Unicode version being compiled has a data file for this
2127 # property.  Either code to execute or a file to read can be specified.
2128 # Details are at the %early definition.
2129 #
2130 # Most of the handlers can call insert_lines() or insert_adjusted_lines()
2131 # which insert the parameters as lines to be processed before the next input
2132 # file line is read.  This allows the EOF handler(s) to flush buffers, for
2133 # example.  The difference between the two routines is that the lines inserted
2134 # by insert_lines() are subjected to the each_line_handler()s.  (So if you
2135 # called it from such a handler, you would get infinite recursion without some
2136 # mechanism to prevent that.)  Lines inserted by insert_adjusted_lines() go
2137 # directly to the main handler without any adjustments.  If the
2138 # post-processing handler calls any of these, there will be no effect.  Some
2139 # error checking for these conditions could be added, but it hasn't been done.
2140 #
2141 # carp_bad_line() should be called to warn of bad input lines, which clears $_
2142 # to prevent further processing of the line.  This routine will output the
2143 # message as a warning once, and then keep a count of the lines that have the
2144 # same message, and output that count at the end of the file's processing.
2145 # This keeps the number of messages down to a manageable amount.
2146 #
2147 # get_missings() should be called to retrieve any @missing input lines.
2148 # Messages will be raised if this isn't done if the options aren't to ignore
2149 # missings.
2150
2151 sub trace { return main::trace(@_); }
2152
2153 { # Closure
2154     # Keep track of fields that are to be put into the constructor.
2155     my %constructor_fields;
2156
2157     main::setup_package(Constructor_Fields => \%constructor_fields);
2158
2159     my %file; # Input file name, required
2160     main::set_access('file', \%file, qw{ c r });
2161
2162     my %first_released; # Unicode version file was first released in, required
2163     main::set_access('first_released', \%first_released, qw{ c r });
2164
2165     my %handler;    # Subroutine to process the input file, defaults to
2166                     # 'process_generic_property_file'
2167     main::set_access('handler', \%handler, qw{ c });
2168
2169     my %property;
2170     # name of property this file is for.  defaults to none, meaning not
2171     # applicable, or is otherwise determinable, for example, from each line.
2172     main::set_access('property', \%property, qw{ c r });
2173
2174     my %optional;
2175     # This is either an unsigned number, or a list of property names.  In the
2176     # former case, if it is non-zero, it means the file is optional, so if the
2177     # file is absent, no warning about that is output.  In the latter case, it
2178     # is a list of properties that the file (exclusively) defines.  If the
2179     # file is present, tables for those properties will be produced; if
2180     # absent, none will, even if they are listed elsewhere (namely
2181     # PropertyAliases.txt and PropValueAliases.txt) as being in this release,
2182     # and no warnings will be raised about them not being available.  (And no
2183     # warning about the file itself will be raised.)
2184     main::set_access('optional', \%optional, qw{ c readable_array } );
2185
2186     my %non_skip;
2187     # This is used for debugging, to skip processing of all but a few input
2188     # files.  Add 'non_skip => 1' to the constructor for those files you want
2189     # processed when you set the $debug_skip global.
2190     main::set_access('non_skip', \%non_skip, 'c');
2191
2192     my %skip;
2193     # This is used to skip processing of this input file (semi-) permanently.
2194     # The value should be the reason the file is being skipped.  It is used
2195     # for files that we aren't planning to process anytime soon, but want to
2196     # allow to be in the directory and be checked for their names not
2197     # conflicting with any other files on a DOS 8.3 name filesystem, but to
2198     # not otherwise be processed, and to not raise a warning about not being
2199     # handled.  In the constructor call, any value that evaluates to a numeric
2200     # 0 or undef means don't skip.  Any other value is a string giving the
2201     # reason it is being skipped, and this will appear in generated pod.
2202     # However, an empty string reason will suppress the pod entry.
2203     # Internally, calls that evaluate to numeric 0 are changed into undef to
2204     # distinguish them from an empty string call.
2205     main::set_access('skip', \%skip, 'c', 'r');
2206
2207     my %each_line_handler;
2208     # list of subroutines to look at and filter each non-comment line in the
2209     # file.  defaults to none.  The subroutines are called in order, each is
2210     # to adjust $_ for the next one, and the final one adjusts it for
2211     # 'handler'
2212     main::set_access('each_line_handler', \%each_line_handler, 'c');
2213
2214     my %retain_trailing_comments;
2215     # This is used to not discard the comments that end data lines.  This
2216     # would be used only for files with non-typical syntax, and most code here
2217     # assumes that comments have been stripped, so special handlers would have
2218     # to be written.  It is assumed that the code will use these in
2219     # single-quoted contexts, and so any "'" marks in the comment will be
2220     # prefixed by a backslash.
2221     main::set_access('retain_trailing_comments', \%retain_trailing_comments, 'c');
2222
2223     my %properties; # Optional ordered list of the properties that occur in each
2224     # meaningful line of the input file.  If present, an appropriate
2225     # each_line_handler() is automatically generated and pushed onto the stack
2226     # of such handlers.  This is useful when a file contains multiple
2227     # properties per line, but no other special considerations are necessary.
2228     # The special value "<ignored>" means to discard the corresponding input
2229     # field.
2230     # Any @missing lines in the file should also match this syntax; no such
2231     # files exist as of 6.3.  But if it happens in a future release, the code
2232     # could be expanded to properly parse them.
2233     main::set_access('properties', \%properties, qw{ c r });
2234
2235     my %has_missings_defaults;
2236     # ? Are there lines in the file giving default values for code points
2237     # missing from it?.  Defaults to NO_DEFAULTS.  Otherwise NOT_IGNORED is
2238     # the norm, but IGNORED means it has such lines, but the handler doesn't
2239     # use them.  Having these three states allows us to catch changes to the
2240     # UCD that this program should track.  XXX This could be expanded to
2241     # specify the syntax for such lines, like %properties above.
2242     main::set_access('has_missings_defaults',
2243                                         \%has_missings_defaults, qw{ c r });
2244
2245     my %construction_time_handler;
2246     # Subroutine to call at the end of the new method.  If undef, no such
2247     # handler is called.
2248     main::set_access('construction_time_handler',
2249                                         \%construction_time_handler, qw{ c });
2250
2251     my %pre_handler;
2252     # Subroutine to call before doing anything else in the file.  If undef, no
2253     # such handler is called.
2254     main::set_access('pre_handler', \%pre_handler, qw{ c });
2255
2256     my %eof_handler;
2257     # Subroutines to call upon getting an EOF on the input file, but before
2258     # that is returned to the main handler.  This is to allow buffers to be
2259     # flushed.  The handler is expected to call insert_lines() or
2260     # insert_adjusted() with the buffered material
2261     main::set_access('eof_handler', \%eof_handler, qw{ c });
2262
2263     my %post_handler;
2264     # Subroutine to call after all the lines of the file are read in and
2265     # processed.  If undef, no such handler is called.  Note that this cannot
2266     # add lines to be processed; instead use eof_handler
2267     main::set_access('post_handler', \%post_handler, qw{ c });
2268
2269     my %progress_message;
2270     # Message to print to display progress in lieu of the standard one
2271     main::set_access('progress_message', \%progress_message, qw{ c });
2272
2273     my %handle;
2274     # cache open file handle, internal.  Is undef if file hasn't been
2275     # processed at all, empty if has;
2276     main::set_access('handle', \%handle);
2277
2278     my %added_lines;
2279     # cache of lines added virtually to the file, internal
2280     main::set_access('added_lines', \%added_lines);
2281
2282     my %remapped_lines;
2283     # cache of lines added virtually to the file, internal
2284     main::set_access('remapped_lines', \%remapped_lines);
2285
2286     my %errors;
2287     # cache of errors found, internal
2288     main::set_access('errors', \%errors);
2289
2290     my %missings;
2291     # storage of '@missing' defaults lines
2292     main::set_access('missings', \%missings);
2293
2294     my %early;
2295     # Used for properties that must be defined (for Perl's purposes) on
2296     # versions of Unicode earlier than Unicode itself defines them.  The
2297     # parameter is an array (it would be better to be a hash, but not worth
2298     # bothering about due to its rare use).
2299     #
2300     # The first element is either a code reference to call when in a release
2301     # earlier than the Unicode file is available in, or it is an alternate
2302     # file to use instead of the non-existent one.  This file must have been
2303     # plunked down in the same directory as mktables.  Should you be compiling
2304     # on a release that needs such a file, mktables will abort the
2305     # compilation, and tell you where to get the necessary file(s), and what
2306     # name(s) to use to store them as.
2307     # In the case of specifying an alternate file, the array must contain two
2308     # further elements:
2309     #
2310     # [1] is the name of the property that will be generated by this file.
2311     # The class automatically takes the input file and excludes any code
2312     # points in it that were not assigned in the Unicode version being
2313     # compiled.  It then uses this result to define the property in the given
2314     # version.  Since the property doesn't actually exist in the Unicode
2315     # version being compiled, this should be a name accessible only by core
2316     # perl.  If it is the same name as the regular property, the constructor
2317     # will mark the output table as a $PLACEHOLDER so that it doesn't actually
2318     # get output, and so will be unusable by non-core code.  Otherwise it gets
2319     # marked as $INTERNAL_ONLY.
2320     #
2321     # [2] is a property value to assign (only when compiling Unicode 1.1.5) to
2322     # the Hangul syllables in that release (which were ripped out in version
2323     # 2) for the given property .  (Hence it is ignored except when compiling
2324     # version 1.  You only get one value that applies to all of them, which
2325     # may not be the actual reality, but probably nobody cares anyway for
2326     # these obsolete characters.)
2327     #
2328     # [3] if present is the default value for the property to assign for code
2329     # points not given in the input.  If not present, the default from the
2330     # normal property is used
2331     #
2332     # [-1] If there is an extra final element that is the string 'ONLY_EARLY'.
2333     # it means to not add the name in [1] as an alias to the property name
2334     # used for these.  Normally, when compiling Unicode versions that don't
2335     # invoke the early handling, the name is added as a synonym.
2336     #
2337     # Not all files can be handled in the above way, and so the code ref
2338     # alternative is available.  It can do whatever it needs to.  The other
2339     # array elements are optional in this case, and the code is free to use or
2340     # ignore them if they are present.
2341     #
2342     # Internally, the constructor unshifts a 0 or 1 onto this array to
2343     # indicate if an early alternative is actually being used or not.  This
2344     # makes for easier testing later on.
2345     main::set_access('early', \%early, 'c');
2346
2347     my %only_early;
2348     main::set_access('only_early', \%only_early, 'c');
2349
2350     my %required_even_in_debug_skip;
2351     # debug_skip is used to speed up compilation during debugging by skipping
2352     # processing files that are not needed for the task at hand.  However,
2353     # some files pretty much can never be skipped, and this is used to specify
2354     # that this is one of them.  In order to skip this file, the call to the
2355     # constructor must be edited to comment out this parameter.
2356     main::set_access('required_even_in_debug_skip',
2357                      \%required_even_in_debug_skip, 'c');
2358
2359     my %withdrawn;
2360     # Some files get removed from the Unicode DB.  This is a version object
2361     # giving the first release without this file.
2362     main::set_access('withdrawn', \%withdrawn, 'c');
2363
2364     my %in_this_release;
2365     # Calculated value from %first_released and %withdrawn.  Are we compiling
2366     # a Unicode release which includes this file?
2367     main::set_access('in_this_release', \%in_this_release);
2368
2369     sub _next_line;
2370     sub _next_line_with_remapped_range;
2371
2372     sub new {
2373         my $class = shift;
2374
2375         my $self = bless \do{ my $anonymous_scalar }, $class;
2376         my $addr = do { no overloading; pack 'J', $self; };
2377
2378         # Set defaults
2379         $handler{$addr} = \&main::process_generic_property_file;
2380         $retain_trailing_comments{$addr} = 0;
2381         $non_skip{$addr} = 0;
2382         $skip{$addr} = undef;
2383         $has_missings_defaults{$addr} = $NO_DEFAULTS;
2384         $handle{$addr} = undef;
2385         $added_lines{$addr} = [ ];
2386         $remapped_lines{$addr} = [ ];
2387         $each_line_handler{$addr} = [ ];
2388         $eof_handler{$addr} = [ ];
2389         $errors{$addr} = { };
2390         $missings{$addr} = [ ];
2391         $early{$addr} = [ ];
2392         $optional{$addr} = [ ];
2393
2394         # Two positional parameters.
2395         return Carp::carp_too_few_args(\@_, 2) if main::DEBUG && @_ < 2;
2396         $file{$addr} = main::internal_file_to_platform(shift);
2397         $first_released{$addr} = shift;
2398
2399         # The rest of the arguments are key => value pairs
2400         # %constructor_fields has been set up earlier to list all possible
2401         # ones.  Either set or push, depending on how the default has been set
2402         # up just above.
2403         my %args = @_;
2404         foreach my $key (keys %args) {
2405             my $argument = $args{$key};
2406
2407             # Note that the fields are the lower case of the constructor keys
2408             my $hash = $constructor_fields{lc $key};
2409             if (! defined $hash) {
2410                 Carp::my_carp_bug("Unrecognized parameters '$key => $argument' to new() for $self.  Skipped");
2411                 next;
2412             }
2413             if (ref $hash->{$addr} eq 'ARRAY') {
2414                 if (ref $argument eq 'ARRAY') {
2415                     foreach my $argument (@{$argument}) {
2416                         next if ! defined $argument;
2417                         push @{$hash->{$addr}}, $argument;
2418                     }
2419                 }
2420                 else {
2421                     push @{$hash->{$addr}}, $argument if defined $argument;
2422                 }
2423             }
2424             else {
2425                 $hash->{$addr} = $argument;
2426             }
2427             delete $args{$key};
2428         };
2429
2430         $non_skip{$addr} = 1 if $required_even_in_debug_skip{$addr};
2431
2432         # Convert 0 (meaning don't skip) to undef
2433         undef $skip{$addr} unless $skip{$addr};
2434
2435         # Handle the case where this file is optional
2436         my $pod_message_for_non_existent_optional = "";
2437         if ($optional{$addr}->@*) {
2438
2439             # First element is the pod message
2440             $pod_message_for_non_existent_optional
2441                                                 = shift $optional{$addr}->@*;
2442             # Convert a 0 'Optional' argument to an empty list to make later
2443             # code more concise.
2444             if (   $optional{$addr}->@*
2445                 && $optional{$addr}->@* == 1
2446                 && $optional{$addr}[0] ne ""
2447                 && $optional{$addr}[0] !~ /\D/
2448                 && $optional{$addr}[0] == 0)
2449             {
2450                 $optional{$addr} = [ ];
2451             }
2452             else {  # But if the only element doesn't evaluate to 0, make sure
2453                     # that this file is indeed considered optional below.
2454                 unshift $optional{$addr}->@*, 1;
2455             }
2456         }
2457
2458         my $progress;
2459         my $function_instead_of_file = 0;
2460
2461         if ($early{$addr}->@* && $early{$addr}[-1] eq 'ONLY_EARLY') {
2462             $only_early{$addr} = 1;
2463             pop $early{$addr}->@*;
2464         }
2465
2466         # If we are compiling a Unicode release earlier than the file became
2467         # available, the constructor may have supplied a substitute
2468         if ($first_released{$addr} gt $v_version && $early{$addr}->@*) {
2469
2470             # Yes, we have a substitute, that we will use; mark it so
2471             unshift $early{$addr}->@*, 1;
2472
2473             # See the definition of %early for what the array elements mean.
2474             # Note that we have just unshifted onto the array, so the numbers
2475             # below are +1 of those in the %early description.
2476             # If we have a property this defines, create a table and default
2477             # map for it now (at essentially compile time), so that it will be
2478             # available for the whole of run time.  (We will want to add this
2479             # name as an alias when we are using the official property name;
2480             # but this must be deferred until run(), because at construction
2481             # time the official names have yet to be defined.)
2482             if ($early{$addr}[2]) {
2483                 my $fate = ($property{$addr}
2484                             && $property{$addr} eq $early{$addr}[2])
2485                           ? $PLACEHOLDER
2486                           : $INTERNAL_ONLY;
2487                 my $prop_object = Property->new($early{$addr}[2],
2488                                                 Fate => $fate,
2489                                                 Perl_Extension => 1,
2490                                                 );
2491
2492                 # If not specified by the constructor, use the default mapping
2493                 # for the regular property for this substitute one.
2494                 if ($early{$addr}[4]) {
2495                     $prop_object->set_default_map($early{$addr}[4]);
2496                 }
2497                 elsif (    defined $property{$addr}
2498                        &&  defined $default_mapping{$property{$addr}})
2499                 {
2500                     $prop_object
2501                         ->set_default_map($default_mapping{$property{$addr}});
2502                 }
2503             }
2504
2505             if (ref $early{$addr}[1] eq 'CODE') {
2506                 $function_instead_of_file = 1;
2507
2508                 # If the first element of the array is a code ref, the others
2509                 # are optional.
2510                 $handler{$addr} = $early{$addr}[1];
2511                 $property{$addr} = $early{$addr}[2]
2512                                                 if defined $early{$addr}[2];
2513                 $progress = "substitute $file{$addr}";
2514
2515                 undef $file{$addr};
2516             }
2517             else {  # Specifying a substitute file
2518
2519                 if (! main::file_exists($early{$addr}[1])) {
2520
2521                     # If we don't see the substitute file, generate an error
2522                     # message giving the needed things, and add it to the list
2523                     # of such to output before actual processing happens
2524                     # (hence the user finds out all of them in one run).
2525                     # Instead of creating a general method for NameAliases,
2526                     # hard-code it here, as there is unlikely to ever be a
2527                     # second one which needs special handling.
2528                     my $string_version = ($file{$addr} eq "NameAliases.txt")
2529                                     ? 'at least 6.1 (the later, the better)'
2530                                     : sprintf "%vd", $first_released{$addr};
2531                     push @missing_early_files, <<END;
2532 '$file{$addr}' version $string_version should be copied to '$early{$addr}[1]'.
2533 END
2534                     ;
2535                     return;
2536                 }
2537                 $progress = $early{$addr}[1];
2538                 $progress .= ", substituting for $file{$addr}" if $file{$addr};
2539                 $file{$addr} = $early{$addr}[1];
2540                 $property{$addr} = $early{$addr}[2];
2541
2542                 # Ignore code points not in the version being compiled
2543                 push $each_line_handler{$addr}->@*, \&_exclude_unassigned;
2544
2545                 if (   $v_version lt v2.0        # Hanguls in this release ...
2546                     && defined $early{$addr}[3]) # ... need special treatment
2547                 {
2548                     push $eof_handler{$addr}->@*, \&_fixup_obsolete_hanguls;
2549                 }
2550             }
2551
2552             # And this substitute is valid for all releases.
2553             $first_released{$addr} = v0;
2554         }
2555         else {  # Normal behavior
2556             $progress = $file{$addr};
2557             unshift $early{$addr}->@*, 0; # No substitute
2558         }
2559
2560         my $file = $file{$addr};
2561         $progress_message{$addr} = "Processing $progress"
2562                                             unless $progress_message{$addr};
2563
2564         # A file should be there if it is within the window of versions for
2565         # which Unicode supplies it
2566         if ($withdrawn{$addr} && $withdrawn{$addr} le $v_version) {
2567             $in_this_release{$addr} = 0;
2568             $skip{$addr} = "";
2569         }
2570         else {
2571             $in_this_release{$addr} = $first_released{$addr} le $v_version;
2572
2573             # Check that the file for this object (possibly using a substitute
2574             # for early releases) exists or we have a function alternative
2575             if (   ! $function_instead_of_file
2576                 && ! main::file_exists($file))
2577             {
2578                 # Here there is nothing available for this release.  This is
2579                 # fine if we aren't expecting anything in this release.
2580                 if (! $in_this_release{$addr}) {
2581                     $skip{$addr} = "";  # Don't remark since we expected
2582                                         # nothing and got nothing
2583                 }
2584                 elsif ($optional{$addr}->@*) {
2585
2586                     # Here the file is optional in this release; Use the
2587                     # passed in text to document this case in the pod.
2588                     $skip{$addr} = $pod_message_for_non_existent_optional;
2589                 }
2590                 elsif (   $in_this_release{$addr}
2591                        && ! defined $skip{$addr}
2592                        && defined $file)
2593                 { # Doesn't exist but should.
2594                     $skip{$addr} = "'$file' not found.  Possibly Big problems";
2595                     Carp::my_carp($skip{$addr});
2596                 }
2597             }
2598             elsif ($debug_skip && ! defined $skip{$addr} && ! $non_skip{$addr})
2599             {
2600
2601                 # The file exists; if not skipped for another reason, and we are
2602                 # skipping most everything during debugging builds, use that as
2603                 # the skip reason.
2604                 $skip{$addr} = '$debug_skip is on'
2605             }
2606         }
2607
2608         if (   ! $debug_skip
2609             && $non_skip{$addr}
2610             && ! $required_even_in_debug_skip{$addr}
2611             && $verbosity)
2612         {
2613             print "Warning: " . __PACKAGE__ . " constructor for $file has useless 'non_skip' in it\n";
2614         }
2615
2616         # Here, we have figured out if we will be skipping this file or not.
2617         # If so, we add any single property it defines to any passed in
2618         # optional property list.  These will be dealt with at run time.
2619         if (defined $skip{$addr}) {
2620             if ($property{$addr}) {
2621                 push $optional{$addr}->@*, $property{$addr};
2622             }
2623         } # Otherwise, are going to process the file.
2624         elsif ($property{$addr}) {
2625
2626             # If the file has a property defined in the constructor for it, it
2627             # means that the property is not listed in the file's entries.  So
2628             # add a handler (to the list of line handlers) to insert the
2629             # property name into the lines, to provide a uniform interface to
2630             # the final processing subroutine.
2631             push @{$each_line_handler{$addr}}, \&_insert_property_into_line;
2632         }
2633         elsif ($properties{$addr}) {
2634
2635             # Similarly, there may be more than one property represented on
2636             # each line, with no clue but the constructor input what those
2637             # might be.  Add a handler for each line in the input so that it
2638             # creates a separate input line for each property in those input
2639             # lines, thus making them suitable to handle generically.
2640
2641             push @{$each_line_handler{$addr}},
2642                  sub {
2643                     my $file = shift;
2644                     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2645
2646                     my @fields = split /\s*;\s*/, $_, -1;
2647
2648                     if (@fields - 1 > @{$properties{$addr}}) {
2649                         $file->carp_bad_line('Extra fields');
2650                         $_ = "";
2651                         return;
2652                     }
2653                     my $range = shift @fields;  # 0th element is always the
2654                                                 # range
2655
2656                     # The next fields in the input line correspond
2657                     # respectively to the stored properties.
2658                     for my $i (0 ..  @{$properties{$addr}} - 1) {
2659                         my $property_name = $properties{$addr}[$i];
2660                         next if $property_name eq '<ignored>';
2661                         $file->insert_adjusted_lines(
2662                               "$range; $property_name; $fields[$i]");
2663                     }
2664                     $_ = "";
2665
2666                     return;
2667                 };
2668         }
2669
2670         {   # On non-ascii platforms, we use a special pre-handler
2671             no strict;
2672             no warnings 'once';
2673             *next_line = (main::NON_ASCII_PLATFORM)
2674                          ? *_next_line_with_remapped_range
2675                          : *_next_line;
2676         }
2677
2678         &{$construction_time_handler{$addr}}($self)
2679                                         if $construction_time_handler{$addr};
2680
2681         return $self;
2682     }
2683
2684
2685     use overload
2686         fallback => 0,
2687         qw("") => "_operator_stringify",
2688         "." => \&main::_operator_dot,
2689         ".=" => \&main::_operator_dot_equal,
2690     ;
2691
2692     sub _operator_stringify {
2693         my $self = shift;
2694
2695         return __PACKAGE__ . " object for " . $self->file;
2696     }
2697
2698     sub run {
2699         # Process the input object $self.  This opens and closes the file and
2700         # calls all the handlers for it.  Currently,  this can only be called
2701         # once per file, as it destroy's the EOF handlers
2702
2703         # flag to make sure extracted files are processed early
2704         state $seen_non_extracted = 0;
2705
2706         my $self = shift;
2707         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2708
2709         my $addr = do { no overloading; pack 'J', $self; };
2710
2711         my $file = $file{$addr};
2712
2713         if (! $file) {
2714             $handle{$addr} = 'pretend_is_open';
2715         }
2716         else {
2717             if ($seen_non_extracted) {
2718                 if ($file =~ /$EXTRACTED/i) # Some platforms may change the
2719                                             # case of the file's name
2720                 {
2721                     Carp::my_carp_bug(main::join_lines(<<END
2722 $file should be processed just after the 'Prop...Alias' files, and before
2723 anything not in the $EXTRACTED_DIR directory.  Proceeding, but the results may
2724 have subtle problems
2725 END
2726                     ));
2727                 }
2728             }
2729             elsif ($EXTRACTED_DIR
2730
2731                     # We only do this check for generic property files
2732                     && $handler{$addr} == \&main::process_generic_property_file
2733
2734                     && $file !~ /$EXTRACTED/i)
2735             {
2736                 # We don't set this (by the 'if' above) if we have no
2737                 # extracted directory, so if running on an early version,
2738                 # this test won't work.  Not worth worrying about.
2739                 $seen_non_extracted = 1;
2740             }
2741
2742             # Mark the file as having being processed, and warn if it
2743             # isn't a file we are expecting.  As we process the files,
2744             # they are deleted from the hash, so any that remain at the
2745             # end of the program are files that we didn't process.
2746             my $fkey = File::Spec->rel2abs($file);
2747             my $exists = delete $potential_files{lc($fkey)};
2748
2749             Carp::my_carp("Was not expecting '$file'.")
2750                                     if $exists && ! $in_this_release{$addr};
2751
2752             # If there is special handling for compiling Unicode releases
2753             # earlier than the first one in which Unicode defines this
2754             # property ...
2755             if ($early{$addr}->@* > 1) {
2756
2757                 # Mark as processed any substitute file that would be used in
2758                 # such a release
2759                 $fkey = File::Spec->rel2abs($early{$addr}[1]);
2760                 delete $potential_files{lc($fkey)};
2761
2762                 # As commented in the constructor code, when using the
2763                 # official property, we still have to allow the publicly
2764                 # inaccessible early name so that the core code which uses it
2765                 # will work regardless.
2766                 if (   ! $only_early{$addr}
2767                     && ! $early{$addr}[0]
2768                     && $early{$addr}->@* > 2)
2769                 {
2770                     my $early_property_name = $early{$addr}[2];
2771                     if ($property{$addr} ne $early_property_name) {
2772                         main::property_ref($property{$addr})
2773                                             ->add_alias($early_property_name);
2774                     }
2775                 }
2776             }
2777
2778             # We may be skipping this file ...
2779             if (defined $skip{$addr}) {
2780
2781                 # If the file isn't supposed to be in this release, there is
2782                 # nothing to do
2783                 if ($in_this_release{$addr}) {
2784
2785                     # But otherwise, we may print a message
2786                     if ($debug_skip) {
2787                         print STDERR "Skipping input file '$file'",
2788                                      " because '$skip{$addr}'\n";
2789                     }
2790
2791                     # And add it to the list of skipped files, which is later
2792                     # used to make the pod
2793                     $skipped_files{$file} = $skip{$addr};
2794
2795                     # The 'optional' list contains properties that are also to
2796                     # be skipped along with the file.  (There may also be
2797                     # digits which are just placeholders to make sure it isn't
2798                     # an empty list
2799                     foreach my $property ($optional{$addr}->@*) {
2800                         next unless $property =~ /\D/;
2801                         my $prop_object = main::property_ref($property);
2802                         next unless defined $prop_object;
2803                         $prop_object->set_fate($SUPPRESSED, $skip{$addr});
2804                     }
2805                 }
2806
2807                 return;
2808             }
2809
2810             # Here, we are going to process the file.  Open it, converting the
2811             # slashes used in this program into the proper form for the OS
2812             my $file_handle;
2813             if (not open $file_handle, "<", $file) {
2814                 Carp::my_carp("Can't open $file.  Skipping: $!");
2815                 return;
2816             }
2817             $handle{$addr} = $file_handle; # Cache the open file handle
2818
2819             # If possible, make sure that the file is the correct version.
2820             # (This data isn't available on early Unicode releases or in
2821             # UnicodeData.txt.)  We don't do this check if we are using a
2822             # substitute file instead of the official one (though the code
2823             # could be extended to do so).
2824             if ($in_this_release{$addr}
2825                 && ! $early{$addr}[0]
2826                 && lc($file) ne 'unicodedata.txt')
2827             {
2828                 if ($file !~ /^Unihan/i) {
2829
2830                     # The non-Unihan files started getting version numbers in
2831                     # 3.2, but some files in 4.0 are unchanged from 3.2, and
2832                     # marked as 3.2.  4.0.1 is the first version where there
2833                     # are no files marked as being from less than 4.0, though
2834                     # some are marked as 4.0.  In versions after that, the
2835                     # numbers are correct.
2836                     if ($v_version ge v4.0.1) {
2837                         $_ = <$file_handle>;    # The version number is in the
2838                                                 # very first line
2839                         if ($_ !~ / - $string_version \. /x) {
2840                             chomp;
2841                             $_ =~ s/^#\s*//;
2842
2843                             # 4.0.1 had some valid files that weren't updated.
2844                             if (! ($v_version eq v4.0.1 && $_ =~ /4\.0\.0/)) {
2845                                 die Carp::my_carp("File '$file' is version "
2846                                                 . "'$_'.  It should be "
2847                                                 . "version $string_version");
2848                             }
2849                         }
2850                     }
2851                 }
2852                 elsif ($v_version ge v6.0.0) { # Unihan
2853
2854                     # Unihan files didn't get accurate version numbers until
2855                     # 6.0.  The version is somewhere in the first comment
2856                     # block
2857                     while (<$file_handle>) {
2858                         if ($_ !~ /^#/) {
2859                             Carp::my_carp_bug("Could not find the expected "
2860                                             . "version info in file '$file'");
2861                             last;
2862                         }
2863                         chomp;
2864                         $_ =~ s/^#\s*//;
2865                         next if $_ !~ / version: /x;
2866                         last if $_ =~ /$string_version/;
2867                         die Carp::my_carp("File '$file' is version "
2868                                         . "'$_'.  It should be "
2869                                         . "version $string_version");
2870                     }
2871                 }
2872             }
2873         }
2874
2875         print "$progress_message{$addr}\n" if $verbosity >= $PROGRESS;
2876
2877         # Call any special handler for before the file.
2878         &{$pre_handler{$addr}}($self) if $pre_handler{$addr};
2879
2880         # Then the main handler
2881         &{$handler{$addr}}($self);
2882
2883         # Then any special post-file handler.
2884         &{$post_handler{$addr}}($self) if $post_handler{$addr};
2885
2886         # If any errors have been accumulated, output the counts (as the first
2887         # error message in each class was output when it was encountered).
2888         if ($errors{$addr}) {
2889             my $total = 0;
2890             my $types = 0;
2891             foreach my $error (keys %{$errors{$addr}}) {
2892                 $total += $errors{$addr}->{$error};
2893                 delete $errors{$addr}->{$error};
2894                 $types++;
2895             }
2896             if ($total > 1) {
2897                 my $message
2898                         = "A total of $total lines had errors in $file.  ";
2899
2900                 $message .= ($types == 1)
2901                             ? '(Only the first one was displayed.)'
2902                             : '(Only the first of each type was displayed.)';
2903                 Carp::my_carp($message);
2904             }
2905         }
2906
2907         if (@{$missings{$addr}}) {
2908             Carp::my_carp_bug("Handler for $file didn't look at all the \@missing lines.  Generated tables likely are wrong");
2909         }
2910
2911         # If a real file handle, close it.
2912         close $handle{$addr} or Carp::my_carp("Can't close $file: $!") if
2913                                                         ref $handle{$addr};
2914         $handle{$addr} = "";   # Uses empty to indicate that has already seen
2915                                # the file, as opposed to undef
2916         return;
2917     }
2918
2919     sub _next_line {
2920         # Sets $_ to be the next logical input line, if any.  Returns non-zero
2921         # if such a line exists.  'logical' means that any lines that have
2922         # been added via insert_lines() will be returned in $_ before the file
2923         # is read again.
2924
2925         my $self = shift;
2926         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2927
2928         my $addr = do { no overloading; pack 'J', $self; };
2929
2930         # Here the file is open (or if the handle is not a ref, is an open
2931         # 'virtual' file).  Get the next line; any inserted lines get priority
2932         # over the file itself.
2933         my $adjusted;
2934
2935         LINE:
2936         while (1) { # Loop until find non-comment, non-empty line
2937             #local $to_trace = 1 if main::DEBUG;
2938             my $inserted_ref = shift @{$added_lines{$addr}};
2939             if (defined $inserted_ref) {
2940                 ($adjusted, $_) = @{$inserted_ref};
2941                 trace $adjusted, $_ if main::DEBUG && $to_trace;
2942                 return 1 if $adjusted;
2943             }
2944             else {
2945                 last if ! ref $handle{$addr}; # Don't read unless is real file
2946                 last if ! defined ($_ = readline $handle{$addr});
2947             }
2948             chomp;
2949             trace $_ if main::DEBUG && $to_trace;
2950
2951             # See if this line is the comment line that defines what property
2952             # value that code points that are not listed in the file should
2953             # have.  The format or existence of these lines is not guaranteed
2954             # by Unicode since they are comments, but the documentation says
2955             # that this was added for machine-readability, so probably won't
2956             # change.  This works starting in Unicode Version 5.0.  They look
2957             # like:
2958             #
2959             # @missing: 0000..10FFFF; Not_Reordered
2960             # @missing: 0000..10FFFF; Decomposition_Mapping; <code point>
2961             # @missing: 0000..10FFFF; ; NaN
2962             #
2963             # Save the line for a later get_missings() call.
2964             if (/$missing_defaults_prefix/) {
2965                 if ($has_missings_defaults{$addr} == $NO_DEFAULTS) {
2966                     $self->carp_bad_line("Unexpected \@missing line.  Assuming no missing entries");
2967                 }
2968                 elsif ($has_missings_defaults{$addr} == $NOT_IGNORED) {
2969                     my @defaults = split /\s* ; \s*/x, $_;
2970
2971                     # The first field is the @missing, which ends in a
2972                     # semi-colon, so can safely shift.
2973                     shift @defaults;
2974
2975                     # Some of these lines may have empty field placeholders
2976                     # which get in the way.  An example is:
2977                     # @missing: 0000..10FFFF; ; NaN
2978                     # Remove them.  Process starting from the top so the
2979                     # splice doesn't affect things still to be looked at.
2980                     for (my $i = @defaults - 1; $i >= 0; $i--) {
2981                         next if $defaults[$i] ne "";
2982                         splice @defaults, $i, 1;
2983                     }
2984
2985                     # What's left should be just the property (maybe) and the
2986                     # default.  Having only one element means it doesn't have
2987                     # the property.
2988                     my $default;
2989                     my $property;
2990                     if (@defaults >= 1) {
2991                         if (@defaults == 1) {
2992                             $default = $defaults[0];
2993                         }
2994                         else {
2995                             $property = $defaults[0];
2996                             $default = $defaults[1];
2997                         }
2998                     }
2999
3000                     if (@defaults < 1
3001                         || @defaults > 2
3002                         || ($default =~ /^</
3003                             && $default !~ /^<code *point>$/i
3004                             && $default !~ /^<none>$/i
3005                             && $default !~ /^<script>$/i))
3006                     {
3007                         $self->carp_bad_line("Unrecognized \@missing line: $_.  Assuming no missing entries");
3008                     }
3009                     else {
3010
3011                         # If the property is missing from the line, it should
3012                         # be the one for the whole file
3013                         $property = $property{$addr} if ! defined $property;
3014
3015                         # Change <none> to the null string, which is what it
3016                         # really means.  If the default is the code point
3017                         # itself, set it to <code point>, which is what
3018                         # Unicode uses (but sometimes they've forgotten the
3019                         # space)
3020                         if ($default =~ /^<none>$/i) {
3021                             $default = "";
3022                         }
3023                         elsif ($default =~ /^<code *point>$/i) {
3024                             $default = $CODE_POINT;
3025                         }
3026                         elsif ($default =~ /^<script>$/i) {
3027
3028                             # Special case this one.  Currently is from
3029                             # ScriptExtensions.txt, and means for all unlisted
3030                             # code points, use their Script property values.
3031                             # For the code points not listed in that file, the
3032                             # default value is 'Unknown'.
3033                             $default = "Unknown";
3034                         }
3035
3036                         # Store them as a sub-arrays with both components.
3037                         push @{$missings{$addr}}, [ $default, $property ];
3038                     }
3039                 }
3040
3041                 # There is nothing for the caller to process on this comment
3042                 # line.
3043                 next;
3044             }
3045
3046             # Unless to keep, remove comments.  If to keep, ignore
3047             # comment-only lines
3048             if ($retain_trailing_comments{$addr}) {
3049                 next if / ^ \s* \# /x;
3050
3051                 # But escape any single quotes (done in both the comment and
3052                 # non-comment portion; this could be a bug someday, but not
3053                 # likely)
3054                 s/'/\\'/g;
3055             }
3056             else {
3057                 s/#.*//;
3058             }
3059
3060             # Remove trailing space, and skip this line if the result is empty
3061             s/\s+$//;
3062             next if /^$/;
3063
3064             # Call any handlers for this line, and skip further processing of
3065             # the line if the handler sets the line to null.
3066             foreach my $sub_ref (@{$each_line_handler{$addr}}) {
3067                 &{$sub_ref}($self);
3068                 next LINE if /^$/;
3069             }
3070
3071             # Here the line is ok.  return success.
3072             return 1;
3073         } # End of looping through lines.
3074
3075         # If there are EOF handlers, call each (only once) and if it generates
3076         # more lines to process go back in the loop to handle them.
3077         while ($eof_handler{$addr}->@*) {
3078             &{$eof_handler{$addr}[0]}($self);
3079             shift $eof_handler{$addr}->@*;   # Currently only get one shot at it.
3080             goto LINE if $added_lines{$addr};
3081         }
3082
3083         # Return failure -- no more lines.
3084         return 0;
3085
3086     }
3087
3088     sub _next_line_with_remapped_range {
3089         my $self = shift;
3090         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3091
3092         # like _next_line(), but for use on non-ASCII platforms.  It sets $_
3093         # to be the next logical input line, if any.  Returns non-zero if such
3094         # a line exists.  'logical' means that any lines that have been added
3095         # via insert_lines() will be returned in $_ before the file is read
3096         # again.
3097         #
3098         # The difference from _next_line() is that this remaps the Unicode
3099         # code points in the input to those of the native platform.  Each
3100         # input line contains a single code point, or a single contiguous
3101         # range of them  This routine splits each range into its individual
3102         # code points and caches them.  It returns the cached values,
3103         # translated into their native equivalents, one at a time, for each
3104         # call, before reading the next line.  Since native values can only be
3105         # a single byte wide, no translation is needed for code points above
3106         # 0xFF, and ranges that are entirely above that number are not split.
3107         # If an input line contains the range 254-1000, it would be split into
3108         # three elements: 254, 255, and 256-1000.  (The downstream table
3109         # insertion code will sort and coalesce the individual code points
3110         # into appropriate ranges.)
3111
3112         my $addr = do { no overloading; pack 'J', $self; };
3113
3114         while (1) {
3115
3116             # Look in cache before reading the next line.  Return any cached
3117             # value, translated
3118             my $inserted = shift @{$remapped_lines{$addr}};
3119             if (defined $inserted) {
3120                 trace $inserted if main::DEBUG && $to_trace;
3121                 $_ = $inserted =~ s/^ ( \d+ ) /sprintf("%04X", utf8::unicode_to_native($1))/xer;
3122                 trace $_ if main::DEBUG && $to_trace;
3123                 return 1;
3124             }
3125
3126             # Get the next line.
3127             return 0 unless _next_line($self);
3128
3129             # If there is a special handler for it, return the line,
3130             # untranslated.  This should happen only for files that are
3131             # special, not being code-point related, such as property names.
3132             return 1 if $handler{$addr}
3133                                     != \&main::process_generic_property_file;
3134
3135             my ($range, $property_name, $map, @remainder)
3136                 = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
3137
3138             if (@remainder
3139                 || ! defined $property_name
3140                 || $range !~ /^ ($code_point_re) (?:\.\. ($code_point_re) )? $/x)
3141             {
3142                 Carp::my_carp_bug("Unrecognized input line '$_'.  Ignored");
3143             }
3144
3145             my $low = hex $1;
3146             my $high = (defined $2) ? hex $2 : $low;
3147
3148             # If the input maps the range to another code point, remap the
3149             # target if it is between 0 and 255.
3150             my $tail;
3151             if (defined $map) {
3152                 $map =~ s/\b 00 ( [0-9A-F]{2} ) \b/sprintf("%04X", utf8::unicode_to_native(hex $1))/gxe;
3153                 $tail = "$property_name; $map";
3154                 $_ = "$range; $tail";
3155             }
3156             else {
3157                 $tail = $property_name;
3158             }
3159
3160             # If entire range is above 255, just return it, unchanged (except
3161             # any mapped-to code point, already changed above)
3162             return 1 if $low > 255;
3163
3164             # Cache an entry for every code point < 255.  For those in the
3165             # range above 255, return a dummy entry for just that portion of
3166             # the range.  Note that this will be out-of-order, but that is not
3167             # a problem.
3168             foreach my $code_point ($low .. $high) {
3169                 if ($code_point > 255) {
3170                     $_ = sprintf "%04X..%04X; $tail", $code_point, $high;
3171                     return 1;
3172                 }
3173                 push @{$remapped_lines{$addr}}, "$code_point; $tail";
3174             }
3175         } # End of looping through lines.
3176
3177         # NOTREACHED
3178     }
3179
3180 #   Not currently used, not fully tested.
3181 #    sub peek {
3182 #        # Non-destructive lookahead one non-adjusted, non-comment, non-blank
3183 #        # record.  Not callable from an each_line_handler(), nor does it call
3184 #        # an each_line_handler() on the line.
3185 #
3186 #        my $self = shift;
3187 #        my $addr = do { no overloading; pack 'J', $self; };
3188 #
3189 #        foreach my $inserted_ref (@{$added_lines{$addr}}) {
3190 #            my ($adjusted, $line) = @{$inserted_ref};
3191 #            next if $adjusted;
3192 #
3193 #            # Remove comments and trailing space, and return a non-empty
3194 #            # resulting line
3195 #            $line =~ s/#.*//;
3196 #            $line =~ s/\s+$//;
3197 #            return $line if $line ne "";
3198 #        }
3199 #
3200 #        return if ! ref $handle{$addr}; # Don't read unless is real file
3201 #        while (1) { # Loop until find non-comment, non-empty line
3202 #            local $to_trace = 1 if main::DEBUG;
3203 #            trace $_ if main::DEBUG && $to_trace;
3204 #            return if ! defined (my $line = readline $handle{$addr});
3205 #            chomp $line;
3206 #            push @{$added_lines{$addr}}, [ 0, $line ];
3207 #
3208 #            $line =~ s/#.*//;
3209 #            $line =~ s/\s+$//;
3210 #            return $line if $line ne "";
3211 #        }
3212 #
3213 #        return;
3214 #    }
3215
3216
3217     sub insert_lines {
3218         # Lines can be inserted so that it looks like they were in the input
3219         # file at the place it was when this routine is called.  See also
3220         # insert_adjusted_lines().  Lines inserted via this routine go through
3221         # any each_line_handler()
3222
3223         my $self = shift;
3224
3225         # Each inserted line is an array, with the first element being 0 to
3226         # indicate that this line hasn't been adjusted, and needs to be
3227         # processed.
3228         no overloading;
3229         push @{$added_lines{pack 'J', $self}}, map { [ 0, $_ ] } @_;
3230         return;
3231     }
3232
3233     sub insert_adjusted_lines {
3234         # Lines can be inserted so that it looks like they were in the input
3235         # file at the place it was when this routine is called.  See also
3236         # insert_lines().  Lines inserted via this routine are already fully
3237         # adjusted, ready to be processed; each_line_handler()s handlers will
3238         # not be called.  This means this is not a completely general
3239         # facility, as only the last each_line_handler on the stack should
3240         # call this.  It could be made more general, by passing to each of the
3241         # line_handlers their position on the stack, which they would pass on
3242         # to this routine, and that would replace the boolean first element in
3243         # the anonymous array pushed here, so that the next_line routine could
3244         # use that to call only those handlers whose index is after it on the
3245         # stack.  But this is overkill for what is needed now.
3246
3247         my $self = shift;
3248         trace $_[0] if main::DEBUG && $to_trace;
3249
3250         # Each inserted line is an array, with the first element being 1 to
3251         # indicate that this line has been adjusted
3252         no overloading;
3253         push @{$added_lines{pack 'J', $self}}, map { [ 1, $_ ] } @_;
3254         return;
3255     }
3256
3257     sub get_missings {
3258         # Returns the stored up @missings lines' values, and clears the list.
3259         # The values are in an array, consisting of the default in the first
3260         # element, and the property in the 2nd.  However, since these lines
3261         # can be stacked up, the return is an array of all these arrays.
3262
3263         my $self = shift;
3264         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3265
3266         my $addr = do { no overloading; pack 'J', $self; };
3267
3268         # If not accepting a list return, just return the first one.
3269         return shift @{$missings{$addr}} unless wantarray;
3270
3271         my @return = @{$missings{$addr}};
3272         undef @{$missings{$addr}};
3273         return @return;
3274     }
3275
3276     sub _exclude_unassigned {
3277
3278         # Takes the range in $_ and excludes code points that aren't assigned
3279         # in this release
3280
3281         state $skip_inserted_count = 0;
3282
3283         # Ignore recursive calls.
3284         if ($skip_inserted_count) {
3285             $skip_inserted_count--;
3286             return;
3287         }
3288
3289         # Find what code points are assigned in this release
3290         main::calculate_Assigned() if ! defined $Assigned;
3291
3292         my $self = shift;
3293         my $addr = do { no overloading; pack 'J', $self; };
3294         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3295
3296         my ($range, @remainder)
3297             = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
3298
3299         # Examine the range.
3300         if ($range =~ /^ ($code_point_re) (?:\.\. ($code_point_re) )? $/x)
3301         {
3302             my $low = hex $1;
3303             my $high = (defined $2) ? hex $2 : $low;
3304
3305             # Split the range into subranges of just those code points in it
3306             # that are assigned.
3307             my @ranges = (Range_List->new(Initialize
3308                               => Range->new($low, $high)) & $Assigned)->ranges;
3309
3310             # Do nothing if nothing in the original range is assigned in this
3311             # release; handle normally if everything is in this release.
3312             if (! @ranges) {
3313                 $_ = "";
3314             }
3315             elsif (@ranges != 1) {
3316
3317                 # Here, some code points in the original range aren't in this
3318                 # release; @ranges gives the ones that are.  Create fake input
3319                 # lines for each of the ranges, and set things up so that when
3320                 # this routine is called on that fake input, it will do
3321                 # nothing.
3322                 $skip_inserted_count = @ranges;
3323                 my $remainder = join ";", @remainder;
3324                 for my $range (@ranges) {
3325                     $self->insert_lines(sprintf("%04X..%04X;%s",
3326                                     $range->start, $range->end, $remainder));
3327                 }
3328                 $_ = "";    # The original range is now defunct.
3329             }
3330         }
3331
3332         return;
3333     }
3334
3335     sub _fixup_obsolete_hanguls {
3336
3337         # This is called only when compiling Unicode version 1.  All Unicode
3338         # data for subsequent releases assumes that the code points that were
3339         # Hangul syllables in this release only are something else, so if
3340         # using such data, we have to override it
3341
3342         my $self = shift;
3343         my $addr = do { no overloading; pack 'J', $self; };
3344         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3345
3346         my $object = main::property_ref($property{$addr});
3347         $object->add_map($FIRST_REMOVED_HANGUL_SYLLABLE,
3348                          $FINAL_REMOVED_HANGUL_SYLLABLE,
3349                          $early{$addr}[3],  # Passed-in value for these
3350                          Replace => $UNCONDITIONALLY);
3351     }
3352
3353     sub _insert_property_into_line {
3354         # Add a property field to $_, if this file requires it.
3355
3356         my $self = shift;
3357         my $addr = do { no overloading; pack 'J', $self; };
3358         my $property = $property{$addr};
3359         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3360
3361         $_ =~ s/(;|$)/; $property$1/;
3362         return;
3363     }
3364
3365     sub carp_bad_line {
3366         # Output consistent error messages, using either a generic one, or the
3367         # one given by the optional parameter.  To avoid gazillions of the
3368         # same message in case the syntax of a  file is way off, this routine
3369         # only outputs the first instance of each message, incrementing a
3370         # count so the totals can be output at the end of the file.
3371
3372         my $self = shift;
3373         my $message = shift;
3374         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3375
3376         my $addr = do { no overloading; pack 'J', $self; };
3377
3378         $message = 'Unexpected line' unless $message;
3379
3380         # No trailing punctuation so as to fit with our addenda.
3381         $message =~ s/[.:;,]$//;
3382
3383         # If haven't seen this exact message before, output it now.  Otherwise
3384         # increment the count of how many times it has occurred
3385         unless ($errors{$addr}->{$message}) {
3386             Carp::my_carp("$message in '$_' in "
3387                             . $file{$addr}
3388                             . " at line $..  Skipping this line;");
3389             $errors{$addr}->{$message} = 1;
3390         }
3391         else {
3392             $errors{$addr}->{$message}++;
3393         }
3394
3395         # Clear the line to prevent any further (meaningful) processing of it.
3396         $_ = "";
3397
3398         return;
3399     }
3400 } # End closure
3401
3402 package Multi_Default;
3403
3404 # Certain properties in early versions of Unicode had more than one possible
3405 # default for code points missing from the files.  In these cases, one
3406 # default applies to everything left over after all the others are applied,
3407 # and for each of the others, there is a description of which class of code
3408 # points applies to it.  This object helps implement this by storing the
3409 # defaults, and for all but that final default, an eval string that generates
3410 # the class that it applies to.
3411
3412
3413 {   # Closure
3414
3415     main::setup_package();
3416
3417     my %class_defaults;
3418     # The defaults structure for the classes
3419     main::set_access('class_defaults', \%class_defaults);
3420
3421     my %other_default;
3422     # The default that applies to everything left over.
3423     main::set_access('other_default', \%other_default, 'r');
3424
3425
3426     sub new {
3427         # The constructor is called with default => eval pairs, terminated by
3428         # the left-over default. e.g.
3429         # Multi_Default->new(
3430         #        'T' => '$gc->table("Mn") + $gc->table("Cf") - 0x200C
3431         #               -  0x200D',
3432         #        'R' => 'some other expression that evaluates to code points',
3433         #        .
3434         #        .
3435         #        .
3436         #        'U'));
3437         # It is best to leave the final value be the one that matches the
3438         # above-Unicode code points.
3439
3440         my $class = shift;
3441
3442         my $self = bless \do{my $anonymous_scalar}, $class;
3443         my $addr = do { no overloading; pack 'J', $self; };
3444
3445         while (@_ > 1) {
3446             my $default = shift;
3447             my $eval = shift;
3448             $class_defaults{$addr}->{$default} = $eval;
3449         }
3450
3451         $other_default{$addr} = shift;
3452
3453         return $self;
3454     }
3455
3456     sub get_next_defaults {
3457         # Iterates and returns the next class of defaults.
3458         my $self = shift;
3459         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3460
3461         my $addr = do { no overloading; pack 'J', $self; };
3462
3463         return each %{$class_defaults{$addr}};
3464     }
3465 }
3466
3467 package Alias;
3468
3469 # An alias is one of the names that a table goes by.  This class defines them
3470 # including some attributes.  Everything is currently setup in the
3471 # constructor.
3472
3473
3474 {   # Closure
3475
3476     main::setup_package();
3477
3478     my %name;
3479     main::set_access('name', \%name, 'r');
3480
3481     my %loose_match;
3482     # Should this name match loosely or not.
3483     main::set_access('loose_match', \%loose_match, 'r');
3484
3485     my %make_re_pod_entry;
3486     # Some aliases should not get their own entries in the re section of the
3487     # pod, because they are covered by a wild-card, and some we want to
3488     # discourage use of.  Binary
3489     main::set_access('make_re_pod_entry', \%make_re_pod_entry, 'r', 's');
3490
3491     my %ucd;
3492     # Is this documented to be accessible via Unicode::UCD
3493     main::set_access('ucd', \%ucd, 'r', 's');
3494
3495     my %status;
3496     # Aliases have a status, like deprecated, or even suppressed (which means
3497     # they don't appear in documentation).  Enum
3498     main::set_access('status', \%status, 'r');
3499
3500     my %ok_as_filename;
3501     # Similarly, some aliases should not be considered as usable ones for
3502     # external use, such as file names, or we don't want documentation to
3503     # recommend them.  Boolean
3504     main::set_access('ok_as_filename', \%ok_as_filename, 'r');
3505
3506     sub new {
3507         my $class = shift;
3508
3509         my $self = bless \do { my $anonymous_scalar }, $class;
3510         my $addr = do { no overloading; pack 'J', $self; };
3511
3512         $name{$addr} = shift;
3513         $loose_match{$addr} = shift;
3514         $make_re_pod_entry{$addr} = shift;
3515         $ok_as_filename{$addr} = shift;
3516         $status{$addr} = shift;
3517         $ucd{$addr} = shift;
3518
3519         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3520
3521         # Null names are never ok externally
3522         $ok_as_filename{$addr} = 0 if $name{$addr} eq "";
3523
3524         return $self;
3525     }
3526 }
3527
3528 package Range;
3529
3530 # A range is the basic unit for storing code points, and is described in the
3531 # comments at the beginning of the program.  Each range has a starting code
3532 # point; an ending code point (not less than the starting one); a value
3533 # that applies to every code point in between the two end-points, inclusive;
3534 # and an enum type that applies to the value.  The type is for the user's
3535 # convenience, and has no meaning here, except that a non-zero type is
3536 # considered to not obey the normal Unicode rules for having standard forms.
3537 #
3538 # The same structure is used for both map and match tables, even though in the
3539 # latter, the value (and hence type) is irrelevant and could be used as a
3540 # comment.  In map tables, the value is what all the code points in the range
3541 # map to.  Type 0 values have the standardized version of the value stored as
3542 # well, so as to not have to recalculate it a lot.
3543
3544 sub trace { return main::trace(@_); }
3545
3546 {   # Closure
3547
3548     main::setup_package();
3549
3550     my %start;
3551     main::set_access('start', \%start, 'r', 's');
3552
3553     my %end;
3554     main::set_access('end', \%end, 'r', 's');
3555
3556     my %value;
3557     main::set_access('value', \%value, 'r', 's');
3558
3559     my %type;
3560     main::set_access('type', \%type, 'r');
3561
3562     my %standard_form;
3563     # The value in internal standard form.  Defined only if the type is 0.
3564     main::set_access('standard_form', \%standard_form);
3565
3566     # Note that if these fields change, the dump() method should as well
3567
3568     sub new {
3569         return Carp::carp_too_few_args(\@_, 3) if main::DEBUG && @_ < 3;
3570         my $class = shift;
3571
3572         my $self = bless \do { my $anonymous_scalar }, $class;
3573         my $addr = do { no overloading; pack 'J', $self; };
3574
3575         $start{$addr} = shift;
3576         $end{$addr} = shift;
3577
3578         my %args = @_;
3579
3580         my $value = delete $args{'Value'};  # Can be 0
3581         $value = "" unless defined $value;
3582         $value{$addr} = $value;
3583
3584         $type{$addr} = delete $args{'Type'} || 0;
3585
3586         Carp::carp_extra_args(\%args) if main::DEBUG && %args;
3587
3588         return $self;
3589     }
3590
3591     use overload
3592         fallback => 0,
3593         qw("") => "_operator_stringify",
3594         "." => \&main::_operator_dot,
3595         ".=" => \&main::_operator_dot_equal,
3596     ;
3597
3598     sub _operator_stringify {
3599         my $self = shift;
3600         my $addr = do { no overloading; pack 'J', $self; };
3601
3602         # Output it like '0041..0065 (value)'
3603         my $return = sprintf("%04X", $start{$addr})
3604                         .  '..'
3605                         . sprintf("%04X", $end{$addr});
3606         my $value = $value{$addr};
3607         my $type = $type{$addr};
3608         $return .= ' (';
3609         $return .= "$value";
3610         $return .= ", Type=$type" if $type != 0;
3611         $return .= ')';
3612
3613         return $return;
3614     }
3615
3616     sub standard_form {
3617         # Calculate the standard form only if needed, and cache the result.
3618         # The standard form is the value itself if the type is special.
3619         # This represents a considerable CPU and memory saving - at the time
3620         # of writing there are 368676 non-special objects, but the standard
3621         # form is only requested for 22047 of them - ie about 6%.
3622
3623         my $self = shift;
3624         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3625
3626         my $addr = do { no overloading; pack 'J', $self; };
3627
3628         return $standard_form{$addr} if defined $standard_form{$addr};
3629
3630         my $value = $value{$addr};
3631         return $value if $type{$addr};
3632         return $standard_form{$addr} = main::standardize($value);
3633     }
3634
3635     sub dump {
3636         # Human, not machine readable.  For machine readable, comment out this
3637         # entire routine and let the standard one take effect.
3638         my $self = shift;
3639         my $indent = shift;
3640         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3641
3642         my $addr = do { no overloading; pack 'J', $self; };
3643
3644         my $return = $indent
3645                     . sprintf("%04X", $start{$addr})
3646                     . '..'
3647                     . sprintf("%04X", $end{$addr})
3648                     . " '$value{$addr}';";
3649         if (! defined $standard_form{$addr}) {
3650             $return .= "(type=$type{$addr})";
3651         }
3652         elsif ($standard_form{$addr} ne $value{$addr}) {
3653             $return .= "(standard '$standard_form{$addr}')";
3654         }
3655         return $return;
3656     }
3657 } # End closure
3658
3659 package _Range_List_Base;
3660
3661 # Base class for range lists.  A range list is simply an ordered list of
3662 # ranges, so that the ranges with the lowest starting numbers are first in it.
3663 #
3664 # When a new range is added that is adjacent to an existing range that has the
3665 # same value and type, it merges with it to form a larger range.
3666 #
3667 # Ranges generally do not overlap, except that there can be multiple entries
3668 # of single code point ranges.  This is because of NameAliases.txt.
3669 #
3670 # In this program, there is a standard value such that if two different
3671 # values, have the same standard value, they are considered equivalent.  This
3672 # value was chosen so that it gives correct results on Unicode data
3673
3674 # There are a number of methods to manipulate range lists, and some operators
3675 # are overloaded to handle them.
3676
3677 sub trace { return main::trace(@_); }
3678
3679 { # Closure
3680
3681     our $addr;
3682
3683     # Max is initialized to a negative value that isn't adjacent to 0, for
3684     # simpler tests
3685     my $max_init = -2;
3686
3687     main::setup_package();
3688
3689     my %ranges;
3690     # The list of ranges
3691     main::set_access('ranges', \%ranges, 'readable_array');
3692
3693     my %max;
3694     # The highest code point in the list.  This was originally a method, but
3695     # actual measurements said it was used a lot.
3696     main::set_access('max', \%max, 'r');
3697
3698     my %each_range_iterator;
3699     # Iterator position for each_range()
3700     main::set_access('each_range_iterator', \%each_range_iterator);
3701
3702     my %owner_name_of;
3703     # Name of parent this is attached to, if any.  Solely for better error
3704     # messages.
3705     main::set_access('owner_name_of', \%owner_name_of, 'p_r');
3706
3707     my %_search_ranges_cache;
3708     # A cache of the previous result from _search_ranges(), for better
3709     # performance
3710     main::set_access('_search_ranges_cache', \%_search_ranges_cache);
3711
3712     sub new {
3713         my $class = shift;
3714         my %args = @_;
3715
3716         # Optional initialization data for the range list.
3717         my $initialize = delete $args{'Initialize'};
3718
3719         my $self;
3720
3721         # Use _union() to initialize.  _union() returns an object of this
3722         # class, which means that it will call this constructor recursively.
3723         # But it won't have this $initialize parameter so that it won't
3724         # infinitely loop on this.
3725         return _union($class, $initialize, %args) if defined $initialize;
3726
3727         $self = bless \do { my $anonymous_scalar }, $class;
3728         my $addr = do { no overloading; pack 'J', $self; };
3729
3730         # Optional parent object, only for debug info.
3731         $owner_name_of{$addr} = delete $args{'Owner'};
3732         $owner_name_of{$addr} = "" if ! defined $owner_name_of{$addr};
3733
3734         # Stringify, in case it is an object.
3735         $owner_name_of{$addr} = "$owner_name_of{$addr}";
3736
3737         # This is used only for error messages, and so a colon is added
3738         $owner_name_of{$addr} .= ": " if $owner_name_of{$addr} ne "";
3739
3740         Carp::carp_extra_args(\%args) if main::DEBUG && %args;
3741
3742         $max{$addr} = $max_init;
3743
3744         $_search_ranges_cache{$addr} = 0;
3745         $ranges{$addr} = [];
3746
3747         return $self;
3748     }
3749
3750     use overload
3751         fallback => 0,
3752         qw("") => "_operator_stringify",
3753         "." => \&main::_operator_dot,
3754         ".=" => \&main::_operator_dot_equal,
3755     ;
3756
3757     sub _operator_stringify {
3758         my $self = shift;
3759         my $addr = do { no overloading; pack 'J', $self; };
3760
3761         return "Range_List attached to '$owner_name_of{$addr}'"
3762                                                 if $owner_name_of{$addr};
3763         return "anonymous Range_List " . \$self;
3764     }
3765
3766     sub _union {
3767         # Returns the union of the input code points.  It can be called as
3768         # either a constructor or a method.  If called as a method, the result
3769         # will be a new() instance of the calling object, containing the union
3770         # of that object with the other parameter's code points;  if called as
3771         # a constructor, the first parameter gives the class that the new object
3772         # should be, and the second parameter gives the code points to go into
3773         # it.
3774         # In either case, there are two parameters looked at by this routine;
3775         # any additional parameters are passed to the new() constructor.
3776         #
3777         # The code points can come in the form of some object that contains
3778         # ranges, and has a conventionally named method to access them; or
3779         # they can be an array of individual code points (as integers); or
3780         # just a single code point.
3781         #
3782         # If they are ranges, this routine doesn't make any effort to preserve
3783         # the range values and types of one input over the other.  Therefore
3784         # this base class should not allow _union to be called from other than
3785         # initialization code, so as to prevent two tables from being added
3786         # together where the range values matter.  The general form of this
3787         # routine therefore belongs in a derived class, but it was moved here
3788         # to avoid duplication of code.  The failure to overload this in this
3789         # class keeps it safe.
3790         #
3791         # It does make the effort during initialization to accept tables with
3792         # multiple values for the same code point, and to preserve the order
3793         # of these.  If there is only one input range or range set, it doesn't
3794         # sort (as it should already be sorted to the desired order), and will
3795         # accept multiple values per code point.  Otherwise it will merge
3796         # multiple values into a single one.
3797
3798         my $self;
3799         my @args;   # Arguments to pass to the constructor
3800
3801         my $class = shift;
3802
3803         # If a method call, will start the union with the object itself, and
3804         # the class of the new object will be the same as self.
3805         if (ref $class) {
3806             $self = $class;
3807             $class = ref $self;
3808             push @args, $self;
3809         }
3810
3811         # Add the other required parameter.
3812         push @args, shift;
3813         # Rest of parameters are passed on to the constructor
3814
3815         # Accumulate all records from both lists.
3816         my @records;
3817         my $input_count = 0;
3818         for my $arg (@args) {
3819             #local $to_trace = 0 if main::DEBUG;
3820             trace "argument = $arg" if main::DEBUG && $to_trace;
3821             if (! defined $arg) {
3822                 my $message = "";
3823                 if (defined $self) {
3824                     no overloading;
3825                     $message .= $owner_name_of{pack 'J', $self};
3826                 }
3827                 Carp::my_carp_bug($message . "Undefined argument to _union.  No union done.");
3828                 return;
3829             }
3830
3831             $arg = [ $arg ] if ! ref $arg;
3832             my $type = ref $arg;
3833             if ($type eq 'ARRAY') {
3834                 foreach my $element (@$arg) {
3835                     push @records, Range->new($element, $element);
3836                     $input_count++;
3837                 }
3838             }
3839             elsif ($arg->isa('Range')) {
3840                 push @records, $arg;
3841                 $input_count++;
3842             }
3843             elsif ($arg->can('ranges')) {
3844                 push @records, $arg->ranges;
3845                 $input_count++;
3846             }
3847             else {
3848                 my $message = "";
3849                 if (defined $self) {
3850                     no overloading;
3851                     $message .= $owner_name_of{pack 'J', $self};
3852                 }
3853                 Carp::my_carp_bug($message . "Cannot take the union of a $type.  No union done.");
3854                 return;
3855             }
3856         }
3857
3858         # Sort with the range containing the lowest ordinal first, but if
3859         # two ranges start at the same code point, sort with the bigger range
3860         # of the two first, because it takes fewer cycles.
3861         if ($input_count > 1) {
3862             @records = sort { ($a->start <=> $b->start)
3863                                       or
3864                                     # if b is shorter than a, b->end will be
3865                                     # less than a->end, and we want to select
3866                                     # a, so want to return -1
3867                                     ($b->end <=> $a->end)
3868                                    } @records;
3869         }
3870
3871         my $new = $class->new(@_);
3872
3873         # Fold in records so long as they add new information.
3874         for my $set (@records) {
3875             my $start = $set->start;
3876             my $end   = $set->end;
3877             my $value = $set->value;
3878             my $type  = $set->type;
3879             if ($start > $new->max) {
3880                 $new->_add_delete('+', $start, $end, $value, Type => $type);
3881             }
3882             elsif ($end > $new->max) {
3883                 $new->_add_delete('+', $new->max +1, $end, $value,
3884                                                                 Type => $type);
3885             }
3886             elsif ($input_count == 1) {
3887                 # Here, overlaps existing range, but is from a single input,
3888                 # so preserve the multiple values from that input.
3889                 $new->_add_delete('+', $start, $end, $value, Type => $type,
3890                                                 Replace => $MULTIPLE_AFTER);
3891             }
3892         }
3893
3894         return $new;
3895     }
3896
3897     sub range_count {        # Return the number of ranges in the range list
3898         my $self = shift;
3899         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3900
3901         no overloading;
3902         return scalar @{$ranges{pack 'J', $self}};
3903     }
3904
3905     sub min {
3906         # Returns the minimum code point currently in the range list, or if
3907         # the range list is empty, 2 beyond the max possible.  This is a
3908         # method because used so rarely, that not worth saving between calls,
3909         # and having to worry about changing it as ranges are added and
3910         # deleted.
3911
3912         my $self = shift;
3913         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3914
3915         my $addr = do { no overloading; pack 'J', $self; };
3916
3917         # If the range list is empty, return a large value that isn't adjacent
3918         # to any that could be in the range list, for simpler tests
3919         return $MAX_WORKING_CODEPOINT + 2 unless scalar @{$ranges{$addr}};
3920         return $ranges{$addr}->[0]->start;
3921     }
3922
3923     sub contains {
3924         # Boolean: Is argument in the range list?  If so returns $i such that:
3925         #   range[$i]->end < $codepoint <= range[$i+1]->end
3926         # which is one beyond what you want; this is so that the 0th range
3927         # doesn't return false
3928         my $self = shift;
3929         my $codepoint = shift;
3930         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3931
3932         my $i = $self->_search_ranges($codepoint);
3933         return 0 unless defined $i;
3934
3935         # The search returns $i, such that
3936         #   range[$i-1]->end < $codepoint <= range[$i]->end
3937         # So is in the table if and only iff it is at least the start position
3938         # of range $i.
3939         no overloading;
3940         return 0 if $ranges{pack 'J', $self}->[$i]->start > $codepoint;
3941         return $i + 1;
3942     }
3943
3944     sub containing_range {
3945         # Returns the range object that contains the code point, undef if none
3946
3947         my $self = shift;
3948         my $codepoint = shift;
3949         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3950
3951         my $i = $self->contains($codepoint);
3952         return unless $i;
3953
3954         # contains() returns 1 beyond where we should look
3955         no overloading;
3956         return $ranges{pack 'J', $self}->[$i-1];
3957     }
3958
3959     sub value_of {
3960         # Returns the value associated with the code point, undef if none
3961
3962         my $self = shift;
3963         my $codepoint = shift;
3964         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3965
3966         my $range = $self->containing_range($codepoint);
3967         return unless defined $range;
3968
3969         return $range->value;
3970     }
3971
3972     sub type_of {
3973         # Returns the type of the range containing the code point, undef if
3974         # the code point is not in the table
3975
3976         my $self = shift;
3977         my $codepoint = shift;
3978         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3979
3980         my $range = $self->containing_range($codepoint);
3981         return unless defined $range;
3982
3983         return $range->type;
3984     }
3985
3986     sub _search_ranges {
3987         # Find the range in the list which contains a code point, or where it
3988         # should go if were to add it.  That is, it returns $i, such that:
3989         #   range[$i-1]->end < $codepoint <= range[$i]->end
3990         # Returns undef if no such $i is possible (e.g. at end of table), or
3991         # if there is an error.
3992
3993         my $self = shift;
3994         my $code_point = shift;
3995         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3996
3997         my $addr = do { no overloading; pack 'J', $self; };
3998
3999         return if $code_point > $max{$addr};
4000         my $r = $ranges{$addr};                # The current list of ranges
4001         my $range_list_size = scalar @$r;
4002         my $i;
4003
4004         use integer;        # want integer division
4005
4006         # Use the cached result as the starting guess for this one, because,
4007         # an experiment on 5.1 showed that 90% of the time the cache was the
4008         # same as the result on the next call (and 7% it was one less).
4009         $i = $_search_ranges_cache{$addr};
4010         $i = 0 if $i >= $range_list_size;   # Reset if no longer valid (prob.
4011                                             # from an intervening deletion
4012         #local $to_trace = 1 if main::DEBUG;
4013         trace "previous \$i is still valid: $i" if main::DEBUG && $to_trace && $code_point <= $r->[$i]->end && ($i == 0 || $r->[$i-1]->end < $code_point);
4014         return $i if $code_point <= $r->[$i]->end
4015                      && ($i == 0 || $r->[$i-1]->end < $code_point);
4016
4017         # Here the cache doesn't yield the correct $i.  Try adding 1.
4018         if ($i < $range_list_size - 1
4019             && $r->[$i]->end < $code_point &&
4020             $code_point <= $r->[$i+1]->end)
4021         {
4022             $i++;
4023             trace "next \$i is correct: $i" if main::DEBUG && $to_trace;
4024             $_search_ranges_cache{$addr} = $i;
4025             return $i;
4026         }
4027
4028         # Here, adding 1 also didn't work.  We do a binary search to
4029         # find the correct position, starting with current $i
4030         my $lower = 0;
4031         my $upper = $range_list_size - 1;
4032         while (1) {
4033             trace "top of loop i=$i:", sprintf("%04X", $r->[$lower]->start), "[$lower] .. ", sprintf("%04X", $r->[$i]->start), "[$i] .. ", sprintf("%04X", $r->[$upper]->start), "[$upper]" if main::DEBUG && $to_trace;
4034
4035             if ($code_point <= $r->[$i]->end) {
4036
4037                 # Here we have met the upper constraint.  We can quit if we
4038                 # also meet the lower one.
4039                 last if $i == 0 || $r->[$i-1]->end < $code_point;
4040
4041                 $upper = $i;        # Still too high.
4042
4043             }
4044             else {
4045
4046                 # Here, $r[$i]->end < $code_point, so look higher up.
4047                 $lower = $i;
4048             }
4049
4050             # Split search domain in half to try again.
4051             my $temp = ($upper + $lower) / 2;
4052
4053             # No point in continuing unless $i changes for next time
4054             # in the loop.
4055             if ($temp == $i) {
4056
4057                 # We can't reach the highest element because of the averaging.
4058                 # So if one below the upper edge, force it there and try one
4059                 # more time.
4060                 if ($i == $range_list_size - 2) {
4061
4062                     trace "Forcing to upper edge" if main::DEBUG && $to_trace;
4063                     $i = $range_list_size - 1;
4064
4065                     # Change $lower as well so if fails next time through,
4066                     # taking the average will yield the same $i, and we will
4067                     # quit with the error message just below.
4068                     $lower = $i;
4069                     next;
4070                 }
4071                 Carp::my_carp_bug("$owner_name_of{$addr}Can't find where the range ought to go.  No action taken.");
4072                 return;
4073             }
4074             $i = $temp;
4075         } # End of while loop
4076
4077         if (main::DEBUG && $to_trace) {
4078             trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i;
4079             trace "i=  [ $i ]", $r->[$i];
4080             trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < $range_list_size - 1;
4081         }
4082
4083         # Here we have found the offset.  Cache it as a starting point for the
4084         # next call.
4085         $_search_ranges_cache{$addr} = $i;
4086         return $i;
4087     }
4088
4089     sub _add_delete {
4090         # Add, replace or delete ranges to or from a list.  The $type
4091         # parameter gives which:
4092         #   '+' => insert or replace a range, returning a list of any changed
4093         #          ranges.
4094         #   '-' => delete a range, returning a list of any deleted ranges.
4095         #
4096         # The next three parameters give respectively the start, end, and
4097         # value associated with the range.  'value' should be null unless the
4098         # operation is '+';
4099         #
4100         # The range list is kept sorted so that the range with the lowest
4101         # starting position is first in the list, and generally, adjacent
4102         # ranges with the same values are merged into a single larger one (see
4103         # exceptions below).
4104         #
4105         # There are more parameters; all are key => value pairs:
4106         #   Type    gives the type of the value.  It is only valid for '+'.
4107         #           All ranges have types; if this parameter is omitted, 0 is
4108         #           assumed.  Ranges with type 0 are assumed to obey the
4109         #           Unicode rules for casing, etc; ranges with other types are
4110         #           not.  Otherwise, the type is arbitrary, for the caller's
4111         #           convenience, and looked at only by this routine to keep
4112         #           adjacent ranges of different types from being merged into
4113         #           a single larger range, and when Replace =>
4114         #           $IF_NOT_EQUIVALENT is specified (see just below).
4115         #   Replace  determines what to do if the range list already contains
4116         #            ranges which coincide with all or portions of the input
4117         #            range.  It is only valid for '+':
4118         #       => $NO            means that the new value is not to replace
4119         #                         any existing ones, but any empty gaps of the
4120         #                         range list coinciding with the input range
4121         #                         will be filled in with the new value.
4122         #       => $UNCONDITIONALLY  means to replace the existing values with
4123         #                         this one unconditionally.  However, if the
4124         #                         new and old values are identical, the
4125         #                         replacement is skipped to save cycles
4126         #       => $IF_NOT_EQUIVALENT means to replace the existing values
4127         #          (the default)  with this one if they are not equivalent.
4128         #                         Ranges are equivalent if their types are the
4129         #                         same, and they are the same string; or if
4130         #                         both are type 0 ranges, if their Unicode
4131         #                         standard forms are identical.  In this last
4132         #                         case, the routine chooses the more "modern"
4133         #                         one to use.  This is because some of the
4134         #                         older files are formatted with values that
4135         #                         are, for example, ALL CAPs, whereas the
4136         #                         derived files have a more modern style,
4137         #                         which looks better.  By looking for this
4138         #                         style when the pre-existing and replacement
4139         #                         standard forms are the same, we can move to
4140         #                         the modern style
4141         #       => $MULTIPLE_BEFORE means that if this range duplicates an
4142         #                         existing one, but has a different value,
4143         #                         don't replace the existing one, but insert
4144         #                         this one so that the same range can occur
4145         #                         multiple times.  They are stored LIFO, so
4146         #                         that the final one inserted is the first one
4147         #                         returned in an ordered search of the table.
4148         #                         If this is an exact duplicate, including the
4149         #                         value, the original will be moved to be
4150         #                         first, before any other duplicate ranges
4151         #                         with different values.
4152         #       => $MULTIPLE_AFTER is like $MULTIPLE_BEFORE, but is stored
4153         #                         FIFO, so that this one is inserted after all
4154         #                         others that currently exist.  If this is an
4155         #                         exact duplicate, including value, of an
4156         #                         existing range, this one is discarded
4157         #                         (leaving the existing one in its original,
4158         #                         higher priority position
4159         #       => $CROAK         Die with an error if is already there
4160         #       => anything else  is the same as => $IF_NOT_EQUIVALENT
4161         #
4162         # "same value" means identical for non-type-0 ranges, and it means
4163         # having the same standard forms for type-0 ranges.
4164
4165         return Carp::carp_too_few_args(\@_, 5) if main::DEBUG && @_ < 5;
4166
4167         my $self = shift;
4168         my $operation = shift;   # '+' for add/replace; '-' for delete;
4169         my $start = shift;
4170         my $end   = shift;
4171         my $value = shift;
4172
4173         my %args = @_;
4174
4175         $value = "" if not defined $value;        # warning: $value can be "0"
4176
4177         my $replace = delete $args{'Replace'};
4178         $replace = $IF_NOT_EQUIVALENT unless defined $replace;
4179
4180         my $type = delete $args{'Type'};
4181         $type = 0 unless defined $type;
4182
4183         Carp::carp_extra_args(\%args) if main::DEBUG && %args;
4184
4185         my $addr = do { no overloading; pack 'J', $self; };
4186
4187         if ($operation ne '+' && $operation ne '-') {
4188             Carp::my_carp_bug("$owner_name_of{$addr}First parameter to _add_delete must be '+' or '-'.  No action taken.");
4189             return;
4190         }
4191         unless (defined $start && defined $end) {
4192             Carp::my_carp_bug("$owner_name_of{$addr}Undefined start and/or end to _add_delete.  No action taken.");
4193             return;
4194         }
4195         unless ($end >= $start) {
4196             Carp::my_carp_bug("$owner_name_of{$addr}End of range (" . sprintf("%04X", $end) . ") must not be before start (" . sprintf("%04X", $start) . ").  No action taken.");
4197             return;
4198         }
4199         #local $to_trace = 1 if main::DEBUG;
4200
4201         if ($operation eq '-') {
4202             if ($replace != $IF_NOT_EQUIVALENT) {
4203                 Carp::my_carp_bug("$owner_name_of{$addr}Replace => \$IF_NOT_EQUIVALENT is required when deleting a range from a range list.  Assuming Replace => \$IF_NOT_EQUIVALENT.");
4204                 $replace = $IF_NOT_EQUIVALENT;
4205             }
4206             if ($type) {
4207                 Carp::my_carp_bug("$owner_name_of{$addr}Type => 0 is required when deleting a range from a range list.  Assuming Type => 0.");
4208                 $type = 0;
4209             }
4210             if ($value ne "") {
4211                 Carp::my_carp_bug("$owner_name_of{$addr}Value => \"\" is required when deleting a range from a range list.  Assuming Value => \"\".");
4212                 $value = "";
4213             }
4214         }
4215
4216         my $r = $ranges{$addr};               # The current list of ranges
4217         my $range_list_size = scalar @$r;     # And its size
4218         my $max = $max{$addr};                # The current high code point in
4219                                               # the list of ranges
4220
4221         # Do a special case requiring fewer machine cycles when the new range
4222         # starts after the current highest point.  The Unicode input data is
4223         # structured so this is common.
4224         if ($start > $max) {
4225
4226             trace "$owner_name_of{$addr} $operation", sprintf("%04X..%04X (%s) type=%d; prev max=%04X", $start, $end, $value, $type, $max) if main::DEBUG && $to_trace;
4227             return if $operation eq '-'; # Deleting a non-existing range is a
4228                                          # no-op
4229
4230             # If the new range doesn't logically extend the current final one
4231             # in the range list, create a new range at the end of the range
4232             # list.  (max cleverly is initialized to a negative number not
4233             # adjacent to 0 if the range list is empty, so even adding a range
4234             # to an empty range list starting at 0 will have this 'if'
4235             # succeed.)
4236             if ($start > $max + 1        # non-adjacent means can't extend.
4237                 || @{$r}[-1]->value ne $value # values differ, can't extend.
4238                 || @{$r}[-1]->type != $type # types differ, can't extend.
4239             ) {
4240                 push @$r, Range->new($start, $end,
4241                                      Value => $value,
4242                                      Type => $type);
4243             }
4244             else {
4245
4246                 # Here, the new range starts just after the current highest in
4247                 # the range list, and they have the same type and value.
4248                 # Extend the existing range to incorporate the new one.
4249                 @{$r}[-1]->set_end($end);
4250             }
4251
4252             # This becomes the new maximum.
4253             $max{$addr} = $end;
4254
4255             return;
4256         }
4257         #local $to_trace = 0 if main::DEBUG;
4258
4259         trace "$owner_name_of{$addr} $operation", sprintf("%04X", $start) . '..' . sprintf("%04X", $end) . " ($value) replace=$replace" if main::DEBUG && $to_trace;
4260
4261         # Here, the input range isn't after the whole rest of the range list.
4262         # Most likely 'splice' will be needed.  The rest of the routine finds
4263         # the needed splice parameters, and if necessary, does the splice.
4264         # First, find the offset parameter needed by the splice function for
4265         # the input range.  Note that the input range may span multiple
4266         # existing ones, but we'll worry about that later.  For now, just find
4267         # the beginning.  If the input range is to be inserted starting in a
4268         # position not currently in the range list, it must (obviously) come
4269         # just after the range below it, and just before the range above it.
4270         # Slightly less obviously, it will occupy the position currently
4271         # occupied by the range that is to come after it.  More formally, we
4272         # are looking for the position, $i, in the array of ranges, such that:
4273         #
4274         # r[$i-1]->start <= r[$i-1]->end < $start < r[$i]->start <= r[$i]->end
4275         #
4276         # (The ordered relationships within existing ranges are also shown in
4277         # the equation above).  However, if the start of the input range is
4278         # within an existing range, the splice offset should point to that
4279         # existing range's position in the list; that is $i satisfies a
4280         # somewhat different equation, namely:
4281         #
4282         #r[$i-1]->start <= r[$i-1]->end < r[$i]->start <= $start <= r[$i]->end
4283         #
4284         # More briefly, $start can come before or after r[$i]->start, and at
4285         # this point, we don't know which it will be.  However, these
4286         # two equations share these constraints:
4287         #
4288         #   r[$i-1]->end < $start <= r[$i]->end
4289         #
4290         # And that is good enough to find $i.
4291
4292         my $i = $self->_search_ranges($start);
4293         if (! defined $i) {
4294             Carp::my_carp_bug("Searching $self for range beginning with $start unexpectedly returned undefined.  Operation '$operation' not performed");
4295             return;
4296         }
4297
4298         # The search function returns $i such that:
4299         #
4300         # r[$i-1]->end < $start <= r[$i]->end
4301         #
4302         # That means that $i points to the first range in the range list
4303         # that could possibly be affected by this operation.  We still don't
4304         # know if the start of the input range is within r[$i], or if it
4305         # points to empty space between r[$i-1] and r[$i].
4306         trace "[$i] is the beginning splice point.  Existing range there is ", $r->[$i] if main::DEBUG && $to_trace;
4307
4308         # Special case the insertion of data that is not to replace any
4309         # existing data.
4310         if ($replace == $NO) {  # If $NO, has to be operation '+'
4311             #local $to_trace = 1 if main::DEBUG;
4312             trace "Doesn't replace" if main::DEBUG && $to_trace;
4313
4314             # Here, the new range is to take effect only on those code points
4315             # that aren't already in an existing range.  This can be done by
4316             # looking through the existing range list and finding the gaps in
4317             # the ranges that this new range affects, and then calling this
4318             # function recursively on each of those gaps, leaving untouched
4319             # anything already in the list.  Gather up a list of the changed
4320             # gaps first so that changes to the internal state as new ranges
4321             # are added won't be a problem.
4322             my @gap_list;
4323
4324             # First, if the starting point of the input range is outside an
4325             # existing one, there is a gap from there to the beginning of the
4326             # existing range -- add a span to fill the part that this new
4327             # range occupies
4328             if ($start < $r->[$i]->start) {
4329                 push @gap_list, Range->new($start,
4330                                            main::min($end,
4331                                                      $r->[$i]->start - 1),
4332                                            Type => $type);
4333                 trace "gap before $r->[$i] [$i], will add", $gap_list[-1] if main::DEBUG && $to_trace;
4334             }
4335
4336             # Then look through the range list for other gaps until we reach
4337             # the highest range affected by the input one.
4338             my $j;
4339             for ($j = $i+1; $j < $range_list_size; $j++) {
4340                 trace "j=[$j]", $r->[$j] if main::DEBUG && $to_trace;
4341                 last if $end < $r->[$j]->start;
4342
4343                 # If there is a gap between when this range starts and the
4344                 # previous one ends, add a span to fill it.  Note that just
4345                 # because there are two ranges doesn't mean there is a
4346                 # non-zero gap between them.  It could be that they have
4347                 # different values or types
4348                 if ($r->[$j-1]->end + 1 != $r->[$j]->start) {
4349                     push @gap_list,
4350                         Range->new($r->[$j-1]->end + 1,
4351                                    $r->[$j]->start - 1,
4352                                    Type => $type);
4353                     trace "gap between $r->[$j-1] and $r->[$j] [$j], will add: $gap_list[-1]" if main::DEBUG && $to_trace;
4354                 }
4355             }
4356
4357             # Here, we have either found an existing range in the range list,
4358             # beyond the area affected by the input one, or we fell off the
4359             # end of the loop because the input range affects the whole rest
4360             # of the range list.  In either case, $j is 1 higher than the
4361             # highest affected range.  If $j == $i, it means that there are no
4362             # affected ranges, that the entire insertion is in the gap between
4363             # r[$i-1], and r[$i], which we already have taken care of before
4364             # the loop.
4365             # On the other hand, if there are affected ranges, it might be
4366             # that there is a gap that needs filling after the final such
4367             # range to the end of the input range
4368             if ($r->[$j-1]->end < $end) {
4369                     push @gap_list, Range->new(main::max($start,
4370                                                          $r->[$j-1]->end + 1),
4371                                                $end,
4372                                                Type => $type);
4373                     trace "gap after $r->[$j-1], will add $gap_list[-1]" if main::DEBUG && $to_trace;
4374             }
4375
4376             # Call recursively to fill in all the gaps.
4377             foreach my $gap (@gap_list) {
4378                 $self->_add_delete($operation,
4379                                    $gap->start,
4380                                    $gap->end,
4381                                    $value,
4382                                    Type => $type);
4383             }
4384
4385             return;
4386         }
4387
4388         # Here, we have taken care of the case where $replace is $NO.
4389         # Remember that here, r[$i-1]->end < $start <= r[$i]->end
4390         # If inserting a multiple record, this is where it goes, before the
4391         # first (if any) existing one if inserting LIFO.  (If this is to go
4392         # afterwards, FIFO, we below move the pointer to there.)  These imply
4393         # an insertion, and no change to any existing ranges.  Note that $i
4394         # can be -1 if this new range doesn't actually duplicate any existing,
4395         # and comes at the beginning of the list.
4396         if ($replace == $MULTIPLE_BEFORE || $replace == $MULTIPLE_AFTER) {
4397
4398             if ($start != $end) {
4399                 Carp::my_carp_bug("$owner_name_of{$addr}Can't cope with adding a multiple record when the range ($start..$end) contains more than one code point.  No action taken.");
4400                 return;
4401             }
4402
4403             # If the new code point is within a current range ...
4404             if ($end >= $r->[$i]->start) {
4405
4406                 # Don't add an exact duplicate, as it isn't really a multiple
4407                 my $existing_value = $r->[$i]->value;
4408                 my $existing_type = $r->[$i]->type;
4409                 return if $value eq $existing_value && $type eq $existing_type;
4410
4411                 # If the multiple value is part of an existing range, we want
4412                 # to split up that range, so that only the single code point
4413                 # is affected.  To do this, we first call ourselves
4414                 # recursively to delete that code point from the table, having
4415                 # preserved its current data above.  Then we call ourselves
4416                 # recursively again to add the new multiple, which we know by
4417                 # the test just above is different than the current code
4418                 # point's value, so it will become a range containing a single
4419                 # code point: just itself.  Finally, we add back in the
4420                 # pre-existing code point, which will again be a single code
4421                 # point range.  Because 'i' likely will have changed as a
4422                 # result of these operations, we can't just continue on, but
4423                 # do this operation recursively as well.  If we are inserting
4424                 # LIFO, the pre-existing code point needs to go after the new
4425                 # one, so use MULTIPLE_AFTER; and vice versa.
4426                 if ($r->[$i]->start != $r->[$i]->end) {
4427                     $self->_add_delete('-', $start, $end, "");
4428                     $self->_add_delete('+', $start, $end, $value, Type => $type);
4429                     return $self->_add_delete('+',
4430                             $start, $end,
4431                             $existing_value,
4432                             Type => $existing_type,
4433                             Replace => ($replace == $MULTIPLE_BEFORE)
4434                                        ? $MULTIPLE_AFTER
4435                                        : $MULTIPLE_BEFORE);
4436                 }
4437             }
4438
4439             # If to place this new record after, move to beyond all existing
4440             # ones; but don't add this one if identical to any of them, as it
4441             # isn't really a multiple.  This leaves the original order, so
4442             # that the current request is ignored.  The reasoning is that the
4443             # previous request that wanted this record to have high priority
4444             # should have precedence.
4445             if ($replace == $MULTIPLE_AFTER) {
4446                 while ($i < @$r && $r->[$i]->start == $start) {
4447                     return if $value eq $r->[$i]->value
4448                               && $type eq $r->[$i]->type;
4449                     $i++;
4450                 }
4451             }
4452             else {
4453                 # If instead we are to place this new record before any
4454                 # existing ones, remove any identical ones that come after it.
4455                 # This changes the existing order so that the new one is
4456                 # first, as is being requested.
4457                 for (my $j = $i + 1;
4458                      $j < @$r && $r->[$j]->start == $start;
4459                      $j++)
4460                 {
4461                     if ($value eq $r->[$j]->value && $type eq $r->[$j]->type) {
4462                         splice @$r, $j, 1;
4463                         last;   # There should only be one instance, so no
4464                                 # need to keep looking
4465                     }
4466                 }
4467             }
4468
4469             trace "Adding multiple record at $i with $start..$end, $value" if main::DEBUG && $to_trace;
4470             my @return = splice @$r,
4471                                 $i,
4472                                 0,
4473                                 Range->new($start,
4474                                            $end,
4475                                            Value => $value,
4476                                            Type => $type);
4477             if (main::DEBUG && $to_trace) {
4478                 trace "After splice:";
4479                 trace 'i-2=[', $i-2, ']', $r->[$i-2] if $i >= 2;
4480                 trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i >= 1;
4481                 trace "i  =[", $i, "]", $r->[$i] if $i >= 0;
4482                 trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < @$r - 1;
4483                 trace 'i+2=[', $i+2, ']', $r->[$i+2] if $i < @$r - 2;
4484                 trace 'i+3=[', $i+3, ']', $r->[$i+3] if $i < @$r - 3;
4485             }
4486             return @return;
4487         }
4488
4489         # Here, we have taken care of $NO and $MULTIPLE_foo replaces.  This
4490         # leaves delete, insert, and replace either unconditionally or if not
4491         # equivalent.  $i still points to the first potential affected range.
4492         # Now find the highest range affected, which will determine the length
4493         # parameter to splice.  (The input range can span multiple existing
4494         # ones.)  If this isn't a deletion, while we are looking through the
4495         # range list, see also if this is a replacement rather than a clean
4496         # insertion; that is if it will change the values of at least one
4497         # existing range.  Start off assuming it is an insert, until find it
4498         # isn't.
4499         my $clean_insert = $operation eq '+';
4500         my $j;        # This will point to the highest affected range
4501
4502         # For non-zero types, the standard form is the value itself;
4503         my $standard_form = ($type) ? $value : main::standardize($value);
4504
4505         for ($j = $i; $j < $range_list_size; $j++) {
4506             trace "Looking for highest affected range; the one at $j is ", $r->[$j] if main::DEBUG && $to_trace;
4507
4508             # If find a range that it doesn't overlap into, we can stop
4509             # searching
4510             last if $end < $r->[$j]->start;
4511
4512             # Here, overlaps the range at $j.  If the values don't match,
4513             # and so far we think this is a clean insertion, it becomes a
4514             # non-clean insertion, i.e., a 'change' or 'replace' instead.
4515             if ($clean_insert) {
4516                 if ($r->[$j]->standard_form ne $standard_form) {
4517                     $clean_insert = 0;
4518                     if ($replace == $CROAK) {
4519                         main::croak("The range to add "
4520                         . sprintf("%04X", $start)
4521                         . '-'
4522                         . sprintf("%04X", $end)
4523                         . " with value '$value' overlaps an existing range $r->[$j]");
4524                     }
4525                 }
4526                 else {
4527
4528                     # Here, the two values are essentially the same.  If the
4529                     # two are actually identical, replacing wouldn't change
4530                     # anything so skip it.
4531                     my $pre_existing = $r->[$j]->value;
4532                     if ($pre_existing ne $value) {
4533
4534                         # Here the new and old standardized values are the
4535                         # same, but the non-standardized values aren't.  If
4536                         # replacing unconditionally, then replace
4537                         if( $replace == $UNCONDITIONALLY) {
4538                             $clean_insert = 0;
4539                         }
4540                         else {
4541
4542                             # Here, are replacing conditionally.  Decide to
4543                             # replace or not based on which appears to look
4544                             # the "nicest".  If one is mixed case and the
4545                             # other isn't, choose the mixed case one.
4546                             my $new_mixed = $value =~ /[A-Z]/
4547                                             && $value =~ /[a-z]/;
4548                             my $old_mixed = $pre_existing =~ /[A-Z]/
4549                                             && $pre_existing =~ /[a-z]/;
4550
4551                             if ($old_mixed != $new_mixed) {
4552                                 $clean_insert = 0 if $new_mixed;
4553                                 if (main::DEBUG && $to_trace) {
4554                                     if ($clean_insert) {
4555                                         trace "Retaining $pre_existing over $value";
4556                                     }
4557                                     else {
4558                                         trace "Replacing $pre_existing with $value";
4559                                     }
4560                                 }
4561                             }
4562                             else {
4563
4564                                 # Here casing wasn't different between the two.
4565                                 # If one has hyphens or underscores and the
4566                                 # other doesn't, choose the one with the
4567                                 # punctuation.
4568                                 my $new_punct = $value =~ /[-_]/;
4569                                 my $old_punct = $pre_existing =~ /[-_]/;
4570
4571                                 if ($old_punct != $new_punct) {
4572                                     $clean_insert = 0 if $new_punct;
4573                                     if (main::DEBUG && $to_trace) {
4574                                         if ($clean_insert) {
4575                                             trace "Retaining $pre_existing over $value";
4576                                         }
4577                                         else {
4578                                             trace "Replacing $pre_existing with $value";
4579                                         }
4580                                     }
4581                                 }   # else existing one is just as "good";
4582                                     # retain it to save cycles.
4583                             }
4584                         }
4585                     }
4586                 }
4587             }
4588         } # End of loop looking for highest affected range.
4589
4590         # Here, $j points to one beyond the highest range that this insertion
4591         # affects (hence to beyond the range list if that range is the final
4592         # one in the range list).
4593
4594         # The splice length is all the affected ranges.  Get it before
4595         # subtracting, for efficiency, so we don't have to later add 1.
4596         my $length = $j - $i;
4597
4598         $j--;        # $j now points to the highest affected range.
4599         trace "Final affected range is $j: $r->[$j]" if main::DEBUG && $to_trace;
4600
4601         # Here, have taken care of $NO and $MULTIPLE_foo replaces.
4602         # $j points to the highest affected range.  But it can be < $i or even
4603         # -1.  These happen only if the insertion is entirely in the gap
4604         # between r[$i-1] and r[$i].  Here's why: j < i means that the j loop
4605         # above exited first time through with $end < $r->[$i]->start.  (And
4606         # then we subtracted one from j)  This implies also that $start <
4607         # $r->[$i]->start, but we know from above that $r->[$i-1]->end <
4608         # $start, so the entire input range is in the gap.
4609         if ($j < $i) {
4610
4611             # Here the entire input range is in the gap before $i.
4612
4613             if (main::DEBUG && $to_trace) {
4614                 if ($i) {
4615                     trace "Entire range is between $r->[$i-1] and $r->[$i]";
4616                 }
4617                 else {
4618                     trace "Entire range is before $r->[$i]";
4619                 }
4620             }
4621             return if $operation ne '+'; # Deletion of a non-existent range is
4622                                          # a no-op
4623         }
4624         else {
4625
4626             # Here part of the input range is not in the gap before $i.  Thus,
4627             # there is at least one affected one, and $j points to the highest
4628             # such one.
4629
4630             # At this point, here is the situation:
4631             # This is not an insertion of a multiple, nor of tentative ($NO)
4632             # data.
4633             #   $i  points to the first element in the current range list that
4634             #            may be affected by this operation.  In fact, we know
4635             #            that the range at $i is affected because we are in
4636             #            the else branch of this 'if'
4637             #   $j  points to the highest affected range.
4638             # In other words,
4639             #   r[$i-1]->end < $start <= r[$i]->end
4640             # And:
4641             #   r[$i-1]->end < $start <= $end < r[$j+1]->start
4642             #
4643             # Also:
4644             #   $clean_insert is a boolean which is set true if and only if
4645             #        this is a "clean insertion", i.e., not a change nor a
4646             #        deletion (multiple was handled above).
4647
4648             # We now have enough information to decide if this call is a no-op
4649             # or not.  It is a no-op if this is an insertion of already
4650             # existing data.  To be so, it must be contained entirely in one
4651             # range.
4652
4653             if (main::DEBUG && $to_trace && $clean_insert
4654                                          && $start >= $r->[$i]->start
4655                                          && $end   <= $r->[$i]->end)
4656             {
4657                     trace "no-op";
4658             }
4659             return if $clean_insert
4660                       && $start >= $r->[$i]->start
4661                       && $end   <= $r->[$i]->end;
4662         }
4663
4664         # Here, we know that some action will have to be taken.  We have
4665         # calculated the offset and length (though adjustments may be needed)
4666         # for the splice.  Now start constructing the replacement list.
4667         my @replacement;
4668         my $splice_start = $i;
4669
4670         my $extends_below;
4671         my $extends_above;
4672
4673         # See if should extend any adjacent ranges.
4674         if ($operation eq '-') { # Don't extend deletions
4675             $extends_below = $extends_above = 0;
4676         }
4677         else {  # Here, should extend any adjacent ranges.  See if there are
4678                 # any.
4679             $extends_below = ($i > 0
4680                             # can't extend unless adjacent
4681                             && $r->[$i-1]->end == $start -1
4682                             # can't extend unless are same standard value
4683                             && $r->[$i-1]->standard_form eq $standard_form
4684                             # can't extend unless share type
4685                             && $r->[$i-1]->type == $type);
4686             $extends_above = ($j+1 < $range_list_size
4687                             && $r->[$j+1]->start == $end +1
4688                             && $r->[$j+1]->standard_form eq $standard_form
4689                             && $r->[$j+1]->type == $type);
4690         }
4691         if ($extends_below && $extends_above) { # Adds to both
4692             $splice_start--;     # start replace at element below
4693             $length += 2;        # will replace on both sides
4694             trace "Extends both below and above ranges" if main::DEBUG && $to_trace;
4695
4696             # The result will fill in any gap, replacing both sides, and
4697             # create one large range.
4698             @replacement = Range->new($r->[$i-1]->start,
4699                                       $r->[$j+1]->end,
4700                                       Value => $value,
4701                                       Type => $type);
4702         }
4703         else {
4704
4705             # Here we know that the result won't just be the conglomeration of
4706             # a new range with both its adjacent neighbors.  But it could
4707             # extend one of them.
4708
4709             if ($extends_below) {
4710
4711                 # Here the new element adds to the one below, but not to the
4712                 # one above.  If inserting, and only to that one range,  can
4713                 # just change its ending to include the new one.
4714                 if ($length == 0 && $clean_insert) {
4715                     $r->[$i-1]->set_end($end);
4716                     trace "inserted range extends range to below so it is now $r->[$i-1]" if main::DEBUG && $to_trace;
4717                     return;
4718                 }
4719                 else {
4720                     trace "Changing inserted range to start at ", sprintf("%04X",  $r->[$i-1]->start), " instead of ", sprintf("%04X", $start) if main::DEBUG && $to_trace;
4721                     $splice_start--;        # start replace at element below
4722                     $length++;              # will replace the element below
4723                     $start = $r->[$i-1]->start;
4724                 }
4725             }
4726             elsif ($extends_above) {
4727
4728                 # Here the new element adds to the one above, but not below.
4729                 # Mirror the code above
4730                 if ($length == 0 && $clean_insert) {
4731                     $r->[$j+1]->set_start($start);
4732                     trace "inserted range extends range to above so it is now $r->[$j+1]" if main::DEBUG && $to_trace;
4733                     return;
4734                 }
4735                 else {
4736                     trace "Changing inserted range to end at ", sprintf("%04X",  $r->[$j+1]->end), " instead of ", sprintf("%04X", $end) if main::DEBUG && $to_trace;
4737                     $length++;        # will replace the element above
4738                     $end = $r->[$j+1]->end;
4739                 }
4740             }
4741
4742             trace "Range at $i is $r->[$i]" if main::DEBUG && $to_trace;
4743
4744             # Finally, here we know there will have to be a splice.
4745             # If the change or delete affects only the highest portion of the
4746             # first affected range, the range will have to be split.  The
4747             # splice will remove the whole range, but will replace it by a new
4748             # range containing just the unaffected part.  So, in this case,
4749             # add to the replacement list just this unaffected portion.
4750             if (! $extends_below
4751                 && $start > $r->[$i]->start && $start <= $r->[$i]->end)
4752             {
4753                 push @replacement,
4754                     Range->new($r->[$i]->start,
4755                                $start - 1,
4756                                Value => $r->[$i]->value,
4757                                Type => $r->[$i]->type);
4758             }
4759
4760             # In the case of an insert or change, but not a delete, we have to
4761             # put in the new stuff;  this comes next.
4762             if ($operation eq '+') {
4763                 push @replacement, Range->new($start,
4764                                               $end,
4765                                               Value => $value,
4766                                               Type => $type);
4767             }
4768
4769             trace "Range at $j is $r->[$j]" if main::DEBUG && $to_trace && $j != $i;
4770             #trace "$end >=", $r->[$j]->start, " && $end <", $r->[$j]->end if main::DEBUG && $to_trace;
4771
4772             # And finally, if we're changing or deleting only a portion of the
4773             # highest affected range, it must be split, as the lowest one was.
4774             if (! $extends_above
4775                 && $j >= 0  # Remember that j can be -1 if before first
4776                             # current element
4777                 && $end >= $r->[$j]->start
4778                 && $end < $r->[$j]->end)
4779             {
4780                 push @replacement,
4781                     Range->new($end + 1,
4782                                $r->[$j]->end,
4783                                Value => $r->[$j]->value,
4784                                Type => $r->[$j]->type);
4785             }
4786         }
4787
4788         # And do the splice, as calculated above
4789         if (main::DEBUG && $to_trace) {
4790             trace "replacing $length element(s) at $i with ";
4791             foreach my $replacement (@replacement) {
4792                 trace "    $replacement";
4793             }
4794             trace "Before splice:";
4795             trace 'i-2=[', $i-2, ']', $r->[$i-2] if $i >= 2;
4796             trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i >= 1;
4797             trace "i  =[", $i, "]", $r->[$i];
4798             trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < @$r - 1;
4799             trace 'i+2=[', $i+2, ']', $r->[$i+2] if $i < @$r - 2;
4800         }
4801
4802         my @return = splice @$r, $splice_start, $length, @replacement;
4803
4804         if (main::DEBUG && $to_trace) {
4805             trace "After splice:";
4806             trace 'i-2=[', $i-2, ']', $r->[$i-2] if $i >= 2;
4807             trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i >= 1;
4808             trace "i  =[", $i, "]", $r->[$i];
4809             trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < @$r - 1;
4810             trace 'i+2=[', $i+2, ']', $r->[$i+2] if $i < @$r - 2;
4811             trace "removed ", @return if @return;
4812         }
4813
4814         # An actual deletion could have changed the maximum in the list.
4815         # There was no deletion if the splice didn't return something, but
4816         # otherwise recalculate it.  This is done too rarely to worry about
4817         # performance.
4818         if ($operation eq '-' && @return) {
4819             if (@$r) {
4820                 $max{$addr} = $r->[-1]->end;
4821             }
4822             else {  # Now empty
4823                 $max{$addr} = $max_init;
4824             }
4825         }
4826         return @return;
4827     }
4828
4829     sub reset_each_range {  # reset the iterator for each_range();
4830         my $self = shift;
4831         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4832
4833         no overloading;
4834         undef $each_range_iterator{pack 'J', $self};
4835         return;
4836     }
4837
4838     sub each_range {
4839         # Iterate over each range in a range list.  Results are undefined if
4840         # the range list is changed during the iteration.
4841
4842         my $self = shift;
4843         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4844
4845         my $addr = do { no overloading; pack 'J', $self; };
4846
4847         return if $self->is_empty;
4848
4849         $each_range_iterator{$addr} = -1
4850                                 if ! defined $each_range_iterator{$addr};
4851         $each_range_iterator{$addr}++;
4852         return $ranges{$addr}->[$each_range_iterator{$addr}]
4853                         if $each_range_iterator{$addr} < @{$ranges{$addr}};
4854         undef $each_range_iterator{$addr};
4855         return;
4856     }
4857
4858     sub count {        # Returns count of code points in range list
4859         my $self = shift;
4860         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4861
4862         my $addr = do { no overloading; pack 'J', $self; };
4863
4864         my $count = 0;
4865         foreach my $range (@{$ranges{$addr}}) {
4866             $count += $range->end - $range->start + 1;
4867         }
4868         return $count;
4869     }
4870
4871     sub delete_range {    # Delete a range
4872         my $self = shift;
4873         my $start = shift;
4874         my $end = shift;
4875
4876         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4877
4878         return $self->_add_delete('-', $start, $end, "");
4879     }
4880
4881     sub is_empty { # Returns boolean as to if a range list is empty
4882         my $self = shift;
4883         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4884
4885         no overloading;
4886         return scalar @{$ranges{pack 'J', $self}} == 0;
4887     }
4888
4889     sub hash {
4890         # Quickly returns a scalar suitable for separating tables into
4891         # buckets, i.e. it is a hash function of the contents of a table, so
4892         # there are relatively few conflicts.
4893
4894         my $self = shift;
4895         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4896
4897         my $addr = do { no overloading; pack 'J', $self; };
4898
4899         # These are quickly computable.  Return looks like 'min..max;count'
4900         return $self->min . "..$max{$addr};" . scalar @{$ranges{$addr}};
4901     }
4902 } # End closure for _Range_List_Base
4903
4904 package Range_List;
4905 use parent '-norequire', '_Range_List_Base';
4906
4907 # A Range_List is a range list for match tables; i.e. the range values are
4908 # not significant.  Thus a number of operations can be safely added to it,
4909 # such as inversion, intersection.  Note that union is also an unsafe
4910 # operation when range values are cared about, and that method is in the base
4911 # class, not here.  But things are set up so that that method is callable only
4912 # during initialization.  Only in this derived class, is there an operation
4913 # that combines two tables.  A Range_Map can thus be used to initialize a
4914 # Range_List, and its mappings will be in the list, but are not significant to
4915 # this class.
4916
4917 sub trace { return main::trace(@_); }
4918
4919 { # Closure
4920
4921     use overload
4922         fallback => 0,
4923         '+' => sub { my $self = shift;
4924                     my $other = shift;
4925
4926                     return $self->_union($other)
4927                 },
4928         '+=' => sub { my $self = shift;
4929                     my $other = shift;
4930                     my $reversed = shift;
4931
4932                     if ($reversed) {
4933                         Carp::my_carp_bug("Bad news.  Can't cope with '"
4934                         . ref($other)
4935                         . ' += '
4936                         . ref($self)
4937                         . "'.  undef returned.");
4938                         return;
4939                     }
4940
4941                     return $self->_union($other)
4942                 },
4943         '&' => sub { my $self = shift;
4944                     my $other = shift;
4945
4946                     return $self->_intersect($other, 0);
4947                 },
4948         '&=' => sub { my $self = shift;
4949                     my $other = shift;
4950                     my $reversed = shift;
4951
4952                     if ($reversed) {
4953                         Carp::my_carp_bug("Bad news.  Can't cope with '"
4954                         . ref($other)
4955                         . ' &= '
4956                         . ref($self)
4957                         . "'.  undef returned.");
4958                         return;
4959                     }
4960
4961                     return $self->_intersect($other, 0);
4962                 },
4963         '~' => "_invert",
4964         '-' => "_subtract",
4965     ;
4966
4967     sub _invert {
4968         # Returns a new Range_List that gives all code points not in $self.
4969
4970         my $self = shift;
4971
4972         my $new = Range_List->new;
4973
4974         # Go through each range in the table, finding the gaps between them
4975         my $max = -1;   # Set so no gap before range beginning at 0
4976         for my $range ($self->ranges) {
4977             my $start = $range->start;
4978             my $end   = $range->end;
4979
4980             # If there is a gap before this range, the inverse will contain
4981             # that gap.
4982             if ($start > $max + 1) {
4983                 $new->add_range($max + 1, $start - 1);
4984             }
4985             $max = $end;
4986         }
4987
4988         # And finally, add the gap from the end of the table to the max
4989         # possible code point
4990         if ($max < $MAX_WORKING_CODEPOINT) {
4991             $new->add_range($max + 1, $MAX_WORKING_CODEPOINT);
4992         }
4993         return $new;
4994     }
4995
4996     sub _subtract {
4997         # Returns a new Range_List with the argument deleted from it.  The
4998         # argument can be a single code point, a range, or something that has
4999         # a range, with the _range_list() method on it returning them
5000
5001         my $self = shift;
5002         my $other = shift;
5003         my $reversed = shift;
5004         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5005
5006         if ($reversed) {
5007             Carp::my_carp_bug("Bad news.  Can't cope with '"
5008             . ref($other)
5009             . ' - '
5010             . ref($self)
5011             . "'.  undef returned.");
5012             return;
5013         }
5014
5015         my $new = Range_List->new(Initialize => $self);
5016
5017         if (! ref $other) { # Single code point
5018             $new->delete_range($other, $other);
5019         }
5020         elsif ($other->isa('Range')) {
5021             $new->delete_range($other->start, $other->end);
5022         }
5023         elsif ($other->can('_range_list')) {
5024             foreach my $range ($other->_range_list->ranges) {
5025                 $new->delete_range($range->start, $range->end);
5026             }
5027         }
5028         else {
5029             Carp::my_carp_bug("Can't cope with a "
5030                         . ref($other)
5031                         . " argument to '-'.  Subtraction ignored."
5032                         );
5033             return $self;
5034         }
5035
5036         return $new;
5037     }
5038
5039     sub _intersect {
5040         # Returns either a boolean giving whether the two inputs' range lists
5041         # intersect (overlap), or a new Range_List containing the intersection
5042         # of the two lists.  The optional final parameter being true indicates
5043         # to do the check instead of the intersection.
5044
5045         my $a_object = shift;
5046         my $b_object = shift;
5047         my $check_if_overlapping = shift;
5048         $check_if_overlapping = 0 unless defined $check_if_overlapping;
5049         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5050
5051         if (! defined $b_object) {
5052             my $message = "";
5053             $message .= $a_object->_owner_name_of if defined $a_object;
5054             Carp::my_carp_bug($message .= "Called with undefined value.  Intersection not done.");
5055             return;
5056         }
5057
5058         # a & b = !(!a | !b), or in our terminology = ~ ( ~a + -b )
5059         # Thus the intersection could be much more simply be written:
5060         #   return ~(~$a_object + ~$b_object);
5061         # But, this is slower, and when taking the inverse of a large
5062         # range_size_1 table, back when such tables were always stored that
5063         # way, it became prohibitively slow, hence the code was changed to the
5064         # below
5065
5066         if ($b_object->isa('Range')) {
5067             $b_object = Range_List->new(Initialize => $b_object,
5068                                         Owner => $a_object->_owner_name_of);
5069         }
5070         $b_object = $b_object->_range_list if $b_object->can('_range_list');
5071
5072         my @a_ranges = $a_object->ranges;
5073         my @b_ranges = $b_object->ranges;
5074
5075         #local $to_trace = 1 if main::DEBUG;
5076         trace "intersecting $a_object with ", scalar @a_ranges, "ranges and $b_object with", scalar @b_ranges, " ranges" if main::DEBUG && $to_trace;
5077
5078         # Start with the first range in each list
5079         my $a_i = 0;
5080         my $range_a = $a_ranges[$a_i];
5081         my $b_i = 0;
5082         my $range_b = $b_ranges[$b_i];
5083
5084         my $new = __PACKAGE__->new(Owner => $a_object->_owner_name_of)
5085                                                 if ! $check_if_overlapping;
5086
5087         # If either list is empty, there is no intersection and no overlap
5088         if (! defined $range_a || ! defined $range_b) {
5089             return $check_if_overlapping ? 0 : $new;
5090         }
5091         trace "range_a[$a_i]=$range_a; range_b[$b_i]=$range_b" if main::DEBUG && $to_trace;
5092
5093         # Otherwise, must calculate the intersection/overlap.  Start with the
5094         # very first code point in each list
5095         my $a = $range_a->start;
5096         my $b = $range_b->start;
5097
5098         # Loop through all the ranges of each list; in each iteration, $a and
5099         # $b are the current code points in their respective lists
5100         while (1) {
5101
5102             # If $a and $b are the same code point, ...
5103             if ($a == $b) {
5104
5105                 # it means the lists overlap.  If just checking for overlap
5106                 # know the answer now,
5107                 return 1 if $check_if_overlapping;
5108
5109                 # The intersection includes this code point plus anything else
5110                 # common to both current ranges.
5111                 my $start = $a;
5112                 my $end = main::min($range_a->end, $range_b->end);
5113                 if (! $check_if_overlapping) {
5114                     trace "adding intersection range ", sprintf("%04X", $start) . ".." . sprintf("%04X", $end) if main::DEBUG && $to_trace;
5115                     $new->add_range($start, $end);
5116                 }
5117
5118                 # Skip ahead to the end of the current intersect
5119                 $a = $b = $end;
5120
5121                 # If the current intersect ends at the end of either range (as
5122                 # it must for at least one of them), the next possible one
5123                 # will be the beginning code point in it's list's next range.
5124                 if ($a == $range_a->end) {
5125                     $range_a = $a_ranges[++$a_i];
5126                     last unless defined $range_a;
5127                     $a = $range_a->start;
5128                 }
5129                 if ($b == $range_b->end) {
5130                     $range_b = $b_ranges[++$b_i];
5131                     last unless defined $range_b;
5132                     $b = $range_b->start;
5133                 }
5134
5135                 trace "range_a[$a_i]=$range_a; range_b[$b_i]=$range_b" if main::DEBUG && $to_trace;
5136             }
5137             elsif ($a < $b) {
5138
5139                 # Not equal, but if the range containing $a encompasses $b,
5140                 # change $a to be the middle of the range where it does equal
5141                 # $b, so the next iteration will get the intersection
5142                 if ($range_a->end >= $b) {
5143                     $a = $b;
5144                 }
5145                 else {
5146
5147                     # Here, the current range containing $a is entirely below
5148                     # $b.  Go try to find a range that could contain $b.
5149                     $a_i = $a_object->_search_ranges($b);
5150
5151                     # If no range found, quit.
5152                     last unless defined $a_i;
5153
5154                     # The search returns $a_i, such that
5155                     #   range_a[$a_i-1]->end < $b <= range_a[$a_i]->end
5156                     # Set $a to the beginning of this new range, and repeat.
5157                     $range_a = $a_ranges[$a_i];
5158                     $a = $range_a->start;
5159                 }
5160             }
5161             else { # Here, $b < $a.
5162
5163                 # Mirror image code to the leg just above
5164                 if ($range_b->end >= $a) {
5165                     $b = $a;
5166                 }
5167                 else {
5168                     $b_i = $b_object->_search_ranges($a);
5169                     last unless defined $b_i;
5170                     $range_b = $b_ranges[$b_i];
5171                     $b = $range_b->start;
5172                 }
5173             }
5174         } # End of looping through ranges.
5175
5176         # Intersection fully computed, or now know that there is no overlap
5177         return $check_if_overlapping ? 0 : $new;
5178     }
5179
5180     sub overlaps {
5181         # Returns boolean giving whether the two arguments overlap somewhere
5182
5183         my $self = shift;
5184         my $other = shift;
5185         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5186
5187         return $self->_intersect($other, 1);
5188     }
5189
5190     sub add_range {
5191         # Add a range to the list.
5192
5193         my $self = shift;
5194         my $start = shift;
5195         my $end = shift;
5196         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5197
5198         return $self->_add_delete('+', $start, $end, "");
5199     }
5200
5201     sub matches_identically_to {
5202         # Return a boolean as to whether or not two Range_Lists match identical
5203         # sets of code points.
5204
5205         my $self = shift;
5206         my $other = shift;
5207         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5208
5209         # These are ordered in increasing real time to figure out (at least
5210         # until a patch changes that and doesn't change this)
5211         return 0 if $self->max != $other->max;
5212         return 0 if $self->min != $other->min;
5213         return 0 if $self->range_count != $other->range_count;
5214         return 0 if $self->count != $other->count;
5215
5216         # Here they could be identical because all the tests above passed.
5217         # The loop below is somewhat simpler since we know they have the same
5218         # number of elements.  Compare range by range, until reach the end or
5219         # find something that differs.
5220         my @a_ranges = $self->ranges;
5221         my @b_ranges = $other->ranges;
5222         for my $i (0 .. @a_ranges - 1) {
5223             my $a = $a_ranges[$i];
5224             my $b = $b_ranges[$i];
5225             trace "self $a; other $b" if main::DEBUG && $to_trace;
5226             return 0 if ! defined $b
5227                         || $a->start != $b->start
5228                         || $a->end != $b->end;
5229         }
5230         return 1;
5231     }
5232
5233     sub is_code_point_usable {
5234         # This used only for making the test script.  See if the input
5235         # proposed trial code point is one that Perl will handle.  If second
5236         # parameter is 0, it won't select some code points for various
5237         # reasons, noted below.
5238
5239         my $code = shift;
5240         my $try_hard = shift;
5241         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5242
5243         return 0 if $code < 0;                # Never use a negative
5244
5245         # shun null.  I'm (khw) not sure why this was done, but NULL would be
5246         # the character very frequently used.
5247         return $try_hard if $code == 0x0000;
5248
5249         # shun non-character code points.
5250         return $try_hard if $code >= 0xFDD0 && $code <= 0xFDEF;
5251         return $try_hard if ($code & 0xFFFE) == 0xFFFE; # includes FFFF
5252
5253         return $try_hard if $code > $MAX_UNICODE_CODEPOINT;   # keep in range
5254         return $try_hard if $code >= 0xD800 && $code <= 0xDFFF; # no surrogate
5255
5256         return 1;
5257     }
5258
5259     sub get_valid_code_point {
5260         # Return a code point that's part of the range list.  Returns nothing
5261         # if the table is empty or we can't find a suitable code point.  This
5262         # used only for making the test script.
5263
5264         my $self = shift;
5265         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5266
5267         my $addr = do { no overloading; pack 'J', $self; };
5268
5269         # On first pass, don't choose less desirable code points; if no good
5270         # one is found, repeat, allowing a less desirable one to be selected.
5271         for my $try_hard (0, 1) {
5272
5273             # Look through all the ranges for a usable code point.
5274             for my $set (reverse $self->ranges) {
5275
5276                 # Try the edge cases first, starting with the end point of the
5277                 # range.
5278                 my $end = $set->end;
5279                 return $end if is_code_point_usable($end, $try_hard);
5280                 $end = $MAX_UNICODE_CODEPOINT + 1 if $end > $MAX_UNICODE_CODEPOINT;
5281
5282                 # End point didn't, work.  Start at the beginning and try
5283                 # every one until find one that does work.
5284                 for my $trial ($set->start .. $end - 1) {
5285                     return $trial if is_code_point_usable($trial, $try_hard);
5286                 }
5287             }
5288         }
5289         return ();  # If none found, give up.
5290     }
5291
5292     sub get_invalid_code_point {
5293         # Return a code point that's not part of the table.  Returns nothing
5294         # if the table covers all code points or a suitable code point can't
5295         # be found.  This used only for making the test script.
5296
5297         my $self = shift;
5298         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5299
5300         # Just find a valid code point of the inverse, if any.
5301         return Range_List->new(Initialize => ~ $self)->get_valid_code_point;
5302     }
5303 } # end closure for Range_List
5304
5305 package Range_Map;
5306 use parent '-norequire', '_Range_List_Base';
5307
5308 # A Range_Map is a range list in which the range values (called maps) are
5309 # significant, and hence shouldn't be manipulated by our other code, which
5310 # could be ambiguous or lose things.  For example, in taking the union of two
5311 # lists, which share code points, but which have differing values, which one
5312 # has precedence in the union?
5313 # It turns out that these operations aren't really necessary for map tables,
5314 # and so this class was created to make sure they aren't accidentally
5315 # applied to them.
5316
5317 { # Closure
5318
5319     sub add_map {
5320         # Add a range containing a mapping value to the list
5321
5322         my $self = shift;
5323         # Rest of parameters passed on
5324
5325         return $self->_add_delete('+', @_);
5326     }
5327
5328     sub replace_map {
5329         # Replace a range
5330
5331         my $self = shift;
5332
5333         return $self->_add_delete('+', @_, Replace => $UNCONDITIONALLY);
5334     }
5335
5336     sub add_duplicate {
5337         # Adds entry to a range list which can duplicate an existing entry
5338
5339         my $self = shift;
5340         my $code_point = shift;
5341         my $value = shift;
5342         my %args = @_;
5343         my $replace = delete $args{'Replace'} // $MULTIPLE_BEFORE;
5344         Carp::carp_extra_args(\%args) if main::DEBUG && %args;
5345
5346         return $self->add_map($code_point, $code_point,
5347                                 $value, Replace => $replace);
5348     }
5349 } # End of closure for package Range_Map
5350
5351 package _Base_Table;
5352
5353 # A table is the basic data structure that gets written out into a file for
5354 # use by the Perl core.  This is the abstract base class implementing the
5355 # common elements from the derived ones.  A list of the methods to be
5356 # furnished by an implementing class is just after the constructor.
5357
5358 sub standardize { return main::standardize($_[0]); }
5359 sub trace { return main::trace(@_); }
5360
5361 { # Closure
5362
5363     main::setup_package();
5364
5365     my %range_list;
5366     # Object containing the ranges of the table.
5367     main::set_access('range_list', \%range_list, 'p_r', 'p_s');
5368
5369     my %full_name;
5370     # The full table name.
5371     main::set_access('full_name', \%full_name, 'r');
5372
5373     my %name;
5374     # The table name, almost always shorter
5375     main::set_access('name', \%name, 'r');
5376
5377     my %short_name;
5378     # The shortest of all the aliases for this table, with underscores removed
5379     main::set_access('short_name', \%short_name);
5380
5381     my %nominal_short_name_length;
5382     # The length of short_name before removing underscores
5383     main::set_access('nominal_short_name_length',
5384                     \%nominal_short_name_length);
5385
5386     my %complete_name;
5387     # The complete name, including property.
5388     main::set_access('complete_name', \%complete_name, 'r');
5389
5390     my %property;
5391     # Parent property this table is attached to.
5392     main::set_access('property', \%property, 'r');
5393
5394     my %aliases;
5395     # Ordered list of alias objects of the table's name.  The first ones in
5396     # the list are output first in comments
5397     main::set_access('aliases', \%aliases, 'readable_array');
5398
5399     my %comment;
5400     # A comment associated with the table for human readers of the files
5401     main::set_access('comment', \%comment, 's');
5402
5403     my %description;
5404     # A comment giving a short description of the table's meaning for human
5405     # readers of the files.
5406     main::set_access('description', \%description, 'readable_array');
5407
5408     my %note;
5409     # A comment giving a short note about the table for human readers of the
5410     # files.
5411     main::set_access('note', \%note, 'readable_array');
5412
5413     my %fate;
5414     # Enum; there are a number of possibilities for what happens to this
5415     # table: it could be normal, or suppressed, or not for external use.  See
5416     # values at definition for $SUPPRESSED.
5417     main::set_access('fate', \%fate, 'r');
5418
5419     my %find_table_from_alias;
5420     # The parent property passes this pointer to a hash which this class adds
5421     # all its aliases to, so that the parent can quickly take an alias and
5422     # find this table.
5423     main::set_access('find_table_from_alias', \%find_table_from_alias, 'p_r');
5424
5425     my %locked;
5426     # After this table is made equivalent to another one; we shouldn't go
5427     # changing the contents because that could mean it's no longer equivalent
5428     main::set_access('locked', \%locked, 'r');
5429
5430     my %file_path;
5431     # This gives the final path to the file containing the table.  Each
5432     # directory in the path is an element in the array
5433     main::set_access('file_path', \%file_path, 'readable_array');
5434
5435     my %status;
5436     # What is the table's status, normal, $OBSOLETE, etc.  Enum
5437     main::set_access('status', \%status, 'r');
5438
5439     my %status_info;
5440     # A comment about its being obsolete, or whatever non normal status it has
5441     main::set_access('status_info', \%status_info, 'r');
5442
5443     my %caseless_equivalent;
5444     # The table this is equivalent to under /i matching, if any.
5445     main::set_access('caseless_equivalent', \%caseless_equivalent, 'r', 's');
5446
5447     my %range_size_1;
5448     # Is the table to be output with each range only a single code point?
5449     # This is done to avoid breaking existing code that may have come to rely
5450     # on this behavior in previous versions of this program.)
5451     main::set_access('range_size_1', \%range_size_1, 'r', 's');
5452
5453     my %perl_extension;
5454     # A boolean set iff this table is a Perl extension to the Unicode
5455     # standard.
5456     main::set_access('perl_extension', \%perl_extension, 'r');
5457
5458     my %output_range_counts;
5459     # A boolean set iff this table is to have comments written in the
5460     # output file that contain the number of code points in the range.
5461     # The constructor can override the global flag of the same name.
5462     main::set_access('output_range_counts', \%output_range_counts, 'r');
5463
5464     my %write_as_invlist;
5465     # A boolean set iff the output file for this table is to be in the form of
5466     # an inversion list/map.
5467     main::set_access('write_as_invlist', \%write_as_invlist, 'r');
5468
5469     my %format;
5470     # The format of the entries of the table.  This is calculated from the
5471     # data in the table (or passed in the constructor).  This is an enum e.g.,
5472     # $STRING_FORMAT.  It is marked protected as it should not be generally
5473     # used to override calculations.
5474     main::set_access('format', \%format, 'r', 'p_s');
5475
5476     my %has_dependency;
5477     # A boolean that gives whether some other table in this property is
5478     # defined as the complement of this table.  This is a crude, but currently
5479     # sufficient, mechanism to make this table not get destroyed before what
5480     # is dependent on it is.  Other dependencies could be added, so the name
5481     # was chosen to reflect a more general situation than actually is
5482     # currently the case.
5483     main::set_access('has_dependency', \%has_dependency, 'r', 's');
5484
5485     sub new {
5486         # All arguments are key => value pairs, which you can see below, most
5487         # of which match fields documented above.  Otherwise: Re_Pod_Entry,
5488         # OK_as_Filename, and Fuzzy apply to the names of the table, and are
5489         # documented in the Alias package
5490
5491         return Carp::carp_too_few_args(\@_, 2) if main::DEBUG && @_ < 2;
5492
5493         my $class = shift;
5494
5495         my $self = bless \do { my $anonymous_scalar }, $class;
5496         my $addr = do { no overloading; pack 'J', $self; };
5497
5498         my %args = @_;
5499
5500         $name{$addr} = delete $args{'Name'};
5501         $find_table_from_alias{$addr} = delete $args{'_Alias_Hash'};
5502         $full_name{$addr} = delete $args{'Full_Name'};
5503         my $complete_name = $complete_name{$addr}
5504                           = delete $args{'Complete_Name'};
5505         $format{$addr} = delete $args{'Format'};
5506         $output_range_counts{$addr} = delete $args{'Output_Range_Counts'};
5507         $property{$addr} = delete $args{'_Property'};
5508         $range_list{$addr} = delete $args{'_Range_List'};
5509         $status{$addr} = delete $args{'Status'} || $NORMAL;
5510         $status_info{$addr} = delete $args{'_Status_Info'} || "";
5511         $range_size_1{$addr} = delete $args{'Range_Size_1'} || 0;
5512         $caseless_equivalent{$addr} = delete $args{'Caseless_Equivalent'} || 0;
5513         $fate{$addr} = delete $args{'Fate'} || $ORDINARY;
5514         $write_as_invlist{$addr} = delete $args{'Write_As_Invlist'};# No default
5515         my $ucd = delete $args{'UCD'};
5516
5517         my $description = delete $args{'Description'};
5518         my $ok_as_filename = delete $args{'OK_as_Filename'};
5519         my $loose_match = delete $args{'Fuzzy'};
5520         my $note = delete $args{'Note'};
5521         my $make_re_pod_entry = delete $args{'Re_Pod_Entry'};
5522         my $perl_extension = delete $args{'Perl_Extension'};
5523         my $suppression_reason = delete $args{'Suppression_Reason'};
5524
5525         # Shouldn't have any left over
5526         Carp::carp_extra_args(\%args) if main::DEBUG && %args;
5527
5528         # Can't use || above because conceivably the name could be 0, and
5529         # can't use // operator in case this program gets used in Perl 5.8
5530         $full_name{$addr} = $name{$addr} if ! defined $full_name{$addr};
5531         $output_range_counts{$addr} = $output_range_counts if
5532                                         ! defined $output_range_counts{$addr};
5533
5534         $aliases{$addr} = [ ];
5535         $comment{$addr} = [ ];
5536         $description{$addr} = [ ];
5537         $note{$addr} = [ ];
5538         $file_path{$addr} = [ ];
5539         $locked{$addr} = "";
5540         $has_dependency{$addr} = 0;
5541
5542         push @{$description{$addr}}, $description if $description;
5543         push @{$note{$addr}}, $note if $note;
5544
5545         if ($fate{$addr} == $PLACEHOLDER) {
5546
5547             # A placeholder table doesn't get documented, is a perl extension,
5548             # and quite likely will be empty
5549             $make_re_pod_entry = 0 if ! defined $make_re_pod_entry;
5550             $perl_extension = 1 if ! defined $perl_extension;
5551             $ucd = 0 if ! defined $ucd;
5552             push @tables_that_may_be_empty, $complete_name{$addr};
5553             $self->add_comment(<<END);
5554 This is a placeholder because it is not in Version $string_version of Unicode,
5555 but is needed by the Perl core to work gracefully.  Because it is not in this
5556 version of Unicode, it will not be listed in $pod_file.pod
5557 END
5558         }
5559         elsif (exists $why_suppressed{$complete_name}
5560                 # Don't suppress if overridden
5561                 && ! grep { $_ eq $complete_name{$addr} }
5562                                                     @output_mapped_properties)
5563         {
5564             $fate{$addr} = $SUPPRESSED;
5565         }
5566         elsif ($fate{$addr} == $SUPPRESSED) {
5567             Carp::my_carp_bug("Need reason for suppressing") unless $suppression_reason;
5568             # Though currently unused
5569         }
5570         elsif ($suppression_reason) {
5571             Carp::my_carp_bug("A reason was given for suppressing, but not suppressed");
5572         }
5573
5574         # If hasn't set its status already, see if it is on one of the
5575         # lists of properties or tables that have particular statuses; if
5576         # not, is normal.  The lists are prioritized so the most serious
5577         # ones are checked first
5578         if (! $status{$addr}) {
5579             if (exists $why_deprecated{$complete_name}) {
5580                 $status{$addr} = $DEPRECATED;
5581             }
5582             elsif (exists $why_stabilized{$complete_name}) {
5583                 $status{$addr} = $STABILIZED;
5584             }
5585             elsif (exists $why_obsolete{$complete_name}) {
5586                 $status{$addr} = $OBSOLETE;
5587             }
5588
5589             # Existence above doesn't necessarily mean there is a message
5590             # associated with it.  Use the most serious message.
5591             if ($status{$addr}) {
5592                 if ($why_deprecated{$complete_name}) {
5593                     $status_info{$addr}
5594                                 = $why_deprecated{$complete_name};
5595                 }
5596                 elsif ($why_stabilized{$complete_name}) {
5597                     $status_info{$addr}
5598                                 = $why_stabilized{$complete_name};
5599                 }
5600                 elsif ($why_obsolete{$complete_name}) {
5601                     $status_info{$addr}
5602                                 = $why_obsolete{$complete_name};
5603                 }
5604             }
5605         }
5606
5607         $perl_extension{$addr} = $perl_extension || 0;
5608
5609         # Don't list a property by default that is internal only
5610         if ($fate{$addr} > $MAP_PROXIED) {
5611             $make_re_pod_entry = 0 if ! defined $make_re_pod_entry;
5612             $ucd = 0 if ! defined $ucd;
5613         }
5614         else {
5615             $ucd = 1 if ! defined $ucd;
5616         }
5617
5618         # By convention what typically gets printed only or first is what's
5619         # first in the list, so put the full name there for good output
5620         # clarity.  Other routines rely on the full name being first on the
5621         # list
5622         $self->add_alias($full_name{$addr},
5623                             OK_as_Filename => $ok_as_filename,
5624                             Fuzzy => $loose_match,
5625                             Re_Pod_Entry => $make_re_pod_entry,
5626                             Status => $status{$addr},
5627                             UCD => $ucd,
5628                             );
5629
5630         # Then comes the other name, if meaningfully different.
5631         if (standardize($full_name{$addr}) ne standardize($name{$addr})) {
5632             $self->add_alias($name{$addr},
5633                             OK_as_Filename => $ok_as_filename,
5634                             Fuzzy => $loose_match,
5635                             Re_Pod_Entry => $make_re_pod_entry,
5636                             Status => $status{$addr},
5637                             UCD => $ucd,
5638                             );
5639         }
5640
5641         return $self;
5642     }
5643
5644     # Here are the methods that are required to be defined by any derived
5645     # class
5646     for my $sub (qw(
5647                     handle_special_range
5648                     append_to_body
5649                     pre_body
5650                 ))
5651                 # write() knows how to write out normal ranges, but it calls
5652                 # handle_special_range() when it encounters a non-normal one.
5653                 # append_to_body() is called by it after it has handled all
5654                 # ranges to add anything after the main portion of the table.
5655                 # And finally, pre_body() is called after all this to build up
5656                 # anything that should appear before the main portion of the
5657                 # table.  Doing it this way allows things in the middle to
5658                 # affect what should appear before the main portion of the
5659                 # table.
5660     {
5661         no strict "refs";
5662         *$sub = sub {
5663             Carp::my_carp_bug( __LINE__
5664                               . ": Must create method '$sub()' for "
5665                               . ref shift);
5666             return;
5667         }
5668     }
5669
5670     use overload
5671         fallback => 0,
5672         "." => \&main::_operator_dot,
5673         ".=" => \&main::_operator_dot_equal,
5674         '!=' => \&main::_operator_not_equal,
5675         '==' => \&main::_operator_equal,
5676     ;
5677
5678     sub ranges {
5679         # Returns the array of ranges associated with this table.
5680
5681         no overloading;
5682         return $range_list{pack 'J', shift}->ranges;
5683     }
5684
5685     sub add_alias {
5686         # Add a synonym for this table.
5687
5688         return Carp::carp_too_few_args(\@_, 3) if main::DEBUG && @_ < 3;
5689
5690         my $self = shift;
5691         my $name = shift;       # The name to add.
5692         my $pointer = shift;    # What the alias hash should point to.  For
5693                                 # map tables, this is the parent property;
5694                                 # for match tables, it is the table itself.
5695
5696         my %args = @_;
5697         my $loose_match = delete $args{'Fuzzy'};
5698
5699         my $ok_as_filename = delete $args{'OK_as_Filename'};
5700         $ok_as_filename = 1 unless defined $ok_as_filename;
5701
5702         # An internal name does not get documented, unless overridden by the
5703         # input; same for making tests for it.
5704         my $status = delete $args{'Status'} || (($name =~ /^_/)
5705                                                 ? $INTERNAL_ALIAS
5706                                                 : $NORMAL);
5707         my $make_re_pod_entry = delete $args{'Re_Pod_Entry'}
5708                                             // (($status ne $INTERNAL_ALIAS)
5709                                                ? (($name =~ /^_/) ? $NO : $YES)
5710                                                : $NO);
5711         my $ucd = delete $args{'UCD'} // (($name =~ /^_/) ? 0 : 1);
5712
5713         Carp::carp_extra_args(\%args) if main::DEBUG && %args;
5714
5715         # Capitalize the first letter of the alias unless it is one of the CJK
5716         # ones which specifically begins with a lower 'k'.  Do this because
5717         # Unicode has varied whether they capitalize first letters or not, and
5718         # have later changed their minds and capitalized them, but not the
5719         # other way around.  So do it always and avoid changes from release to
5720         # release
5721         $name = ucfirst($name) unless $name =~ /^k[A-Z]/;
5722
5723         my $addr = do { no overloading; pack 'J', $self; };
5724
5725         # Figure out if should be loosely matched if not already specified.
5726         if (! defined $loose_match) {
5727
5728             # Is a loose_match if isn't null, and doesn't begin with an
5729             # underscore and isn't just a number
5730             if ($name ne ""
5731                 && substr($name, 0, 1) ne '_'
5732                 && $name !~ qr{^[0-9_.+-/]+$})
5733             {
5734                 $loose_match = 1;
5735             }
5736             else {
5737                 $loose_match = 0;
5738             }
5739         }
5740
5741         # If this alias has already been defined, do nothing.
5742         return if defined $find_table_from_alias{$addr}->{$name};
5743
5744         # That includes if it is standardly equivalent to an existing alias,
5745         # in which case, add this name to the list, so won't have to search
5746         # for it again.
5747         my $standard_name = main::standardize($name);
5748         if (defined $find_table_from_alias{$addr}->{$standard_name}) {
5749             $find_table_from_alias{$addr}->{$name}
5750                         = $find_table_from_alias{$addr}->{$standard_name};
5751             return;
5752         }
5753
5754         # Set the index hash for this alias for future quick reference.
5755         $find_table_from_alias{$addr}->{$name} = $pointer;
5756         $find_table_from_alias{$addr}->{$standard_name} = $pointer;
5757         local $to_trace = 0 if main::DEBUG;
5758         trace "adding alias $name to $pointer" if main::DEBUG && $to_trace;
5759         trace "adding alias $standard_name to $pointer" if main::DEBUG && $to_trace;
5760
5761
5762         # Put the new alias at the end of the list of aliases unless the final
5763         # element begins with an underscore (meaning it is for internal perl
5764         # use) or is all numeric, in which case, put the new one before that
5765         # one.  This floats any all-numeric or underscore-beginning aliases to
5766         # the end.  This is done so that they are listed last in output lists,
5767         # to encourage the user to use a better name (either more descriptive
5768         # or not an internal-only one) instead.  This ordering is relied on
5769         # implicitly elsewhere in this program, like in short_name()
5770         my $list = $aliases{$addr};
5771         my $insert_position = (@$list == 0
5772                                 || (substr($list->[-1]->name, 0, 1) ne '_'
5773                                     && $list->[-1]->name =~ /\D/))
5774                             ? @$list
5775                             : @$list - 1;
5776         splice @$list,
5777                 $insert_position,
5778                 0,
5779                 Alias->new($name, $loose_match, $make_re_pod_entry,
5780                            $ok_as_filename, $status, $ucd);
5781
5782         # This name may be shorter than any existing ones, so clear the cache
5783         # of the shortest, so will have to be recalculated.
5784         no overloading;
5785         undef $short_name{pack 'J', $self};
5786         return;
5787     }
5788
5789     sub short_name {
5790         # Returns a name suitable for use as the base part of a file name.
5791         # That is, shorter wins.  It can return undef if there is no suitable
5792         # name.  The name has all non-essential underscores removed.
5793
5794         # The optional second parameter is a reference to a scalar in which
5795         # this routine will store the length the returned name had before the
5796         # underscores were removed, or undef if the return is undef.
5797
5798         # The shortest name can change if new aliases are added.  So using
5799         # this should be deferred until after all these are added.  The code
5800         # that does that should clear this one's cache.
5801         # Any name with alphabetics is preferred over an all numeric one, even
5802         # if longer.
5803
5804         my $self = shift;
5805         my $nominal_length_ptr = shift;
5806         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5807
5808         my $addr = do { no overloading; pack 'J', $self; };
5809
5810         # For efficiency, don't recalculate, but this means that adding new
5811         # aliases could change what the shortest is, so the code that does
5812         # that needs to undef this.
5813         if (defined $short_name{$addr}) {
5814             if ($nominal_length_ptr) {
5815                 $$nominal_length_ptr = $nominal_short_name_length{$addr};
5816             }
5817             return $short_name{$addr};
5818         }
5819
5820         # Look at each alias
5821         my $is_last_resort = 0;
5822         my $deprecated_or_discouraged
5823                                 = qr/ ^ (?: $DEPRECATED | $DISCOURAGED ) $/x;
5824         foreach my $alias ($self->aliases()) {
5825
5826             # Don't use an alias that isn't ok to use for an external name.
5827             next if ! $alias->ok_as_filename;
5828
5829             my $name = main::Standardize($alias->name);
5830             trace $self, $name if main::DEBUG && $to_trace;
5831
5832             # Take the first one, or any non-deprecated non-discouraged one
5833             # over one that is, or a shorter one that isn't numeric.  This
5834             # relies on numeric aliases always being last in the array
5835             # returned by aliases().  Any alpha one will have precedence.
5836             if (   ! defined $short_name{$addr}
5837                 || (   $is_last_resort
5838                     && $alias->status !~ $deprecated_or_discouraged)
5839                 || ($name =~ /\D/
5840                     && length($name) < length($short_name{$addr})))
5841             {
5842                 # Remove interior underscores.
5843                 ($short_name{$addr} = $name) =~ s/ (?<= . ) _ (?= . ) //xg;
5844
5845                 $nominal_short_name_length{$addr} = length $name;
5846                 $is_last_resort = $alias->status =~ $deprecated_or_discouraged;
5847             }
5848         }
5849
5850         # If the short name isn't a nice one, perhaps an equivalent table has
5851         # a better one.
5852         if (   $self->can('children')
5853             && (   ! defined $short_name{$addr}
5854                 || $short_name{$addr} eq ""
5855                 || $short_name{$addr} eq "_"))
5856         {
5857             my $return;
5858             foreach my $follower ($self->children) {    # All equivalents
5859                 my $follower_name = $follower->short_name;
5860                 next unless defined $follower_name;
5861
5862                 # Anything (except undefined) is better than underscore or
5863                 # empty
5864                 if (! defined $return || $return eq "_") {
5865                     $return = $follower_name;
5866                     next;
5867                 }
5868
5869                 # If the new follower name isn't "_" and is shorter than the
5870                 # current best one, prefer the new one.
5871                 next if $follower_name eq "_";
5872                 next if length $follower_name > length $return;
5873                 $return = $follower_name;
5874             }
5875             $short_name{$addr} = $return if defined $return;
5876         }
5877
5878         # If no suitable external name return undef
5879         if (! defined $short_name{$addr}) {
5880             $$nominal_length_ptr = undef if $nominal_length_ptr;
5881             return;
5882         }
5883
5884         # Don't allow a null short name.
5885         if ($short_name{$addr} eq "") {
5886             $short_name{$addr} = '_';
5887             $nominal_short_name_length{$addr} = 1;
5888         }
5889
5890         trace $self, $short_name{$addr} if main::DEBUG && $to_trace;
5891
5892         if ($nominal_length_ptr) {
5893             $$nominal_length_ptr = $nominal_short_name_length{$addr};
5894         }
5895         return $short_name{$addr};
5896     }
5897
5898     sub external_name {
5899         # Returns the external name that this table should be known by.  This
5900         # is usually the short_name, but not if the short_name is undefined,
5901         # in which case the external_name is arbitrarily set to the
5902         # underscore.
5903
5904         my $self = shift;
5905         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5906
5907         my $short = $self->short_name;
5908         return $short if defined $short;
5909
5910         return '_';
5911     }
5912
5913     sub add_description { # Adds the parameter as a short description.
5914
5915         my $self = shift;
5916         my $description = shift;
5917         chomp $description;
5918         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5919
5920         no overloading;
5921         push @{$description{pack 'J', $self}}, $description;
5922
5923         return;
5924     }
5925
5926     sub add_note { # Adds the parameter as a short note.
5927
5928         my $self = shift;
5929         my $note = shift;
5930         chomp $note;
5931         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5932
5933         no overloading;
5934         push @{$note{pack 'J', $self}}, $note;
5935
5936         return;
5937     }
5938
5939     sub add_comment { # Adds the parameter as a comment.
5940
5941         return unless $debugging_build;
5942
5943         my $self = shift;
5944         my $comment = shift;
5945         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5946
5947         chomp $comment;
5948
5949         no overloading;
5950         push @{$comment{pack 'J', $self}}, $comment;
5951
5952         return;
5953     }
5954
5955     sub comment {
5956         # Return the current comment for this table.  If called in list
5957         # context, returns the array of comments.  In scalar, returns a string
5958         # of each element joined together with a period ending each.
5959
5960         my $self = shift;
5961         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5962
5963         my $addr = do { no overloading; pack 'J', $self; };
5964         my @list = @{$comment{$addr}};
5965         return @list if wantarray;
5966         my $return = "";
5967         foreach my $sentence (@list) {
5968             $return .= '.  ' if $return;
5969             $return .= $sentence;
5970             $return =~ s/\.$//;
5971         }
5972         $return .= '.' if $return;
5973         return $return;
5974     }
5975
5976     sub initialize {
5977         # Initialize the table with the argument which is any valid
5978         # initialization for range lists.
5979
5980         my $self = shift;
5981         my $addr = do { no overloading; pack 'J', $self; };
5982         my $initialization = shift;
5983         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5984
5985         # Replace the current range list with a new one of the same exact
5986         # type.
5987         my $class = ref $range_list{$addr};
5988         $range_list{$addr} = $class->new(Owner => $self,
5989                                         Initialize => $initialization);
5990         return;
5991
5992     }
5993
5994     sub header {
5995         # The header that is output for the table in the file it is written
5996         # in.
5997
5998         my $self = shift;
5999         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6000
6001         my $return = "";
6002         $return .= $DEVELOPMENT_ONLY if $compare_versions;
6003         $return .= $HEADER;
6004         return $return;
6005     }
6006
6007     sub merge_single_annotation_line ($$$) {
6008         my ($output, $annotation, $annotation_column) = @_;
6009
6010         # This appends an annotation comment, $annotation, to $output,
6011         # starting in or after column $annotation_column, removing any
6012         # pre-existing comment from $output.
6013
6014         $annotation =~ s/^ \s* \# \  //x;
6015         $output =~ s/ \s* ( \# \N* )? \n //x;
6016         $output = Text::Tabs::expand($output);
6017
6018         my $spaces = $annotation_column - length $output;
6019         $spaces = 2 if $spaces < 0;  # Have 2 blanks before the comment
6020
6021         $output = sprintf "%s%*s# %s",
6022                             $output,
6023                             $spaces,
6024                             " ",
6025                             $annotation;
6026         return Text::Tabs::unexpand $output;
6027     }
6028
6029     sub write {
6030         # Write a representation of the table to its file.  It calls several
6031         # functions furnished by sub-classes of this abstract base class to
6032         # handle non-normal ranges, to add stuff before the table, and at its
6033         # end.  If the table is to be written so that adjustments are
6034         # required, this does that conversion.
6035
6036         my $self = shift;
6037         my $use_adjustments = shift; # ? output in adjusted format or not
6038         my $suppress_value = shift;  # Optional, if the value associated with
6039                                      # a range equals this one, don't write
6040                                      # the range
6041         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6042
6043         my $addr = do { no overloading; pack 'J', $self; };
6044         my $write_as_invlist = $write_as_invlist{$addr};
6045
6046         # Start with the header
6047         my @HEADER = $self->header;
6048
6049         # Then the comments
6050         push @HEADER, "\n", main::simple_fold($comment{$addr}, '# '), "\n"
6051                                                         if $comment{$addr};
6052
6053         # Things discovered processing the main body of the document may
6054         # affect what gets output before it, therefore pre_body() isn't called
6055         # until after all other processing of the table is done.
6056
6057         # The main body looks like a 'here' document.  If there are comments,
6058         # get rid of them when processing it.
6059         my @OUT;
6060         if ($annotate || $output_range_counts) {
6061             # Use the line below in Perls that don't have /r
6062             #push @OUT, 'return join "\n",  map { s/\s*#.*//mg; $_ } split "\n", <<\'END\';' . "\n";
6063             push @OUT, "return <<'END' =~ s/\\s*#.*//mgr;\n";
6064         } else {
6065             push @OUT, "return <<'END';\n";
6066         }
6067
6068         if ($range_list{$addr}->is_empty) {
6069
6070             # This is a kludge for empty tables to silence a warning in
6071             # utf8.c, which can't really deal with empty tables, but it can
6072             # deal with a table that matches nothing, as the inverse of 'All'
6073             # does.
6074             push @OUT, "!utf8::All\n";
6075         }
6076         elsif ($self->name eq 'N'
6077
6078                # To save disk space and table cache space, avoid putting out
6079                # binary N tables, but instead create a file which just inverts
6080                # the Y table.  Since the file will still exist and occupy a
6081                # certain number of blocks, might as well output the whole
6082                # thing if it all will fit in one block.   The number of
6083                # ranges below is an approximate number for that.
6084                && ($self->property->type == $BINARY
6085                    || $self->property->type == $FORCED_BINARY)
6086                # && $self->property->tables == 2  Can't do this because the
6087                #        non-binary properties, like NFDQC aren't specifiable
6088                #        by the notation
6089                && $range_list{$addr}->ranges > 15
6090                && ! $annotate)  # Under --annotate, want to see everything
6091         {
6092             push @OUT, "!utf8::" . $self->property->name . "\n";
6093         }
6094         else {
6095             my $range_size_1 = $range_size_1{$addr};
6096
6097             # To make it more readable, use a minimum indentation
6098             my $comment_indent;
6099
6100             # These are used only in $annotate option
6101             my $format;         # e.g. $HEX_ADJUST_FORMAT
6102             my $include_name;   # ? Include the character's name in the
6103                                 # annotation?
6104             my $include_cp;     # ? Include its code point
6105
6106             if (! $annotate) {
6107                 $comment_indent = ($self->isa('Map_Table'))
6108                                   ? 24
6109                                   : ($write_as_invlist)
6110                                     ? 8
6111                                     : 16;
6112             }
6113             else {
6114                 $format = $self->format;
6115
6116                 # The name of the character is output only for tables that
6117                 # don't already include the name in the output.
6118                 my $property = $self->property;
6119                 $include_name =
6120                     !  ($property == $perl_charname
6121                         || $property == main::property_ref('Unicode_1_Name')
6122                         || $property == main::property_ref('Name')
6123                         || $property == main::property_ref('Name_Alias')
6124                        );
6125
6126                 # Don't include the code point in the annotation where all
6127                 # lines are a single code point, so it can be easily found in
6128                 # the first column
6129                 $include_cp = ! $range_size_1;
6130
6131                 if (! $self->isa('Map_Table')) {
6132                     $comment_indent = ($write_as_invlist) ? 8 : 16;
6133                 }
6134                 else {
6135                     $comment_indent = 16;
6136
6137                     # There are just a few short ranges in this table, so no
6138                     # need to include the code point in the annotation.
6139                     $include_cp = 0 if $format eq $DECOMP_STRING_FORMAT;
6140
6141                     # We're trying to get this to look good, as the whole
6142                     # point is to make human-readable tables.  It is easier to
6143                     # read if almost all the annotation comments begin in the
6144                     # same column.  Map tables have varying width maps, so can
6145                     # create a jagged comment appearance.  This code does a
6146                     # preliminary pass through these tables looking for the
6147                     # maximum width map in each, and causing the comments to
6148                     # begin just to the right of that.  However, if the
6149                     # comments begin too far to the right of most lines, it's
6150                     # hard to line them up horizontally with their real data.
6151                     # Therefore we ignore the longest outliers
6152                     my $ignore_longest_X_percent = 2;  # Discard longest X%
6153
6154                     # Each key in this hash is a width of at least one of the
6155                     # maps in the table.  Its value is how many lines have
6156                     # that width.
6157                     my %widths;
6158
6159                     # We won't space things further left than one tab stop
6160                     # after the rest of the line; initializing it to that
6161                     # number saves some work.
6162                     my $max_map_width = 8;
6163
6164                     # Fill in the %widths hash
6165                     my $total = 0;
6166                     for my $set ($range_list{$addr}->ranges) {
6167                         my $value = $set->value;
6168
6169                         # These range types don't appear in the main table
6170                         next if $set->type == 0
6171                                 && defined $suppress_value
6172                                 && $value eq $suppress_value;
6173                         next if $set->type == $MULTI_CP
6174                                 || $set->type == $NULL;
6175
6176                         # Include 2 spaces before the beginning of the
6177                         # comment
6178                         my $this_width = length($value) + 2;
6179
6180                         # Ranges of the remaining non-zero types usually
6181                         # occupy just one line (maybe occasionally two, but
6182                         # this doesn't have to be dead accurate).  This is
6183                         # because these ranges are like "unassigned code
6184                         # points"
6185                         my $count = ($set->type != 0)
6186                                     ? 1
6187                                     : $set->end - $set->start + 1;
6188                         $widths{$this_width} += $count;
6189                         $total += $count;
6190                         $max_map_width = $this_width
6191                                             if $max_map_width < $this_width;
6192                     }
6193
6194                     # If the widest map gives us less than two tab stops
6195                     # worth, just take it as-is.
6196                     if ($max_map_width > 16) {
6197
6198                         # Otherwise go through %widths until we have included
6199                         # the desired percentage of lines in the whole table.
6200                         my $running_total = 0;
6201                         foreach my $width (sort { $a <=> $b } keys %widths)
6202                         {
6203                             $running_total += $widths{$width};
6204                             use integer;
6205                             if ($running_total * 100 / $total
6206                                             >= 100 - $ignore_longest_X_percent)
6207                             {
6208                                 $max_map_width = $width;
6209                                 last;
6210                             }
6211                         }
6212                     }
6213                     $comment_indent += $max_map_width;
6214                 }
6215             }
6216
6217             # Values for previous time through the loop.  Initialize to
6218             # something that won't be adjacent to the first iteration;
6219             # only $previous_end matters for that.
6220             my $previous_start;
6221             my $previous_end = -2;
6222             my $previous_value;
6223
6224             # Values for next time through the portion of the loop that splits
6225             # the range.  0 in $next_start means there is no remaining portion
6226             # to deal with.
6227             my $next_start = 0;
6228             my $next_end;
6229             my $next_value;
6230             my $offset = 0;
6231             my $invlist_count = 0;
6232
6233             my $output_value_in_hex = $self->isa('Map_Table')
6234                                 && ($self->format eq $HEX_ADJUST_FORMAT
6235                                     || $self->to_output_map == $EXTERNAL_MAP);
6236             # Use leading zeroes just for files whose format should not be
6237             # changed from what it has been.  Otherwise, they just take up
6238             # space and time to process.
6239             my $hex_format = ($self->isa('Map_Table')
6240                               && $self->to_output_map == $EXTERNAL_MAP)
6241                              ? "%04X"
6242                              : "%X";
6243
6244             # The values for some of these tables are stored in mktables as
6245             # hex strings.  Normally, these are just output as strings without
6246             # change, but when we are doing adjustments, we have to operate on
6247             # these numerically, so we convert those to decimal to do that,
6248             # and back to hex for output
6249             my $convert_map_to_from_hex = 0;
6250             my $output_map_in_hex = 0;
6251             if ($self->isa('Map_Table')) {
6252                 $convert_map_to_from_hex
6253                    = ($use_adjustments && $self->format eq $HEX_ADJUST_FORMAT)
6254                       || ($annotate && $self->format eq $HEX_FORMAT);
6255                 $output_map_in_hex = $convert_map_to_from_hex
6256                                  || $self->format eq $HEX_FORMAT;
6257             }
6258
6259             # To store any annotations about the characters.
6260             my @annotation;
6261
6262             # Output each range as part of the here document.
6263             RANGE:
6264             for my $set ($range_list{$addr}->ranges) {
6265                 if ($set->type != 0) {
6266                     $self->handle_special_range($set);
6267                     next RANGE;
6268                 }
6269                 my $start = $set->start;
6270                 my $end   = $set->end;
6271                 my $value  = $set->value;
6272
6273                 # Don't output ranges whose value is the one to suppress
6274                 next RANGE if defined $suppress_value
6275                               && $value eq $suppress_value;
6276
6277                 $value = CORE::hex $value if $convert_map_to_from_hex;
6278
6279
6280                 {   # This bare block encloses the scope where we may need to
6281                     # 'redo' to.  Consider a table that is to be written out
6282                     # using single item ranges.  This is given in the
6283                     # $range_size_1 boolean.  To accomplish this, we split the
6284                     # range each time through the loop into two portions, the
6285                     # first item, and the rest.  We handle that first item
6286                     # this time in the loop, and 'redo' to repeat the process
6287                     # for the rest of the range.
6288                     #
6289                     # We may also have to do it, with other special handling,
6290                     # if the table has adjustments.  Consider the table that
6291                     # contains the lowercasing maps.  mktables stores the
6292                     # ASCII range ones as 26 ranges:
6293                     #       ord('A') => ord('a'), .. ord('Z') => ord('z')
6294                     # For compactness, the table that gets written has this as
6295                     # just one range
6296                     #       ( ord('A') .. ord('Z') ) => ord('a')
6297                     # and the software that reads the tables is smart enough
6298                     # to "connect the dots".  This change is accomplished in
6299                     # this loop by looking to see if the current iteration
6300                     # fits the paradigm of the previous iteration, and if so,
6301                     # we merge them by replacing the final output item with
6302                     # the merged data.  Repeated 25 times, this gets A-Z.  But
6303                     # we also have to make sure we don't screw up cases where
6304                     # we have internally stored
6305                     #       ( 0x1C4 .. 0x1C6 ) => 0x1C5
6306                     # This single internal range has to be output as 3 ranges,
6307                     # which is done by splitting, like we do for $range_size_1
6308                     # tables.  (There are very few of such ranges that need to
6309                     # be split, so the gain of doing the combining of other
6310                     # ranges far outweighs the splitting of these.)  The
6311                     # values to use for the redo at the end of this block are
6312                     # set up just below in the scalars whose names begin with
6313                     # '$next_'.
6314
6315                     if (($use_adjustments || $range_size_1) && $end != $start)
6316                     {
6317                         $next_start = $start + 1;
6318                         $next_end = $end;
6319                         $next_value = $value;
6320                         $end = $start;
6321                     }
6322
6323                     if ($use_adjustments && ! $range_size_1) {
6324
6325                         # If this range is adjacent to the previous one, and
6326                         # the values in each are integers that are also
6327                         # adjacent (differ by 1), then this range really
6328                         # extends the previous one that is already in element
6329                         # $OUT[-1].  So we pop that element, and pretend that
6330                         # the range starts with whatever it started with.
6331                         # $offset is incremented by 1 each time so that it
6332                         # gives the current offset from the first element in
6333                         # the accumulating range, and we keep in $value the
6334                         # value of that first element.
6335                         if ($start == $previous_end + 1
6336                             && $value =~ /^ -? \d+ $/xa
6337                             && $previous_value =~ /^ -? \d+ $/xa
6338                             && ($value == ($previous_value + ++$offset)))
6339                         {
6340                             pop @OUT;
6341                             $start = $previous_start;
6342                             $value = $previous_value;
6343                         }
6344                         else {
6345                             $offset = 0;
6346                             if (@annotation == 1) {
6347                                 $OUT[-1] = merge_single_annotation_line(
6348                                     $OUT[-1], $annotation[0], $comment_indent);
6349                             }
6350                             else {
6351                                 push @OUT, @annotation;
6352                             }
6353                         }
6354                         undef @annotation;
6355
6356                         # Save the current values for the next time through
6357                         # the loop.
6358                         $previous_start = $start;
6359                         $previous_end = $end;
6360                         $previous_value = $value;
6361                     }
6362
6363                     if ($write_as_invlist) {
6364                         if (   $previous_end > 0
6365                             && $output_range_counts{$addr})
6366                         {
6367                             my $complement_count = $start - $previous_end - 1;
6368                             if ($complement_count > 1) {
6369                                 $OUT[-1] = merge_single_annotation_line(
6370                                     $OUT[-1],
6371                                        "#"
6372                                      . (" " x 17)
6373                                      . "["
6374                                      .  main::clarify_code_point_count(
6375                                                             $complement_count)
6376                                       . "] in complement\n",
6377                                     $comment_indent);
6378                             }
6379                         }
6380
6381                         # Inversion list format has a single number per line,
6382                         # the starting code point of a range that matches the
6383                         # property
6384                         push @OUT, $start, "\n";
6385                         $invlist_count++;
6386
6387                         # Add a comment with the size of the range, if
6388                         # requested.
6389                         if ($output_range_counts{$addr}) {
6390                             $OUT[-1] = merge_single_annotation_line(
6391                                     $OUT[-1],
6392                                     "# ["
6393                                       . main::clarify_code_point_count($end - $start + 1)
6394                                       . "]\n",
6395                                     $comment_indent);
6396                         }
6397                     }
6398                     elsif ($start != $end) { # If there is a range
6399                         if ($end == $MAX_WORKING_CODEPOINT) {
6400                             push @OUT, sprintf "$hex_format\t$hex_format",
6401                                                 $start,
6402                                                 $MAX_PLATFORM_CODEPOINT;
6403                         }
6404                         else {
6405                             push @OUT, sprintf "$hex_format\t$hex_format",
6406                                                 $start,       $end;
6407                         }
6408                         if (length $value) {
6409                             if ($convert_map_to_from_hex) {
6410                                 $OUT[-1] .= sprintf "\t$hex_format\n", $value;
6411                             }
6412                             else {
6413                                 $OUT[-1] .= "\t$value\n";
6414                             }
6415                         }
6416
6417                         # Add a comment with the size of the range, if
6418                         # requested.
6419                         if ($output_range_counts{$addr}) {
6420                             $OUT[-1] = merge_single_annotation_line(
6421                                     $OUT[-1],
6422                                     "# ["
6423                                       . main::clarify_code_point_count($end - $start + 1)
6424                                       . "]\n",
6425                                     $comment_indent);
6426                         }
6427                     }
6428                     else { # Here to output a single code point per line.
6429
6430                         # Use any passed in subroutine to output.
6431                         if (ref $range_size_1 eq 'CODE') {
6432                             for my $i ($start .. $end) {
6433                                 push @OUT, &{$range_size_1}($i, $value);
6434                             }
6435                         }
6436                         else {
6437
6438                             # Here, caller is ok with default output.
6439                             for (my $i = $start; $i <= $end; $i++) {
6440                                 if ($convert_map_to_from_hex) {
6441                                     push @OUT,
6442                                         sprintf "$hex_format\t\t$hex_format\n",
6443                                                  $i,            $value;
6444                                 }
6445                                 else {
6446                                     push @OUT, sprintf $hex_format, $i;
6447                                     $OUT[-1] .= "\t\t$value" if $value ne "";
6448                                     $OUT[-1] .= "\n";
6449                                 }
6450                             }
6451                         }
6452                     }
6453
6454                     if ($annotate) {
6455                         for (my $i = $start; $i <= $end; $i++) {
6456                             my $annotation = "";
6457
6458                             # Get character information if don't have it already
6459                             main::populate_char_info($i)
6460                                                      if ! defined $viacode[$i];
6461                             my $type = $annotate_char_type[$i];
6462
6463                             # Figure out if should output the next code points
6464                             # as part of a range or not.  If this is not in an
6465                             # annotation range, then won't output as a range,
6466                             # so returns $i.  Otherwise use the end of the
6467                             # annotation range, but no further than the
6468                             # maximum possible end point of the loop.
6469                             my $range_end =
6470                                         $range_size_1
6471                                         ? $start
6472                                         : main::min(
6473                                           $annotate_ranges->value_of($i) || $i,
6474                                           $end);
6475
6476                             # Use a range if it is a range, and either is one
6477                             # of the special annotation ranges, or the range
6478                             # is at most 3 long.  This last case causes the
6479                             # algorithmically named code points to be output
6480                             # individually in spans of at most 3, as they are
6481                             # the ones whose $type is > 0.
6482                             if ($range_end != $i
6483                                 && ( $type < 0 || $range_end - $i > 2))
6484                             {
6485                                 # Here is to output a range.  We don't allow a
6486                                 # caller-specified output format--just use the
6487                                 # standard one.
6488                                 my $range_name = $viacode[$i];
6489
6490                                 # For the code points which end in their hex
6491                                 # value, we eliminate that from the output
6492                                 # annotation, and capitalize only the first
6493                                 # letter of each word.
6494                                 if ($type == $CP_IN_NAME) {
6495                                     my $hex = sprintf $hex_format, $i;
6496                                     $range_name =~ s/-$hex$//;
6497                                     my @words = split " ", $range_name;
6498                                     for my $word (@words) {
6499                                         $word =
6500                                           ucfirst(lc($word)) if $word ne 'CJK';
6501                                     }
6502                                     $range_name = join " ", @words;
6503                                 }
6504                                 elsif ($type == $HANGUL_SYLLABLE) {
6505                                     $range_name = "Hangul Syllable";
6506                                 }
6507
6508                                 # If the annotation would just repeat what's
6509                                 # already being output as the range, skip it.
6510                                 # (When an inversion list is being written, it
6511                                 # isn't a repeat, as that always is in
6512                                 # decimal)
6513                                 if (   $write_as_invlist
6514                                     || $i != $start
6515                                     || $range_end < $end)
6516                                 {
6517                                     if ($range_end < $MAX_WORKING_CODEPOINT)
6518                                     {
6519                                         $annotation = sprintf "%04X..%04X",
6520                                                               $i,   $range_end;
6521                                     }
6522                                     else {
6523                                         $annotation = sprintf "%04X..INFINITY",
6524                                                                $i;
6525                                     }
6526                                 }
6527                                 else { # Indent if not displaying code points
6528                                     $annotation = " " x 4;
6529                                 }
6530
6531                                 if ($range_name) {
6532                                     $annotation .= " $age[$i]" if $age[$i];
6533                                     $annotation .= " $range_name";
6534                                 }
6535
6536                                 # Include the number of code points in the
6537                                 # range
6538                                 my $count =
6539                                     main::clarify_code_point_count($range_end - $i + 1);
6540                                 $annotation .= " [$count]\n";
6541
6542                                 # Skip to the end of the range
6543                                 $i = $range_end;
6544                             }
6545                             else { # Not in a range.
6546                                 my $comment = "";
6547
6548                                 # When outputting the names of each character,
6549                                 # use the character itself if printable
6550                                 $comment .= "'" . main::display_chr($i) . "' "
6551                                                             if $printable[$i];
6552
6553                                 my $output_value = $value;
6554
6555                                 # Determine the annotation
6556                                 if ($format eq $DECOMP_STRING_FORMAT) {
6557
6558                                     # This is very specialized, with the type
6559                                     # of decomposition beginning the line
6560                                     # enclosed in <...>, and the code points
6561                                     # that the code point decomposes to
6562                                     # separated by blanks.  Create two
6563                                     # strings, one of the printable
6564                                     # characters, and one of their official
6565                                     # names.
6566                                     (my $map = $output_value)
6567                                                     =~ s/ \ * < .*? > \ +//x;
6568                                     my $tostr = "";
6569                                     my $to_name = "";
6570                                     my $to_chr = "";
6571                                     foreach my $to (split " ", $map) {
6572                                         $to = CORE::hex $to;
6573                                         $to_name .= " + " if $to_name;
6574                                         $to_chr .= main::display_chr($to);
6575                                         main::populate_char_info($to)
6576                                                     if ! defined $viacode[$to];
6577                                         $to_name .=  $viacode[$to];
6578                                     }
6579
6580                                     $comment .=
6581                                     "=> '$to_chr'; $viacode[$i] => $to_name";
6582                                 }
6583                                 else {
6584                                     $output_value += $i - $start
6585                                                    if $use_adjustments
6586                                                       # Don't try to adjust a
6587                                                       # non-integer
6588                                                    && $output_value !~ /[-\D]/;
6589
6590                                     if ($output_map_in_hex) {
6591                                         main::populate_char_info($output_value)
6592                                           if ! defined $viacode[$output_value];
6593                                         $comment .= " => '"
6594                                         . main::display_chr($output_value)
6595                                         . "'; " if $printable[$output_value];
6596                                     }
6597                                     if ($include_name && $viacode[$i]) {
6598                                         $comment .= " " if $comment;
6599                                         $comment .= $viacode[$i];
6600                                     }
6601                                     if ($output_map_in_hex) {
6602                                         $comment .=
6603                                                 " => $viacode[$output_value]"
6604                                                     if $viacode[$output_value];
6605                                         $output_value = sprintf($hex_format,
6606                                                                 $output_value);
6607                                     }
6608                                 }
6609
6610                                 if ($include_cp) {
6611                                     $annotation = sprintf "%04X %s", $i, $age[$i];
6612                                     if ($use_adjustments) {
6613                                         $annotation .= " => $output_value";
6614                                     }
6615                                 }
6616
6617                                 if ($comment ne "") {
6618                                     $annotation .= " " if $annotation ne "";
6619                                     $annotation .= $comment;
6620                                 }
6621                                 $annotation .= "\n" if $annotation ne "";
6622                             }
6623
6624                             if ($annotation ne "") {
6625                                 push @annotation, (" " x $comment_indent)
6626                                                   .  "# $annotation";
6627                             }
6628                         }
6629
6630                         # If not adjusting, we don't have to go through the
6631                         # loop again to know that the annotation comes next
6632                         # in the output.
6633                         if (! $use_adjustments) {
6634                             if (@annotation == 1) {
6635                                 $OUT[-1] = merge_single_annotation_line(
6636                                     $OUT[-1], $annotation[0], $comment_indent);
6637                             }
6638                             else {
6639                                 push @OUT, map { Text::Tabs::unexpand $_ }
6640                                                @annotation;
6641                             }
6642                             undef @annotation;
6643                         }
6644                     }
6645
6646                     # Add the beginning of the range that doesn't match the
6647                     # property, except if the just added match range extends
6648                     # to infinity.  We do this after any annotations for the
6649                     # match range.
6650                     if ($write_as_invlist && $end < $MAX_WORKING_CODEPOINT) {
6651                         push @OUT, $end + 1, "\n";
6652                         $invlist_count++;
6653                     }
6654
6655                     # If we split the range, set up so the next time through
6656                     # we get the remainder, and redo.
6657                     if ($next_start) {
6658                         $start = $next_start;
6659                         $end = $next_end;
6660                         $value = $next_value;
6661                         $next_start = 0;
6662                         redo;
6663                     }
6664                 }
6665             } # End of loop through all the table's ranges
6666
6667             push @OUT, @annotation; # Add orphaned annotation, if any
6668
6669             splice @OUT, 1, 0, "V$invlist_count\n" if $invlist_count;
6670         }
6671
6672         # Add anything that goes after the main body, but within the here
6673         # document,
6674         my $append_to_body = $self->append_to_body;
6675         push @OUT, $append_to_body if $append_to_body;
6676
6677         # And finish the here document.
6678         push @OUT, "END\n";
6679
6680         # Done with the main portion of the body.  Can now figure out what
6681         # should appear before it in the file.
6682         my $pre_body = $self->pre_body;
6683         push @HEADER, $pre_body, "\n" if $pre_body;
6684
6685         # All these files should have a .pl suffix added to them.
6686         my @file_with_pl = @{$file_path{$addr}};
6687         $file_with_pl[-1] .= '.pl';
6688
6689         main::write(\@file_with_pl,
6690                     $annotate,      # utf8 iff annotating
6691                     \@HEADER,
6692                     \@OUT);
6693         return;
6694     }
6695
6696     sub set_status {    # Set the table's status
6697         my $self = shift;
6698         my $status = shift; # The status enum value
6699         my $info = shift;   # Any message associated with it.
6700         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6701
6702         my $addr = do { no overloading; pack 'J', $self; };
6703
6704         $status{$addr} = $status;
6705         $status_info{$addr} = $info;
6706         return;
6707     }
6708
6709     sub set_fate {  # Set the fate of a table
6710         my $self = shift;
6711         my $fate = shift;
6712         my $reason = shift;
6713         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6714
6715         my $addr = do { no overloading; pack 'J', $self; };
6716
6717         return if $fate{$addr} == $fate;    # If no-op
6718
6719         # Can only change the ordinary fate, except if going to $MAP_PROXIED
6720         return if $fate{$addr} != $ORDINARY && $fate != $MAP_PROXIED;
6721
6722         $fate{$addr} = $fate;
6723
6724         # Don't document anything to do with a non-normal fated table
6725         if ($fate != $ORDINARY) {
6726             my $put_in_pod = ($fate == $MAP_PROXIED) ? 1 : 0;
6727             foreach my $alias ($self->aliases) {
6728                 $alias->set_ucd($put_in_pod);
6729
6730                 # MAP_PROXIED doesn't affect the match tables
6731                 next if $fate == $MAP_PROXIED;
6732                 $alias->set_make_re_pod_entry($put_in_pod);
6733             }
6734         }
6735
6736         # Save the reason for suppression for output
6737         if ($fate >= $SUPPRESSED) {
6738             $reason = "" unless defined $reason;
6739             $why_suppressed{$complete_name{$addr}} = $reason;
6740         }
6741
6742         return;
6743     }
6744
6745     sub lock {
6746         # Don't allow changes to the table from now on.  This stores a stack
6747         # trace of where it was called, so that later attempts to modify it
6748         # can immediately show where it got locked.
6749
6750         my $self = shift;
6751         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6752
6753         my $addr = do { no overloading; pack 'J', $self; };
6754
6755         $locked{$addr} = "";
6756
6757         my $line = (caller(0))[2];
6758         my $i = 1;
6759
6760         # Accumulate the stack trace
6761         while (1) {
6762             my ($pkg, $file, $caller_line, $caller) = caller $i++;
6763
6764             last unless defined $caller;
6765
6766             $locked{$addr} .= "    called from $caller() at line $line\n";
6767             $line = $caller_line;
6768         }
6769         $locked{$addr} .= "    called from main at line $line\n";
6770
6771         return;
6772     }
6773
6774     sub carp_if_locked {
6775         # Return whether a table is locked or not, and, by the way, complain
6776         # if is locked
6777
6778         my $self = shift;
6779         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6780
6781         my $addr = do { no overloading; pack 'J', $self; };
6782
6783         return 0 if ! $locked{$addr};
6784         Carp::my_carp_bug("Can't modify a locked table. Stack trace of locking:\n$locked{$addr}\n\n");
6785         return 1;
6786     }
6787
6788     sub set_file_path { # Set the final directory path for this table
6789         my $self = shift;
6790         # Rest of parameters passed on
6791
6792         no overloading;
6793         @{$file_path{pack 'J', $self}} = @_;
6794         return
6795     }
6796
6797     # Accessors for the range list stored in this table.  First for
6798     # unconditional
6799     for my $sub (qw(
6800                     containing_range
6801                     contains
6802                     count
6803                     each_range
6804                     hash
6805                     is_empty
6806                     matches_identically_to
6807                     max
6808                     min
6809                     range_count
6810                     reset_each_range
6811                     type_of
6812                     value_of
6813                 ))
6814     {
6815         no strict "refs";
6816         *$sub = sub {
6817             use strict "refs";
6818             my $self = shift;
6819             return $self->_range_list->$sub(@_);
6820         }
6821     }
6822
6823     # Then for ones that should fail if locked
6824     for my $sub (qw(
6825                     delete_range
6826                 ))
6827     {
6828         no strict "refs";
6829         *$sub = sub {
6830             use strict "refs";
6831             my $self = shift;
6832
6833             return if $self->carp_if_locked;
6834             no overloading;
6835             return $self->_range_list->$sub(@_);
6836         }
6837     }
6838
6839 } # End closure
6840
6841 package Map_Table;
6842 use parent '-norequire', '_Base_Table';
6843
6844 # A Map Table is a table that contains the mappings from code points to
6845 # values.  There are two weird cases:
6846 # 1) Anomalous entries are ones that aren't maps of ranges of code points, but
6847 #    are written in the table's file at the end of the table nonetheless.  It
6848 #    requires specially constructed code to handle these; utf8.c can not read
6849 #    these in, so they should not go in $map_directory.  As of this writing,
6850 #    the only case that these happen is for named sequences used in
6851 #    charnames.pm.   But this code doesn't enforce any syntax on these, so
6852 #    something else could come along that uses it.
6853 # 2) Specials are anything that doesn't fit syntactically into the body of the
6854 #    table.  The ranges for these have a map type of non-zero.  The code below
6855 #    knows about and handles each possible type.   In most cases, these are
6856 #    written as part of the header.
6857 #
6858 # A map table deliberately can't be manipulated at will unlike match tables.
6859 # This is because of the ambiguities having to do with what to do with
6860 # overlapping code points.  And there just isn't a need for those things;
6861 # what one wants to do is just query, add, replace, or delete mappings, plus
6862 # write the final result.
6863 # However, there is a method to get the list of possible ranges that aren't in
6864 # this table to use for defaulting missing code point mappings.  And,
6865 # map_add_or_replace_non_nulls() does allow one to add another table to this
6866 # one, but it is clearly very specialized, and defined that the other's
6867 # non-null values replace this one's if there is any overlap.
6868
6869 sub trace { return main::trace(@_); }
6870
6871 { # Closure
6872
6873     main::setup_package();
6874
6875     my %default_map;
6876     # Many input files omit some entries; this gives what the mapping for the
6877     # missing entries should be
6878     main::set_access('default_map', \%default_map, 'r');
6879
6880     my %anomalous_entries;
6881     # Things that go in the body of the table which don't fit the normal
6882     # scheme of things, like having a range.  Not much can be done with these
6883     # once there except to output them.  This was created to handle named
6884     # sequences.
6885     main::set_access('anomalous_entry', \%anomalous_entries, 'a');
6886     main::set_access('anomalous_entries',       # Append singular, read plural
6887                     \%anomalous_entries,
6888                     'readable_array');
6889
6890     my %replacement_property;
6891     # Certain files are unused by Perl itself, and are kept only for backwards
6892     # compatibility for programs that used them before Unicode::UCD existed.
6893     # These are termed legacy properties.  At some point they may be removed,
6894     # but for now mark them as legacy.  If non empty, this is the name of the
6895     # property to use instead (i.e., the modern equivalent).
6896     main::set_access('replacement_property', \%replacement_property, 'r');
6897
6898     my %to_output_map;
6899     # Enum as to whether or not to write out this map table, and how:
6900     #   0               don't output
6901     #   $EXTERNAL_MAP   means its existence is noted in the documentation, and
6902     #                   it should not be removed nor its format changed.  This
6903     #                   is done for those files that have traditionally been
6904     #                   output.  Maps of legacy-only properties default to
6905     #                   this.
6906     #   $INTERNAL_MAP   means Perl reserves the right to do anything it wants
6907     #                   with this file
6908     #   $OUTPUT_ADJUSTED means that it is an $INTERNAL_MAP, and instead of
6909     #                   outputting the actual mappings as-is, we adjust things
6910     #                   to create a much more compact table. Only those few
6911     #                   tables where the mapping is convertible at least to an
6912     #                   integer and compacting makes a big difference should
6913     #                   have this.  Hence, the default is to not do this
6914     #                   unless the table's default mapping is to $CODE_POINT,
6915     #                   and the range size is not 1.
6916     main::set_access('to_output_map', \%to_output_map, 's');
6917
6918     sub new {
6919         my $class = shift;
6920         my $name = shift;
6921
6922         my %args = @_;
6923
6924         # Optional initialization data for the table.
6925         my $initialize = delete $args{'Initialize'};
6926
6927         my $default_map = delete $args{'Default_Map'};
6928         my $property = delete $args{'_Property'};
6929         my $full_name = delete $args{'Full_Name'};
6930         my $replacement_property = delete $args{'Replacement_Property'} // "";
6931         my $to_output_map = delete $args{'To_Output_Map'};
6932
6933         # Rest of parameters passed on; legacy properties have several common
6934         # other attributes
6935         if ($replacement_property) {
6936             $args{"Fate"} = $LEGACY_ONLY;
6937             $args{"Range_Size_1"} = 1;
6938             $args{"Perl_Extension"} = 1;
6939             $args{"UCD"} = 0;
6940         }
6941
6942         my $range_list = Range_Map->new(Owner => $property);
6943
6944         my $self = $class->SUPER::new(
6945                                     Name => $name,
6946                                     Complete_Name =>  $full_name,
6947                                     Full_Name => $full_name,
6948                                     _Property => $property,
6949                                     _Range_List => $range_list,
6950                                     Write_As_Invlist => 0,
6951                                     %args);
6952
6953         my $addr = do { no overloading; pack 'J', $self; };
6954
6955         $anomalous_entries{$addr} = [];
6956         $default_map{$addr} = $default_map;
6957         $replacement_property{$addr} = $replacement_property;
6958         $to_output_map = $EXTERNAL_MAP if ! defined $to_output_map
6959                                           && $replacement_property;
6960         $to_output_map{$addr} = $to_output_map;
6961
6962         $self->initialize($initialize) if defined $initialize;
6963
6964         return $self;
6965     }
6966
6967     use overload
6968         fallback => 0,
6969         qw("") => "_operator_stringify",
6970     ;
6971
6972     sub _operator_stringify {
6973         my $self = shift;
6974
6975         my $name = $self->property->full_name;
6976         $name = '""' if $name eq "";
6977         return "Map table for Property '$name'";
6978     }
6979
6980     sub add_alias {
6981         # Add a synonym for this table (which means the property itself)
6982         my $self = shift;
6983         my $name = shift;
6984         # Rest of parameters passed on.
6985
6986         $self->SUPER::add_alias($name, $self->property, @_);
6987         return;
6988     }
6989
6990     sub add_map {
6991         # Add a range of code points to the list of specially-handled code
6992         # points.  $MULTI_CP is assumed if the type of special is not passed
6993         # in.
6994
6995         my $self = shift;
6996         my $lower = shift;
6997         my $upper = shift;
6998         my $string = shift;
6999         my %args = @_;
7000
7001         my $type = delete $args{'Type'} || 0;
7002         # Rest of parameters passed on
7003
7004         # Can't change the table if locked.
7005         return if $self->carp_if_locked;
7006
7007         my $addr = do { no overloading; pack 'J', $self; };
7008
7009         $self->_range_list->add_map($lower, $upper,
7010                                     $string,
7011                                     @_,
7012                                     Type => $type);
7013         return;
7014     }
7015
7016     sub append_to_body {
7017         # Adds to the written HERE document of the table's body any anomalous
7018         # entries in the table..
7019
7020         my $self = shift;
7021         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7022
7023         my $addr = do { no overloading; pack 'J', $self; };
7024
7025         return "" unless @{$anomalous_entries{$addr}};
7026         return join("\n", @{$anomalous_entries{$addr}}) . "\n";
7027     }
7028
7029     sub map_add_or_replace_non_nulls {
7030         # This adds the mappings in the table $other to $self.  Non-null
7031         # mappings from $other override those in $self.  It essentially merges
7032         # the two tables, with the second having priority except for null
7033         # mappings.
7034
7035         my $self = shift;
7036         my $other = shift;
7037         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7038
7039         return if $self->carp_if_locked;
7040
7041         if (! $other->isa(__PACKAGE__)) {
7042             Carp::my_carp_bug("$other should be a "
7043                         . __PACKAGE__
7044                         . ".  Not a '"
7045                         . ref($other)
7046                         . "'.  Not added;");
7047             return;
7048         }
7049
7050         my $addr = do { no overloading; pack 'J', $self; };
7051         my $other_addr = do { no overloading; pack 'J', $other; };
7052
7053         local $to_trace = 0 if main::DEBUG;
7054
7055         my $self_range_list = $self->_range_list;
7056         my $other_range_list = $other->_range_list;
7057         foreach my $range ($other_range_list->ranges) {
7058             my $value = $range->value;
7059             next if $value eq "";
7060             $self_range_list->_add_delete('+',
7061                                           $range->start,
7062                                           $range->end,
7063                                           $value,
7064                                           Type => $range->type,
7065                                           Replace => $UNCONDITIONALLY);
7066         }
7067
7068         return;
7069     }
7070
7071     sub set_default_map {
7072         # Define what code points that are missing from the input files should
7073         # map to.  The optional second parameter 'full_name' indicates to
7074         # force using the full name of the map instead of its standard name.
7075
7076         my $self = shift;
7077         my $map = shift;
7078         my $use_full_name = shift // 0;
7079         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7080
7081         if ($use_full_name && $use_full_name ne 'full_name') {
7082             Carp::my_carp_bug("Second parameter to set_default_map() if"
7083                             . " present, must be 'full_name'");
7084         }
7085
7086         my $addr = do { no overloading; pack 'J', $self; };
7087
7088         # Convert the input to the standard equivalent, if any (won't have any
7089         # for $STRING properties)
7090         my $standard = $self->property->table($map);
7091         if (defined $standard) {
7092             $map = ($use_full_name)
7093                    ? $standard->full_name
7094                    : $standard->name;
7095         }
7096
7097         # Warn if there already is a non-equivalent default map for this
7098         # property.  Note that a default map can be a ref, which means that
7099         # what it actually means is delayed until later in the program, and it
7100         # IS permissible to override it here without a message.
7101         my $default_map = $default_map{$addr};
7102         if (defined $default_map
7103             && ! ref($default_map)
7104             && $default_map ne $map
7105             && main::Standardize($map) ne $default_map)
7106         {
7107             my $property = $self->property;
7108             my $map_table = $property->table($map);
7109             my $default_table = $property->table($default_map);
7110             if (defined $map_table
7111                 && defined $default_table
7112                 && $map_table != $default_table)
7113             {
7114                 Carp::my_carp("Changing the default mapping for "
7115                             . $property
7116                             . " from $default_map to $map'");
7117             }
7118         }
7119
7120         $default_map{$addr} = $map;
7121
7122         # Don't also create any missing table for this map at this point,
7123         # because if we did, it could get done before the main table add is
7124         # done for PropValueAliases.txt; instead the caller will have to make
7125         # sure it exists, if desired.
7126         return;
7127     }
7128
7129     sub to_output_map {
7130         # Returns boolean: should we write this map table?
7131
7132         my $self = shift;
7133         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7134
7135         my $addr = do { no overloading; pack 'J', $self; };
7136
7137         # If overridden, use that
7138         return $to_output_map{$addr} if defined $to_output_map{$addr};
7139
7140         my $full_name = $self->full_name;
7141         return $global_to_output_map{$full_name}
7142                                 if defined $global_to_output_map{$full_name};
7143
7144         # If table says to output, do so; if says to suppress it, do so.
7145         my $fate = $self->fate;
7146         return $INTERNAL_MAP if $fate == $INTERNAL_ONLY;
7147         return $EXTERNAL_MAP if grep { $_ eq $full_name } @output_mapped_properties;
7148         return 0 if $fate == $SUPPRESSED || $fate == $MAP_PROXIED;
7149
7150         my $type = $self->property->type;
7151
7152         # Don't want to output binary map tables even for debugging.
7153         return 0 if $type == $BINARY;
7154
7155         # But do want to output string ones.  All the ones that remain to
7156         # be dealt with (i.e. which haven't explicitly been set to external)
7157         # are for internal Perl use only.  The default for those that map to
7158         # $CODE_POINT and haven't been restricted to a single element range
7159         # is to use the adjusted form.
7160         if ($type == $STRING) {
7161             return $INTERNAL_MAP if $self->range_size_1
7162                                     || $default_map{$addr} ne $CODE_POINT;
7163             return $OUTPUT_ADJUSTED;
7164         }
7165
7166         # Otherwise is an $ENUM, do output it, for Perl's purposes
7167         return $INTERNAL_MAP;
7168     }
7169
7170     sub inverse_list {
7171         # Returns a Range_List that is gaps of the current table.  That is,
7172         # the inversion
7173
7174         my $self = shift;
7175         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7176
7177         my $current = Range_List->new(Initialize => $self->_range_list,
7178                                 Owner => $self->property);
7179         return ~ $current;
7180     }
7181
7182     sub header {
7183         my $self = shift;
7184         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7185
7186         my $return = $self->SUPER::header();
7187
7188         if ($self->to_output_map >= $INTERNAL_MAP) {
7189             $return .= $INTERNAL_ONLY_HEADER;
7190         }
7191         else {
7192             my $property_name = $self->property->replacement_property;
7193
7194             # The legacy-only properties were gotten above; but there are some
7195             # other properties whose files are in current use that have fixed
7196             # formats.
7197             $property_name = $self->property->full_name unless $property_name;
7198
7199             $return .= <<END;
7200
7201 # !!!!!!!   IT IS DEPRECATED TO USE THIS FILE   !!!!!!!
7202
7203 # This file is for internal use by core Perl only.  It is retained for
7204 # backwards compatibility with applications that may have come to rely on it,
7205 # but its format and even its name or existence are subject to change without
7206 # notice in a future Perl version.  Don't use it directly.  Instead, its
7207 # contents are now retrievable through a stable API in the Unicode::UCD
7208 # module: Unicode::UCD::prop_invmap('$property_name') (Values for individual
7209 # code points can be retrieved via Unicode::UCD::charprop());
7210 END
7211         }
7212         return $return;
7213     }
7214
7215     sub set_final_comment {
7216         # Just before output, create the comment that heads the file
7217         # containing this table.
7218
7219         return unless $debugging_build;
7220
7221         my $self = shift;
7222         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7223
7224         # No sense generating a comment if aren't going to write it out.
7225         return if ! $self->to_output_map;
7226
7227         my $addr = do { no overloading; pack 'J', $self; };
7228
7229         my $property = $self->property;
7230
7231         # Get all the possible names for this property.  Don't use any that
7232         # aren't ok for use in a file name, etc.  This is perhaps causing that
7233         # flag to do double duty, and may have to be changed in the future to
7234         # have our own flag for just this purpose; but it works now to exclude
7235         # Perl generated synonyms from the lists for properties, where the
7236         # name is always the proper Unicode one.
7237         my @property_aliases = grep { $_->ok_as_filename } $self->aliases;
7238
7239         my $count = $self->count;
7240         my $default_map = $default_map{$addr};
7241
7242         # The ranges that map to the default aren't output, so subtract that
7243         # to get those actually output.  A property with matching tables
7244         # already has the information calculated.
7245         if ($property->type != $STRING && $property->type != $FORCED_BINARY) {
7246             $count -= $property->table($default_map)->count;
7247         }
7248         elsif (defined $default_map) {
7249
7250             # But for $STRING properties, must calculate now.  Subtract the
7251             # count from each range that maps to the default.
7252             foreach my $range ($self->_range_list->ranges) {
7253                 if ($range->value eq $default_map) {
7254                     $count -= $range->end +1 - $range->start;
7255                 }
7256             }
7257
7258         }
7259
7260         # Get a  string version of $count with underscores in large numbers,
7261         # for clarity.
7262         my $string_count = main::clarify_code_point_count($count);
7263
7264         my $code_points = ($count == 1)
7265                         ? 'single code point'
7266                         : "$string_count code points";
7267
7268         my $mapping;
7269         my $these_mappings;
7270         my $are;
7271         if (@property_aliases <= 1) {
7272             $mapping = 'mapping';
7273             $these_mappings = 'this mapping';
7274             $are = 'is'
7275         }
7276         else {
7277             $mapping = 'synonymous mappings';
7278             $these_mappings = 'these mappings';
7279             $are = 'are'
7280         }
7281         my $cp;
7282         if ($count >= $MAX_UNICODE_CODEPOINTS) {
7283             $cp = "any code point in Unicode Version $string_version";
7284         }
7285         else {
7286             my $map_to;
7287             if ($default_map eq "") {
7288                 $map_to = 'the null string';
7289             }
7290             elsif ($default_map eq $CODE_POINT) {
7291                 $map_to = "itself";
7292             }
7293             else {
7294                 $map_to = "'$default_map'";
7295             }
7296             if ($count == 1) {
7297                 $cp = "the single code point";
7298             }
7299             else {
7300                 $cp = "one of the $code_points";
7301             }
7302             $cp .= " in Unicode Version $unicode_version for which the mapping is not to $map_to";
7303         }
7304
7305         my $comment = "";
7306
7307         my $status = $self->status;
7308         if ($status ne $NORMAL) {
7309             my $warn = uc $status_past_participles{$status};
7310             $comment .= <<END;
7311
7312 !!!!!!!   $warn !!!!!!!!!!!!!!!!!!!
7313  All property or property=value combinations contained in this file are $warn.
7314  See $unicode_reference_url for what this means.
7315
7316 END
7317         }
7318         $comment .= "This file returns the $mapping:\n";
7319
7320         my $ucd_accessible_name = "";
7321         my $has_underscore_name = 0;
7322         my $full_name = $self->property->full_name;
7323         for my $i (0 .. @property_aliases - 1) {
7324             my $name = $property_aliases[$i]->name;
7325             $has_underscore_name = 1 if $name =~ /^_/;
7326             $comment .= sprintf("%-8s%s\n", " ", $name . '(cp)');
7327             if ($property_aliases[$i]->ucd) {
7328                 if ($name eq $full_name) {
7329                     $ucd_accessible_name = $full_name;
7330                 }
7331                 elsif (! $ucd_accessible_name) {
7332                     $ucd_accessible_name = $name;
7333                 }
7334             }
7335         }
7336         $comment .= "\nwhere 'cp' is $cp.";
7337         if ($ucd_accessible_name) {
7338             $comment .= "  Note that $these_mappings";
7339             if ($has_underscore_name) {
7340                 $comment .= " (except for the one(s) that begin with an underscore)";
7341             }
7342             $comment .= " $are accessible via the functions prop_invmap('$full_name') or charprop() in Unicode::UCD";
7343
7344         }
7345
7346         # And append any commentary already set from the actual property.
7347         $comment .= "\n\n" . $self->comment if $self->comment;
7348         if ($self->description) {
7349             $comment .= "\n\n" . join " ", $self->description;
7350         }
7351         if ($self->note) {
7352             $comment .= "\n\n" . join " ", $self->note;
7353         }
7354         $comment .= "\n";
7355
7356         if (! $self->perl_extension) {
7357             $comment .= <<END;
7358
7359 For information about what this property really means, see:
7360 $unicode_reference_url
7361 END
7362         }
7363
7364         if ($count) {        # Format differs for empty table
7365                 $comment.= "\nThe format of the ";
7366             if ($self->range_size_1) {
7367                 $comment.= <<END;
7368 main body of lines of this file is: CODE_POINT\\t\\tMAPPING where CODE_POINT
7369 is in hex; MAPPING is what CODE_POINT maps to.
7370 END
7371             }
7372             else {
7373
7374                 # There are tables which end up only having one element per
7375                 # range, but it is not worth keeping track of for making just
7376                 # this comment a little better.
7377                 $comment .= <<END;
7378 non-comment portions of the main body of lines of this file is:
7379 START\\tSTOP\\tMAPPING where START is the starting code point of the
7380 range, in hex; STOP is the ending point, or if omitted, the range has just one
7381 code point; MAPPING is what each code point between START and STOP maps to.
7382 END
7383                 if ($self->output_range_counts) {
7384                     $comment .= <<END;
7385 Numbers in comments in [brackets] indicate how many code points are in the
7386 range (omitted when the range is a single code point or if the mapping is to
7387 the null string).
7388 END
7389                 }
7390             }
7391         }
7392         $self->set_comment(main::join_lines($comment));
7393         return;
7394     }
7395
7396     my %swash_keys; # Makes sure don't duplicate swash names.
7397
7398     # The remaining variables are temporaries used while writing each table,
7399     # to output special ranges.
7400     my @multi_code_point_maps;  # Map is to more than one code point.
7401
7402     sub handle_special_range {
7403         # Called in the middle of write when it finds a range it doesn't know
7404         # how to handle.
7405
7406         my $self = shift;
7407         my $range = shift;
7408         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7409
7410         my $addr = do { no overloading; pack 'J', $self; };
7411
7412         my $type = $range->type;
7413
7414         my $low = $range->start;
7415         my $high = $range->end;
7416         my $map = $range->value;
7417
7418         # No need to output the range if it maps to the default.
7419         return if $map eq $default_map{$addr};
7420
7421         my $property = $self->property;
7422
7423         # Switch based on the map type...
7424         if ($type == $HANGUL_SYLLABLE) {
7425
7426             # These are entirely algorithmically determinable based on
7427             # some constants furnished by Unicode; for now, just set a
7428             # flag to indicate that have them.  After everything is figured
7429             # out, we will output the code that does the algorithm.  (Don't
7430             # output them if not needed because we are suppressing this
7431             # property.)
7432             $has_hangul_syllables = 1 if $property->to_output_map;
7433         }
7434         elsif ($type == $CP_IN_NAME) {
7435
7436             # Code points whose name ends in their code point are also
7437             # algorithmically determinable, but need information about the map
7438             # to do so.  Both the map and its inverse are stored in data
7439             # structures output in the file.  They are stored in the mean time
7440             # in global lists The lists will be written out later into Name.pm,
7441             # which is created only if needed.  In order to prevent duplicates
7442             # in the list, only add to them for one property, should multiple
7443             # ones need them.
7444             if ($needing_code_points_ending_in_code_point == 0) {
7445                 $needing_code_points_ending_in_code_point = $property;
7446             }
7447             if ($property == $needing_code_points_ending_in_code_point) {
7448                 push @{$names_ending_in_code_point{$map}->{'low'}}, $low;
7449                 push @{$names_ending_in_code_point{$map}->{'high'}}, $high;
7450
7451                 my $squeezed = $map =~ s/[-\s]+//gr;
7452                 push @{$loose_names_ending_in_code_point{$squeezed}->{'low'}},
7453                                                                           $low;
7454                 push @{$loose_names_ending_in_code_point{$squeezed}->{'high'}},
7455                                                                          $high;
7456
7457                 push @code_points_ending_in_code_point, { low => $low,
7458                                                         high => $high,
7459                                                         name => $map
7460                                                         };
7461             }
7462         }
7463         elsif ($range->type == $MULTI_CP || $range->type == $NULL) {
7464
7465             # Multi-code point maps and null string maps have an entry
7466             # for each code point in the range.  They use the same
7467             # output format.
7468             for my $code_point ($low .. $high) {
7469
7470                 # The pack() below can't cope with surrogates.  XXX This may
7471                 # no longer be true
7472                 if ($code_point >= 0xD800 && $code_point <= 0xDFFF) {
7473                     Carp::my_carp("Surrogate code point '$code_point' in mapping to '$map' in $self.  No map created");
7474                     next;
7475                 }
7476
7477                 # Generate the hash entries for these in the form that
7478                 # utf8.c understands.
7479                 my $tostr = "";
7480                 my $to_name = "";
7481                 my $to_chr = "";
7482                 foreach my $to (split " ", $map) {
7483                     if ($to !~ /^$code_point_re$/) {
7484                         Carp::my_carp("Illegal code point '$to' in mapping '$map' from $code_point in $self.  No map created");
7485                         next;
7486                     }
7487                     $tostr .= sprintf "\\x{%s}", $to;
7488                     $to = CORE::hex $to;
7489                     if ($annotate) {
7490                         $to_name .= " + " if $to_name;
7491                         $to_chr .= main::display_chr($to);
7492                         main::populate_char_info($to)
7493                                             if ! defined $viacode[$to];
7494                         $to_name .=  $viacode[$to];
7495                     }
7496                 }
7497
7498                 # The unpack yields a list of the bytes that comprise the
7499                 # UTF-8 of $code_point, which are each placed in \xZZ format
7500                 # and output in the %s to map to $tostr, so the result looks
7501                 # like:
7502                 # "\xC4\xB0" => "\x{0069}\x{0307}",
7503                 my $utf8 = sprintf(qq["%s" => "$tostr",],
7504                         join("", map { sprintf "\\x%02X", $_ }
7505                             unpack("U0C*", chr $code_point)));
7506
7507                 # Add a comment so that a human reader can more easily
7508                 # see what's going on.
7509                 push @multi_code_point_maps,
7510                         sprintf("%-45s # U+%04X", $utf8, $code_point);
7511                 if (! $annotate) {
7512                     $multi_code_point_maps[-1] .= " => $map";
7513                 }
7514                 else {
7515                     main::populate_char_info($code_point)
7516                                     if ! defined $viacode[$code_point];
7517                     $multi_code_point_maps[-1] .= " '"
7518                         . main::display_chr($code_point)
7519                         . "' => '$to_chr'; $viacode[$code_point] => $to_name";
7520                 }
7521             }
7522         }
7523         else {
7524             Carp::my_carp("Unrecognized map type '$range->type' in '$range' in $self.  Not written");
7525         }
7526
7527         return;
7528     }
7529
7530     sub pre_body {
7531         # Returns the string that should be output in the file before the main
7532         # body of this table.  It isn't called until the main body is
7533         # calculated, saving a pass.  The string includes some hash entries
7534         # identifying the format of the body, and what the single value should
7535         # be for all ranges missing from it.  It also includes any code points
7536         # which have map_types that don't go in the main table.
7537
7538         my $self = shift;
7539         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7540
7541         my $addr = do { no overloading; pack 'J', $self; };
7542
7543         my $name = $self->property->swash_name;
7544
7545         # Currently there is nothing in the pre_body unless a swash is being
7546         # generated.
7547         return unless defined $name;
7548
7549         if (defined $swash_keys{$name}) {
7550             Carp::my_carp(main::join_lines(<<END
7551 Already created a swash name '$name' for $swash_keys{$name}.  This means that
7552 the same name desired for $self shouldn't be used.  Bad News.  This must be
7553 fixed before production use, but proceeding anyway
7554 END
7555             ));
7556         }
7557         $swash_keys{$name} = "$self";
7558
7559         my $pre_body = "";
7560
7561         # Here we assume we were called after have gone through the whole
7562         # file.  If we actually generated anything for each map type, add its
7563         # respective header and trailer
7564         my $specials_name = "";
7565         if (@multi_code_point_maps) {
7566             $specials_name = "utf8::ToSpec$name";
7567             $pre_body .= <<END;
7568
7569 # Some code points require special handling because their mappings are each to
7570 # multiple code points.  These do not appear in the main body, but are defined
7571 # in the hash below.
7572
7573 # Each key is the string of N bytes that together make up the UTF-8 encoding
7574 # for the code point.  (i.e. the same as looking at the code point's UTF-8
7575 # under "use bytes").  Each value is the UTF-8 of the translation, for speed.
7576 \%$specials_name = (
7577 END
7578             $pre_body .= join("\n", @multi_code_point_maps) . "\n);\n";
7579         }
7580
7581         my $format = $self->format;
7582
7583         my $return = "";
7584
7585         my $output_adjusted = ($self->to_output_map == $OUTPUT_ADJUSTED);
7586         if ($output_adjusted) {
7587             if ($specials_name) {
7588                 $return .= <<END;
7589 # The mappings in the non-hash portion of this file must be modified to get the
7590 # correct values by adding the code point ordinal number to each one that is
7591 # numeric.
7592 END
7593             }
7594             else {
7595                 $return .= <<END;
7596 # The mappings must be modified to get the correct values by adding the code
7597 # point ordinal number to each one that is numeric.
7598 END
7599             }
7600         }
7601
7602         $return .= <<END;
7603
7604 # The name this swash is to be known by, with the format of the mappings in
7605 # the main body of the table, and what all code points missing from this file
7606 # map to.
7607 \$utf8::SwashInfo{'To$name'}{'format'} = '$format'; # $map_table_formats{$format}
7608 END
7609         if ($specials_name) {
7610             $return .= <<END;
7611 \$utf8::SwashInfo{'To$name'}{'specials_name'} = '$specials_name'; # Name of hash of special mappings
7612 END
7613         }
7614         my $default_map = $default_map{$addr};
7615
7616         # For $CODE_POINT default maps and using adjustments, instead the default
7617         # becomes zero.
7618         $return .= "\$utf8::SwashInfo{'To$name'}{'missing'} = '"
7619                 .  (($output_adjusted && $default_map eq $CODE_POINT)
7620                    ? "0"
7621                    : $default_map)
7622                 . "';";
7623
7624         if ($default_map eq $CODE_POINT) {
7625             $return .= ' # code point maps to itself';
7626         }
7627         elsif ($default_map eq "") {
7628             $return .= ' # code point maps to the null string';
7629         }
7630         $return .= "\n";
7631
7632         $return .= $pre_body;
7633
7634         return $return;
7635     }
7636
7637     sub write {
7638         # Write the table to the file.
7639
7640         my $self = shift;
7641         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7642
7643         my $addr = do { no overloading; pack 'J', $self; };
7644
7645         # Clear the temporaries
7646         undef @multi_code_point_maps;
7647
7648         # Calculate the format of the table if not already done.
7649         my $format = $self->format;
7650         my $type = $self->property->type;
7651         my $default_map = $self->default_map;
7652         if (! defined $format) {
7653             if ($type == $BINARY) {
7654
7655                 # Don't bother checking the values, because we elsewhere
7656                 # verify that a binary table has only 2 values.
7657                 $format = $BINARY_FORMAT;
7658             }
7659             else {
7660                 my @ranges = $self->_range_list->ranges;
7661
7662                 # default an empty table based on its type and default map
7663                 if (! @ranges) {
7664
7665                     # But it turns out that the only one we can say is a
7666                     # non-string (besides binary, handled above) is when the
7667                     # table is a string and the default map is to a code point
7668                     if ($type == $STRING && $default_map eq $CODE_POINT) {
7669                         $format = $HEX_FORMAT;
7670                     }
7671                     else {
7672                         $format = $STRING_FORMAT;
7673                     }
7674                 }
7675                 else {
7676
7677                     # Start with the most restrictive format, and as we find
7678                     # something that doesn't fit with that, change to the next
7679                     # most restrictive, and so on.
7680                     $format = $DECIMAL_FORMAT;
7681                     foreach my $range (@ranges) {
7682                         next if $range->type != 0;  # Non-normal ranges don't
7683                                                     # affect the main body
7684                         my $map = $range->value;
7685                         if ($map ne $default_map) {
7686                             last if $format eq $STRING_FORMAT;  # already at
7687                                                                 # least
7688                                                                 # restrictive
7689                             $format = $INTEGER_FORMAT
7690                                                 if $format eq $DECIMAL_FORMAT
7691                                                     && $map !~ / ^ [0-9] $ /x;
7692                             $format = $FLOAT_FORMAT
7693                                             if $format eq $INTEGER_FORMAT
7694                                                 && $map !~ / ^ -? [0-9]+ $ /x;
7695                             $format = $RATIONAL_FORMAT
7696                                 if $format eq $FLOAT_FORMAT
7697                                     && $map !~ / ^ -? [0-9]+ \. [0-9]* $ /x;
7698                             $format = $HEX_FORMAT
7699                                 if ($format eq $RATIONAL_FORMAT
7700                                        && $map !~
7701                                            m/ ^ -? [0-9]+ ( \/ [0-9]+ )? $ /x)
7702                                         # Assume a leading zero means hex,
7703                                         # even if all digits are 0-9
7704                                     || ($format eq $INTEGER_FORMAT
7705                                         && $map =~ /^0[0-9A-F]/);
7706                             $format = $STRING_FORMAT if $format eq $HEX_FORMAT
7707                                                        && $map =~ /[^0-9A-F]/;
7708                         }
7709                     }
7710                 }
7711             }
7712         } # end of calculating format
7713
7714         if ($default_map eq $CODE_POINT
7715             && $format ne $HEX_FORMAT
7716             && ! defined $self->format)    # manual settings are always
7717                                            # considered ok
7718         {
7719             Carp::my_carp_bug("Expecting hex format for mapping table for $self, instead got '$format'")
7720         }
7721
7722         # If the output is to be adjusted, the format of the table that gets
7723         # output is actually 'a' or 'ax' instead of whatever it is stored
7724         # internally as.
7725         my $output_adjusted = ($self->to_output_map == $OUTPUT_ADJUSTED);
7726         if ($output_adjusted) {
7727             if ($default_map eq $CODE_POINT) {
7728                 $format = $HEX_ADJUST_FORMAT;
7729             }
7730             else {
7731                 $format = $ADJUST_FORMAT;
7732             }
7733         }
7734
7735         $self->_set_format($format);
7736
7737         return $self->SUPER::write(
7738             $output_adjusted,
7739             $default_map);   # don't write defaulteds
7740     }
7741
7742     # Accessors for the underlying list that should fail if locked.
7743     for my $sub (qw(
7744                     add_duplicate
7745                     replace_map
7746                 ))
7747     {
7748         no strict "refs";
7749         *$sub = sub {
7750             use strict "refs";
7751             my $self = shift;
7752
7753             return if $self->carp_if_locked;
7754             return $self->_range_list->$sub(@_);
7755         }
7756     }
7757 } # End closure for Map_Table
7758
7759 package Match_Table;
7760 use parent '-norequire', '_Base_Table';
7761
7762 # A Match table is one which is a list of all the code points that have
7763 # the same property and property value, for use in \p{property=value}
7764 # constructs in regular expressions.  It adds very little data to the base
7765 # structure, but many methods, as these lists can be combined in many ways to
7766 # form new ones.
7767 # There are only a few concepts added:
7768 # 1) Equivalents and Relatedness.
7769 #    Two tables can match the identical code points, but have different names.
7770 #    This always happens when there is a perl single form extension
7771 #    \p{IsProperty} for the Unicode compound form \P{Property=True}.  The two
7772 #    tables are set to be related, with the Perl extension being a child, and
7773 #    the Unicode property being the parent.
7774 #
7775 #    It may be that two tables match the identical code points and we don't
7776 #    know if they are related or not.  This happens most frequently when the
7777 #    Block and Script properties have the exact range.  But note that a
7778 #    revision to Unicode could add new code points to the script, which would
7779 #    now have to be in a different block (as the block was filled, or there
7780 #    would have been 'Unknown' script code points in it and they wouldn't have
7781 #    been identical).  So we can't rely on any two properties from Unicode
7782 #    always matching the same code points from release to release, and thus
7783 #    these tables are considered coincidentally equivalent--not related.  When
7784 #    two tables are unrelated but equivalent, one is arbitrarily chosen as the
7785 #    'leader', and the others are 'equivalents'.  This concept is useful
7786 #    to minimize the number of tables written out.  Only one file is used for
7787 #    any identical set of code points, with entries in Heavy.pl mapping all
7788 #    the involved tables to it.
7789 #
7790 #    Related tables will always be identical; we set them up to be so.  Thus
7791 #    if the Unicode one is deprecated, the Perl one will be too.  Not so for
7792 #    unrelated tables.  Relatedness makes generating the documentation easier.
7793 #
7794 # 2) Complement.
7795 #    Like equivalents, two tables may be the inverses of each other, the
7796 #    intersection between them is null, and the union is every Unicode code
7797 #    point.  The two tables that occupy a binary property are necessarily like
7798 #    this.  By specifying one table as the complement of another, we can avoid
7799 #    storing it on disk (using the other table and performing a fast
7800 #    transform), and some memory and calculations.
7801 #
7802 # 3) Conflicting.  It may be that there will eventually be name clashes, with
7803 #    the same name meaning different things.  For a while, there actually were
7804 #    conflicts, but they have so far been resolved by changing Perl's or
7805 #    Unicode's definitions to match the other, but when this code was written,
7806 #    it wasn't clear that that was what was going to happen.  (Unicode changed
7807 #    because of protests during their beta period.)  Name clashes are warned
7808 #    about during compilation, and the documentation.  The generated tables
7809 #    are sane, free of name clashes, because the code suppresses the Perl
7810 #    version.  But manual intervention to decide what the actual behavior
7811 #    should be may be required should this happen.  The introductory comments
7812 #    have more to say about this.
7813 #
7814 # 4) Definition.  This is a string for human consumption that specifies the
7815 #    code points that this table matches.  This is used only for the generated
7816 #    pod file.  It may be specified explicitly, or automatically computed.
7817 #    Only the first portion of complicated definitions is computed and
7818 #    displayed.
7819
7820 sub standardize { return main::standardize($_[0]); }
7821 sub trace { return main::trace(@_); }
7822
7823
7824 { # Closure
7825
7826     main::setup_package();
7827
7828     my %leader;
7829     # The leader table of this one; initially $self.
7830     main::set_access('leader', \%leader, 'r');
7831
7832     my %equivalents;
7833     # An array of any tables that have this one as their leader
7834     main::set_access('equivalents', \%equivalents, 'readable_array');
7835
7836     my %parent;
7837     # The parent table to this one, initially $self.  This allows us to
7838     # distinguish between equivalent tables that are related (for which this
7839     # is set to), and those which may not be, but share the same output file
7840     # because they match the exact same set of code points in the current
7841     # Unicode release.
7842     main::set_access('parent', \%parent, 'r');
7843
7844     my %children;
7845     # An array of any tables that have this one as their parent
7846     main::set_access('children', \%children, 'readable_array');
7847
7848     my %conflicting;
7849     # Array of any tables that would have the same name as this one with
7850     # a different meaning.  This is used for the generated documentation.
7851     main::set_access('conflicting', \%conflicting, 'readable_array');
7852
7853     my %matches_all;
7854     # Set in the constructor for tables that are expected to match all code
7855     # points.
7856     main::set_access('matches_all', \%matches_all, 'r');
7857
7858     my %complement;
7859     # Points to the complement that this table is expressed in terms of; 0 if
7860     # none.
7861     main::set_access('complement', \%complement, 'r');
7862
7863     my %definition;
7864     # Human readable string of the first few ranges of code points matched by
7865     # this table
7866     main::set_access('definition', \%definition, 'r', 's');
7867
7868     sub new {
7869         my $class = shift;
7870
7871         my %args = @_;
7872
7873         # The property for which this table is a listing of property values.
7874         my $property = delete $args{'_Property'};
7875
7876         my $name = delete $args{'Name'};
7877         my $full_name = delete $args{'Full_Name'};
7878         $full_name = $name if ! defined $full_name;
7879
7880         # Optional
7881         my $initialize = delete $args{'Initialize'};
7882         my $matches_all = delete $args{'Matches_All'} || 0;
7883         my $format = delete $args{'Format'};
7884         my $definition = delete $args{'Definition'} // "";
7885         # Rest of parameters passed on.
7886
7887         my $range_list = Range_List->new(Initialize => $initialize,
7888                                          Owner => $property);
7889
7890         my $complete = $full_name;
7891         $complete = '""' if $complete eq "";  # A null name shouldn't happen,
7892                                               # but this helps debug if it
7893                                               # does
7894         # The complete name for a match table includes it's property in a
7895         # compound form 'property=table', except if the property is the
7896         # pseudo-property, perl, in which case it is just the single form,
7897         # 'table' (If you change the '=' must also change the ':' in lots of
7898         # places in this program that assume an equal sign)
7899         $complete = $property->full_name . "=$complete" if $property != $perl;
7900
7901         my $self = $class->SUPER::new(%args,
7902                                       Name => $name,
7903                                       Complete_Name => $complete,
7904                                       Full_Name => $full_name,
7905                                       _Property => $property,
7906                                       _Range_List => $range_list,
7907                                       Format => $EMPTY_FORMAT,
7908                                       Write_As_Invlist => 1,
7909                                       );
7910         my $addr = do { no overloading; pack 'J', $self; };
7911
7912         $conflicting{$addr} = [ ];
7913         $equivalents{$addr} = [ ];
7914         $children{$addr} = [ ];
7915         $matches_all{$addr} = $matches_all;
7916         $leader{$addr} = $self;
7917         $parent{$addr} = $self;
7918         $complement{$addr} = 0;
7919         $definition{$addr} = $definition;
7920
7921         if (defined $format && $format ne $EMPTY_FORMAT) {
7922             Carp::my_carp_bug("'Format' must be '$EMPTY_FORMAT' in a match table instead of '$format'.  Using '$EMPTY_FORMAT'");
7923         }
7924
7925         return $self;
7926     }
7927
7928     # See this program's beginning comment block about overloading these.
7929     use overload
7930         fallback => 0,
7931         qw("") => "_operator_stringify",
7932         '=' => sub {
7933                     my $self = shift;
7934
7935                     return if $self->carp_if_locked;
7936                     return $self;
7937                 },
7938
7939         '+' => sub {
7940                         my $self = shift;
7941                         my $other = shift;
7942
7943                         return $self->_range_list + $other;
7944                     },
7945         '&' => sub {
7946                         my $self = shift;
7947                         my $other = shift;
7948
7949                         return $self->_range_list & $other;
7950                     },
7951         '+=' => sub {
7952                         my $self = shift;
7953                         my $other = shift;
7954                         my $reversed = shift;
7955
7956                         if ($reversed) {
7957                             Carp::my_carp_bug("Bad news.  Can't cope with '"
7958                             . ref($other)
7959                             . ' += '
7960                             . ref($self)
7961                             . "'.  undef returned.");
7962                             return;
7963                         }
7964
7965                         return if $self->carp_if_locked;
7966
7967                         my $addr = do { no overloading; pack 'J', $self; };
7968
7969                         if (ref $other) {
7970
7971                             # Change the range list of this table to be the
7972                             # union of the two.
7973                             $self->_set_range_list($self->_range_list
7974                                                     + $other);
7975                         }
7976                         else {    # $other is just a simple value
7977                             $self->add_range($other, $other);
7978                         }
7979                         return $self;
7980                     },
7981         '&=' => sub {
7982                         my $self = shift;
7983                         my $other = shift;
7984                         my $reversed = shift;
7985
7986                         if ($reversed) {
7987                             Carp::my_carp_bug("Bad news.  Can't cope with '"
7988                             . ref($other)
7989                             . ' &= '
7990                             . ref($self)
7991                             . "'.  undef returned.");
7992                             return;
7993                         }
7994
7995                         return if $self->carp_if_locked;
7996                         $self->_set_range_list($self->_range_list & $other);
7997                         return $self;
7998                     },
7999         '-' => sub { my $self = shift;
8000                     my $other = shift;
8001                     my $reversed = shift;
8002                     if ($reversed) {
8003                         Carp::my_carp_bug("Bad news.  Can't cope with '"
8004                         . ref($other)
8005                         . ' - '
8006                         . ref($self)
8007                         . "'.  undef returned.");
8008                         return;
8009                     }
8010
8011                     return $self->_range_list - $other;
8012                 },
8013         '~' => sub { my $self = shift;
8014                     return ~ $self->_range_list;
8015                 },
8016     ;
8017
8018     sub _operator_stringify {
8019         my $self = shift;
8020
8021         my $name = $self->complete_name;
8022         return "Table '$name'";
8023     }
8024
8025     sub _range_list {
8026         # Returns the range list associated with this table, which will be the
8027         # complement's if it has one.
8028
8029         my $self = shift;
8030         my $complement = $self->complement;
8031
8032         # In order to avoid re-complementing on each access, only do the
8033         # complement the first time, and store the result in this table's
8034         # range list to use henceforth.  However, this wouldn't work if the
8035         # controlling (complement) table changed after we do this, so lock it.
8036         # Currently, the value of the complement isn't needed until after it
8037         # is fully constructed, so this works.  If this were to change, the
8038         # each_range iteration functionality would no longer work on this
8039         # complement.
8040         if ($complement != 0 && $self->SUPER::_range_list->count == 0) {
8041             $self->_set_range_list($self->SUPER::_range_list
8042                                 + ~ $complement->_range_list);
8043             $complement->lock;
8044         }
8045
8046         return $self->SUPER::_range_list;
8047     }
8048
8049     sub add_alias {
8050         # Add a synonym for this table.  See the comments in the base class
8051
8052         my $self = shift;
8053         my $name = shift;
8054         # Rest of parameters passed on.
8055
8056         $self->SUPER::add_alias($name, $self, @_);
8057         return;
8058     }
8059
8060     sub add_conflicting {
8061         # Add the name of some other object to the list of ones that name
8062         # clash with this match table.
8063
8064         my $self = shift;
8065         my $conflicting_name = shift;   # The name of the conflicting object
8066         my $p = shift || 'p';           # Optional, is this a \p{} or \P{} ?
8067         my $conflicting_object = shift; # Optional, the conflicting object
8068                                         # itself.  This is used to
8069                                         # disambiguate the text if the input
8070                                         # name is identical to any of the
8071                                         # aliases $self is known by.
8072                                         # Sometimes the conflicting object is
8073                                         # merely hypothetical, so this has to
8074                                         # be an optional parameter.
8075         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8076
8077         my $addr = do { no overloading; pack 'J', $self; };
8078
8079         # Check if the conflicting name is exactly the same as any existing
8080         # alias in this table (as long as there is a real object there to
8081         # disambiguate with).
8082         if (defined $conflicting_object) {
8083             foreach my $alias ($self->aliases) {
8084                 if (standardize($alias->name) eq standardize($conflicting_name)) {
8085
8086                     # Here, there is an exact match.  This results in
8087                     # ambiguous comments, so disambiguate by changing the
8088                     # conflicting name to its object's complete equivalent.
8089                     $conflicting_name = $conflicting_object->complete_name;
8090                     last;
8091                 }
8092             }
8093         }
8094
8095         # Convert to the \p{...} final name
8096         $conflicting_name = "\\$p" . "{$conflicting_name}";
8097
8098         # Only add once
8099         return if grep { $conflicting_name eq $_ } @{$conflicting{$addr}};
8100
8101         push @{$conflicting{$addr}}, $conflicting_name;
8102
8103         return;
8104     }
8105
8106     sub is_set_equivalent_to {
8107         # Return boolean of whether or not the other object is a table of this
8108         # type and has been marked equivalent to this one.
8109
8110         my $self = shift;
8111         my $other = shift;
8112         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8113
8114         return 0 if ! defined $other; # Can happen for incomplete early
8115                                       # releases
8116         unless ($other->isa(__PACKAGE__)) {
8117             my $ref_other = ref $other;
8118             my $ref_self = ref $self;
8119             Carp::my_carp_bug("Argument to 'is_set_equivalent_to' must be another $ref_self, not a '$ref_other'.  $other not set equivalent to $self.");
8120             return 0;
8121         }
8122
8123         # Two tables are equivalent if they have the same leader.
8124         no overloading;
8125         return $leader{pack 'J', $self} == $leader{pack 'J', $other};
8126         return;
8127     }
8128
8129     sub set_equivalent_to {
8130         # Set $self equivalent to the parameter table.
8131         # The required Related => 'x' parameter is a boolean indicating
8132         # whether these tables are related or not.  If related, $other becomes
8133         # the 'parent' of $self; if unrelated it becomes the 'leader'
8134         #
8135         # Related tables share all characteristics except names; equivalents
8136         # not quite so many.
8137         # If they are related, one must be a perl extension.  This is because
8138         # we can't guarantee that Unicode won't change one or the other in a
8139         # later release even if they are identical now.
8140
8141         my $self = shift;
8142         my $other = shift;
8143
8144         my %args = @_;
8145         my $related = delete $args{'Related'};
8146
8147         Carp::carp_extra_args(\%args) if main::DEBUG && %args;
8148
8149         return if ! defined $other;     # Keep on going; happens in some early
8150                                         # Unicode releases.
8151
8152         if (! defined $related) {
8153             Carp::my_carp_bug("set_equivalent_to must have 'Related => [01] parameter.  Assuming $self is not related to $other");
8154             $related = 0;
8155         }
8156
8157         # If already are equivalent, no need to re-do it;  if subroutine
8158         # returns null, it found an error, also do nothing
8159         my $are_equivalent = $self->is_set_equivalent_to($other);
8160         return if ! defined $are_equivalent || $are_equivalent;
8161
8162         my $addr = do { no overloading; pack 'J', $self; };
8163         my $current_leader = ($related) ? $parent{$addr} : $leader{$addr};
8164
8165         if ($related) {
8166             if ($current_leader->perl_extension) {
8167                 if ($other->perl_extension) {
8168                     Carp::my_carp_bug("Use add_alias() to set two Perl tables '$self' and '$other', equivalent.");
8169                     return;
8170                 }
8171             } elsif ($self->property != $other->property    # Depending on
8172                                                             # situation, might
8173                                                             # be better to use
8174                                                             # add_alias()
8175                                                             # instead for same
8176                                                             # property
8177                      && ! $other->perl_extension
8178
8179                          # We allow the sc and scx properties to be marked as
8180                          # related.  They are in fact related, and this allows
8181                          # the pod to show that better.  This test isn't valid
8182                          # if this is an early Unicode release without the scx
8183                          # property (having that also implies the sc property
8184                          # exists, so don't have to test for no 'sc')
8185                      && (   ! defined $scx
8186                          && ! (   (   $self->property == $script
8187                                    || $self->property == $scx)
8188                                && (   $self->property == $script
8189                                    || $self->property == $scx))))
8190             {
8191                 Carp::my_carp_bug("set_equivalent_to should have 'Related => 0 for equivalencing two Unicode properties.  Assuming $self is not related to $other");
8192                 $related = 0;
8193             }
8194         }
8195
8196         if (! $self->is_empty && ! $self->matches_identically_to($other)) {
8197             Carp::my_carp_bug("$self should be empty or match identically to $other.  Not setting equivalent");
8198             return;
8199         }
8200
8201         my $leader = do { no overloading; pack 'J', $current_leader; };
8202         my $other_addr = do { no overloading; pack 'J', $other; };
8203
8204         # Any tables that are equivalent to or children of this table must now
8205         # instead be equivalent to or (children) to the new leader (parent),
8206         # still equivalent.  The equivalency includes their matches_all info,
8207         # and for related tables, their fate and status.
8208         # All related tables are of necessity equivalent, but the converse
8209         # isn't necessarily true
8210         my $status = $other->status;
8211         my $status_info = $other->status_info;
8212         my $fate = $other->fate;
8213         my $matches_all = $matches_all{other_addr};
8214         my $caseless_equivalent = $other->caseless_equivalent;
8215         foreach my $table ($current_leader, @{$equivalents{$leader}}) {
8216             next if $table == $other;
8217             trace "setting $other to be the leader of $table, status=$status" if main::DEBUG && $to_trace;
8218
8219             my $table_addr = do { no overloading; pack 'J', $table; };
8220             $leader{$table_addr} = $other;
8221             $matches_all{$table_addr} = $matches_all;
8222             $self->_set_range_list($other->_range_list);
8223             push @{$equivalents{$other_addr}}, $table;
8224             if ($related) {
8225                 $parent{$table_addr} = $other;
8226                 push @{$children{$other_addr}}, $table;
8227                 $table->set_status($status, $status_info);
8228
8229                 # This reason currently doesn't get exposed outside; otherwise
8230                 # would have to look up the parent's reason and use it instead.
8231                 $table->set_fate($fate, "Parent's fate");
8232
8233                 $self->set_caseless_equivalent($caseless_equivalent);
8234             }
8235         }
8236
8237         # Now that we've declared these to be equivalent, any changes to one
8238         # of the tables would invalidate that equivalency.
8239         $self->lock;
8240         $other->lock;
8241         return;
8242     }
8243
8244     sub set_complement {
8245         # Set $self to be the complement of the parameter table.  $self is
8246         # locked, as what it contains should all come from the other table.
8247
8248         my $self = shift;
8249         my $other = shift;
8250
8251         my %args = @_;
8252         Carp::carp_extra_args(\%args) if main::DEBUG && %args;
8253
8254         if ($other->complement != 0) {
8255             Carp::my_carp_bug("Can't set $self to be the complement of $other, which itself is the complement of " . $other->complement);
8256             return;
8257         }
8258         my $addr = do { no overloading; pack 'J', $self; };
8259         $complement{$addr} = $other;
8260
8261         # Be sure the other property knows we are depending on them; or the
8262         # other table if it is one in the current property.
8263         if ($self->property != $other->property) {
8264             $other->property->set_has_dependency(1);
8265         }
8266         else {
8267             $other->set_has_dependency(1);
8268         }
8269         $self->lock;
8270         return;
8271     }
8272
8273     sub add_range { # Add a range to the list for this table.
8274         my $self = shift;
8275         # Rest of parameters passed on
8276
8277         return if $self->carp_if_locked;
8278         return $self->_range_list->add_range(@_);
8279     }
8280
8281     sub header {
8282         my $self = shift;
8283         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8284
8285         # All match tables are to be used only by the Perl core.
8286         return $self->SUPER::header() . $INTERNAL_ONLY_HEADER;
8287     }
8288
8289     sub pre_body {  # Does nothing for match tables.
8290         return
8291     }
8292
8293     sub append_to_body {  # Does nothing for match tables.
8294         return
8295     }
8296
8297     sub set_fate {
8298         my $self = shift;
8299         my $fate = shift;
8300         my $reason = shift;
8301         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8302
8303         $self->SUPER::set_fate($fate, $reason);
8304
8305         # All children share this fate
8306         foreach my $child ($self->children) {
8307             $child->set_fate($fate, $reason);
8308         }
8309         return;
8310     }
8311
8312     sub calculate_table_definition
8313     {
8314         # Returns a human-readable string showing some or all of the code
8315         # points matched by this table.  The string will include a
8316         # bracketed-character class for all characters matched in the 00-FF
8317         # range, and the first few ranges matched beyond that.
8318         my $max_ranges = 6;
8319
8320         my $self = shift;
8321         my $definition = $self->definition || "";
8322
8323         # Skip this if already have a definition.
8324         return $definition if $definition;
8325
8326         my $lows_string = "";   # The string representation of the 0-FF
8327                                 # characters
8328         my $string_range = "";  # The string rep. of the above FF ranges
8329         my $range_count = 0;    # How many ranges in $string_rage
8330
8331         my @lows_invlist;       # The inversion list of the 0-FF code points
8332         my $first_non_control = ord(" ");   # Everything below this is a
8333                                             # control, on ASCII or EBCDIC
8334         my $max_table_code_point = $self->max;
8335
8336         # On ASCII platforms, the range 80-FF contains no printables.
8337         my $highest_printable = ((main::NON_ASCII_PLATFORM) ? 255 : 126);
8338
8339
8340         # Look through the first few ranges matched by this table.
8341         $self->reset_each_range;    # Defensive programming
8342         while (defined (my $range = $self->each_range())) {
8343             my $start = $range->start;
8344             my $end = $range->end;
8345
8346             # Accumulate an inversion list of the 00-FF code points
8347             if ($start < 256 && ($start > 0 || $end < 256)) {
8348                 push @lows_invlist, $start;
8349                 push @lows_invlist, 1 + (($end < 256) ? $end : 255);
8350
8351                 # Get next range if there are more ranges below 256
8352                 next if $end < 256 && $end < $max_table_code_point;
8353
8354                 # If the range straddles the 255/256 boundary, we split it
8355                 # there.  We already added above the low portion to the
8356                 # inversion list
8357                 $start = 256 if $end > 256;
8358             }
8359
8360             # Here, @lows_invlist contains the code points below 256, and
8361             # there is no other range, or the current one starts at or above
8362             # 256.  Generate the [char class] for the 0-255 ones.
8363             while (@lows_invlist) {
8364
8365                 # If this range (necessarily the first one, by the way) starts
8366                 # at 0 ...
8367                 if ($lows_invlist[0] == 0) {
8368
8369                     # If it ends within the block of controls, that means that
8370                     # some controls are in it and some aren't.  Since Unicode
8371                     # properties pretty much only know about a few of the
8372                     # controls, like \n, \t, this means that its one of them
8373                     # that isn't in the range.  Complement the inversion list
8374                     # which will likely cause these to be output using their
8375                     # mnemonics, hence being clearer.
8376                     if ($lows_invlist[1] < $first_non_control) {
8377                         $lows_string .= '^';
8378                         shift @lows_invlist;
8379                         push @lows_invlist, 256;
8380                     }
8381                     elsif ($lows_invlist[1] <= $highest_printable) {
8382
8383                         # Here, it extends into the printables block.  Split
8384                         # into two ranges so that the controls are separate.
8385                         $lows_string .= sprintf "\\x00-\\x%02x",
8386                                                     $first_non_control - 1;
8387                         $lows_invlist[0] = $first_non_control;
8388                     }
8389                 }
8390
8391                 # If the range completely contains the printables, don't
8392                 # individually spell out the printables.
8393                 if (    $lows_invlist[0] <= $first_non_control
8394                     && $lows_invlist[1] > $highest_printable)
8395                 {
8396                     $lows_string .= sprintf "\\x%02x-\\x%02x",
8397                                         $lows_invlist[0], $lows_invlist[1] - 1;
8398                     shift @lows_invlist;
8399                     shift @lows_invlist;
8400                     next;
8401                 }
8402
8403                 # Here, the range may include some but not all printables.
8404                 # Look at each one individually
8405                 foreach my $ord (shift @lows_invlist .. shift(@lows_invlist) - 1) {
8406                     my $char = chr $ord;
8407
8408                     # If there is already something in the list, an
8409                     # alphanumeric char could be the next in sequence.  If so,
8410                     # we start or extend a range.  That is, we could have so
8411                     # far something like 'a-c', and the next char is a 'd', so
8412                     # we change it to 'a-d'.  We use native_to_unicode()
8413                     # because a-z on EBCDIC means 26 chars, and excludes the
8414                     # gap ones.
8415                     if ($lows_string ne "" && $char =~ /[[:alnum:]]/) {
8416                         my $prev = substr($lows_string, -1);
8417                         if (   $prev !~ /[[:alnum:]]/
8418                             ||   utf8::native_to_unicode(ord $prev) + 1
8419                               != utf8::native_to_unicode(ord $char))
8420                         {
8421                             # Not extending the range
8422                             $lows_string .= $char;
8423                         }
8424                         elsif (   length $lows_string > 1
8425                                && substr($lows_string, -2, 1) eq '-')
8426                         {
8427                             # We had a sequence like '-c' and the current
8428                             # character is 'd'.  Extend the range.
8429                             substr($lows_string, -1, 1) = $char;
8430                         }
8431                         else {
8432                             # We had something like 'd' and this is 'e'.
8433                             # Start a range.
8434                             $lows_string .= "-$char";
8435                         }
8436                     }
8437                     elsif ($char =~ /[[:graph:]]/) {
8438
8439                         # We output a graphic char as-is, preceded by a
8440                         # backslash if it is a metacharacter
8441                         $lows_string .= '\\'
8442                                 if $char =~ /[\\\^\$\@\%\|()\[\]\{\}\-\/"']/;
8443                         $lows_string .= $char;
8444                     } # Otherwise use mnemonic for any that have them
8445                     elsif ($char =~ /[\a]/) {
8446                         $lows_string .= '\a';
8447                     }
8448                     elsif ($char =~ /[\b]/) {
8449                         $lows_string .= '\b';
8450                     }
8451                     elsif ($char eq "\e") {
8452                         $lows_string .= '\e';
8453                     }
8454                     elsif ($char eq "\f") {
8455                         $lows_string .= '\f';
8456                     }
8457                     elsif ($char eq "\cK") {
8458                         $lows_string .= '\cK';
8459                     }
8460                     elsif ($char eq "\n") {
8461                         $lows_string .= '\n';
8462                     }
8463                     elsif ($char eq "\r") {
8464                         $lows_string .= '\r';
8465                     }
8466                     elsif ($char eq "\t") {
8467                         $lows_string .= '\t';
8468                     }
8469                     else {
8470
8471                         # Here is a non-graphic without a mnemonic.  We use \x
8472                         # notation.  But if the ordinal of this is one above
8473                         # the previous, create or extend the range
8474                         my $hex_representation = sprintf("%02x", ord $char);
8475                         if (   length $lows_string >= 4
8476                             && substr($lows_string, -4, 2) eq '\\x'
8477                             && hex(substr($lows_string, -2)) + 1 == ord $char)
8478                         {
8479                             if (       length $lows_string >= 5
8480                                 &&     substr($lows_string, -5, 1) eq '-'
8481                                 && (   length $lows_string == 5
8482                                     || substr($lows_string, -6, 1) ne '\\'))
8483                             {
8484                                 substr($lows_string, -2) = $hex_representation;
8485                             }
8486                             else {
8487                                 $lows_string .= '-\\x' . $hex_representation;
8488                             }
8489                         }
8490                         else {
8491                             $lows_string .= '\\x' . $hex_representation;
8492                         }
8493                     }
8494                 }
8495             }
8496
8497             # Done with assembling the string of all lows.  If there are only
8498             # lows in the property, are completely done.
8499             if ($max_table_code_point < 256) {
8500                 $self->reset_each_range;
8501                 last;
8502             }
8503
8504             # Otherwise, quit if reached max number of non-lows ranges.  If
8505             # there are lows, count them as one unit towards the maximum.
8506             $range_count++;
8507             if ($range_count > (($lows_string eq "") ? $max_ranges : $max_ranges - 1)) {
8508                 $string_range .= " ...";
8509                 $self->reset_each_range;
8510                 last;
8511             }
8512
8513             # Otherwise add this range.
8514             $string_range .= ", " if $string_range ne "";
8515             if ($start == $end) {
8516                 $string_range .= sprintf("U+%04X", $start);
8517             }
8518             elsif ($end >= $MAX_WORKING_CODEPOINT)  {
8519                 $string_range .= sprintf("U+%04X..infinity", $start);
8520             }
8521             else  {
8522                 $string_range .= sprintf("U+%04X..%04X",
8523                                         $start, $end);
8524             }
8525         }
8526
8527         # Done with all the ranges we're going to look at.  Assemble the
8528         # definition from the lows + non-lows.
8529
8530         if ($lows_string ne "" || $string_range ne "") {
8531             if ($lows_string ne "") {
8532                 $definition .= "[$lows_string]";
8533                 $definition .= ", " if $string_range;
8534             }
8535             $definition .= $string_range;
8536         }
8537
8538         return $definition;
8539     }
8540
8541     sub write {
8542         my $self = shift;
8543         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8544
8545         return $self->SUPER::write(0); # No adjustments
8546     }
8547
8548     sub set_final_comment {
8549         # This creates a comment for the file that is to hold the match table
8550         # $self.  It is somewhat convoluted to make the English read nicely,
8551         # but, heh, it's just a comment.
8552         # This should be called only with the leader match table of all the
8553         # ones that share the same file.  It lists all such tables, ordered so
8554         # that related ones are together.
8555
8556         return unless $debugging_build;
8557
8558         my $leader = shift;   # Should only be called on the leader table of
8559                               # an equivalent group
8560         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8561
8562         my $addr = do { no overloading; pack 'J', $leader; };
8563
8564         if ($leader{$addr} != $leader) {
8565             Carp::my_carp_bug(<<END
8566 set_final_comment() must be called on a leader table, which $leader is not.
8567 It is equivalent to $leader{$addr}.  No comment created
8568 END
8569             );
8570             return;
8571         }
8572
8573         # Get the number of code points matched by each of the tables in this
8574         # file, and add underscores for clarity.
8575         my $count = $leader->count;
8576         my $unicode_count;
8577         my $non_unicode_string;
8578         if ($count > $MAX_UNICODE_CODEPOINTS) {
8579             $unicode_count = $count - ($MAX_WORKING_CODEPOINT
8580                                        - $MAX_UNICODE_CODEPOINT);
8581             $non_unicode_string = "All above-Unicode code points match as well, and are also returned";
8582         }
8583         else {
8584             $unicode_count = $count;
8585             $non_unicode_string = "";
8586         }
8587         my $string_count = main::clarify_code_point_count($unicode_count);
8588
8589         my $loose_count = 0;        # how many aliases loosely matched
8590         my $compound_name = "";     # ? Are any names compound?, and if so, an
8591                                     # example
8592         my $properties_with_compound_names = 0;    # count of these
8593
8594
8595         my %flags;              # The status flags used in the file
8596         my $total_entries = 0;  # number of entries written in the comment
8597         my $matches_comment = ""; # The portion of the comment about the
8598                                   # \p{}'s
8599         my @global_comments;    # List of all the tables' comments that are
8600                                 # there before this routine was called.
8601         my $has_ucd_alias = 0;  # If there is an alias that is accessible via
8602                                 # Unicode::UCD.  If not, then don't say it is
8603                                 # in the comment
8604
8605         # Get list of all the parent tables that are equivalent to this one
8606         # (including itself).
8607         my @parents = grep { $parent{main::objaddr $_} == $_ }
8608                             main::uniques($leader, @{$equivalents{$addr}});
8609         my $has_unrelated = (@parents >= 2);  # boolean, ? are there unrelated
8610                                               # tables
8611         for my $parent (@parents) {
8612
8613             my $property = $parent->property;
8614
8615             # Special case 'N' tables in properties with two match tables when
8616             # the other is a 'Y' one.  These are likely to be binary tables,
8617             # but not necessarily.  In either case, \P{} will match the
8618             # complement of \p{}, and so if something is a synonym of \p, the
8619             # complement of that something will be the synonym of \P.  This
8620             # would be true of any property with just two match tables, not
8621             # just those whose values are Y and N; but that would require a
8622             # little extra work, and there are none such so far in Unicode.
8623             my $perl_p = 'p';        # which is it?  \p{} or \P{}
8624             my @yes_perl_synonyms;   # list of any synonyms for the 'Y' table
8625
8626             if (scalar $property->tables == 2
8627                 && $parent == $property->table('N')
8628                 && defined (my $yes = $property->table('Y')))
8629             {
8630                 my $yes_addr = do { no overloading; pack 'J', $yes; };
8631                 @yes_perl_synonyms
8632                     = grep { $_->property == $perl }
8633                                     main::uniques($yes,
8634                                                 $parent{$yes_addr},
8635                                                 $parent{$yes_addr}->children);
8636
8637                 # But these synonyms are \P{} ,not \p{}
8638                 $perl_p = 'P';
8639             }
8640
8641             my @description;        # Will hold the table description
8642             my @note;               # Will hold the table notes.
8643             my @conflicting;        # Will hold the table conflicts.
8644
8645             # Look at the parent, any yes synonyms, and all the children
8646             my $parent_addr = do { no overloading; pack 'J', $parent; };
8647             for my $table ($parent,
8648                            @yes_perl_synonyms,
8649                            @{$children{$parent_addr}})
8650             {
8651                 my $table_addr = do { no overloading; pack 'J', $table; };
8652                 my $table_property = $table->property;
8653
8654                 # Tables are separated by a blank line to create a grouping.
8655                 $matches_comment .= "\n" if $matches_comment;
8656
8657                 # The table is named based on the property and value
8658                 # combination it is for, like script=greek.  But there may be
8659                 # a number of synonyms for each side, like 'sc' for 'script',
8660                 # and 'grek' for 'greek'.  Any combination of these is a valid
8661                 # name for this table.  In this case, there are three more,
8662                 # 'sc=grek', 'sc=greek', and 'script='grek'.  Rather than
8663                 # listing all possible combinations in the comment, we make
8664                 # sure that each synonym occurs at least once, and add
8665                 # commentary that the other combinations are possible.
8666                 # Because regular expressions don't recognize things like
8667                 # \p{jsn=}, only look at non-null right-hand-sides
8668                 my @property_aliases = grep { $_->status ne $INTERNAL_ALIAS } $table_property->aliases;
8669                 my @table_aliases = grep { $_->name ne "" } $table->aliases;
8670
8671                 # The alias lists above are already ordered in the order we
8672                 # want to output them.  To ensure that each synonym is listed,
8673                 # we must use the max of the two numbers.  But if there are no
8674                 # legal synonyms (nothing in @table_aliases), then we don't
8675                 # list anything.
8676                 my $listed_combos = (@table_aliases)
8677                                     ?  main::max(scalar @table_aliases,
8678                                                  scalar @property_aliases)
8679                                     : 0;
8680                 trace "$listed_combos, tables=", scalar @table_aliases, "; property names=", scalar @property_aliases if main::DEBUG;
8681
8682                 my $property_had_compound_name = 0;
8683
8684                 for my $i (0 .. $listed_combos - 1) {
8685                     $total_entries++;
8686
8687                     # The current alias for the property is the next one on
8688                     # the list, or if beyond the end, start over.  Similarly
8689                     # for the table (\p{prop=table})
8690                     my $property_alias = $property_aliases
8691                                             [$i % @property_aliases]->name;
8692                     my $table_alias_object = $table_aliases
8693                                                         [$i % @table_aliases];
8694                     my $table_alias = $table_alias_object->name;
8695                     my $loose_match = $table_alias_object->loose_match;
8696                     $has_ucd_alias |= $table_alias_object->ucd;
8697
8698                     if ($table_alias !~ /\D/) { # Clarify large numbers.
8699                         $table_alias = main::clarify_number($table_alias)
8700                     }
8701
8702                     # Add a comment for this alias combination
8703                     my $current_match_comment;
8704                     if ($table_property == $perl) {
8705                         $current_match_comment = "\\$perl_p"
8706                                                     . "{$table_alias}";
8707                     }
8708                     else {
8709                         $current_match_comment
8710                                         = "\\p{$property_alias=$table_alias}";
8711                         $property_had_compound_name = 1;
8712                     }
8713
8714                     # Flag any abnormal status for this table.
8715                     my $flag = $property->status
8716                                 || $table->status
8717                                 || $table_alias_object->status;
8718                     if ($flag && $flag ne $PLACEHOLDER) {
8719                         $flags{$flag} = $status_past_participles{$flag};
8720                     }
8721
8722                     $loose_count++;
8723
8724                     # Pretty up the comment.  Note the \b; it says don't make
8725                     # this line a continuation.
8726                     $matches_comment .= sprintf("\b%-1s%-s%s\n",
8727                                         $flag,
8728                                         " " x 7,
8729                                         $current_match_comment);
8730                 } # End of generating the entries for this table.
8731
8732                 # Save these for output after this group of related tables.
8733                 push @description, $table->description;
8734                 push @note, $table->note;
8735                 push @conflicting, $table->conflicting;
8736
8737                 # And this for output after all the tables.
8738                 push @global_comments, $table->comment;
8739
8740                 # Compute an alternate compound name using the final property
8741                 # synonym and the first table synonym with a colon instead of
8742                 # the equal sign used elsewhere.
8743                 if ($property_had_compound_name) {
8744                     $properties_with_compound_names ++;
8745                     if (! $compound_name || @property_aliases > 1) {
8746                         $compound_name = $property_aliases[-1]->name
8747                                         . ': '
8748                                         . $table_aliases[0]->name;
8749                     }
8750                 }
8751             } # End of looping through all children of this table
8752
8753             # Here have assembled in $matches_comment all the related tables
8754             # to the current parent (preceded by the same info for all the
8755             # previous parents).  Put out information that applies to all of
8756             # the current family.
8757             if (@conflicting) {
8758
8759                 # But output the conflicting information now, as it applies to
8760                 # just this table.
8761                 my $conflicting = join ", ", @conflicting;
8762                 if ($conflicting) {
8763                     $matches_comment .= <<END;
8764
8765     Note that contrary to what you might expect, the above is NOT the same as
8766 END
8767                     $matches_comment .= "any of: " if @conflicting > 1;
8768                     $matches_comment .= "$conflicting\n";
8769                 }
8770             }
8771             if (@description) {
8772                 $matches_comment .= "\n    Meaning: "
8773                                     . join('; ', @description)
8774                                     . "\n";
8775             }
8776             if (@note) {
8777                 $matches_comment .= "\n    Note: "
8778                                     . join("\n    ", @note)
8779                                     . "\n";
8780             }
8781         } # End of looping through all tables
8782
8783         $matches_comment .= "\n$non_unicode_string\n" if $non_unicode_string;
8784
8785
8786         my $code_points;
8787         my $match;
8788         my $any_of_these;
8789         if ($unicode_count == 1) {
8790             $match = 'matches';
8791             $code_points = 'single code point';
8792         }
8793         else {
8794             $match = 'match';
8795             $code_points = "$string_count code points";
8796         }
8797
8798         my $synonyms;
8799         my $entries;
8800         if ($total_entries == 1) {
8801             $synonyms = "";
8802             $entries = 'entry';
8803             $any_of_these = 'this'
8804         }
8805         else {
8806             $synonyms = " any of the following regular expression constructs";
8807             $entries = 'entries';
8808             $any_of_these = 'any of these'
8809         }
8810
8811         my $comment = "";
8812         if ($has_ucd_alias) {
8813             $comment .= "Use Unicode::UCD::prop_invlist() to access the contents of this file.\n\n";
8814         }
8815         if ($has_unrelated) {
8816             $comment .= <<END;
8817 This file is for tables that are not necessarily related:  To conserve
8818 resources, every table that matches the identical set of code points in this
8819 version of Unicode uses this file.  Each one is listed in a separate group
8820 below.  It could be that the tables will match the same set of code points in
8821 other Unicode releases, or it could be purely coincidence that they happen to
8822 be the same in Unicode $unicode_version, and hence may not in other versions.
8823
8824 END
8825         }
8826
8827         if (%flags) {
8828             foreach my $flag (sort keys %flags) {
8829                 $comment .= <<END;
8830 '$flag' below means that this form is $flags{$flag}.
8831 END
8832                 if ($flag eq $INTERNAL_ALIAS) {
8833                     $comment .= "DO NOT USE!!!";
8834                 }
8835                 else {
8836                     $comment .= "Consult $pod_file.pod";
8837                 }
8838                 $comment .= "\n";
8839             }
8840             $comment .= "\n";
8841         }
8842
8843         if ($total_entries == 0) {
8844             Carp::my_carp("No regular expression construct can match $leader, as all names for it are the null string.  Creating file anyway.");
8845             $comment .= <<END;
8846 This file returns the $code_points in Unicode Version
8847 $unicode_version for
8848 $leader, but it is inaccessible through Perl regular expressions, as
8849 "\\p{prop=}" is not recognized.
8850 END
8851
8852         } else {
8853             $comment .= <<END;
8854 This file returns the $code_points in Unicode Version
8855 $unicode_version that
8856 $match$synonyms:
8857
8858 $matches_comment
8859 $pod_file.pod should be consulted for the syntax rules for $any_of_these,
8860 including if adding or subtracting white space, underscore, and hyphen
8861 characters matters or doesn't matter, and other permissible syntactic
8862 variants.  Upper/lower case distinctions never matter.
8863 END
8864
8865         }
8866         if ($compound_name) {
8867             $comment .= <<END;
8868
8869 A colon can be substituted for the equals sign, and
8870 END
8871             if ($properties_with_compound_names > 1) {
8872                 $comment .= <<END;
8873 within each group above,
8874 END
8875             }
8876             $compound_name = sprintf("%-8s\\p{%s}", " ", $compound_name);
8877
8878             # Note the \b below, it says don't make that line a continuation.
8879             $comment .= <<END;
8880 anything to the left of the equals (or colon) can be combined with anything to
8881 the right.  Thus, for example,
8882 $compound_name
8883 \bis also valid.
8884 END
8885         }
8886
8887         # And append any comment(s) from the actual tables.  They are all
8888         # gathered here, so may not read all that well.
8889         if (@global_comments) {
8890             $comment .= "\n" . join("\n\n", @global_comments) . "\n";
8891         }
8892
8893         if ($count) {   # The format differs if no code points, and needs no
8894                         # explanation in that case
8895             if ($leader->write_as_invlist) {
8896                 $comment.= <<END;
8897
8898 The first data line of this file begins with the letter V to indicate it is in
8899 inversion list format.  The number following the V gives the number of lines
8900 remaining.  Each of those remaining lines is a single number representing the
8901 starting code point of a range which goes up to but not including the number
8902 on the next line; The 0th, 2nd, 4th... ranges are for code points that match
8903 the property; the 1st, 3rd, 5th... are ranges of code points that don't match
8904 the property.  The final line's range extends to the platform's infinity.
8905 END
8906             }
8907             else {
8908                 $comment.= <<END;
8909 The format of the lines of this file is:
8910 START\\tSTOP\\twhere START is the starting code point of the range, in hex;
8911 STOP is the ending point, or if omitted, the range has just one code point.
8912 END
8913             }
8914             if ($leader->output_range_counts) {
8915                 $comment .= <<END;
8916 Numbers in comments in [brackets] indicate how many code points are in the
8917 range.
8918 END
8919             }
8920         }
8921
8922         $leader->set_comment(main::join_lines($comment));
8923         return;
8924     }
8925
8926     # Accessors for the underlying list
8927     for my $sub (qw(
8928                     get_valid_code_point
8929                     get_invalid_code_point
8930                 ))
8931     {
8932         no strict "refs";
8933         *$sub = sub {
8934             use strict "refs";
8935             my $self = shift;
8936
8937             return $self->_range_list->$sub(@_);
8938         }
8939     }
8940 } # End closure for Match_Table
8941
8942 package Property;
8943
8944 # The Property class represents a Unicode property, or the $perl
8945 # pseudo-property.  It contains a map table initialized empty at construction
8946 # time, and for properties accessible through regular expressions, various
8947 # match tables, created through the add_match_table() method, and referenced
8948 # by the table('NAME') or tables() methods, the latter returning a list of all
8949 # of the match tables.  Otherwise table operations implicitly are for the map
8950 # table.
8951 #
8952 # Most of the data in the property is actually about its map table, so it
8953 # mostly just uses that table's accessors for most methods.  The two could
8954 # have been combined into one object, but for clarity because of their
8955 # differing semantics, they have been kept separate.  It could be argued that
8956 # the 'file' and 'directory' fields should be kept with the map table.
8957 #
8958 # Each property has a type.  This can be set in the constructor, or in the
8959 # set_type accessor, but mostly it is figured out by the data.  Every property
8960 # starts with unknown type, overridden by a parameter to the constructor, or
8961 # as match tables are added, or ranges added to the map table, the data is
8962 # inspected, and the type changed.  After the table is mostly or entirely
8963 # filled, compute_type() should be called to finalize they analysis.
8964 #
8965 # There are very few operations defined.  One can safely remove a range from
8966 # the map table, and property_add_or_replace_non_nulls() adds the maps from another
8967 # table to this one, replacing any in the intersection of the two.
8968
8969 sub standardize { return main::standardize($_[0]); }
8970 sub trace { return main::trace(@_) if main::DEBUG && $to_trace }
8971
8972 {   # Closure
8973
8974     # This hash will contain as keys, all the aliases of all properties, and
8975     # as values, pointers to their respective property objects.  This allows
8976     # quick look-up of a property from any of its names.
8977     my %alias_to_property_of;
8978
8979     sub dump_alias_to_property_of {
8980         # For debugging
8981
8982         print "\n", main::simple_dumper (\%alias_to_property_of), "\n";
8983         return;
8984     }
8985
8986     sub property_ref {
8987         # This is a package subroutine, not called as a method.
8988         # If the single parameter is a literal '*' it returns a list of all
8989         # defined properties.
8990         # Otherwise, the single parameter is a name, and it returns a pointer
8991         # to the corresponding property object, or undef if none.
8992         #
8993         # Properties can have several different names.  The 'standard' form of
8994         # each of them is stored in %alias_to_property_of as they are defined.
8995         # But it's possible that this subroutine will be called with some
8996         # variant, so if the initial lookup fails, it is repeated with the
8997         # standardized form of the input name.  If found, besides returning the
8998         # result, the input name is added to the list so future calls won't
8999         # have to do the conversion again.
9000
9001         my $name = shift;
9002
9003         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9004
9005         if (! defined $name) {
9006             Carp::my_carp_bug("Undefined input property.  No action taken.");
9007             return;
9008         }
9009
9010         return main::uniques(values %alias_to_property_of) if $name eq '*';
9011
9012         # Return cached result if have it.
9013         my $result = $alias_to_property_of{$name};
9014         return $result if defined $result;
9015
9016         # Convert the input to standard form.
9017         my $standard_name = standardize($name);
9018
9019         $result = $alias_to_property_of{$standard_name};
9020         return unless defined $result;        # Don't cache undefs
9021
9022         # Cache the result before returning it.
9023         $alias_to_property_of{$name} = $result;
9024         return $result;
9025     }
9026
9027
9028     main::setup_package();
9029
9030     my %map;
9031     # A pointer to the map table object for this property
9032     main::set_access('map', \%map);
9033
9034     my %full_name;
9035     # The property's full name.  This is a duplicate of the copy kept in the
9036     # map table, but is needed because stringify needs it during
9037     # construction of the map table, and then would have a chicken before egg
9038     # problem.
9039     main::set_access('full_name', \%full_name, 'r');
9040
9041     my %table_ref;
9042     # This hash will contain as keys, all the aliases of any match tables
9043     # attached to this property, and as values, the pointers to their
9044     # respective tables.  This allows quick look-up of a table from any of its
9045     # names.
9046     main::set_access('table_ref', \%table_ref);
9047
9048     my %type;
9049     # The type of the property, $ENUM, $BINARY, etc
9050     main::set_access('type', \%type, 'r');
9051
9052     my %file;
9053     # The filename where the map table will go (if actually written).
9054     # Normally defaulted, but can be overridden.
9055     main::set_access('file', \%file, 'r', 's');
9056
9057     my %directory;
9058     # The directory where the map table will go (if actually written).
9059     # Normally defaulted, but can be overridden.
9060     main::set_access('directory', \%directory, 's');
9061
9062     my %pseudo_map_type;
9063     # This is used to affect the calculation of the map types for all the
9064     # ranges in the table.  It should be set to one of the values that signify
9065     # to alter the calculation.
9066     main::set_access('pseudo_map_type', \%pseudo_map_type, 'r');
9067
9068     my %has_only_code_point_maps;
9069     # A boolean used to help in computing the type of data in the map table.
9070     main::set_access('has_only_code_point_maps', \%has_only_code_point_maps);
9071
9072     my %unique_maps;
9073     # A list of the first few distinct mappings this property has.  This is
9074     # used to disambiguate between binary and enum property types, so don't
9075     # have to keep more than three.
9076     main::set_access('unique_maps', \%unique_maps);
9077
9078     my %pre_declared_maps;
9079     # A boolean that gives whether the input data should declare all the
9080     # tables used, or not.  If the former, unknown ones raise a warning.
9081     main::set_access('pre_declared_maps',
9082                                     \%pre_declared_maps, 'r', 's');
9083
9084     my %has_dependency;
9085     # A boolean that gives whether some table somewhere is defined as the
9086     # complement of a table in this property.  This is a crude, but currently
9087     # sufficient, mechanism to make this property not get destroyed before
9088     # what is dependent on it is.  Other dependencies could be added, so the
9089     # name was chosen to reflect a more general situation than actually is
9090     # currently the case.
9091     main::set_access('has_dependency', \%has_dependency, 'r', 's');
9092
9093     sub new {
9094         # The only required parameter is the positionally first, name.  All
9095         # other parameters are key => value pairs.  See the documentation just
9096         # above for the meanings of the ones not passed directly on to the map
9097         # table constructor.
9098
9099         my $class = shift;
9100         my $name = shift || "";
9101
9102         my $self = property_ref($name);
9103         if (defined $self) {
9104             my $options_string = join ", ", @_;
9105             $options_string = ".  Ignoring options $options_string" if $options_string;
9106             Carp::my_carp("$self is already in use.  Using existing one$options_string;");
9107             return $self;
9108         }
9109
9110         my %args = @_;
9111
9112         $self = bless \do { my $anonymous_scalar }, $class;
9113         my $addr = do { no overloading; pack 'J', $self; };
9114
9115         $directory{$addr} = delete $args{'Directory'};
9116         $file{$addr} = delete $args{'File'};
9117         $full_name{$addr} = delete $args{'Full_Name'} || $name;
9118         $type{$addr} = delete $args{'Type'} || $UNKNOWN;
9119         $pseudo_map_type{$addr} = delete $args{'Map_Type'};
9120         $pre_declared_maps{$addr} = delete $args{'Pre_Declared_Maps'}
9121                                     # Starting in this release, property
9122                                     # values should be defined for all
9123                                     # properties, except those overriding this
9124                                     // $v_version ge v5.1.0;
9125
9126         # Rest of parameters passed on.
9127
9128         $has_only_code_point_maps{$addr} = 1;
9129         $table_ref{$addr} = { };
9130         $unique_maps{$addr} = { };
9131         $has_dependency{$addr} = 0;
9132
9133         $map{$addr} = Map_Table->new($name,
9134                                     Full_Name => $full_name{$addr},
9135                                     _Alias_Hash => \%alias_to_property_of,
9136                                     _Property => $self,
9137                                     %args);
9138         return $self;
9139     }
9140
9141     # See this program's beginning comment block about overloading the copy
9142     # constructor.  Few operations are defined on properties, but a couple are
9143     # useful.  It is safe to take the inverse of a property, and to remove a
9144     # single code point from it.
9145     use overload
9146         fallback => 0,
9147         qw("") => "_operator_stringify",
9148         "." => \&main::_operator_dot,
9149         ".=" => \&main::_operator_dot_equal,
9150         '==' => \&main::_operator_equal,
9151         '!=' => \&main::_operator_not_equal,
9152         '=' => sub { return shift },
9153         '-=' => "_minus_and_equal",
9154     ;
9155
9156     sub _operator_stringify {
9157         return "Property '" .  shift->full_name . "'";
9158     }
9159
9160     sub _minus_and_equal {
9161         # Remove a single code point from the map table of a property.
9162
9163         my $self = shift;
9164         my $other = shift;
9165         my $reversed = shift;
9166         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9167
9168         if (ref $other) {
9169             Carp::my_carp_bug("Bad news.  Can't cope with a "
9170                         . ref($other)
9171                         . " argument to '-='.  Subtraction ignored.");
9172             return $self;
9173         }
9174         elsif ($reversed) {   # Shouldn't happen in a -=, but just in case
9175             Carp::my_carp_bug("Bad news.  Can't cope with subtracting a "
9176             . ref $self
9177             . " from a non-object.  undef returned.");
9178             return;
9179         }
9180         else {
9181             no overloading;
9182             $map{pack 'J', $self}->delete_range($other, $other);
9183         }
9184         return $self;
9185     }
9186
9187     sub add_match_table {
9188         # Add a new match table for this property, with name given by the
9189         # parameter.  It returns a pointer to the table.
9190
9191         my $self = shift;
9192         my $name = shift;
9193         my %args = @_;
9194
9195         my $addr = do { no overloading; pack 'J', $self; };
9196
9197         my $table = $table_ref{$addr}{$name};
9198         my $standard_name = main::standardize($name);
9199         if (defined $table
9200             || (defined ($table = $table_ref{$addr}{$standard_name})))
9201         {
9202             Carp::my_carp("Table '$name' in $self is already in use.  Using existing one");
9203             $table_ref{$addr}{$name} = $table;
9204             return $table;
9205         }
9206         else {
9207
9208             # See if this is a perl extension, if not passed in.
9209             my $perl_extension = delete $args{'Perl_Extension'};
9210             $perl_extension
9211                         = $self->perl_extension if ! defined $perl_extension;
9212
9213             my $fate;
9214             my $suppression_reason = "";
9215             if ($self->name =~ /^_/) {
9216                 $fate = $SUPPRESSED;
9217                 $suppression_reason = "Parent property is internal only";
9218             }
9219             elsif ($self->fate >= $SUPPRESSED) {
9220                 $fate = $self->fate;
9221                 $suppression_reason = $why_suppressed{$self->complete_name};
9222
9223             }
9224             elsif ($name =~ /^_/) {
9225                 $fate = $INTERNAL_ONLY;
9226             }
9227             $table = Match_Table->new(
9228                                 Name => $name,
9229                                 Perl_Extension => $perl_extension,
9230                                 _Alias_Hash => $table_ref{$addr},
9231                                 _Property => $self,
9232                                 Fate => $fate,
9233                                 Suppression_Reason => $suppression_reason,
9234                                 Status => $self->status,
9235                                 _Status_Info => $self->status_info,
9236                                 %args);
9237             return unless defined $table;
9238         }
9239
9240         # Save the names for quick look up
9241         $table_ref{$addr}{$standard_name} = $table;
9242         $table_ref{$addr}{$name} = $table;
9243
9244         # Perhaps we can figure out the type of this property based on the
9245         # fact of adding this match table.  First, string properties don't
9246         # have match tables; second, a binary property can't have 3 match
9247         # tables
9248         if ($type{$addr} == $UNKNOWN) {
9249             $type{$addr} = $NON_STRING;
9250         }
9251         elsif ($type{$addr} == $STRING) {
9252             Carp::my_carp("$self Added a match table '$name' to a string property '$self'.  Changed it to a non-string property.  Bad News.");
9253             $type{$addr} = $NON_STRING;
9254         }
9255         elsif ($type{$addr} != $ENUM && $type{$addr} != $FORCED_BINARY) {
9256             if (scalar main::uniques(values %{$table_ref{$addr}}) > 2) {
9257                 if ($type{$addr} == $BINARY) {
9258                     Carp::my_carp("$self now has more than 2 tables (with the addition of '$name'), and so is no longer binary.  Changing its type to 'enum'.  Bad News.");
9259                 }
9260                 $type{$addr} = $ENUM;
9261             }
9262         }
9263
9264         return $table;
9265     }
9266
9267     sub delete_match_table {
9268         # Delete the table referred to by $2 from the property $1.
9269
9270         my $self = shift;
9271         my $table_to_remove = shift;
9272         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9273
9274         my $addr = do { no overloading; pack 'J', $self; };
9275
9276         # Remove all names that refer to it.
9277         foreach my $key (keys %{$table_ref{$addr}}) {
9278             delete $table_ref{$addr}{$key}
9279                                 if $table_ref{$addr}{$key} == $table_to_remove;
9280         }
9281
9282         $table_to_remove->DESTROY;
9283         return;
9284     }
9285
9286     sub table {
9287         # Return a pointer to the match table (with name given by the
9288         # parameter) associated with this property; undef if none.
9289
9290         my $self = shift;
9291         my $name = shift;
9292         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9293
9294         my $addr = do { no overloading; pack 'J', $self; };
9295
9296         return $table_ref{$addr}{$name} if defined $table_ref{$addr}{$name};
9297
9298         # If quick look-up failed, try again using the standard form of the
9299         # input name.  If that succeeds, cache the result before returning so
9300         # won't have to standardize this input name again.
9301         my $standard_name = main::standardize($name);
9302         return unless defined $table_ref{$addr}{$standard_name};
9303
9304         $table_ref{$addr}{$name} = $table_ref{$addr}{$standard_name};
9305         return $table_ref{$addr}{$name};
9306     }
9307
9308     sub tables {
9309         # Return a list of pointers to all the match tables attached to this
9310         # property
9311
9312         no overloading;
9313         return main::uniques(values %{$table_ref{pack 'J', shift}});
9314     }
9315
9316     sub directory {
9317         # Returns the directory the map table for this property should be
9318         # output in.  If a specific directory has been specified, that has
9319         # priority;  'undef' is returned if the type isn't defined;
9320         # or $map_directory for everything else.
9321
9322         my $addr = do { no overloading; pack 'J', shift; };
9323
9324         return $directory{$addr} if defined $directory{$addr};
9325         return undef if $type{$addr} == $UNKNOWN;
9326         return $map_directory;
9327     }
9328
9329     sub swash_name {
9330         # Return the name that is used to both:
9331         #   1)  Name the file that the map table is written to.
9332         #   2)  The name of swash related stuff inside that file.
9333         # The reason for this is that the Perl core historically has used
9334         # certain names that aren't the same as the Unicode property names.
9335         # To continue using these, $file is hard-coded in this file for those,
9336         # but otherwise the standard name is used.  This is different from the
9337         # external_name, so that the rest of the files, like in lib can use
9338         # the standard name always, without regard to historical precedent.
9339
9340         my $self = shift;
9341         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9342
9343         my $addr = do { no overloading; pack 'J', $self; };
9344
9345         # Swash names are used only on either
9346         # 1) legacy-only properties, because the formats for these are
9347         #    unchangeable, and they have had these lines in them; or
9348         # 2) regular or internal-only map tables
9349         # 3) otherwise there should be no access to the
9350         #    property map table from other parts of Perl.
9351         return if $map{$addr}->fate != $ORDINARY
9352                   && $map{$addr}->fate != $LEGACY_ONLY
9353                   && ! ($map{$addr}->name =~ /^_/
9354                         && $map{$addr}->fate == $INTERNAL_ONLY);
9355
9356         return $file{$addr} if defined $file{$addr};
9357         return $map{$addr}->external_name;
9358     }
9359
9360     sub to_create_match_tables {
9361         # Returns a boolean as to whether or not match tables should be
9362         # created for this property.
9363
9364         my $self = shift;
9365         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9366
9367         # The whole point of this pseudo property is match tables.
9368         return 1 if $self == $perl;
9369
9370         my $addr = do { no overloading; pack 'J', $self; };
9371
9372         # Don't generate tables of code points that match the property values
9373         # of a string property.  Such a list would most likely have many
9374         # property values, each with just one or very few code points mapping
9375         # to it.
9376         return 0 if $type{$addr} == $STRING;
9377
9378         # Otherwise, do.
9379         return 1;
9380     }
9381
9382     sub property_add_or_replace_non_nulls {
9383         # This adds the mappings in the property $other to $self.  Non-null
9384         # mappings from $other override those in $self.  It essentially merges
9385         # the two properties, with the second having priority except for null
9386         # mappings.
9387
9388         my $self = shift;
9389         my $other = shift;
9390         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9391
9392         if (! $other->isa(__PACKAGE__)) {
9393             Carp::my_carp_bug("$other should be a "
9394                             . __PACKAGE__
9395                             . ".  Not a '"
9396                             . ref($other)
9397                             . "'.  Not added;");
9398             return;
9399         }
9400
9401         no overloading;
9402         return $map{pack 'J', $self}->map_add_or_replace_non_nulls($map{pack 'J', $other});
9403     }
9404
9405     sub set_proxy_for {
9406         # Certain tables are not generally written out to files, but
9407         # Unicode::UCD has the intelligence to know that the file for $self
9408         # can be used to reconstruct those tables.  This routine just changes
9409         # things so that UCD pod entries for those suppressed tables are
9410         # generated, so the fact that a proxy is used is invisible to the
9411         # user.
9412
9413         my $self = shift;
9414
9415         foreach my $property_name (@_) {
9416             my $ref = property_ref($property_name);
9417             next if $ref->to_output_map;
9418             $ref->set_fate($MAP_PROXIED);
9419         }
9420     }
9421
9422     sub set_type {
9423         # Set the type of the property.  Mostly this is figured out by the
9424         # data in the table.  But this is used to set it explicitly.  The
9425         # reason it is not a standard accessor is that when setting a binary
9426         # property, we need to make sure that all the true/false aliases are
9427         # present, as they were omitted in early Unicode releases.
9428
9429         my $self = shift;
9430         my $type = shift;
9431         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9432
9433         if ($type != $ENUM
9434             && $type != $BINARY
9435             && $type != $FORCED_BINARY
9436             && $type != $STRING)
9437         {
9438             Carp::my_carp("Unrecognized type '$type'.  Type not set");
9439             return;
9440         }
9441
9442         { no overloading; $type{pack 'J', $self} = $type; }
9443         return if $type != $BINARY && $type != $FORCED_BINARY;
9444
9445         my $yes = $self->table('Y');
9446         $yes = $self->table('Yes') if ! defined $yes;
9447         $yes = $self->add_match_table('Y', Full_Name => 'Yes')
9448                                                             if ! defined $yes;
9449
9450         # Add aliases in order wanted, duplicates will be ignored.  We use a
9451         # binary property present in all releases for its ordered lists of
9452         # true/false aliases.  Note, that could run into problems in
9453         # outputting things in that we don't distinguish between the name and
9454         # full name of these.  Hopefully, if the table was already created
9455         # before this code is executed, it was done with these set properly.
9456         my $bm = property_ref("Bidi_Mirrored");
9457         foreach my $alias ($bm->table("Y")->aliases) {
9458             $yes->add_alias($alias->name);
9459         }
9460         my $no = $self->table('N');
9461         $no = $self->table('No') if ! defined $no;
9462         $no = $self->add_match_table('N', Full_Name => 'No') if ! defined $no;
9463         foreach my $alias ($bm->table("N")->aliases) {
9464             $no->add_alias($alias->name);
9465         }
9466
9467         return;
9468     }
9469
9470     sub add_map {
9471         # Add a map to the property's map table.  This also keeps
9472         # track of the maps so that the property type can be determined from
9473         # its data.
9474
9475         my $self = shift;
9476         my $start = shift;  # First code point in range
9477         my $end = shift;    # Final code point in range
9478         my $map = shift;    # What the range maps to.
9479         # Rest of parameters passed on.
9480
9481         my $addr = do { no overloading; pack 'J', $self; };
9482
9483         # If haven't the type of the property, gather information to figure it
9484         # out.
9485         if ($type{$addr} == $UNKNOWN) {
9486
9487             # If the map contains an interior blank or dash, or most other
9488             # nonword characters, it will be a string property.  This
9489             # heuristic may actually miss some string properties.  If so, they
9490             # may need to have explicit set_types called for them.  This
9491             # happens in the Unihan properties.
9492             if ($map =~ / (?<= . ) [ -] (?= . ) /x
9493                 || $map =~ / [^\w.\/\ -]  /x)
9494             {
9495                 $self->set_type($STRING);
9496
9497                 # $unique_maps is used for disambiguating between ENUM and
9498                 # BINARY later; since we know the property is not going to be
9499                 # one of those, no point in keeping the data around
9500                 undef $unique_maps{$addr};
9501             }
9502             else {
9503
9504                 # Not necessarily a string.  The final decision has to be
9505                 # deferred until all the data are in.  We keep track of if all
9506                 # the values are code points for that eventual decision.
9507                 $has_only_code_point_maps{$addr} &=
9508                                             $map =~ / ^ $code_point_re $/x;
9509
9510                 # For the purposes of disambiguating between binary and other
9511                 # enumerations at the end, we keep track of the first three
9512                 # distinct property values.  Once we get to three, we know
9513                 # it's not going to be binary, so no need to track more.
9514                 if (scalar keys %{$unique_maps{$addr}} < 3) {
9515                     $unique_maps{$addr}{main::standardize($map)} = 1;
9516                 }
9517             }
9518         }
9519
9520         # Add the mapping by calling our map table's method
9521         return $map{$addr}->add_map($start, $end, $map, @_);
9522     }
9523
9524     sub compute_type {
9525         # Compute the type of the property: $ENUM, $STRING, or $BINARY.  This
9526         # should be called after the property is mostly filled with its maps.
9527         # We have been keeping track of what the property values have been,
9528         # and now have the necessary information to figure out the type.
9529
9530         my $self = shift;
9531         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9532
9533         my $addr = do { no overloading; pack 'J', $self; };
9534
9535         my $type = $type{$addr};
9536
9537         # If already have figured these out, no need to do so again, but we do
9538         # a double check on ENUMS to make sure that a string property hasn't
9539         # improperly been classified as an ENUM, so continue on with those.
9540         return if $type == $STRING
9541                   || $type == $BINARY
9542                   || $type == $FORCED_BINARY;
9543
9544         # If every map is to a code point, is a string property.
9545         if ($type == $UNKNOWN
9546             && ($has_only_code_point_maps{$addr}
9547                 || (defined $map{$addr}->default_map
9548                     && $map{$addr}->default_map eq "")))
9549         {
9550             $self->set_type($STRING);
9551         }
9552         else {
9553
9554             # Otherwise, it is to some sort of enumeration.  (The case where
9555             # it is a Unicode miscellaneous property, and treated like a
9556             # string in this program is handled in add_map()).  Distinguish
9557             # between binary and some other enumeration type.  Of course, if
9558             # there are more than two values, it's not binary.  But more
9559             # subtle is the test that the default mapping is defined means it
9560             # isn't binary.  This in fact may change in the future if Unicode
9561             # changes the way its data is structured.  But so far, no binary
9562             # properties ever have @missing lines for them, so the default map
9563             # isn't defined for them.  The few properties that are two-valued
9564             # and aren't considered binary have the default map defined
9565             # starting in Unicode 5.0, when the @missing lines appeared; and
9566             # this program has special code to put in a default map for them
9567             # for earlier than 5.0 releases.
9568             if ($type == $ENUM
9569                 || scalar keys %{$unique_maps{$addr}} > 2
9570                 || defined $self->default_map)
9571             {
9572                 my $tables = $self->tables;
9573                 my $count = $self->count;
9574                 if ($verbosity && $tables > 500 && $tables/$count > .1) {
9575                     Carp::my_carp_bug("It appears that $self should be a \$STRING property, not an \$ENUM because it has too many match tables: $tables\n");
9576                 }
9577                 $self->set_type($ENUM);
9578             }
9579             else {
9580                 $self->set_type($BINARY);
9581             }
9582         }
9583         undef $unique_maps{$addr};  # Garbage collect
9584         return;
9585     }
9586
9587     sub set_fate {
9588         my $self = shift;
9589         my $fate = shift;
9590         my $reason = shift;  # Ignored unless suppressing
9591         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9592
9593         my $addr = do { no overloading; pack 'J', $self; };
9594         if ($fate >= $SUPPRESSED) {
9595             $why_suppressed{$self->complete_name} = $reason;
9596         }
9597
9598         # Each table shares the property's fate, except that MAP_PROXIED
9599         # doesn't affect match tables
9600         $map{$addr}->set_fate($fate, $reason);
9601         if ($fate != $MAP_PROXIED) {
9602             foreach my $table ($map{$addr}, $self->tables) {
9603                 $table->set_fate($fate, $reason);
9604             }
9605         }
9606         return;
9607     }
9608
9609
9610     # Most of the accessors for a property actually apply to its map table.
9611     # Setup up accessor functions for those, referring to %map
9612     for my $sub (qw(
9613                     add_alias
9614                     add_anomalous_entry
9615                     add_comment
9616                     add_conflicting
9617                     add_description
9618                     add_duplicate
9619                     add_note
9620                     aliases
9621                     comment
9622                     complete_name
9623                     containing_range
9624                     count
9625                     default_map
9626                     definition
9627                     delete_range
9628                     description
9629                     each_range
9630                     external_name
9631                     fate
9632                     file_path
9633                     format
9634                     initialize
9635                     inverse_list
9636                     is_empty
9637                     replacement_property
9638                     name
9639                     note
9640                     perl_extension
9641                     property
9642                     range_count
9643                     ranges
9644                     range_size_1
9645                     replace_map
9646                     reset_each_range
9647                     set_comment
9648                     set_default_map
9649                     set_file_path
9650                     set_final_comment
9651                     _set_format
9652                     set_range_size_1
9653                     set_status
9654                     set_to_output_map
9655                     short_name
9656                     status
9657                     status_info
9658                     to_output_map
9659                     type_of
9660                     value_of
9661                     write
9662                 ))
9663                     # 'property' above is for symmetry, so that one can take
9664                     # the property of a property and get itself, and so don't
9665                     # have to distinguish between properties and tables in
9666                     # calling code
9667     {
9668         no strict "refs";
9669         *$sub = sub {
9670             use strict "refs";
9671             my $self = shift;
9672             no overloading;
9673             return $map{pack 'J', $self}->$sub(@_);
9674         }
9675     }
9676
9677
9678 } # End closure
9679
9680 package main;
9681
9682 sub display_chr {
9683     # Converts an ordinal printable character value to a displayable string,
9684     # using a dotted circle to hold combining characters.
9685
9686     my $ord = shift;
9687     my $chr = chr $ord;
9688     return $chr if $ccc->table(0)->contains($ord);
9689     return "\x{25CC}$chr";
9690 }
9691
9692 sub join_lines($) {
9693     # Returns lines of the input joined together, so that they can be folded
9694     # properly.
9695     # This causes continuation lines to be joined together into one long line
9696     # for folding.  A continuation line is any line that doesn't begin with a
9697     # space or "\b" (the latter is stripped from the output).  This is so
9698     # lines can be be in a HERE document so as to fit nicely in the terminal
9699     # width, but be joined together in one long line, and then folded with
9700     # indents, '#' prefixes, etc, properly handled.
9701     # A blank separates the joined lines except if there is a break; an extra
9702     # blank is inserted after a period ending a line.
9703
9704     # Initialize the return with the first line.
9705     my ($return, @lines) = split "\n", shift;
9706
9707     # If the first line is null, it was an empty line, add the \n back in
9708     $return = "\n" if $return eq "";
9709
9710     # Now join the remainder of the physical lines.
9711     for my $line (@lines) {
9712
9713         # An empty line means wanted a blank line, so add two \n's to get that
9714         # effect, and go to the next line.
9715         if (length $line == 0) {
9716             $return .= "\n\n";
9717             next;
9718         }
9719
9720         # Look at the last character of what we have so far.
9721         my $previous_char = substr($return, -1, 1);
9722
9723         # And at the next char to be output.
9724         my $next_char = substr($line, 0, 1);
9725
9726         if ($previous_char ne "\n") {
9727
9728             # Here didn't end wth a nl.  If the next char a blank or \b, it
9729             # means that here there is a break anyway.  So add a nl to the
9730             # output.
9731             if ($next_char eq " " || $next_char eq "\b") {
9732                 $previous_char = "\n";
9733                 $return .= $previous_char;
9734             }
9735
9736             # Add an extra space after periods.
9737             $return .= " " if $previous_char eq '.';
9738         }
9739
9740         # Here $previous_char is still the latest character to be output.  If
9741         # it isn't a nl, it means that the next line is to be a continuation
9742         # line, with a blank inserted between them.
9743         $return .= " " if $previous_char ne "\n";
9744
9745         # Get rid of any \b
9746         substr($line, 0, 1) = "" if $next_char eq "\b";
9747
9748         # And append this next line.
9749         $return .= $line;
9750     }
9751
9752     return $return;
9753 }
9754
9755 sub simple_fold($;$$$) {
9756     # Returns a string of the input (string or an array of strings) folded
9757     # into multiple-lines each of no more than $MAX_LINE_WIDTH characters plus
9758     # a \n
9759     # This is tailored for the kind of text written by this program,
9760     # especially the pod file, which can have very long names with
9761     # underscores in the middle, or words like AbcDefgHij....  We allow
9762     # breaking in the middle of such constructs if the line won't fit
9763     # otherwise.  The break in such cases will come either just after an
9764     # underscore, or just before one of the Capital letters.
9765
9766     local $to_trace = 0 if main::DEBUG;
9767
9768     my $line = shift;
9769     my $prefix = shift;     # Optional string to prepend to each output
9770                             # line
9771     $prefix = "" unless defined $prefix;
9772
9773     my $hanging_indent = shift; # Optional number of spaces to indent
9774                                 # continuation lines
9775     $hanging_indent = 0 unless $hanging_indent;
9776
9777     my $right_margin = shift;   # Optional number of spaces to narrow the
9778                                 # total width by.
9779     $right_margin = 0 unless defined $right_margin;
9780
9781     # Call carp with the 'nofold' option to avoid it from trying to call us
9782     # recursively
9783     Carp::carp_extra_args(\@_, 'nofold') if main::DEBUG && @_;
9784
9785     # The space available doesn't include what's automatically prepended
9786     # to each line, or what's reserved on the right.
9787     my $max = $MAX_LINE_WIDTH - length($prefix) - $right_margin;
9788     # XXX Instead of using the 'nofold' perhaps better to look up the stack
9789
9790     if (DEBUG && $hanging_indent >= $max) {
9791         Carp::my_carp("Too large a hanging indent ($hanging_indent); must be < $max.  Using 0", 'nofold');
9792         $hanging_indent = 0;
9793     }
9794
9795     # First, split into the current physical lines.
9796     my @line;
9797     if (ref $line) {        # Better be an array, because not bothering to
9798                             # test
9799         foreach my $line (@{$line}) {
9800             push @line, split /\n/, $line;
9801         }
9802     }
9803     else {
9804         @line = split /\n/, $line;
9805     }
9806
9807     #local $to_trace = 1 if main::DEBUG;
9808     trace "", join(" ", @line), "\n" if main::DEBUG && $to_trace;
9809
9810     # Look at each current physical line.
9811     for (my $i = 0; $i < @line; $i++) {
9812         Carp::my_carp("Tabs don't work well.", 'nofold') if $line[$i] =~ /\t/;
9813         #local $to_trace = 1 if main::DEBUG;
9814         trace "i=$i: $line[$i]\n" if main::DEBUG && $to_trace;
9815
9816         # Remove prefix, because will be added back anyway, don't want
9817         # doubled prefix
9818         $line[$i] =~ s/^$prefix//;
9819
9820         # Remove trailing space
9821         $line[$i] =~ s/\s+\Z//;
9822
9823         # If the line is too long, fold it.
9824         if (length $line[$i] > $max) {
9825             my $remainder;
9826
9827             # Here needs to fold.  Save the leading space in the line for
9828             # later.
9829             $line[$i] =~ /^ ( \s* )/x;
9830             my $leading_space = $1;
9831             trace "line length", length $line[$i], "; lead length", length($leading_space) if main::DEBUG && $to_trace;
9832
9833             # If character at final permissible position is white space,
9834             # fold there, which will delete that white space
9835             if (substr($line[$i], $max - 1, 1) =~ /\s/) {
9836                 $remainder = substr($line[$i], $max);
9837                 $line[$i] = substr($line[$i], 0, $max - 1);
9838             }
9839             else {
9840
9841                 # Otherwise fold at an acceptable break char closest to
9842                 # the max length.  Look at just the maximal initial
9843                 # segment of the line
9844                 my $segment = substr($line[$i], 0, $max - 1);
9845                 if ($segment =~
9846                     /^ ( .{$hanging_indent}   # Don't look before the
9847                                               #  indent.
9848                         \ *                   # Don't look in leading
9849                                               #  blanks past the indent
9850                             [^ ] .*           # Find the right-most
9851                         (?:                   #  acceptable break:
9852                             [ \s = ]          # space or equal
9853                             | - (?! [.0-9] )  # or non-unary minus.
9854                         )                     # $1 includes the character
9855                     )/x)
9856                 {
9857                     # Split into the initial part that fits, and remaining
9858                     # part of the input
9859                     $remainder = substr($line[$i], length $1);
9860                     $line[$i] = $1;
9861                     trace $line[$i] if DEBUG && $to_trace;
9862                     trace $remainder if DEBUG && $to_trace;
9863                 }
9864
9865                 # If didn't find a good breaking spot, see if there is a
9866                 # not-so-good breaking spot.  These are just after
9867                 # underscores or where the case changes from lower to
9868                 # upper.  Use \a as a soft hyphen, but give up
9869                 # and don't break the line if there is actually a \a
9870                 # already in the input.  We use an ascii character for the
9871                 # soft-hyphen to avoid any attempt by miniperl to try to
9872                 # access the files that this program is creating.
9873                 elsif ($segment !~ /\a/
9874                        && ($segment =~ s/_/_\a/g
9875                        || $segment =~ s/ ( [a-z] ) (?= [A-Z] )/$1\a/xg))
9876                 {
9877                     # Here were able to find at least one place to insert
9878                     # our substitute soft hyphen.  Find the right-most one
9879                     # and replace it by a real hyphen.
9880                     trace $segment if DEBUG && $to_trace;
9881                     substr($segment,
9882                             rindex($segment, "\a"),
9883                             1) = '-';
9884
9885                     # Then remove the soft hyphen substitutes.
9886                     $segment =~ s/\a//g;
9887                     trace $segment if DEBUG && $to_trace;
9888
9889                     # And split into the initial part that fits, and
9890                     # remainder of the line
9891                     my $pos = rindex($segment, '-');
9892                     $remainder = substr($line[$i], $pos);
9893                     trace $remainder if DEBUG && $to_trace;
9894                     $line[$i] = substr($segment, 0, $pos + 1);
9895                 }
9896             }
9897
9898             # Here we know if we can fold or not.  If we can, $remainder
9899             # is what remains to be processed in the next iteration.
9900             if (defined $remainder) {
9901                 trace "folded='$line[$i]'" if main::DEBUG && $to_trace;
9902
9903                 # Insert the folded remainder of the line as a new element
9904                 # of the array.  (It may still be too long, but we will
9905                 # deal with that next time through the loop.)  Omit any
9906                 # leading space in the remainder.
9907                 $remainder =~ s/^\s+//;
9908                 trace "remainder='$remainder'" if main::DEBUG && $to_trace;
9909
9910                 # But then indent by whichever is larger of:
9911                 # 1) the leading space on the input line;
9912                 # 2) the hanging indent.
9913                 # This preserves indentation in the original line.
9914                 my $lead = ($leading_space)
9915                             ? length $leading_space
9916                             : $hanging_indent;
9917                 $lead = max($lead, $hanging_indent);
9918                 splice @line, $i+1, 0, (" " x $lead) . $remainder;
9919             }
9920         }
9921
9922         # Ready to output the line. Get rid of any trailing space
9923         # And prefix by the required $prefix passed in.
9924         $line[$i] =~ s/\s+$//;
9925         $line[$i] = "$prefix$line[$i]\n";
9926     } # End of looping through all the lines.
9927
9928     return join "", @line;
9929 }
9930
9931 sub property_ref {  # Returns a reference to a property object.
9932     return Property::property_ref(@_);
9933 }
9934
9935 sub force_unlink ($) {
9936     my $filename = shift;
9937     return unless file_exists($filename);
9938     return if CORE::unlink($filename);
9939
9940     # We might need write permission
9941     chmod 0777, $filename;
9942     CORE::unlink($filename) or Carp::my_carp("Couldn't unlink $filename.  Proceeding anyway: $!");
9943     return;
9944 }
9945
9946 sub write ($$@) {
9947     # Given a filename and references to arrays of lines, write the lines of
9948     # each array to the file
9949     # Filename can be given as an arrayref of directory names
9950
9951     return Carp::carp_too_few_args(\@_, 3) if main::DEBUG && @_ < 3;
9952
9953     my $file  = shift;
9954     my $use_utf8 = shift;
9955
9956     # Get into a single string if an array, and get rid of, in Unix terms, any
9957     # leading '.'
9958     $file= File::Spec->join(@$file) if ref $file eq 'ARRAY';
9959     $file = File::Spec->canonpath($file);
9960
9961     # If has directories, make sure that they all exist
9962     (undef, my $directories, undef) = File::Spec->splitpath($file);
9963     File::Path::mkpath($directories) if $directories && ! -d $directories;
9964
9965     push @files_actually_output, $file;
9966
9967     force_unlink ($file);
9968
9969     my $OUT;
9970     if (not open $OUT, ">", $file) {
9971         Carp::my_carp("can't open $file for output.  Skipping this file: $!");
9972         return;
9973     }
9974
9975     binmode $OUT, ":utf8" if $use_utf8;
9976
9977     while (defined (my $lines_ref = shift)) {
9978         unless (@$lines_ref) {
9979             Carp::my_carp("An array of lines for writing to file '$file' is empty; writing it anyway;");
9980         }
9981
9982         print $OUT @$lines_ref or die Carp::my_carp("write to '$file' failed: $!");
9983     }
9984     close $OUT or die Carp::my_carp("close '$file' failed: $!");
9985
9986     print "$file written.\n" if $verbosity >= $VERBOSE;
9987
9988     return;
9989 }
9990
9991
9992 sub Standardize($) {
9993     # This converts the input name string into a standardized equivalent to
9994     # use internally.
9995
9996     my $name = shift;
9997     unless (defined $name) {
9998       Carp::my_carp_bug("Standardize() called with undef.  Returning undef.");
9999       return;
10000     }
10001
10002     # Remove any leading or trailing white space
10003     $name =~ s/^\s+//g;
10004     $name =~ s/\s+$//g;
10005
10006     # Convert interior white space and hyphens into underscores.
10007     $name =~ s/ (?<= .) [ -]+ (.) /_$1/xg;
10008
10009     # Capitalize the letter following an underscore, and convert a sequence of
10010     # multiple underscores to a single one
10011     $name =~ s/ (?<= .) _+ (.) /_\u$1/xg;
10012
10013     # And capitalize the first letter, but not for the special cjk ones.
10014     $name = ucfirst($name) unless $name =~ /^k[A-Z]/;
10015     return $name;
10016 }
10017
10018 sub standardize ($) {
10019     # Returns a lower-cased standardized name, without underscores.  This form
10020     # is chosen so that it can distinguish between any real versus superficial
10021     # Unicode name differences.  It relies on the fact that Unicode doesn't
10022     # have interior underscores, white space, nor dashes in any
10023     # stricter-matched name.  It should not be used on Unicode code point
10024     # names (the Name property), as they mostly, but not always follow these
10025     # rules.
10026
10027     my $name = Standardize(shift);
10028     return if !defined $name;
10029
10030     $name =~ s/ (?<= .) _ (?= . ) //xg;
10031     return lc $name;
10032 }
10033
10034 sub utf8_heavy_name ($$) {
10035     # Returns the name that utf8_heavy.pl will use to find a table.  XXX
10036     # perhaps this function should be placed somewhere, like Heavy.pl so that
10037     # utf8_heavy can use it directly without duplicating code that can get
10038     # out-of sync.
10039
10040     my $table = shift;
10041     my $alias = shift;
10042     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10043
10044     my $property = $table->property;
10045     $property = ($property == $perl)
10046                 ? ""                # 'perl' is never explicitly stated
10047                 : standardize($property->name) . '=';
10048     if ($alias->loose_match) {
10049         return $property . standardize($alias->name);
10050     }
10051     else {
10052         return lc ($property . $alias->name);
10053     }
10054
10055     return;
10056 }
10057
10058 {   # Closure
10059
10060     my $indent_increment = " " x (($debugging_build) ? 2 : 0);
10061     %main::already_output = ();
10062
10063     $main::simple_dumper_nesting = 0;
10064
10065     sub simple_dumper {
10066         # Like Simple Data::Dumper. Good enough for our needs. We can't use
10067         # the real thing as we have to run under miniperl.
10068
10069         # It is designed so that on input it is at the beginning of a line,
10070         # and the final thing output in any call is a trailing ",\n".
10071
10072         my $item = shift;
10073         my $indent = shift;
10074         $indent = "" if ! $debugging_build || ! defined $indent;
10075
10076         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10077
10078         # nesting level is localized, so that as the call stack pops, it goes
10079         # back to the prior value.
10080         local $main::simple_dumper_nesting = $main::simple_dumper_nesting;
10081         local %main::already_output = %main::already_output;
10082         $main::simple_dumper_nesting++;
10083         #print STDERR __LINE__, ": $main::simple_dumper_nesting: $indent$item\n";
10084
10085         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10086
10087         # Determine the indent for recursive calls.
10088         my $next_indent = $indent . $indent_increment;
10089
10090         my $output;
10091         if (! ref $item) {
10092
10093             # Dump of scalar: just output it in quotes if not a number.  To do
10094             # so we must escape certain characters, and therefore need to
10095             # operate on a copy to avoid changing the original
10096             my $copy = $item;
10097             $copy = $UNDEF unless defined $copy;
10098
10099             # Quote non-integers (integers also have optional leading '-')
10100             if ($copy eq "" || $copy !~ /^ -? \d+ $/x) {
10101
10102                 # Escape apostrophe and backslash
10103                 $copy =~ s/ ( ['\\] ) /\\$1/xg;
10104                 $copy = "'$copy'";
10105             }
10106             $output = "$indent$copy,\n";
10107         }
10108         else {
10109
10110             # Keep track of cycles in the input, and refuse to infinitely loop
10111             my $addr = do { no overloading; pack 'J', $item; };
10112             if (defined $main::already_output{$addr}) {
10113                 return "${indent}ALREADY OUTPUT: $item\n";
10114             }
10115             $main::already_output{$addr} = $item;
10116
10117             if (ref $item eq 'ARRAY') {
10118                 my $using_brackets;
10119                 $output = $indent;
10120                 if ($main::simple_dumper_nesting > 1) {
10121                     $output .= '[';
10122                     $using_brackets = 1;
10123                 }
10124                 else {
10125                     $using_brackets = 0;
10126                 }
10127
10128                 # If the array is empty, put the closing bracket on the same
10129                 # line.  Otherwise, recursively add each array element
10130                 if (@$item == 0) {
10131                     $output .= " ";
10132                 }
10133                 else {
10134                     $output .= "\n";
10135                     for (my $i = 0; $i < @$item; $i++) {
10136
10137                         # Indent array elements one level
10138                         $output .= &simple_dumper($item->[$i], $next_indent);
10139                         next if ! $debugging_build;
10140                         $output =~ s/\n$//;      # Remove any trailing nl so
10141                         $output .= " # [$i]\n";  # as to add a comment giving
10142                                                  # the array index
10143                     }
10144                     $output .= $indent;     # Indent closing ']' to orig level
10145                 }
10146                 $output .= ']' if $using_brackets;
10147                 $output .= ",\n";
10148             }
10149             elsif (ref $item eq 'HASH') {
10150                 my $is_first_line;
10151                 my $using_braces;
10152                 my $body_indent;
10153
10154                 # No surrounding braces at top level
10155                 $output .= $indent;
10156                 if ($main::simple_dumper_nesting > 1) {
10157                     $output .= "{\n";
10158                     $is_first_line = 0;
10159                     $body_indent = $next_indent;
10160                     $next_indent .= $indent_increment;
10161                     $using_braces = 1;
10162                 }
10163                 else {
10164                     $is_first_line = 1;
10165                     $body_indent = $indent;
10166                     $using_braces = 0;
10167                 }
10168
10169                 # Output hashes sorted alphabetically instead of apparently
10170                 # random.  Use caseless alphabetic sort
10171                 foreach my $key (sort { lc $a cmp lc $b } keys %$item)
10172                 {
10173                     if ($is_first_line) {
10174                         $is_first_line = 0;
10175                     }
10176                     else {
10177                         $output .= "$body_indent";
10178                     }
10179
10180                     # The key must be a scalar, but this recursive call quotes
10181                     # it
10182                     $output .= &simple_dumper($key);
10183
10184                     # And change the trailing comma and nl to the hash fat
10185                     # comma for clarity, and so the value can be on the same
10186                     # line
10187                     $output =~ s/,\n$/ => /;
10188
10189                     # Recursively call to get the value's dump.
10190                     my $next = &simple_dumper($item->{$key}, $next_indent);
10191
10192                     # If the value is all on one line, remove its indent, so
10193                     # will follow the => immediately.  If it takes more than
10194                     # one line, start it on a new line.
10195                     if ($next !~ /\n.*\n/) {
10196                         $next =~ s/^ *//;
10197                     }
10198                     else {
10199                         $output .= "\n";
10200                     }
10201                     $output .= $next;
10202                 }
10203
10204                 $output .= "$indent},\n" if $using_braces;
10205             }
10206             elsif (ref $item eq 'CODE' || ref $item eq 'GLOB') {
10207                 $output = $indent . ref($item) . "\n";
10208                 # XXX see if blessed
10209             }
10210             elsif ($item->can('dump')) {
10211
10212                 # By convention in this program, objects furnish a 'dump'
10213                 # method.  Since not doing any output at this level, just pass
10214                 # on the input indent
10215                 $output = $item->dump($indent);
10216             }
10217             else {
10218                 Carp::my_carp("Can't cope with dumping a " . ref($item) . ".  Skipping.");
10219             }
10220         }
10221         return $output;
10222     }
10223 }
10224
10225 sub dump_inside_out {
10226     # Dump inside-out hashes in an object's state by converting them to a
10227     # regular hash and then calling simple_dumper on that.
10228
10229     my $object = shift;
10230     my $fields_ref = shift;
10231
10232     my $addr = do { no overloading; pack 'J', $object; };
10233
10234     my %hash;
10235     foreach my $key (keys %$fields_ref) {
10236         $hash{$key} = $fields_ref->{$key}{$addr};
10237     }
10238
10239     return simple_dumper(\%hash, @_);
10240 }
10241
10242 sub _operator_dot {
10243     # Overloaded '.' method that is common to all packages.  It uses the
10244     # package's stringify method.
10245
10246     my $self = shift;
10247     my $other = shift;
10248     my $reversed = shift;
10249     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10250
10251     $other = "" unless defined $other;
10252
10253     foreach my $which (\$self, \$other) {
10254         next unless ref $$which;
10255         if ($$which->can('_operator_stringify')) {
10256             $$which = $$which->_operator_stringify;
10257         }
10258         else {
10259             my $ref = ref $$which;
10260             my $addr = do { no overloading; pack 'J', $$which; };
10261             $$which = "$ref ($addr)";
10262         }
10263     }
10264     return ($reversed)
10265             ? "$other$self"
10266             : "$self$other";
10267 }
10268
10269 sub _operator_dot_equal {
10270     # Overloaded '.=' method that is common to all packages.
10271
10272     my $self = shift;
10273     my $other = shift;
10274     my $reversed = shift;
10275     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10276
10277     $other = "" unless defined $other;
10278
10279     if ($reversed) {
10280         return $other .= "$self";
10281     }
10282     else {
10283         return "$self" . "$other";
10284     }
10285 }
10286
10287 sub _operator_equal {
10288     # Generic overloaded '==' routine.  To be equal, they must be the exact
10289     # same object
10290
10291     my $self = shift;
10292     my $other = shift;
10293
10294     return 0 unless defined $other;
10295     return 0 unless ref $other;
10296     no overloading;
10297     return $self == $other;
10298 }
10299
10300 sub _operator_not_equal {
10301     my $self = shift;
10302     my $other = shift;
10303
10304     return ! _operator_equal($self, $other);
10305 }
10306
10307 sub substitute_PropertyAliases($) {
10308     # Deal with early releases that don't have the crucial PropertyAliases.txt
10309     # file.
10310
10311     my $file_object = shift;
10312     $file_object->insert_lines(get_old_property_aliases());
10313
10314     process_PropertyAliases($file_object);
10315 }
10316
10317
10318 sub process_PropertyAliases($) {
10319     # This reads in the PropertyAliases.txt file, which contains almost all
10320     # the character properties in Unicode and their equivalent aliases:
10321     # scf       ; Simple_Case_Folding         ; sfc
10322     #
10323     # Field 0 is the preferred short name for the property.
10324     # Field 1 is the full name.
10325     # Any succeeding ones are other accepted names.
10326
10327     my $file= shift;
10328     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10329
10330     # Add any cjk properties that may have been defined.
10331     $file->insert_lines(@cjk_properties);
10332
10333     while ($file->next_line) {
10334
10335         my @data = split /\s*;\s*/;
10336
10337         my $full = $data[1];
10338
10339         # This line is defective in early Perls.  The property in Unihan.txt
10340         # is kRSUnicode.
10341         if ($full eq 'Unicode_Radical_Stroke' && @data < 3) {
10342             push @data, qw(cjkRSUnicode kRSUnicode);
10343         }
10344
10345         my $this = Property->new($data[0], Full_Name => $full);
10346
10347         $this->set_fate($SUPPRESSED, $why_suppressed{$full})
10348                                                     if $why_suppressed{$full};
10349
10350         # Start looking for more aliases after these two.
10351         for my $i (2 .. @data - 1) {
10352             $this->add_alias($data[$i]);
10353         }
10354
10355     }
10356
10357     my $scf = property_ref("Simple_Case_Folding");
10358     $scf->add_alias("scf");
10359     $scf->add_alias("sfc");
10360
10361     return;
10362 }
10363
10364 sub finish_property_setup {
10365     # Finishes setting up after PropertyAliases.
10366
10367     my $file = shift;
10368     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10369
10370     # This entry was missing from this file in earlier Unicode versions
10371     if (-e 'Jamo.txt' && ! defined property_ref('JSN')) {
10372         Property->new('JSN', Full_Name => 'Jamo_Short_Name');
10373     }
10374
10375     # These are used so much, that we set globals for them.
10376     $gc = property_ref('General_Category');
10377     $block = property_ref('Block');
10378     $script = property_ref('Script');
10379     $age = property_ref('Age');
10380
10381     # Perl adds this alias.
10382     $gc->add_alias('Category');
10383
10384     # Unicode::Normalize expects this file with this name and directory.
10385     $ccc = property_ref('Canonical_Combining_Class');
10386     if (defined $ccc) {
10387         $ccc->set_file('CombiningClass');
10388         $ccc->set_directory(File::Spec->curdir());
10389     }
10390
10391     # These two properties aren't actually used in the core, but unfortunately
10392     # the names just above that are in the core interfere with these, so
10393     # choose different names.  These aren't a problem unless the map tables
10394     # for these files get written out.
10395     my $lowercase = property_ref('Lowercase');
10396     $lowercase->set_file('IsLower') if defined $lowercase;
10397     my $uppercase = property_ref('Uppercase');
10398     $uppercase->set_file('IsUpper') if defined $uppercase;
10399
10400     # Set up the hard-coded default mappings, but only on properties defined
10401     # for this release
10402     foreach my $property (keys %default_mapping) {
10403         my $property_object = property_ref($property);
10404         next if ! defined $property_object;
10405         my $default_map = $default_mapping{$property};
10406         $property_object->set_default_map($default_map);
10407
10408         # A map of <code point> implies the property is string.
10409         if ($property_object->type == $UNKNOWN
10410             && $default_map eq $CODE_POINT)
10411         {
10412             $property_object->set_type($STRING);
10413         }
10414     }
10415
10416     # The following use the Multi_Default class to create objects for
10417     # defaults.
10418
10419     # Bidi class has a complicated default, but the derived file takes care of
10420     # the complications, leaving just 'L'.
10421     if (file_exists("${EXTRACTED}DBidiClass.txt")) {
10422         property_ref('Bidi_Class')->set_default_map('L');
10423     }
10424     else {
10425         my $default;
10426
10427         # The derived file was introduced in 3.1.1.  The values below are
10428         # taken from table 3-8, TUS 3.0
10429         my $default_R =
10430             'my $default = Range_List->new;
10431              $default->add_range(0x0590, 0x05FF);
10432              $default->add_range(0xFB1D, 0xFB4F);'
10433         ;
10434
10435         # The defaults apply only to unassigned characters
10436         $default_R .= '$gc->table("Unassigned") & $default;';
10437
10438         if ($v_version lt v3.0.0) {
10439             $default = Multi_Default->new(R => $default_R, 'L');
10440         }
10441         else {
10442
10443             # AL apparently not introduced until 3.0:  TUS 2.x references are
10444             # not on-line to check it out
10445             my $default_AL =
10446                 'my $default = Range_List->new;
10447                  $default->add_range(0x0600, 0x07BF);
10448                  $default->add_range(0xFB50, 0xFDFF);
10449                  $default->add_range(0xFE70, 0xFEFF);'
10450             ;
10451
10452             # Non-character code points introduced in this release; aren't AL
10453             if ($v_version ge 3.1.0) {
10454                 $default_AL .= '$default->delete_range(0xFDD0, 0xFDEF);';
10455             }
10456             $default_AL .= '$gc->table("Unassigned") & $default';
10457             $default = Multi_Default->new(AL => $default_AL,
10458                                           R => $default_R,
10459                                           'L');
10460         }
10461         property_ref('Bidi_Class')->set_default_map($default);
10462     }
10463
10464     # Joining type has a complicated default, but the derived file takes care
10465     # of the complications, leaving just 'U' (or Non_Joining), except the file
10466     # is bad in 3.1.0
10467     if (file_exists("${EXTRACTED}DJoinType.txt") || -e 'ArabicShaping.txt') {
10468         if (file_exists("${EXTRACTED}DJoinType.txt") && $v_version ne 3.1.0) {
10469             property_ref('Joining_Type')->set_default_map('Non_Joining');
10470         }
10471         else {
10472
10473             # Otherwise, there are not one, but two possibilities for the
10474             # missing defaults: T and U.
10475             # The missing defaults that evaluate to T are given by:
10476             # T = Mn + Cf - ZWNJ - ZWJ
10477             # where Mn and Cf are the general category values. In other words,
10478             # any non-spacing mark or any format control character, except
10479             # U+200C ZERO WIDTH NON-JOINER (joining type U) and U+200D ZERO
10480             # WIDTH JOINER (joining type C).
10481             my $default = Multi_Default->new(
10482                'T' => '$gc->table("Mn") + $gc->table("Cf") - 0x200C - 0x200D',
10483                'Non_Joining');
10484             property_ref('Joining_Type')->set_default_map($default);
10485         }
10486     }
10487
10488     # Line break has a complicated default in early releases. It is 'Unknown'
10489     # for non-assigned code points; 'AL' for assigned.
10490     if (file_exists("${EXTRACTED}DLineBreak.txt") || -e 'LineBreak.txt') {
10491         my $lb = property_ref('Line_Break');
10492         if (file_exists("${EXTRACTED}DLineBreak.txt")) {
10493             $lb->set_default_map('Unknown');
10494         }
10495         else {
10496             my $default = Multi_Default->new('AL' => '~ $gc->table("Cn")',
10497                                              'Unknown',
10498                                             );
10499             $lb->set_default_map($default);
10500         }
10501     }
10502
10503     # For backwards compatibility with applications that may read the mapping
10504     # file directly (it was documented in 5.12 and 5.14 as being thusly
10505     # usable), keep it from being adjusted.  (range_size_1 is
10506     # used to force the traditional format.)
10507     if (defined (my $nfkc_cf = property_ref('NFKC_Casefold'))) {
10508         $nfkc_cf->set_to_output_map($EXTERNAL_MAP);
10509         $nfkc_cf->set_range_size_1(1);
10510     }
10511     if (defined (my $bmg = property_ref('Bidi_Mirroring_Glyph'))) {
10512         $bmg->set_to_output_map($EXTERNAL_MAP);
10513         $bmg->set_range_size_1(1);
10514     }
10515
10516     property_ref('Numeric_Value')->set_to_output_map($OUTPUT_ADJUSTED);
10517
10518     return;
10519 }
10520
10521 sub get_old_property_aliases() {
10522     # Returns what would be in PropertyAliases.txt if it existed in very old
10523     # versions of Unicode.  It was derived from the one in 3.2, and pared
10524     # down based on the data that was actually in the older releases.
10525     # An attempt was made to use the existence of files to mean inclusion or
10526     # not of various aliases, but if this was not sufficient, using version
10527     # numbers was resorted to.
10528
10529     my @return;
10530
10531     # These are to be used in all versions (though some are constructed by
10532     # this program if missing)
10533     push @return, split /\n/, <<'END';
10534 bc        ; Bidi_Class
10535 Bidi_M    ; Bidi_Mirrored
10536 cf        ; Case_Folding
10537 ccc       ; Canonical_Combining_Class
10538 dm        ; Decomposition_Mapping
10539 dt        ; Decomposition_Type
10540 gc        ; General_Category
10541 isc       ; ISO_Comment
10542 lc        ; Lowercase_Mapping
10543 na        ; Name
10544 na1       ; Unicode_1_Name
10545 nt        ; Numeric_Type
10546 nv        ; Numeric_Value
10547 scf       ; Simple_Case_Folding
10548 slc       ; Simple_Lowercase_Mapping
10549 stc       ; Simple_Titlecase_Mapping
10550 suc       ; Simple_Uppercase_Mapping
10551 tc        ; Titlecase_Mapping
10552 uc        ; Uppercase_Mapping
10553 END
10554
10555     if (-e 'Blocks.txt') {
10556         push @return, "blk       ; Block\n";
10557     }
10558     if (-e 'ArabicShaping.txt') {
10559         push @return, split /\n/, <<'END';
10560 jg        ; Joining_Group
10561 jt        ; Joining_Type
10562 END
10563     }
10564     if (-e 'PropList.txt') {
10565
10566         # This first set is in the original old-style proplist.
10567         push @return, split /\n/, <<'END';
10568 Bidi_C    ; Bidi_Control
10569 Dash      ; Dash
10570 Dia       ; Diacritic
10571 Ext       ; Extender
10572 Hex       ; Hex_Digit
10573 Hyphen    ; Hyphen
10574 IDC       ; ID_Continue
10575 Ideo      ; Ideographic
10576 Join_C    ; Join_Control
10577 Math      ; Math
10578 QMark     ; Quotation_Mark
10579 Term      ; Terminal_Punctuation
10580 WSpace    ; White_Space
10581 END
10582         # The next sets were added later
10583         if ($v_version ge v3.0.0) {
10584             push @return, split /\n/, <<'END';
10585 Upper     ; Uppercase
10586 Lower     ; Lowercase
10587 END
10588         }
10589         if ($v_version ge v3.0.1) {
10590             push @return, split /\n/, <<'END';
10591 NChar     ; Noncharacter_Code_Point
10592 END
10593         }
10594         # The next sets were added in the new-style
10595         if ($v_version ge v3.1.0) {
10596             push @return, split /\n/, <<'END';
10597 OAlpha    ; Other_Alphabetic
10598 OLower    ; Other_Lowercase
10599 OMath     ; Other_Math
10600 OUpper    ; Other_Uppercase
10601 END
10602         }
10603         if ($v_version ge v3.1.1) {
10604             push @return, "AHex      ; ASCII_Hex_Digit\n";
10605         }
10606     }
10607     if (-e 'EastAsianWidth.txt') {
10608         push @return, "ea        ; East_Asian_Width\n";
10609     }
10610     if (-e 'CompositionExclusions.txt') {
10611         push @return, "CE        ; Composition_Exclusion\n";
10612     }
10613     if (-e 'LineBreak.txt') {
10614         push @return, "lb        ; Line_Break\n";
10615     }
10616     if (-e 'BidiMirroring.txt') {
10617         push @return, "bmg       ; Bidi_Mirroring_Glyph\n";
10618     }
10619     if (-e 'Scripts.txt') {
10620         push @return, "sc        ; Script\n";
10621     }
10622     if (-e 'DNormalizationProps.txt') {
10623         push @return, split /\n/, <<'END';
10624 Comp_Ex   ; Full_Composition_Exclusion
10625 FC_NFKC   ; FC_NFKC_Closure
10626 NFC_QC    ; NFC_Quick_Check
10627 NFD_QC    ; NFD_Quick_Check
10628 NFKC_QC   ; NFKC_Quick_Check
10629 NFKD_QC   ; NFKD_Quick_Check
10630 XO_NFC    ; Expands_On_NFC
10631 XO_NFD    ; Expands_On_NFD
10632 XO_NFKC   ; Expands_On_NFKC
10633 XO_NFKD   ; Expands_On_NFKD
10634 END
10635     }
10636     if (-e 'DCoreProperties.txt') {
10637         push @return, split /\n/, <<'END';
10638 Alpha     ; Alphabetic
10639 IDS       ; ID_Start
10640 XIDC      ; XID_Continue
10641 XIDS      ; XID_Start
10642 END
10643         # These can also appear in some versions of PropList.txt
10644         push @return, "Lower     ; Lowercase\n"
10645                                     unless grep { $_ =~ /^Lower\b/} @return;
10646         push @return, "Upper     ; Uppercase\n"
10647                                     unless grep { $_ =~ /^Upper\b/} @return;
10648     }
10649
10650     # This flag requires the DAge.txt file to be copied into the directory.
10651     if (DEBUG && $compare_versions) {
10652         push @return, 'age       ; Age';
10653     }
10654
10655     return @return;
10656 }
10657
10658 sub substitute_PropValueAliases($) {
10659     # Deal with early releases that don't have the crucial
10660     # PropValueAliases.txt file.
10661
10662     my $file_object = shift;
10663     $file_object->insert_lines(get_old_property_value_aliases());
10664
10665     process_PropValueAliases($file_object);
10666 }
10667
10668 sub process_PropValueAliases {
10669     # This file contains values that properties look like:
10670     # bc ; AL        ; Arabic_Letter
10671     # blk; n/a       ; Greek_And_Coptic                 ; Greek
10672     #
10673     # Field 0 is the property.
10674     # Field 1 is the short name of a property value or 'n/a' if no
10675     #                short name exists;
10676     # Field 2 is the full property value name;
10677     # Any other fields are more synonyms for the property value.
10678     # Purely numeric property values are omitted from the file; as are some
10679     # others, fewer and fewer in later releases
10680
10681     # Entries for the ccc property have an extra field before the
10682     # abbreviation:
10683     # ccc;   0; NR   ; Not_Reordered
10684     # It is the numeric value that the names are synonyms for.
10685
10686     # There are comment entries for values missing from this file:
10687     # # @missing: 0000..10FFFF; ISO_Comment; <none>
10688     # # @missing: 0000..10FFFF; Lowercase_Mapping; <code point>
10689
10690     my $file= shift;
10691     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10692
10693     if ($v_version lt 4.0.0) {
10694         $file->insert_lines(split /\n/, <<'END'
10695 Hangul_Syllable_Type; L                                ; Leading_Jamo
10696 Hangul_Syllable_Type; LV                               ; LV_Syllable
10697 Hangul_Syllable_Type; LVT                              ; LVT_Syllable
10698 Hangul_Syllable_Type; NA                               ; Not_Applicable
10699 Hangul_Syllable_Type; T                                ; Trailing_Jamo
10700 Hangul_Syllable_Type; V                                ; Vowel_Jamo
10701 END
10702         );
10703     }
10704     if ($v_version lt 4.1.0) {
10705         $file->insert_lines(split /\n/, <<'END'
10706 _Perl_GCB; CN                               ; Control
10707 _Perl_GCB; CR                               ; CR
10708 _Perl_GCB; EX                               ; Extend
10709 _Perl_GCB; L                                ; L
10710 _Perl_GCB; LF                               ; LF
10711 _Perl_GCB; LV                               ; LV
10712 _Perl_GCB; LVT                              ; LVT
10713 _Perl_GCB; T                                ; T
10714 _Perl_GCB; V                                ; V
10715 _Perl_GCB; XX                               ; Other
10716 END
10717         );
10718     }
10719
10720
10721     # Add any explicit cjk values
10722     $file->insert_lines(@cjk_property_values);
10723
10724     # This line is used only for testing the code that checks for name
10725     # conflicts.  There is a script Inherited, and when this line is executed
10726     # it causes there to be a name conflict with the 'Inherited' that this
10727     # program generates for this block property value
10728     #$file->insert_lines('blk; n/a; Herited');
10729
10730     # Process each line of the file ...
10731     while ($file->next_line) {
10732
10733         # Fix typo in input file
10734         s/CCC133/CCC132/g if $v_version eq v6.1.0;
10735
10736         my ($property, @data) = split /\s*;\s*/;
10737
10738         # The ccc property has an extra field at the beginning, which is the
10739         # numeric value.  Move it to be after the other two, mnemonic, fields,
10740         # so that those will be used as the property value's names, and the
10741         # number will be an extra alias.  (Rightmost splice removes field 1-2,
10742         # returning them in a slice; left splice inserts that before anything,
10743         # thus shifting the former field 0 to after them.)
10744         splice (@data, 0, 0, splice(@data, 1, 2)) if $property eq 'ccc';
10745
10746         if ($v_version le v5.0.0 && $property eq 'blk' && $data[1] =~ /-/) {
10747             my $new_style = $data[1] =~ s/-/_/gr;
10748             splice @data, 1, 0, $new_style;
10749         }
10750
10751         # Field 0 is a short name unless "n/a"; field 1 is the full name.  If
10752         # there is no short name, use the full one in element 1
10753         if ($data[0] eq "n/a") {
10754             $data[0] = $data[1];
10755         }
10756         elsif ($data[0] ne $data[1]
10757                && standardize($data[0]) eq standardize($data[1])
10758                && $data[1] !~ /[[:upper:]]/)
10759         {
10760             # Also, there is a bug in the file in which "n/a" is omitted, and
10761             # the two fields are identical except for case, and the full name
10762             # is all lower case.  Copy the "short" name unto the full one to
10763             # give it some upper case.
10764
10765             $data[1] = $data[0];
10766         }
10767
10768         # Earlier releases had the pseudo property 'qc' that should expand to
10769         # the ones that replace it below.
10770         if ($property eq 'qc') {
10771             if (lc $data[0] eq 'y') {
10772                 $file->insert_lines('NFC_QC; Y      ; Yes',
10773                                     'NFD_QC; Y      ; Yes',
10774                                     'NFKC_QC; Y     ; Yes',
10775                                     'NFKD_QC; Y     ; Yes',
10776                                     );
10777             }
10778             elsif (lc $data[0] eq 'n') {
10779                 $file->insert_lines('NFC_QC; N      ; No',
10780                                     'NFD_QC; N      ; No',
10781                                     'NFKC_QC; N     ; No',
10782                                     'NFKD_QC; N     ; No',
10783                                     );
10784             }
10785             elsif (lc $data[0] eq 'm') {
10786                 $file->insert_lines('NFC_QC; M      ; Maybe',
10787                                     'NFKC_QC; M     ; Maybe',
10788                                     );
10789             }
10790             else {
10791                 $file->carp_bad_line("qc followed by unexpected '$data[0]");
10792             }
10793             next;
10794         }
10795
10796         # The first field is the short name, 2nd is the full one.
10797         my $property_object = property_ref($property);
10798         my $table = $property_object->add_match_table($data[0],
10799                                                 Full_Name => $data[1]);
10800
10801         # Start looking for more aliases after these two.
10802         for my $i (2 .. @data - 1) {
10803             $table->add_alias($data[$i]);
10804         }
10805     } # End of looping through the file
10806
10807     # As noted in the comments early in the program, it generates tables for
10808     # the default values for all releases, even those for which the concept
10809     # didn't exist at the time.  Here we add those if missing.
10810     if (defined $age && ! defined $age->table('Unassigned')) {
10811         $age->add_match_table('Unassigned');
10812     }
10813     $block->add_match_table('No_Block') if -e 'Blocks.txt'
10814                                     && ! defined $block->table('No_Block');
10815
10816
10817     # Now set the default mappings of the properties from the file.  This is
10818     # done after the loop because a number of properties have only @missings
10819     # entries in the file, and may not show up until the end.
10820     my @defaults = $file->get_missings;
10821     foreach my $default_ref (@defaults) {
10822         my $default = $default_ref->[0];
10823         my $property = property_ref($default_ref->[1]);
10824         $property->set_default_map($default);
10825     }
10826     return;
10827 }
10828
10829 sub get_old_property_value_aliases () {
10830     # Returns what would be in PropValueAliases.txt if it existed in very old
10831     # versions of Unicode.  It was derived from the one in 3.2, and pared
10832     # down.  An attempt was made to use the existence of files to mean
10833     # inclusion or not of various aliases, but if this was not sufficient,
10834     # using version numbers was resorted to.
10835
10836     my @return = split /\n/, <<'END';
10837 bc ; AN        ; Arabic_Number
10838 bc ; B         ; Paragraph_Separator
10839 bc ; CS        ; Common_Separator
10840 bc ; EN        ; European_Number
10841 bc ; ES        ; European_Separator
10842 bc ; ET        ; European_Terminator
10843 bc ; L         ; Left_To_Right
10844 bc ; ON        ; Other_Neutral
10845 bc ; R         ; Right_To_Left
10846 bc ; WS        ; White_Space
10847
10848 Bidi_M; N; No; F; False
10849 Bidi_M; Y; Yes; T; True
10850
10851 # The standard combining classes are very much different in v1, so only use
10852 # ones that look right (not checked thoroughly)
10853 ccc;   0; NR   ; Not_Reordered
10854 ccc;   1; OV   ; Overlay
10855 ccc;   7; NK   ; Nukta
10856 ccc;   8; KV   ; Kana_Voicing
10857 ccc;   9; VR   ; Virama
10858 ccc; 202; ATBL ; Attached_Below_Left
10859 ccc; 216; ATAR ; Attached_Above_Right
10860 ccc; 218; BL   ; Below_Left
10861 ccc; 220; B    ; Below
10862 ccc; 222; BR   ; Below_Right
10863 ccc; 224; L    ; Left
10864 ccc; 228; AL   ; Above_Left
10865 ccc; 230; A    ; Above
10866 ccc; 232; AR   ; Above_Right
10867 ccc; 234; DA   ; Double_Above
10868
10869 dt ; can       ; canonical
10870 dt ; enc       ; circle
10871 dt ; fin       ; final
10872 dt ; font      ; font
10873 dt ; fra       ; fraction
10874 dt ; init      ; initial
10875 dt ; iso       ; isolated
10876 dt ; med       ; medial
10877 dt ; n/a       ; none
10878 dt ; nb        ; noBreak
10879 dt ; sqr       ; square
10880 dt ; sub       ; sub
10881 dt ; sup       ; super
10882
10883 gc ; C         ; Other                            # Cc | Cf | Cn | Co | Cs
10884 gc ; Cc        ; Control
10885 gc ; Cn        ; Unassigned
10886 gc ; Co        ; Private_Use
10887 gc ; L         ; Letter                           # Ll | Lm | Lo | Lt | Lu
10888 gc ; LC        ; Cased_Letter                     # Ll | Lt | Lu
10889 gc ; Ll        ; Lowercase_Letter
10890 gc ; Lm        ; Modifier_Letter
10891 gc ; Lo        ; Other_Letter
10892 gc ; Lu        ; Uppercase_Letter
10893 gc ; M         ; Mark                             # Mc | Me | Mn
10894 gc ; Mc        ; Spacing_Mark
10895 gc ; Mn        ; Nonspacing_Mark
10896 gc ; N         ; Number                           # Nd | Nl | No
10897 gc ; Nd        ; Decimal_Number
10898 gc ; No        ; Other_Number
10899 gc ; P         ; Punctuation                      # Pc | Pd | Pe | Pf | Pi | Po | Ps
10900 gc ; Pd        ; Dash_Punctuation
10901 gc ; Pe        ; Close_Punctuation
10902 gc ; Po        ; Other_Punctuation
10903 gc ; Ps        ; Open_Punctuation
10904 gc ; S         ; Symbol                           # Sc | Sk | Sm | So
10905 gc ; Sc        ; Currency_Symbol
10906 gc ; Sm        ; Math_Symbol
10907 gc ; So        ; Other_Symbol
10908 gc ; Z         ; Separator                        # Zl | Zp | Zs
10909 gc ; Zl        ; Line_Separator
10910 gc ; Zp        ; Paragraph_Separator
10911 gc ; Zs        ; Space_Separator
10912
10913 nt ; de        ; Decimal
10914 nt ; di        ; Digit
10915 nt ; n/a       ; None
10916 nt ; nu        ; Numeric
10917 END
10918
10919     if (-e 'ArabicShaping.txt') {
10920         push @return, split /\n/, <<'END';
10921 jg ; n/a       ; AIN
10922 jg ; n/a       ; ALEF
10923 jg ; n/a       ; DAL
10924 jg ; n/a       ; GAF
10925 jg ; n/a       ; LAM
10926 jg ; n/a       ; MEEM
10927 jg ; n/a       ; NO_JOINING_GROUP
10928 jg ; n/a       ; NOON
10929 jg ; n/a       ; QAF
10930 jg ; n/a       ; SAD
10931 jg ; n/a       ; SEEN
10932 jg ; n/a       ; TAH
10933 jg ; n/a       ; WAW
10934
10935 jt ; C         ; Join_Causing
10936 jt ; D         ; Dual_Joining
10937 jt ; L         ; Left_Joining
10938 jt ; R         ; Right_Joining
10939 jt ; U         ; Non_Joining
10940 jt ; T         ; Transparent
10941 END
10942         if ($v_version ge v3.0.0) {
10943             push @return, split /\n/, <<'END';
10944 jg ; n/a       ; ALAPH
10945 jg ; n/a       ; BEH
10946 jg ; n/a       ; BETH
10947 jg ; n/a       ; DALATH_RISH
10948 jg ; n/a       ; E
10949 jg ; n/a       ; FEH
10950 jg ; n/a       ; FINAL_SEMKATH
10951 jg ; n/a       ; GAMAL
10952 jg ; n/a       ; HAH
10953 jg ; n/a       ; HAMZA_ON_HEH_GOAL
10954 jg ; n/a       ; HE
10955 jg ; n/a       ; HEH
10956 jg ; n/a       ; HEH_GOAL
10957 jg ; n/a       ; HETH
10958 jg ; n/a       ; KAF
10959 jg ; n/a       ; KAPH
10960 jg ; n/a       ; KNOTTED_HEH
10961 jg ; n/a       ; LAMADH
10962 jg ; n/a       ; MIM
10963 jg ; n/a       ; NUN
10964 jg ; n/a       ; PE
10965 jg ; n/a       ; QAPH
10966 jg ; n/a       ; REH
10967 jg ; n/a       ; REVERSED_PE
10968 jg ; n/a       ; SADHE
10969 jg ; n/a       ; SEMKATH
10970 jg ; n/a       ; SHIN
10971 jg ; n/a       ; SWASH_KAF
10972 jg ; n/a       ; TAW
10973 jg ; n/a       ; TEH_MARBUTA
10974 jg ; n/a       ; TETH
10975 jg ; n/a       ; YEH
10976 jg ; n/a       ; YEH_BARREE
10977 jg ; n/a       ; YEH_WITH_TAIL
10978 jg ; n/a       ; YUDH
10979 jg ; n/a       ; YUDH_HE
10980 jg ; n/a       ; ZAIN
10981 END
10982         }
10983     }
10984
10985
10986     if (-e 'EastAsianWidth.txt') {
10987         push @return, split /\n/, <<'END';
10988 ea ; A         ; Ambiguous
10989 ea ; F         ; Fullwidth
10990 ea ; H         ; Halfwidth
10991 ea ; N         ; Neutral
10992 ea ; Na        ; Narrow
10993 ea ; W         ; Wide
10994 END
10995     }
10996
10997     if (-e 'LineBreak.txt' || -e 'LBsubst.txt') {
10998         my @lb = split /\n/, <<'END';
10999 lb ; AI        ; Ambiguous
11000 lb ; AL        ; Alphabetic
11001 lb ; B2        ; Break_Both
11002 lb ; BA        ; Break_After
11003 lb ; BB        ; Break_Before
11004 lb ; BK        ; Mandatory_Break
11005 lb ; CB        ; Contingent_Break
11006 lb ; CL        ; Close_Punctuation
11007 lb ; CM        ; Combining_Mark
11008 lb ; CR        ; Carriage_Return
11009 lb ; EX        ; Exclamation
11010 lb ; GL        ; Glue
11011 lb ; HY        ; Hyphen
11012 lb ; ID        ; Ideographic
11013 lb ; IN        ; Inseperable
11014 lb ; IS        ; Infix_Numeric
11015 lb ; LF        ; Line_Feed
11016 lb ; NS        ; Nonstarter
11017 lb ; NU        ; Numeric
11018 lb ; OP        ; Open_Punctuation
11019 lb ; PO        ; Postfix_Numeric
11020 lb ; PR        ; Prefix_Numeric
11021 lb ; QU        ; Quotation
11022 lb ; SA        ; Complex_Context
11023 lb ; SG        ; Surrogate
11024 lb ; SP        ; Space
11025 lb ; SY        ; Break_Symbols
11026 lb ; XX        ; Unknown
11027 lb ; ZW        ; ZWSpace
11028 END
11029         # If this Unicode version predates the lb property, we use our
11030         # substitute one
11031         if (-e 'LBsubst.txt') {
11032             $_ = s/^lb/_Perl_LB/r for @lb;
11033         }
11034         push @return, @lb;
11035     }
11036
11037     if (-e 'DNormalizationProps.txt') {
11038         push @return, split /\n/, <<'END';
11039 qc ; M         ; Maybe
11040 qc ; N         ; No
11041 qc ; Y         ; Yes
11042 END
11043     }
11044
11045     if (-e 'Scripts.txt') {
11046         push @return, split /\n/, <<'END';
11047 sc ; Arab      ; Arabic
11048 sc ; Armn      ; Armenian
11049 sc ; Beng      ; Bengali
11050 sc ; Bopo      ; Bopomofo
11051 sc ; Cans      ; Canadian_Aboriginal
11052 sc ; Cher      ; Cherokee
11053 sc ; Cyrl      ; Cyrillic
11054 sc ; Deva      ; Devanagari
11055 sc ; Dsrt      ; Deseret
11056 sc ; Ethi      ; Ethiopic
11057 sc ; Geor      ; Georgian
11058 sc ; Goth      ; Gothic
11059 sc ; Grek      ; Greek
11060 sc ; Gujr      ; Gujarati
11061 sc ; Guru      ; Gurmukhi
11062 sc ; Hang      ; Hangul
11063 sc ; Hani      ; Han
11064 sc ; Hebr      ; Hebrew
11065 sc ; Hira      ; Hiragana
11066 sc ; Ital      ; Old_Italic
11067 sc ; Kana      ; Katakana
11068 sc ; Khmr      ; Khmer
11069 sc ; Knda      ; Kannada
11070 sc ; Laoo      ; Lao
11071 sc ; Latn      ; Latin
11072 sc ; Mlym      ; Malayalam
11073 sc ; Mong      ; Mongolian
11074 sc ; Mymr      ; Myanmar
11075 sc ; Ogam      ; Ogham
11076 sc ; Orya      ; Oriya
11077 sc ; Qaai      ; Inherited
11078 sc ; Runr      ; Runic
11079 sc ; Sinh      ; Sinhala
11080 sc ; Syrc      ; Syriac
11081 sc ; Taml      ; Tamil
11082 sc ; Telu      ; Telugu
11083 sc ; Thaa      ; Thaana
11084 sc ; Thai      ; Thai
11085 sc ; Tibt      ; Tibetan
11086 sc ; Yiii      ; Yi
11087 sc ; Zyyy      ; Common
11088 END
11089     }
11090
11091     if ($v_version ge v2.0.0) {
11092         push @return, split /\n/, <<'END';
11093 dt ; com       ; compat
11094 dt ; nar       ; narrow
11095 dt ; sml       ; small
11096 dt ; vert      ; vertical
11097 dt ; wide      ; wide
11098
11099 gc ; Cf        ; Format
11100 gc ; Cs        ; Surrogate
11101 gc ; Lt        ; Titlecase_Letter
11102 gc ; Me        ; Enclosing_Mark
11103 gc ; Nl        ; Letter_Number
11104 gc ; Pc        ; Connector_Punctuation
11105 gc ; Sk        ; Modifier_Symbol
11106 END
11107     }
11108     if ($v_version ge v2.1.2) {
11109         push @return, "bc ; S         ; Segment_Separator\n";
11110     }
11111     if ($v_version ge v2.1.5) {
11112         push @return, split /\n/, <<'END';
11113 gc ; Pf        ; Final_Punctuation
11114 gc ; Pi        ; Initial_Punctuation
11115 END
11116     }
11117     if ($v_version ge v2.1.8) {
11118         push @return, "ccc; 240; IS   ; Iota_Subscript\n";
11119     }
11120
11121     if ($v_version ge v3.0.0) {
11122         push @return, split /\n/, <<'END';
11123 bc ; AL        ; Arabic_Letter
11124 bc ; BN        ; Boundary_Neutral
11125 bc ; LRE       ; Left_To_Right_Embedding
11126 bc ; LRO       ; Left_To_Right_Override
11127 bc ; NSM       ; Nonspacing_Mark
11128 bc ; PDF       ; Pop_Directional_Format
11129 bc ; RLE       ; Right_To_Left_Embedding
11130 bc ; RLO       ; Right_To_Left_Override
11131
11132 ccc; 233; DB   ; Double_Below
11133 END
11134     }
11135
11136     if ($v_version ge v3.1.0) {
11137         push @return, "ccc; 226; R    ; Right\n";
11138     }
11139
11140     return @return;
11141 }
11142
11143 sub process_NormalizationsTest {
11144
11145     # Each line looks like:
11146     #      source code point; NFC; NFD; NFKC; NFKD
11147     # e.g.
11148     #       1E0A;1E0A;0044 0307;1E0A;0044 0307;
11149
11150     my $file= shift;
11151     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
11152
11153     # Process each line of the file ...
11154     while ($file->next_line) {
11155
11156         next if /^@/;
11157
11158         my ($c1, $c2, $c3, $c4, $c5) = split /\s*;\s*/;
11159
11160         foreach my $var (\$c1, \$c2, \$c3, \$c4, \$c5) {
11161             $$var = pack "U0U*", map { hex } split " ", $$var;
11162             $$var =~ s/(\\)/$1$1/g;
11163         }
11164
11165         push @normalization_tests,
11166                 "Test_N(q\a$c1\a, q\a$c2\a, q\a$c3\a, q\a$c4\a, q\a$c5\a);\n";
11167     } # End of looping through the file
11168 }
11169
11170 sub output_perl_charnames_line ($$) {
11171
11172     # Output the entries in Perl_charnames specially, using 5 digits instead
11173     # of four.  This makes the entries a constant length, and simplifies
11174     # charnames.pm which this table is for.  Unicode can have 6 digit
11175     # ordinals, but they are all private use or noncharacters which do not
11176     # have names, so won't be in this table.
11177
11178     return sprintf "%05X\t%s\n", $_[0], $_[1];
11179 }
11180
11181 { # Closure
11182
11183     # These are constants to the $property_info hash in this subroutine, to
11184     # avoid using a quoted-string which might have a typo.
11185     my $TYPE  = 'type';
11186     my $DEFAULT_MAP = 'default_map';
11187     my $DEFAULT_TABLE = 'default_table';
11188     my $PSEUDO_MAP_TYPE = 'pseudo_map_type';
11189     my $MISSINGS = 'missings';
11190
11191     sub process_generic_property_file {
11192         # This processes a file containing property mappings and puts them
11193         # into internal map tables.  It should be used to handle any property
11194         # files that have mappings from a code point or range thereof to
11195         # something else.  This means almost all the UCD .txt files.
11196         # each_line_handlers() should be set to adjust the lines of these
11197         # files, if necessary, to what this routine understands:
11198         #
11199         # 0374          ; NFD_QC; N
11200         # 003C..003E    ; Math
11201         #
11202         # the fields are: "codepoint-range ; property; map"
11203         #
11204         # meaning the codepoints in the range all have the value 'map' under
11205         # 'property'.
11206         # Beginning and trailing white space in each field are not significant.
11207         # Note there is not a trailing semi-colon in the above.  A trailing
11208         # semi-colon means the map is a null-string.  An omitted map, as
11209         # opposed to a null-string, is assumed to be 'Y', based on Unicode
11210         # table syntax.  (This could have been hidden from this routine by
11211         # doing it in the $file object, but that would require parsing of the
11212         # line there, so would have to parse it twice, or change the interface
11213         # to pass this an array.  So not done.)
11214         #
11215         # The map field may begin with a sequence of commands that apply to
11216         # this range.  Each such command begins and ends with $CMD_DELIM.
11217         # These are used to indicate, for example, that the mapping for a
11218         # range has a non-default type.
11219         #
11220         # This loops through the file, calling its next_line() method, and
11221         # then taking the map and adding it to the property's table.
11222         # Complications arise because any number of properties can be in the
11223         # file, in any order, interspersed in any way.  The first time a
11224         # property is seen, it gets information about that property and
11225         # caches it for quick retrieval later.  It also normalizes the maps
11226         # so that only one of many synonyms is stored.  The Unicode input
11227         # files do use some multiple synonyms.
11228
11229         my $file = shift;
11230         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
11231
11232         my %property_info;               # To keep track of what properties
11233                                          # have already had entries in the
11234                                          # current file, and info about each,
11235                                          # so don't have to recompute.
11236         my $property_name;               # property currently being worked on
11237         my $property_type;               # and its type
11238         my $previous_property_name = ""; # name from last time through loop
11239         my $property_object;             # pointer to the current property's
11240                                          # object
11241         my $property_addr;               # the address of that object
11242         my $default_map;                 # the string that code points missing
11243                                          # from the file map to
11244         my $default_table;               # For non-string properties, a
11245                                          # reference to the match table that
11246                                          # will contain the list of code
11247                                          # points that map to $default_map.
11248
11249         # Get the next real non-comment line
11250         LINE:
11251         while ($file->next_line) {
11252
11253             # Default replacement type; means that if parts of the range have
11254             # already been stored in our tables, the new map overrides them if
11255             # they differ more than cosmetically
11256             my $replace = $IF_NOT_EQUIVALENT;
11257             my $map_type;            # Default type for the map of this range
11258
11259             #local $to_trace = 1 if main::DEBUG;
11260             trace $_ if main::DEBUG && $to_trace;
11261
11262             # Split the line into components
11263             my ($range, $property_name, $map, @remainder)
11264                 = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
11265
11266             # If more or less on the line than we are expecting, warn and skip
11267             # the line
11268             if (@remainder) {
11269                 $file->carp_bad_line('Extra fields');
11270                 next LINE;
11271             }
11272             elsif ( ! defined $property_name) {
11273                 $file->carp_bad_line('Missing property');
11274                 next LINE;
11275             }
11276
11277             # Examine the range.
11278             if ($range !~ /^ ($code_point_re) (?:\.\. ($code_point_re) )? $/x)
11279             {
11280                 $file->carp_bad_line("Range '$range' not of the form 'CP1' or 'CP1..CP2' (where CP1,2 are code points in hex)");
11281                 next LINE;
11282             }
11283             my $low = hex $1;
11284             my $high = (defined $2) ? hex $2 : $low;
11285
11286             # If changing to a new property, get the things constant per
11287             # property
11288             if ($previous_property_name ne $property_name) {
11289
11290                 $property_object = property_ref($property_name);
11291                 if (! defined $property_object) {
11292                     $file->carp_bad_line("Unexpected property '$property_name'.  Skipped");
11293                     next LINE;
11294                 }
11295                 { no overloading; $property_addr = pack 'J', $property_object; }
11296
11297                 # Defer changing names until have a line that is acceptable
11298                 # (the 'next' statement above means is unacceptable)
11299                 $previous_property_name = $property_name;
11300
11301                 # If not the first time for this property, retrieve info about
11302                 # it from the cache
11303                 if (defined ($property_info{$property_addr}{$TYPE})) {
11304                     $property_type = $property_info{$property_addr}{$TYPE};
11305                     $default_map = $property_info{$property_addr}{$DEFAULT_MAP};
11306                     $map_type
11307                         = $property_info{$property_addr}{$PSEUDO_MAP_TYPE};
11308                     $default_table
11309                             = $property_info{$property_addr}{$DEFAULT_TABLE};
11310                 }
11311                 else {
11312
11313                     # Here, is the first time for this property.  Set up the
11314                     # cache.
11315                     $property_type = $property_info{$property_addr}{$TYPE}
11316                                    = $property_object->type;
11317                     $map_type
11318                         = $property_info{$property_addr}{$PSEUDO_MAP_TYPE}
11319                         = $property_object->pseudo_map_type;
11320
11321                     # The Unicode files are set up so that if the map is not
11322                     # defined, it is a binary property
11323                     if (! defined $map && $property_type != $BINARY) {
11324                         if ($property_type != $UNKNOWN
11325                             && $property_type != $NON_STRING)
11326                         {
11327                             $file->carp_bad_line("No mapping defined on a non-binary property.  Using 'Y' for the map");
11328                         }
11329                         else {
11330                             $property_object->set_type($BINARY);
11331                             $property_type
11332                                 = $property_info{$property_addr}{$TYPE}
11333                                 = $BINARY;
11334                         }
11335                     }
11336
11337                     # Get any @missings default for this property.  This
11338                     # should precede the first entry for the property in the
11339                     # input file, and is located in a comment that has been
11340                     # stored by the Input_file class until we access it here.
11341                     # It's possible that there is more than one such line
11342                     # waiting for us; collect them all, and parse
11343                     my @missings_list = $file->get_missings
11344                                             if $file->has_missings_defaults;
11345                     foreach my $default_ref (@missings_list) {
11346                         my $default = $default_ref->[0];
11347                         my $addr = do { no overloading; pack 'J', property_ref($default_ref->[1]); };
11348
11349                         # For string properties, the default is just what the
11350                         # file says, but non-string properties should already
11351                         # have set up a table for the default property value;
11352                         # use the table for these, so can resolve synonyms
11353                         # later to a single standard one.
11354                         if ($property_type == $STRING
11355                             || $property_type == $UNKNOWN)
11356                         {
11357                             $property_info{$addr}{$MISSINGS} = $default;
11358                         }
11359                         else {
11360                             $property_info{$addr}{$MISSINGS}
11361                                         = $property_object->table($default);
11362                         }
11363                     }
11364
11365                     # Finished storing all the @missings defaults in the input
11366                     # file so far.  Get the one for the current property.
11367                     my $missings = $property_info{$property_addr}{$MISSINGS};
11368
11369                     # But we likely have separately stored what the default
11370                     # should be.  (This is to accommodate versions of the
11371                     # standard where the @missings lines are absent or
11372                     # incomplete.)  Hopefully the two will match.  But check
11373                     # it out.
11374                     $default_map = $property_object->default_map;
11375
11376                     # If the map is a ref, it means that the default won't be
11377                     # processed until later, so undef it, so next few lines
11378                     # will redefine it to something that nothing will match
11379                     undef $default_map if ref $default_map;
11380
11381                     # Create a $default_map if don't have one; maybe a dummy
11382                     # that won't match anything.
11383                     if (! defined $default_map) {
11384
11385                         # Use any @missings line in the file.
11386                         if (defined $missings) {
11387                             if (ref $missings) {
11388                                 $default_map = $missings->full_name;
11389                                 $default_table = $missings;
11390                             }
11391                             else {
11392                                 $default_map = $missings;
11393                             }
11394
11395                             # And store it with the property for outside use.
11396                             $property_object->set_default_map($default_map);
11397                         }
11398                         else {
11399
11400                             # Neither an @missings nor a default map.  Create
11401                             # a dummy one, so won't have to test definedness
11402                             # in the main loop.
11403                             $default_map = '_Perl This will never be in a file
11404                                             from Unicode';
11405                         }
11406                     }
11407
11408                     # Here, we have $default_map defined, possibly in terms of
11409                     # $missings, but maybe not, and possibly is a dummy one.
11410                     if (defined $missings) {
11411
11412                         # Make sure there is no conflict between the two.
11413                         # $missings has priority.
11414                         if (ref $missings) {
11415                             $default_table
11416                                         = $property_object->table($default_map);
11417                             if (! defined $default_table
11418                                 || $default_table != $missings)
11419                             {
11420                                 if (! defined $default_table) {
11421                                     $default_table = $UNDEF;
11422                                 }
11423                                 $file->carp_bad_line(<<END
11424 The \@missings line for $property_name in $file says that missings default to
11425 $missings, but we expect it to be $default_table.  $missings used.
11426 END
11427                                 );
11428                                 $default_table = $missings;
11429                                 $default_map = $missings->full_name;
11430                             }
11431                             $property_info{$property_addr}{$DEFAULT_TABLE}
11432                                                         = $default_table;
11433                         }
11434                         elsif ($default_map ne $missings) {
11435                             $file->carp_bad_line(<<END
11436 The \@missings line for $property_name in $file says that missings default to
11437 $missings, but we expect it to be $default_map.  $missings used.
11438 END
11439                             );
11440                             $default_map = $missings;
11441                         }
11442                     }
11443
11444                     $property_info{$property_addr}{$DEFAULT_MAP}
11445                                                     = $default_map;
11446
11447                     # If haven't done so already, find the table corresponding
11448                     # to this map for non-string properties.
11449                     if (! defined $default_table
11450                         && $property_type != $STRING
11451                         && $property_type != $UNKNOWN)
11452                     {
11453                         $default_table = $property_info{$property_addr}
11454                                                         {$DEFAULT_TABLE}
11455                                     = $property_object->table($default_map);
11456                     }
11457                 } # End of is first time for this property
11458             } # End of switching properties.
11459
11460             # Ready to process the line.
11461             # The Unicode files are set up so that if the map is not defined,
11462             # it is a binary property with value 'Y'
11463             if (! defined $map) {
11464                 $map = 'Y';
11465             }
11466             else {
11467
11468                 # If the map begins with a special command to us (enclosed in
11469                 # delimiters), extract the command(s).
11470                 while ($map =~ s/ ^ $CMD_DELIM (.*?) $CMD_DELIM //x) {
11471                     my $command = $1;
11472                     if ($command =~  / ^ $REPLACE_CMD= (.*) /x) {
11473                         $replace = $1;
11474                     }
11475                     elsif ($command =~  / ^ $MAP_TYPE_CMD= (.*) /x) {
11476                         $map_type = $1;
11477                     }
11478                     else {
11479                         $file->carp_bad_line("Unknown command line: '$1'");
11480                         next LINE;
11481                     }
11482                 }
11483             }
11484
11485             if ($default_map eq $CODE_POINT && $map =~ / ^ $code_point_re $/x)
11486             {
11487
11488                 # Here, we have a map to a particular code point, and the
11489                 # default map is to a code point itself.  If the range
11490                 # includes the particular code point, change that portion of
11491                 # the range to the default.  This makes sure that in the final
11492                 # table only the non-defaults are listed.
11493                 my $decimal_map = hex $map;
11494                 if ($low <= $decimal_map && $decimal_map <= $high) {
11495
11496                     # If the range includes stuff before or after the map
11497                     # we're changing, split it and process the split-off parts
11498                     # later.
11499                     if ($low < $decimal_map) {
11500                         $file->insert_adjusted_lines(
11501                                             sprintf("%04X..%04X; %s; %s",
11502                                                     $low,
11503                                                     $decimal_map - 1,
11504                                                     $property_name,
11505                                                     $map));
11506                     }
11507                     if ($high > $decimal_map) {
11508                         $file->insert_adjusted_lines(
11509                                             sprintf("%04X..%04X; %s; %s",
11510                                                     $decimal_map + 1,
11511                                                     $high,
11512                                                     $property_name,
11513                                                     $map));
11514                     }
11515                     $low = $high = $decimal_map;
11516                     $map = $CODE_POINT;
11517                 }
11518             }
11519
11520             # If we can tell that this is a synonym for the default map, use
11521             # the default one instead.
11522             if ($property_type != $STRING
11523                 && $property_type != $UNKNOWN)
11524             {
11525                 my $table = $property_object->table($map);
11526                 if (defined $table && $table == $default_table) {
11527                     $map = $default_map;
11528                 }
11529             }
11530
11531             # And figure out the map type if not known.
11532             if (! defined $map_type || $map_type == $COMPUTE_NO_MULTI_CP) {
11533                 if ($map eq "") {   # Nulls are always $NULL map type
11534                     $map_type = $NULL;
11535                 } # Otherwise, non-strings, and those that don't allow
11536                   # $MULTI_CP, and those that aren't multiple code points are
11537                   # 0
11538                 elsif
11539                    (($property_type != $STRING && $property_type != $UNKNOWN)
11540                    || (defined $map_type && $map_type == $COMPUTE_NO_MULTI_CP)
11541                    || $map !~ /^ $code_point_re ( \  $code_point_re )+ $ /x)
11542                 {
11543                     $map_type = 0;
11544                 }
11545                 else {
11546                     $map_type = $MULTI_CP;
11547                 }
11548             }
11549
11550             $property_object->add_map($low, $high,
11551                                         $map,
11552                                         Type => $map_type,
11553                                         Replace => $replace);
11554         } # End of loop through file's lines
11555
11556         return;
11557     }
11558 }
11559
11560 { # Closure for UnicodeData.txt handling
11561
11562     # This file was the first one in the UCD; its design leads to some
11563     # awkwardness in processing.  Here is a sample line:
11564     # 0041;LATIN CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0061;
11565     # The fields in order are:
11566     my $i = 0;            # The code point is in field 0, and is shifted off.
11567     my $CHARNAME = $i++;  # character name (e.g. "LATIN CAPITAL LETTER A")
11568     my $CATEGORY = $i++;  # category (e.g. "Lu")
11569     my $CCC = $i++;       # Canonical combining class (e.g. "230")
11570     my $BIDI = $i++;      # directional class (e.g. "L")
11571     my $PERL_DECOMPOSITION = $i++;  # decomposition mapping
11572     my $PERL_DECIMAL_DIGIT = $i++;   # decimal digit value
11573     my $NUMERIC_TYPE_OTHER_DIGIT = $i++; # digit value, like a superscript
11574                                          # Dual-use in this program; see below
11575     my $NUMERIC = $i++;   # numeric value
11576     my $MIRRORED = $i++;  # ? mirrored
11577     my $UNICODE_1_NAME = $i++; # name in Unicode 1.0
11578     my $COMMENT = $i++;   # iso comment
11579     my $UPPER = $i++;     # simple uppercase mapping
11580     my $LOWER = $i++;     # simple lowercase mapping
11581     my $TITLE = $i++;     # simple titlecase mapping
11582     my $input_field_count = $i;
11583
11584     # This routine in addition outputs these extra fields:
11585
11586     my $DECOMP_TYPE = $i++; # Decomposition type
11587
11588     # These fields are modifications of ones above, and are usually
11589     # suppressed; they must come last, as for speed, the loop upper bound is
11590     # normally set to ignore them
11591     my $NAME = $i++;        # This is the strict name field, not the one that
11592                             # charnames uses.
11593     my $DECOMP_MAP = $i++;  # Strict decomposition mapping; not the one used
11594                             # by Unicode::Normalize
11595     my $last_field = $i - 1;
11596
11597     # All these are read into an array for each line, with the indices defined
11598     # above.  The empty fields in the example line above indicate that the
11599     # value is defaulted.  The handler called for each line of the input
11600     # changes these to their defaults.
11601
11602     # Here are the official names of the properties, in a parallel array:
11603     my @field_names;
11604     $field_names[$BIDI] = 'Bidi_Class';
11605     $field_names[$CATEGORY] = 'General_Category';
11606     $field_names[$CCC] = 'Canonical_Combining_Class';
11607     $field_names[$CHARNAME] = 'Perl_Charnames';
11608     $field_names[$COMMENT] = 'ISO_Comment';
11609     $field_names[$DECOMP_MAP] = 'Decomposition_Mapping';
11610     $field_names[$DECOMP_TYPE] = 'Decomposition_Type';
11611     $field_names[$LOWER] = 'Lowercase_Mapping';
11612     $field_names[$MIRRORED] = 'Bidi_Mirrored';
11613     $field_names[$NAME] = 'Name';
11614     $field_names[$NUMERIC] = 'Numeric_Value';
11615     $field_names[$NUMERIC_TYPE_OTHER_DIGIT] = 'Numeric_Type';
11616     $field_names[$PERL_DECIMAL_DIGIT] = 'Perl_Decimal_Digit';
11617     $field_names[$PERL_DECOMPOSITION] = 'Perl_Decomposition_Mapping';
11618     $field_names[$TITLE] = 'Titlecase_Mapping';
11619     $field_names[$UNICODE_1_NAME] = 'Unicode_1_Name';
11620     $field_names[$UPPER] = 'Uppercase_Mapping';
11621
11622     # Some of these need a little more explanation:
11623     # The $PERL_DECIMAL_DIGIT field does not lead to an official Unicode
11624     #   property, but is used in calculating the Numeric_Type.  Perl however,
11625     #   creates a file from this field, so a Perl property is created from it.
11626     # Similarly, the Other_Digit field is used only for calculating the
11627     #   Numeric_Type, and so it can be safely re-used as the place to store
11628     #   the value for Numeric_Type; hence it is referred to as
11629     #   $NUMERIC_TYPE_OTHER_DIGIT.
11630     # The input field named $PERL_DECOMPOSITION is a combination of both the
11631     #   decomposition mapping and its type.  Perl creates a file containing
11632     #   exactly this field, so it is used for that.  The two properties are
11633     #   separated into two extra output fields, $DECOMP_MAP and $DECOMP_TYPE.
11634     #   $DECOMP_MAP is usually suppressed (unless the lists are changed to
11635     #   output it), as Perl doesn't use it directly.
11636     # The input field named here $CHARNAME is used to construct the
11637     #   Perl_Charnames property, which is a combination of the Name property
11638     #   (which the input field contains), and the Unicode_1_Name property, and
11639     #   others from other files.  Since, the strict Name property is not used
11640     #   by Perl, this field is used for the table that Perl does use.  The
11641     #   strict Name property table is usually suppressed (unless the lists are
11642     #   changed to output it), so it is accumulated in a separate field,
11643     #   $NAME, which to save time is discarded unless the table is actually to
11644     #   be output
11645
11646     # This file is processed like most in this program.  Control is passed to
11647     # process_generic_property_file() which calls filter_UnicodeData_line()
11648     # for each input line.  This filter converts the input into line(s) that
11649     # process_generic_property_file() understands.  There is also a setup
11650     # routine called before any of the file is processed, and a handler for
11651     # EOF processing, all in this closure.
11652
11653     # A huge speed-up occurred at the cost of some added complexity when these
11654     # routines were altered to buffer the outputs into ranges.  Almost all the
11655     # lines of the input file apply to just one code point, and for most
11656     # properties, the map for the next code point up is the same as the
11657     # current one.  So instead of creating a line for each property for each
11658     # input line, filter_UnicodeData_line() remembers what the previous map
11659     # of a property was, and doesn't generate a line to pass on until it has
11660     # to, as when the map changes; and that passed-on line encompasses the
11661     # whole contiguous range of code points that have the same map for that
11662     # property.  This means a slight amount of extra setup, and having to
11663     # flush these buffers on EOF, testing if the maps have changed, plus
11664     # remembering state information in the closure.  But it means a lot less
11665     # real time in not having to change the data base for each property on
11666     # each line.
11667
11668     # Another complication is that there are already a few ranges designated
11669     # in the input.  There are two lines for each, with the same maps except
11670     # the code point and name on each line.  This was actually the hardest
11671     # thing to design around.  The code points in those ranges may actually
11672     # have real maps not given by these two lines.  These maps will either
11673     # be algorithmically determinable, or be in the extracted files furnished
11674     # with the UCD.  In the event of conflicts between these extracted files,
11675     # and this one, Unicode says that this one prevails.  But it shouldn't
11676     # prevail for conflicts that occur in these ranges.  The data from the
11677     # extracted files prevails in those cases.  So, this program is structured
11678     # so that those files are processed first, storing maps.  Then the other
11679     # files are processed, generally overwriting what the extracted files
11680     # stored.  But just the range lines in this input file are processed
11681     # without overwriting.  This is accomplished by adding a special string to
11682     # the lines output to tell process_generic_property_file() to turn off the
11683     # overwriting for just this one line.
11684     # A similar mechanism is used to tell it that the map is of a non-default
11685     # type.
11686
11687     sub setup_UnicodeData { # Called before any lines of the input are read
11688         my $file = shift;
11689         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
11690
11691         # Create a new property specially located that is a combination of
11692         # various Name properties: Name, Unicode_1_Name, Named Sequences, and
11693         # _Perl_Name_Alias properties.  (The final one duplicates elements of the
11694         # first, and starting in v6.1, is the same as the 'Name_Alias
11695         # property.)  A comment for the new property will later be constructed
11696         # based on the actual properties present and used
11697         $perl_charname = Property->new('Perl_Charnames',
11698                        Default_Map => "",
11699                        Directory => File::Spec->curdir(),
11700                        File => 'Name',
11701                        Fate => $INTERNAL_ONLY,
11702                        Perl_Extension => 1,
11703                        Range_Size_1 => \&output_perl_charnames_line,
11704                        Type => $STRING,
11705                        );
11706         $perl_charname->set_proxy_for('Name');
11707
11708         my $Perl_decomp = Property->new('Perl_Decomposition_Mapping',
11709                                         Directory => File::Spec->curdir(),
11710                                         File => 'Decomposition',
11711                                         Format => $DECOMP_STRING_FORMAT,
11712                                         Fate => $INTERNAL_ONLY,
11713                                         Perl_Extension => 1,
11714                                         Default_Map => $CODE_POINT,
11715
11716                                         # normalize.pm can't cope with these
11717                                         Output_Range_Counts => 0,
11718
11719                                         # This is a specially formatted table
11720                                         # explicitly for normalize.pm, which
11721                                         # is expecting a particular format,
11722                                         # which means that mappings containing
11723                                         # multiple code points are in the main
11724                                         # body of the table
11725                                         Map_Type => $COMPUTE_NO_MULTI_CP,
11726                                         Type => $STRING,
11727                                         To_Output_Map => $INTERNAL_MAP,
11728                                         );
11729         $Perl_decomp->set_proxy_for('Decomposition_Mapping', 'Decomposition_Type');
11730         $Perl_decomp->add_comment(join_lines(<<END
11731 This mapping is a combination of the Unicode 'Decomposition_Type' and
11732 'Decomposition_Mapping' properties, formatted for use by normalize.pm.  It is
11733 identical to the official Unicode 'Decomposition_Mapping' property except for
11734 two things:
11735  1) It omits the algorithmically determinable Hangul syllable decompositions,
11736 which normalize.pm handles algorithmically.
11737  2) It contains the decomposition type as well.  Non-canonical decompositions
11738 begin with a word in angle brackets, like <super>, which denotes the
11739 compatible decomposition type.  If the map does not begin with the <angle
11740 brackets>, the decomposition is canonical.
11741 END
11742         ));
11743
11744         my $Decimal_Digit = Property->new("Perl_Decimal_Digit",
11745                                         Default_Map => "",
11746                                         Perl_Extension => 1,
11747                                         Directory => $map_directory,
11748                                         Type => $STRING,
11749                                         To_Output_Map => $OUTPUT_ADJUSTED,
11750                                         );
11751         $Decimal_Digit->add_comment(join_lines(<<END
11752 This file gives the mapping of all code points which represent a single
11753 decimal digit [0-9] to their respective digits, but it has ranges of 10 code
11754 points, and the mapping of each non-initial element of each range is actually
11755 not to "0", but to the offset that element has from its corresponding DIGIT 0.
11756 These code points are those that have Numeric_Type=Decimal; not special
11757 things, like subscripts nor Roman numerals.
11758 END
11759         ));
11760
11761         # These properties are not used for generating anything else, and are
11762         # usually not output.  By making them last in the list, we can just
11763         # change the high end of the loop downwards to avoid the work of
11764         # generating a table(s) that is/are just going to get thrown away.
11765         if (! property_ref('Decomposition_Mapping')->to_output_map
11766             && ! property_ref('Name')->to_output_map)
11767         {
11768             $last_field = min($NAME, $DECOMP_MAP) - 1;
11769         } elsif (property_ref('Decomposition_Mapping')->to_output_map) {
11770             $last_field = $DECOMP_MAP;
11771         } elsif (property_ref('Name')->to_output_map) {
11772             $last_field = $NAME;
11773         }
11774         return;
11775     }
11776
11777     my $first_time = 1;                 # ? Is this the first line of the file
11778     my $in_range = 0;                   # ? Are we in one of the file's ranges
11779     my $previous_cp;                    # hex code point of previous line
11780     my $decimal_previous_cp = -1;       # And its decimal equivalent
11781     my @start;                          # For each field, the current starting
11782                                         # code point in hex for the range
11783                                         # being accumulated.
11784     my @fields;                         # The input fields;
11785     my @previous_fields;                # And those from the previous call
11786
11787     sub filter_UnicodeData_line {
11788         # Handle a single input line from UnicodeData.txt; see comments above
11789         # Conceptually this takes a single line from the file containing N
11790         # properties, and converts it into N lines with one property per line,
11791         # which is what the final handler expects.  But there are
11792         # complications due to the quirkiness of the input file, and to save
11793         # time, it accumulates ranges where the property values don't change
11794         # and only emits lines when necessary.  This is about an order of
11795         # magnitude fewer lines emitted.
11796
11797         my $file = shift;
11798         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
11799
11800         # $_ contains the input line.
11801         # -1 in split means retain trailing null fields
11802         (my $cp, @fields) = split /\s*;\s*/, $_, -1;
11803
11804         #local $to_trace = 1 if main::DEBUG;
11805         trace $cp, @fields , $input_field_count if main::DEBUG && $to_trace;
11806         if (@fields > $input_field_count) {
11807             $file->carp_bad_line('Extra fields');
11808             $_ = "";
11809             return;
11810         }
11811
11812         my $decimal_cp = hex $cp;
11813
11814         # We have to output all the buffered ranges when the next code point
11815         # is not exactly one after the previous one, which means there is a
11816         # gap in the ranges.
11817         my $force_output = ($decimal_cp != $decimal_previous_cp + 1);
11818
11819         # The decomposition mapping field requires special handling.  It looks
11820         # like either:
11821         #
11822         # <compat> 0032 0020
11823         # 0041 0300
11824         #
11825         # The decomposition type is enclosed in <brackets>; if missing, it
11826         # means the type is canonical.  There are two decomposition mapping
11827         # tables: the one for use by Perl's normalize.pm has a special format
11828         # which is this field intact; the other, for general use is of
11829         # standard format.  In either case we have to find the decomposition
11830         # type.  Empty fields have None as their type, and map to the code
11831         # point itself
11832         if ($fields[$PERL_DECOMPOSITION] eq "") {
11833             $fields[$DECOMP_TYPE] = 'None';
11834             $fields[$DECOMP_MAP] = $fields[$PERL_DECOMPOSITION] = $CODE_POINT;
11835         }
11836         else {
11837             ($fields[$DECOMP_TYPE], my $map) = $fields[$PERL_DECOMPOSITION]
11838                                             =~ / < ( .+? ) > \s* ( .+ ) /x;
11839             if (! defined $fields[$DECOMP_TYPE]) {
11840                 $fields[$DECOMP_TYPE] = 'Canonical';
11841                 $fields[$DECOMP_MAP] = $fields[$PERL_DECOMPOSITION];
11842             }
11843             else {
11844                 $fields[$DECOMP_MAP] = $map;
11845             }
11846         }
11847
11848         # The 3 numeric fields also require special handling.  The 2 digit
11849         # fields must be either empty or match the number field.  This means
11850         # that if it is empty, they must be as well, and the numeric type is
11851         # None, and the numeric value is 'Nan'.
11852         # The decimal digit field must be empty or match the other digit
11853         # field.  If the decimal digit field is non-empty, the code point is
11854         # a decimal digit, and the other two fields will have the same value.
11855         # If it is empty, but the other digit field is non-empty, the code
11856         # point is an 'other digit', and the number field will have the same
11857         # value as the other digit field.  If the other digit field is empty,
11858         # but the number field is non-empty, the code point is a generic
11859         # numeric type.
11860         if ($fields[$NUMERIC] eq "") {
11861             if ($fields[$PERL_DECIMAL_DIGIT] ne ""
11862                 || $fields[$NUMERIC_TYPE_OTHER_DIGIT] ne ""
11863             ) {
11864                 $file->carp_bad_line("Numeric values inconsistent.  Trying to process anyway");
11865             }
11866             $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'None';
11867             $fields[$NUMERIC] = 'NaN';
11868         }
11869         else {
11870             $file->carp_bad_line("'$fields[$NUMERIC]' should be a whole or rational number.  Processing as if it were") if $fields[$NUMERIC] !~ qr{ ^ -? \d+ ( / \d+ )? $ }x;
11871             if ($fields[$PERL_DECIMAL_DIGIT] ne "") {
11872                 $file->carp_bad_line("$fields[$PERL_DECIMAL_DIGIT] should equal $fields[$NUMERIC].  Processing anyway") if $fields[$PERL_DECIMAL_DIGIT] != $fields[$NUMERIC];
11873                 $file->carp_bad_line("$fields[$PERL_DECIMAL_DIGIT] should be empty since the general category ($fields[$CATEGORY]) isn't 'Nd'.  Processing as Decimal") if $fields[$CATEGORY] ne "Nd";
11874                 $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'Decimal';
11875             }
11876             elsif ($fields[$NUMERIC_TYPE_OTHER_DIGIT] ne "") {
11877                 $file->carp_bad_line("$fields[$NUMERIC_TYPE_OTHER_DIGIT] should equal $fields[$NUMERIC].  Processing anyway") if $fields[$NUMERIC_TYPE_OTHER_DIGIT] != $fields[$NUMERIC];
11878                 $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'Digit';
11879             }
11880             else {
11881                 $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'Numeric';
11882
11883                 # Rationals require extra effort.
11884                 if ($fields[$NUMERIC] =~ qr{/}) {
11885                     reduce_fraction(\$fields[$NUMERIC]);
11886                     register_fraction($fields[$NUMERIC])
11887                 }
11888             }
11889         }
11890
11891         # For the properties that have empty fields in the file, and which
11892         # mean something different from empty, change them to that default.
11893         # Certain fields just haven't been empty so far in any Unicode
11894         # version, so don't look at those, namely $MIRRORED, $BIDI, $CCC,
11895         # $CATEGORY.  This leaves just the two fields, and so we hard-code in
11896         # the defaults; which are very unlikely to ever change.
11897         $fields[$UPPER] = $CODE_POINT if $fields[$UPPER] eq "";
11898         $fields[$LOWER] = $CODE_POINT if $fields[$LOWER] eq "";
11899
11900         # UAX44 says that if title is empty, it is the same as whatever upper
11901         # is,
11902         $fields[$TITLE] = $fields[$UPPER] if $fields[$TITLE] eq "";
11903
11904         # There are a few pairs of lines like:
11905         #   AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
11906         #   D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
11907         # that define ranges.  These should be processed after the fields are
11908         # adjusted above, as they may override some of them; but mostly what
11909         # is left is to possibly adjust the $CHARNAME field.  The names of all the
11910         # paired lines start with a '<', but this is also true of '<control>,
11911         # which isn't one of these special ones.
11912         if ($fields[$CHARNAME] eq '<control>') {
11913
11914             # Some code points in this file have the pseudo-name
11915             # '<control>', but the official name for such ones is the null
11916             # string.
11917             $fields[$NAME] = $fields[$CHARNAME] = "";
11918
11919             # We had better not be in between range lines.
11920             if ($in_range) {
11921                 $file->carp_bad_line("Expecting a closing range line, not a $fields[$CHARNAME]'.  Trying anyway");
11922                 $in_range = 0;
11923             }
11924         }
11925         elsif (substr($fields[$CHARNAME], 0, 1) ne '<') {
11926
11927             # Here is a non-range line.  We had better not be in between range
11928             # lines.
11929             if ($in_range) {
11930                 $file->carp_bad_line("Expecting a closing range line, not a $fields[$CHARNAME]'.  Trying anyway");
11931                 $in_range = 0;
11932             }
11933             if ($fields[$CHARNAME] =~ s/- $cp $//x) {
11934
11935                 # These are code points whose names end in their code points,
11936                 # which means the names are algorithmically derivable from the
11937                 # code points.  To shorten the output Name file, the algorithm
11938                 # for deriving these is placed in the file instead of each
11939                 # code point, so they have map type $CP_IN_NAME
11940                 $fields[$CHARNAME] = $CMD_DELIM
11941                                  . $MAP_TYPE_CMD
11942                                  . '='
11943                                  . $CP_IN_NAME
11944                                  . $CMD_DELIM
11945                                  . $fields[$CHARNAME];
11946             }
11947             $fields[$NAME] = $fields[$CHARNAME];
11948         }
11949         elsif ($fields[$CHARNAME] =~ /^<(.+), First>$/) {
11950             $fields[$CHARNAME] = $fields[$NAME] = $1;
11951
11952             # Here we are at the beginning of a range pair.
11953             if ($in_range) {
11954                 $file->carp_bad_line("Expecting a closing range line, not a beginning one, $fields[$CHARNAME]'.  Trying anyway");
11955             }
11956             $in_range = 1;
11957
11958             # Because the properties in the range do not overwrite any already
11959             # in the db, we must flush the buffers of what's already there, so
11960             # they get handled in the normal scheme.
11961             $force_output = 1;
11962
11963         }
11964         elsif ($fields[$CHARNAME] !~ s/^<(.+), Last>$/$1/) {
11965             $file->carp_bad_line("Unexpected name starting with '<' $fields[$CHARNAME].  Ignoring this line.");
11966             $_ = "";
11967             return;
11968         }
11969         else { # Here, we are at the last line of a range pair.
11970
11971             if (! $in_range) {
11972                 $file->carp_bad_line("Unexpected end of range $fields[$CHARNAME] when not in one.  Ignoring this line.");
11973                 $_ = "";
11974                 return;
11975             }
11976             $in_range = 0;
11977
11978             $fields[$NAME] = $fields[$CHARNAME];
11979
11980             # Check that the input is valid: that the closing of the range is
11981             # the same as the beginning.
11982             foreach my $i (0 .. $last_field) {
11983                 next if $fields[$i] eq $previous_fields[$i];
11984                 $file->carp_bad_line("Expecting '$fields[$i]' to be the same as '$previous_fields[$i]'.  Bad News.  Trying anyway");
11985             }
11986
11987             # The processing differs depending on the type of range,
11988             # determined by its $CHARNAME
11989             if ($fields[$CHARNAME] =~ /^Hangul Syllable/) {
11990
11991                 # Check that the data looks right.
11992                 if ($decimal_previous_cp != $SBase) {
11993                     $file->carp_bad_line("Unexpected Hangul syllable start = $previous_cp.  Bad News.  Results will be wrong");
11994                 }
11995                 if ($decimal_cp != $SBase + $SCount - 1) {
11996                     $file->carp_bad_line("Unexpected Hangul syllable end = $cp.  Bad News.  Results will be wrong");
11997                 }
11998
11999                 # The Hangul syllable range has a somewhat complicated name
12000                 # generation algorithm.  Each code point in it has a canonical
12001                 # decomposition also computable by an algorithm.  The
12002                 # perl decomposition map table built from these is used only
12003                 # by normalize.pm, which has the algorithm built in it, so the
12004                 # decomposition maps are not needed, and are large, so are
12005                 # omitted from it.  If the full decomposition map table is to
12006                 # be output, the decompositions are generated for it, in the
12007                 # EOF handling code for this input file.
12008
12009                 $previous_fields[$DECOMP_TYPE] = 'Canonical';
12010
12011                 # This range is stored in our internal structure with its
12012                 # own map type, different from all others.
12013                 $previous_fields[$CHARNAME] = $previous_fields[$NAME]
12014                                         = $CMD_DELIM
12015                                           . $MAP_TYPE_CMD
12016                                           . '='
12017                                           . $HANGUL_SYLLABLE
12018                                           . $CMD_DELIM
12019                                           . $fields[$CHARNAME];
12020             }
12021             elsif ($fields[$CATEGORY] eq 'Lo') {    # Is a letter
12022
12023                 # All the CJK ranges like this have the name given as a
12024                 # special case in the next code line.  And for the others, we
12025                 # hope that Unicode continues to use the correct name in
12026                 # future releases, so we don't have to make further special
12027                 # cases.
12028                 my $name = ($fields[$CHARNAME] =~ /^CJK/)
12029                            ? 'CJK UNIFIED IDEOGRAPH'
12030                            : uc $fields[$CHARNAME];
12031
12032                 # The name for these contains the code point itself, and all
12033                 # are defined to have the same base name, regardless of what
12034                 # is in the file.  They are stored in our internal structure
12035                 # with a map type of $CP_IN_NAME
12036                 $previous_fields[$CHARNAME] = $previous_fields[$NAME]
12037                                         = $CMD_DELIM
12038                                            . $MAP_TYPE_CMD
12039                                            . '='
12040                                            . $CP_IN_NAME
12041                                            . $CMD_DELIM
12042                                            . $name;
12043
12044             }
12045             elsif ($fields[$CATEGORY] eq 'Co'
12046                      || $fields[$CATEGORY] eq 'Cs')
12047             {
12048                 # The names of all the code points in these ranges are set to
12049                 # null, as there are no names for the private use and
12050                 # surrogate code points.
12051
12052                 $previous_fields[$CHARNAME] = $previous_fields[$NAME] = "";
12053             }
12054             else {
12055                 $file->carp_bad_line("Unexpected code point range $fields[$CHARNAME] because category is $fields[$CATEGORY].  Attempting to process it.");
12056             }
12057
12058             # The first line of the range caused everything else to be output,
12059             # and then its values were stored as the beginning values for the
12060             # next set of ranges, which this one ends.  Now, for each value,
12061             # add a command to tell the handler that these values should not
12062             # replace any existing ones in our database.
12063             foreach my $i (0 .. $last_field) {
12064                 $previous_fields[$i] = $CMD_DELIM
12065                                         . $REPLACE_CMD
12066                                         . '='
12067                                         . $NO
12068                                         . $CMD_DELIM
12069                                         . $previous_fields[$i];
12070             }
12071
12072             # And change things so it looks like the entire range has been
12073             # gone through with this being the final part of it.  Adding the
12074             # command above to each field will cause this range to be flushed
12075             # during the next iteration, as it guaranteed that the stored
12076             # field won't match whatever value the next one has.
12077             $previous_cp = $cp;
12078             $decimal_previous_cp = $decimal_cp;
12079
12080             # We are now set up for the next iteration; so skip the remaining
12081             # code in this subroutine that does the same thing, but doesn't
12082             # know about these ranges.
12083             $_ = "";
12084
12085             return;
12086         }
12087
12088         # On the very first line, we fake it so the code below thinks there is
12089         # nothing to output, and initialize so that when it does get output it
12090         # uses the first line's values for the lowest part of the range.
12091         # (One could avoid this by using peek(), but then one would need to
12092         # know the adjustments done above and do the same ones in the setup
12093         # routine; not worth it)
12094         if ($first_time) {
12095             $first_time = 0;
12096             @previous_fields = @fields;
12097             @start = ($cp) x scalar @fields;
12098             $decimal_previous_cp = $decimal_cp - 1;
12099         }
12100
12101         # For each field, output the stored up ranges that this code point
12102         # doesn't fit in.  Earlier we figured out if all ranges should be
12103         # terminated because of changing the replace or map type styles, or if
12104         # there is a gap between this new code point and the previous one, and
12105         # that is stored in $force_output.  But even if those aren't true, we
12106         # need to output the range if this new code point's value for the
12107         # given property doesn't match the stored range's.
12108         #local $to_trace = 1 if main::DEBUG;
12109         foreach my $i (0 .. $last_field) {
12110             my $field = $fields[$i];
12111             if ($force_output || $field ne $previous_fields[$i]) {
12112
12113                 # Flush the buffer of stored values.
12114                 $file->insert_adjusted_lines("$start[$i]..$previous_cp; $field_names[$i]; $previous_fields[$i]");
12115
12116                 # Start a new range with this code point and its value
12117                 $start[$i] = $cp;
12118                 $previous_fields[$i] = $field;
12119             }
12120         }
12121
12122         # Set the values for the next time.
12123         $previous_cp = $cp;
12124         $decimal_previous_cp = $decimal_cp;
12125
12126         # The input line has generated whatever adjusted lines are needed, and
12127         # should not be looked at further.
12128         $_ = "";
12129         return;
12130     }
12131
12132     sub EOF_UnicodeData {
12133         # Called upon EOF to flush the buffers, and create the Hangul
12134         # decomposition mappings if needed.
12135
12136         my $file = shift;
12137         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
12138
12139         # Flush the buffers.
12140         foreach my $i (0 .. $last_field) {
12141             $file->insert_adjusted_lines("$start[$i]..$previous_cp; $field_names[$i]; $previous_fields[$i]");
12142         }
12143
12144         if (-e 'Jamo.txt') {
12145
12146             # The algorithm is published by Unicode, based on values in
12147             # Jamo.txt, (which should have been processed before this
12148             # subroutine), and the results left in %Jamo
12149             unless (%Jamo) {
12150                 Carp::my_carp_bug("Jamo.txt should be processed before Unicode.txt.  Hangul syllables not generated.");
12151                 return;
12152             }
12153
12154             # If the full decomposition map table is being output, insert
12155             # into it the Hangul syllable mappings.  This is to avoid having
12156             # to publish a subroutine in it to compute them.  (which would
12157             # essentially be this code.)  This uses the algorithm published by
12158             # Unicode.  (No hangul syllables in version 1)
12159             if ($v_version ge v2.0.0
12160                 && property_ref('Decomposition_Mapping')->to_output_map) {
12161                 for (my $S = $SBase; $S < $SBase + $SCount; $S++) {
12162                     use integer;
12163                     my $SIndex = $S - $SBase;
12164                     my $L = $LBase + $SIndex / $NCount;
12165                     my $V = $VBase + ($SIndex % $NCount) / $TCount;
12166                     my $T = $TBase + $SIndex % $TCount;
12167
12168                     trace "L=$L, V=$V, T=$T" if main::DEBUG && $to_trace;
12169                     my $decomposition = sprintf("%04X %04X", $L, $V);
12170                     $decomposition .= sprintf(" %04X", $T) if $T != $TBase;
12171                     $file->insert_adjusted_lines(
12172                                 sprintf("%04X; Decomposition_Mapping; %s",
12173                                         $S,
12174                                         $decomposition));
12175                 }
12176             }
12177         }
12178
12179         return;
12180     }
12181
12182     sub filter_v1_ucd {
12183         # Fix UCD lines in version 1.  This is probably overkill, but this
12184         # fixes some glaring errors in Version 1 UnicodeData.txt.  That file:
12185         # 1)    had many Hangul (U+3400 - U+4DFF) code points that were later
12186         #       removed.  This program retains them
12187         # 2)    didn't include ranges, which it should have, and which are now
12188         #       added in @corrected_lines below.  It was hand populated by
12189         #       taking the data from Version 2, verified by analyzing
12190         #       DAge.txt.
12191         # 3)    There is a syntax error in the entry for U+09F8 which could
12192         #       cause problems for utf8_heavy, and so is changed.  It's
12193         #       numeric value was simply a minus sign, without any number.
12194         #       (Eventually Unicode changed the code point to non-numeric.)
12195         # 4)    The decomposition types often don't match later versions
12196         #       exactly, and the whole syntax of that field is different; so
12197         #       the syntax is changed as well as the types to their later
12198         #       terminology.  Otherwise normalize.pm would be very unhappy
12199         # 5)    Many ccc classes are different.  These are left intact.
12200         # 6)    U+FF10..U+FF19 are missing their numeric values in all three
12201         #       fields.  These are unchanged because it doesn't really cause
12202         #       problems for Perl.
12203         # 7)    A number of code points, such as controls, don't have their
12204         #       Unicode Version 1 Names in this file.  These are added.
12205         # 8)    A number of Symbols were marked as Lm.  This changes those in
12206         #       the Latin1 range, so that regexes work.
12207         # 9)    The odd characters U+03DB .. U+03E1 weren't encoded but are
12208         #       referred to by their lc equivalents.  Not fixed.
12209
12210         my @corrected_lines = split /\n/, <<'END';
12211 4E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;;
12212 9FA5;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;;
12213 E000;<Private Use, First>;Co;0;L;;;;;N;;;;;
12214 F8FF;<Private Use, Last>;Co;0;L;;;;;N;;;;;
12215 F900;<CJK Compatibility Ideograph, First>;Lo;0;L;;;;;N;;;;;
12216 FA2D;<CJK Compatibility Ideograph, Last>;Lo;0;L;;;;;N;;;;;
12217 END
12218
12219         my $file = shift;
12220         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
12221
12222         #local $to_trace = 1 if main::DEBUG;
12223         trace $_ if main::DEBUG && $to_trace;
12224
12225         # -1 => retain trailing null fields
12226         my ($code_point, @fields) = split /\s*;\s*/, $_, -1;
12227
12228         # At the first place that is wrong in the input, insert all the
12229         # corrections, replacing the wrong line.
12230         if ($code_point eq '4E00') {
12231             my @copy = @corrected_lines;
12232             $_ = shift @copy;
12233             ($code_point, @fields) = split /\s*;\s*/, $_, -1;
12234
12235             $file->insert_lines(@copy);
12236         }
12237         elsif ($code_point =~ /^00/ && $fields[$CATEGORY] eq 'Lm') {
12238
12239             # There are no Lm characters in Latin1; these should be 'Sk', but
12240             # there isn't that in V1.
12241             $fields[$CATEGORY] = 'So';
12242         }
12243
12244         if ($fields[$NUMERIC] eq '-') {
12245             $fields[$NUMERIC] = '-1';  # This is what 2.0 made it.
12246         }
12247
12248         if  ($fields[$PERL_DECOMPOSITION] ne "") {
12249
12250             # Several entries have this change to superscript 2 or 3 in the
12251             # middle.  Convert these to the modern version, which is to use
12252             # the actual U+00B2 and U+00B3 (the superscript forms) instead.
12253             # So 'HHHH HHHH <+sup> 0033 <-sup> HHHH' becomes
12254             # 'HHHH HHHH 00B3 HHHH'.
12255             # It turns out that all of these that don't have another
12256             # decomposition defined at the beginning of the line have the
12257             # <square> decomposition in later releases.
12258             if ($code_point ne '00B2' && $code_point ne '00B3') {
12259                 if  ($fields[$PERL_DECOMPOSITION]
12260                                     =~ s/<\+sup> 003([23]) <-sup>/00B$1/)
12261                 {
12262                     if (substr($fields[$PERL_DECOMPOSITION], 0, 1) ne '<') {
12263                         $fields[$PERL_DECOMPOSITION] = '<square> '
12264                         . $fields[$PERL_DECOMPOSITION];
12265                     }
12266                 }
12267             }
12268
12269             # If is like '<+circled> 0052 <-circled>', convert to
12270             # '<circled> 0052'
12271             $fields[$PERL_DECOMPOSITION] =~
12272                             s/ < \+ ( .*? ) > \s* (.*?) \s* <-\1> /<$1> $2/xg;
12273
12274             # Convert '<join> HHHH HHHH <join>' to '<medial> HHHH HHHH', etc.
12275             $fields[$PERL_DECOMPOSITION] =~
12276                             s/ <join> \s* (.*?) \s* <no-join> /<final> $1/x
12277             or $fields[$PERL_DECOMPOSITION] =~
12278                             s/ <join> \s* (.*?) \s* <join> /<medial> $1/x
12279             or $fields[$PERL_DECOMPOSITION] =~
12280                             s/ <no-join> \s* (.*?) \s* <join> /<initial> $1/x
12281             or $fields[$PERL_DECOMPOSITION] =~
12282                         s/ <no-join> \s* (.*?) \s* <no-join> /<isolated> $1/x;
12283
12284             # Convert '<break> HHHH HHHH <break>' to '<break> HHHH', etc.
12285             $fields[$PERL_DECOMPOSITION] =~
12286                     s/ <(break|no-break)> \s* (.*?) \s* <\1> /<$1> $2/x;
12287
12288             # Change names to modern form.
12289             $fields[$PERL_DECOMPOSITION] =~ s/<font variant>/<font>/g;
12290             $fields[$PERL_DECOMPOSITION] =~ s/<no-break>/<noBreak>/g;
12291             $fields[$PERL_DECOMPOSITION] =~ s/<circled>/<circle>/g;
12292             $fields[$PERL_DECOMPOSITION] =~ s/<break>/<fraction>/g;
12293
12294             # One entry has weird braces
12295             $fields[$PERL_DECOMPOSITION] =~ s/[{}]//g;
12296
12297             # One entry at U+2116 has an extra <sup>
12298             $fields[$PERL_DECOMPOSITION] =~ s/( < .*? > .* ) < .*? > \ * /$1/x;
12299         }
12300
12301         $_ = join ';', $code_point, @fields;
12302         trace $_ if main::DEBUG && $to_trace;
12303         return;
12304     }
12305
12306     sub filter_bad_Nd_ucd {
12307         # Early versions specified a value in the decimal digit field even
12308         # though the code point wasn't a decimal digit.  Clear the field in
12309         # that situation, so that the main code doesn't think it is a decimal
12310         # digit.
12311
12312         my ($code_point, @fields) = split /\s*;\s*/, $_, -1;
12313         if ($fields[$PERL_DECIMAL_DIGIT] ne "" && $fields[$CATEGORY] ne 'Nd') {
12314             $fields[$PERL_DECIMAL_DIGIT] = "";
12315             $_ = join ';', $code_point, @fields;
12316         }
12317         return;
12318     }
12319
12320     my @U1_control_names = split /\n/, <<'END';
12321 NULL
12322 START OF HEADING
12323 START OF TEXT
12324 END OF TEXT
12325 END OF TRANSMISSION
12326 ENQUIRY
12327 ACKNOWLEDGE
12328 BELL
12329 BACKSPACE
12330 HORIZONTAL TABULATION
12331 LINE FEED
12332 VERTICAL TABULATION
12333 FORM FEED
12334 CARRIAGE RETURN
12335 SHIFT OUT
12336 SHIFT IN
12337 DATA LINK ESCAPE
12338 DEVICE CONTROL ONE
12339 DEVICE CONTROL TWO
12340 DEVICE CONTROL THREE
12341 DEVICE CONTROL FOUR
12342 NEGATIVE ACKNOWLEDGE
12343 SYNCHRONOUS IDLE
12344 END OF TRANSMISSION BLOCK
12345 CANCEL
12346 END OF MEDIUM
12347 SUBSTITUTE
12348 ESCAPE
12349 FILE SEPARATOR
12350 GROUP SEPARATOR
12351 RECORD SEPARATOR
12352 UNIT SEPARATOR
12353 DELETE
12354 BREAK PERMITTED HERE
12355 NO BREAK HERE
12356 INDEX
12357 NEXT LINE
12358 START OF SELECTED AREA
12359 END OF SELECTED AREA
12360 CHARACTER TABULATION SET
12361 CHARACTER TABULATION WITH JUSTIFICATION
12362 LINE TABULATION SET
12363 PARTIAL LINE DOWN
12364 PARTIAL LINE UP
12365 REVERSE LINE FEED
12366 SINGLE SHIFT TWO
12367 SINGLE SHIFT THREE
12368 DEVICE CONTROL STRING
12369 PRIVATE USE ONE
12370 PRIVATE USE TWO
12371 SET TRANSMIT STATE
12372 CANCEL CHARACTER
12373 MESSAGE WAITING
12374 START OF GUARDED AREA
12375 END OF GUARDED AREA
12376 START OF STRING
12377 SINGLE CHARACTER INTRODUCER
12378 CONTROL SEQUENCE INTRODUCER
12379 STRING TERMINATOR
12380 OPERATING SYSTEM COMMAND
12381 PRIVACY MESSAGE
12382 APPLICATION PROGRAM COMMAND
12383 END
12384
12385     sub filter_early_U1_names {
12386         # Very early versions did not have the Unicode_1_name field specified.
12387         # They differed in which ones were present; make sure a U1 name
12388         # exists, so that Unicode::UCD::charinfo will work
12389
12390         my ($code_point, @fields) = split /\s*;\s*/, $_, -1;
12391
12392
12393         # @U1_control names above are entirely positional, so we pull them out
12394         # in the exact order required, with gaps for the ones that don't have
12395         # names.
12396         if ($code_point =~ /^00[01]/
12397             || $code_point eq '007F'
12398             || $code_point =~ /^008[2-9A-F]/
12399             || $code_point =~ /^009[0-8A-F]/)
12400         {
12401             my $u1_name = shift @U1_control_names;
12402             $fields[$UNICODE_1_NAME] = $u1_name unless $fields[$UNICODE_1_NAME];
12403             $_ = join ';', $code_point, @fields;
12404         }
12405         return;
12406     }
12407
12408     sub filter_v2_1_5_ucd {
12409         # A dozen entries in this 2.1.5 file had the mirrored and numeric
12410         # columns swapped;  These all had mirrored be 'N'.  So if the numeric
12411         # column appears to be N, swap it back.
12412
12413         my ($code_point, @fields) = split /\s*;\s*/, $_, -1;
12414         if ($fields[$NUMERIC] eq 'N') {
12415             $fields[$NUMERIC] = $fields[$MIRRORED];
12416             $fields[$MIRRORED] = 'N';
12417             $_ = join ';', $code_point, @fields;
12418         }
12419         return;
12420     }
12421
12422     sub filter_v6_ucd {
12423
12424         # Unicode 6.0 co-opted the name BELL for U+1F514, but until 5.17,
12425         # it wasn't accepted, to allow for some deprecation cycles.  This
12426         # function is not called after 5.16
12427
12428         return if $_ !~ /^(?:0007|1F514|070F);/;
12429
12430         my ($code_point, @fields) = split /\s*;\s*/, $_, -1;
12431         if ($code_point eq '0007') {
12432             $fields[$CHARNAME] = "";
12433         }
12434         elsif ($code_point eq '070F') { # Unicode Corrigendum #8; see
12435                             # http://www.unicode.org/versions/corrigendum8.html
12436             $fields[$BIDI] = "AL";
12437         }
12438         elsif ($^V lt v5.18.0) { # For 5.18 will convert to use Unicode's name
12439             $fields[$CHARNAME] = "";
12440         }
12441
12442         $_ = join ';', $code_point, @fields;
12443
12444         return;
12445     }
12446 } # End closure for UnicodeData
12447
12448 sub process_GCB_test {
12449
12450     my $file = shift;
12451     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
12452
12453     while ($file->next_line) {
12454         push @backslash_X_tests, $_;
12455     }
12456
12457     return;
12458 }
12459
12460 sub process_LB_test {
12461
12462     my $file = shift;
12463     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
12464
12465     while ($file->next_line) {
12466         push @LB_tests, $_;
12467     }
12468
12469     return;
12470 }
12471
12472 sub process_SB_test {
12473
12474     my $file = shift;
12475     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
12476
12477     while ($file->next_line) {
12478         push @SB_tests, $_;
12479     }
12480
12481     return;
12482 }
12483
12484 sub process_WB_test {
12485
12486     my $file = shift;
12487     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
12488
12489     while ($file->next_line) {
12490         push @WB_tests, $_;
12491     }
12492
12493     return;
12494 }
12495
12496 sub process_NamedSequences {
12497     # NamedSequences.txt entries are just added to an array.  Because these
12498     # don't look like the other tables, they have their own handler.
12499     # An example:
12500     # LATIN CAPITAL LETTER A WITH MACRON AND GRAVE;0100 0300
12501     #
12502     # This just adds the sequence to an array for later handling
12503
12504     my $file = shift;
12505     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
12506
12507     while ($file->next_line) {
12508         my ($name, $sequence, @remainder) = split /\s*;\s*/, $_, -1;
12509         if (@remainder) {
12510             $file->carp_bad_line(
12511                 "Doesn't look like 'KHMER VOWEL SIGN OM;17BB 17C6'");
12512             next;
12513         }
12514
12515         # Note single \t in keeping with special output format of
12516         # Perl_charnames.  But it turns out that the code points don't have to
12517         # be 5 digits long, like the rest, based on the internal workings of
12518         # charnames.pm.  This could be easily changed for consistency.
12519         push @named_sequences, "$sequence\t$name";
12520     }
12521     return;
12522 }
12523
12524 { # Closure
12525
12526     my $first_range;
12527
12528     sub  filter_early_ea_lb {
12529         # Fixes early EastAsianWidth.txt and LineBreak.txt files.  These had a
12530         # third field be the name of the code point, which can be ignored in
12531         # most cases.  But it can be meaningful if it marks a range:
12532         # 33FE;W;IDEOGRAPHIC TELEGRAPH SYMBOL FOR DAY THIRTY-ONE
12533         # 3400;W;<CJK Ideograph Extension A, First>
12534         #
12535         # We need to see the First in the example above to know it's a range.
12536         # They did not use the later range syntaxes.  This routine changes it
12537         # to use the modern syntax.
12538         # $1 is the Input_file object.
12539
12540         my @fields = split /\s*;\s*/;
12541         if ($fields[2] =~ /^<.*, First>/) {
12542             $first_range = $fields[0];
12543             $_ = "";
12544         }
12545         elsif ($fields[2] =~ /^<.*, Last>/) {
12546             $_ = $_ = "$first_range..$fields[0]; $fields[1]";
12547         }
12548         else {
12549             undef $first_range;
12550             $_ = "$fields[0]; $fields[1]";
12551         }
12552
12553         return;
12554     }
12555 }
12556
12557 sub filter_substitute_lb {
12558     # Used on Unicodes that predate the LB property, where there is a
12559     # substitute file.  This just does the regular ea_lb handling for such
12560     # files, and then substitutes the long property value name for the short
12561     # one that comes with the file.  (The other break files have the long
12562     # names in them, so this is the odd one out.)  The reason for doing this
12563     # kludge is that regen/mk_invlists.pl is expecting the long name.  This
12564     # also fixes the typo 'Inseperable' that leads to problems.
12565
12566     filter_early_ea_lb;
12567     return unless $_;
12568
12569     my @fields = split /\s*;\s*/;
12570     $fields[1] = property_ref('_Perl_LB')->table($fields[1])->full_name;
12571     $fields[1] = 'Inseparable' if lc $fields[1] eq 'inseperable';
12572     $_ = join '; ', @fields;
12573 }
12574
12575 sub filter_old_style_arabic_shaping {
12576     # Early versions used a different term for the later one.
12577
12578     my @fields = split /\s*;\s*/;
12579     $fields[3] =~ s/<no shaping>/No_Joining_Group/;
12580     $fields[3] =~ s/\s+/_/g;                # Change spaces to underscores
12581     $_ = join ';', @fields;
12582     return;
12583 }
12584
12585 { # Closure
12586     my $lc; # Table for lowercase mapping
12587     my $tc;
12588     my $uc;
12589     my %special_casing_code_points;
12590
12591     sub setup_special_casing {
12592         # SpecialCasing.txt contains the non-simple case change mappings.  The
12593         # simple ones are in UnicodeData.txt, which should already have been
12594         # read in to the full property data structures, so as to initialize
12595         # these with the simple ones.  Then the SpecialCasing.txt entries
12596         # add or overwrite the ones which have different full mappings.
12597
12598         # This routine sees if the simple mappings are to be output, and if
12599         # so, copies what has already been put into the full mapping tables,
12600         # while they still contain only the simple mappings.
12601
12602         # The reason it is done this way is that the simple mappings are
12603         # probably not going to be output, so it saves work to initialize the
12604         # full tables with the simple mappings, and then overwrite those
12605         # relatively few entries in them that have different full mappings,
12606         # and thus skip the simple mapping tables altogether.
12607
12608         my $file= shift;
12609         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
12610
12611         $lc = property_ref('lc');
12612         $tc = property_ref('tc');
12613         $uc = property_ref('uc');
12614
12615         # For each of the case change mappings...
12616         foreach my $full_casing_table ($lc, $tc, $uc) {
12617             my $full_casing_name = $full_casing_table->name;
12618             my $full_casing_full_name = $full_casing_table->full_name;
12619             unless (defined $full_casing_table
12620                     && ! $full_casing_table->is_empty)
12621             {
12622                 Carp::my_carp_bug("Need to process UnicodeData before SpecialCasing.  Only special casing will be generated.");
12623             }
12624
12625             # Create a table in the old-style format and with the original
12626             # file name for backwards compatibility with applications that
12627             # read it directly.  The new tables contain both the simple and
12628             # full maps, and the old are missing simple maps when there is a
12629             # conflicting full one.  Probably it would have been ok to add
12630             # those to the legacy version, as was already done in 5.14 to the
12631             # case folding one, but this was not done, out of an abundance of
12632             # caution.  The tables are set up here before we deal with the
12633             # full maps so that as we handle those, we can override the simple
12634             # maps for them in the legacy table, and merely add them in the
12635             # new-style one.
12636             my $legacy = Property->new("Legacy_" . $full_casing_full_name,
12637                                 File => $full_casing_full_name
12638                                                           =~ s/case_Mapping//r,
12639                                 Format => $HEX_FORMAT,
12640                                 Default_Map => $CODE_POINT,
12641                                 Initialize => $full_casing_table,
12642                                 Replacement_Property => $full_casing_full_name,
12643             );
12644
12645             $full_casing_table->add_comment(join_lines( <<END
12646 This file includes both the simple and full case changing maps.  The simple
12647 ones are in the main body of the table below, and the full ones adding to or
12648 overriding them are in the hash.
12649 END
12650             ));
12651
12652             # The simple version's name in each mapping merely has an 's' in
12653             # front of the full one's
12654             my $simple_name = 's' . $full_casing_name;
12655             my $simple = property_ref($simple_name);
12656             $simple->initialize($full_casing_table) if $simple->to_output_map();
12657         }
12658
12659         return;
12660     }
12661
12662     sub filter_2_1_8_special_casing_line {
12663
12664         # This version had duplicate entries in this file.  Delete all but the
12665         # first one
12666         my @fields = split /\s*;\s*/, $_, -1; # -1 => retain trailing null
12667                                               # fields
12668         if (exists $special_casing_code_points{$fields[0]}) {
12669             $_ = "";
12670             return;
12671         }
12672
12673         $special_casing_code_points{$fields[0]} = 1;
12674         filter_special_casing_line(@_);
12675     }
12676
12677     sub filter_special_casing_line {
12678         # Change the format of $_ from SpecialCasing.txt into something that
12679         # the generic handler understands.  Each input line contains three
12680         # case mappings.  This will generate three lines to pass to the
12681         # generic handler for each of those.
12682
12683         # The input syntax (after stripping comments and trailing white space
12684         # is like one of the following (with the final two being entries that
12685         # we ignore):
12686         # 00DF; 00DF; 0053 0073; 0053 0053; # LATIN SMALL LETTER SHARP S
12687         # 03A3; 03C2; 03A3; 03A3; Final_Sigma;
12688         # 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
12689         # Note the trailing semi-colon, unlike many of the input files.  That
12690         # means that there will be an extra null field generated by the split
12691
12692         my $file = shift;
12693         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
12694
12695         my @fields = split /\s*;\s*/, $_, -1; # -1 => retain trailing null
12696                                               # fields
12697
12698         # field #4 is when this mapping is conditional.  If any of these get
12699         # implemented, it would be by hard-coding in the casing functions in
12700         # the Perl core, not through tables.  But if there is a new condition
12701         # we don't know about, output a warning.  We know about all the
12702         # conditions through 6.0
12703         if ($fields[4] ne "") {
12704             my @conditions = split ' ', $fields[4];
12705             if ($conditions[0] ne 'tr'  # We know that these languages have
12706                                         # conditions, and some are multiple
12707                 && $conditions[0] ne 'az'
12708                 && $conditions[0] ne 'lt'
12709
12710                 # And, we know about a single condition Final_Sigma, but
12711                 # nothing else.
12712                 && ($v_version gt v5.2.0
12713                     && (@conditions > 1 || $conditions[0] ne 'Final_Sigma')))
12714             {
12715                 $file->carp_bad_line("Unknown condition '$fields[4]'.  You should inspect it and either add code to handle it, or add to list of those that are to ignore");
12716             }
12717             elsif ($conditions[0] ne 'Final_Sigma') {
12718
12719                     # Don't print out a message for Final_Sigma, because we
12720                     # have hard-coded handling for it.  (But the standard
12721                     # could change what the rule should be, but it wouldn't
12722                     # show up here anyway.
12723
12724                     print "# SKIPPING Special Casing: $_\n"
12725                                                     if $verbosity >= $VERBOSE;
12726             }
12727             $_ = "";
12728             return;
12729         }
12730         elsif (@fields > 6 || (@fields == 6 && $fields[5] ne "" )) {
12731             $file->carp_bad_line('Extra fields');
12732             $_ = "";
12733             return;
12734         }
12735
12736         my $decimal_code_point = hex $fields[0];
12737
12738         # Loop to handle each of the three mappings in the input line, in
12739         # order, with $i indicating the current field number.
12740         my $i = 0;
12741         for my $object ($lc, $tc, $uc) {
12742             $i++;   # First time through, $i = 0 ... 3rd time = 3
12743
12744             my $value = $object->value_of($decimal_code_point);
12745             $value = ($value eq $CODE_POINT)
12746                       ? $decimal_code_point
12747                       : hex $value;
12748
12749             # If this isn't a multi-character mapping, it should already have
12750             # been read in.
12751             if ($fields[$i] !~ / /) {
12752                 if ($value != hex $fields[$i]) {
12753                     Carp::my_carp("Bad news. UnicodeData.txt thinks "
12754                                   . $object->name
12755                                   . "(0x$fields[0]) is $value"
12756                                   . " and SpecialCasing.txt thinks it is "
12757                                   . hex($fields[$i])
12758                                   . ".  Good luck.  Retaining UnicodeData value, and proceeding anyway.");
12759                 }
12760             }
12761             else {
12762
12763                 # The mapping goes into both the legacy table, in which it
12764                 # replaces the simple one...
12765                 $file->insert_adjusted_lines("$fields[0]; Legacy_"
12766                                              . $object->full_name
12767                                              . "; $fields[$i]");
12768
12769                 # ... and the regular table, in which it is additional,
12770                 # beyond the simple mapping.
12771                 $file->insert_adjusted_lines("$fields[0]; "
12772                                              . $object->name
12773                                             . "; "
12774                                             . $CMD_DELIM
12775                                             . "$REPLACE_CMD=$MULTIPLE_BEFORE"
12776                                             . $CMD_DELIM
12777                                             . $fields[$i]);
12778             }
12779         }
12780
12781         # Everything has been handled by the insert_adjusted_lines()
12782         $_ = "";
12783
12784         return;
12785     }
12786 }
12787
12788 sub filter_old_style_case_folding {
12789     # This transforms $_ containing the case folding style of 3.0.1, to 3.1
12790     # and later style.  Different letters were used in the earlier.
12791
12792     my $file = shift;
12793     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
12794
12795     my @fields = split /\s*;\s*/;
12796
12797     if ($fields[1] eq 'L') {
12798         $fields[1] = 'C';             # L => C always
12799     }
12800     elsif ($fields[1] eq 'E') {
12801         if ($fields[2] =~ / /) {      # E => C if one code point; F otherwise
12802             $fields[1] = 'F'
12803         }
12804         else {
12805             $fields[1] = 'C'
12806         }
12807     }
12808     else {
12809         $file->carp_bad_line("Expecting L or E in second field");
12810         $_ = "";
12811         return;
12812     }
12813     $_ = join("; ", @fields) . ';';
12814     return;
12815 }
12816
12817 { # Closure for case folding
12818
12819     # Create the map for simple only if are going to output it, for otherwise
12820     # it takes no part in anything we do.
12821     my $to_output_simple;
12822
12823     sub setup_case_folding($) {
12824         # Read in the case foldings in CaseFolding.txt.  This handles both
12825         # simple and full case folding.
12826
12827         $to_output_simple
12828                         = property_ref('Simple_Case_Folding')->to_output_map;
12829
12830         if (! $to_output_simple) {
12831             property_ref('Case_Folding')->set_proxy_for('Simple_Case_Folding');
12832         }
12833
12834         # If we ever wanted to show that these tables were combined, a new
12835         # property method could be created, like set_combined_props()
12836         property_ref('Case_Folding')->add_comment(join_lines( <<END
12837 This file includes both the simple and full case folding maps.  The simple
12838 ones are in the main body of the table below, and the full ones adding to or
12839 overriding them are in the hash.
12840 END
12841         ));
12842         return;
12843     }
12844
12845     sub filter_case_folding_line {
12846         # Called for each line in CaseFolding.txt
12847         # Input lines look like:
12848         # 0041; C; 0061; # LATIN CAPITAL LETTER A
12849         # 00DF; F; 0073 0073; # LATIN SMALL LETTER SHARP S
12850         # 1E9E; S; 00DF; # LATIN CAPITAL LETTER SHARP S
12851         #
12852         # 'C' means that folding is the same for both simple and full
12853         # 'F' that it is only for full folding
12854         # 'S' that it is only for simple folding
12855         # 'T' is locale-dependent, and ignored
12856         # 'I' is a type of 'F' used in some early releases.
12857         # Note the trailing semi-colon, unlike many of the input files.  That
12858         # means that there will be an extra null field generated by the split
12859         # below, which we ignore and hence is not an error.
12860
12861         my $file = shift;
12862         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
12863
12864         my ($range, $type, $map, @remainder) = split /\s*;\s*/, $_, -1;
12865         if (@remainder > 1 || (@remainder == 1 && $remainder[0] ne "" )) {
12866             $file->carp_bad_line('Extra fields');
12867             $_ = "";
12868             return;
12869         }
12870
12871         if ($type =~ / ^ [IT] $/x) {   # Skip Turkic case folding, is locale dependent
12872             $_ = "";
12873             return;
12874         }
12875
12876         # C: complete, F: full, or I: dotted uppercase I -> dotless lowercase
12877         # I are all full foldings; S is single-char.  For S, there is always
12878         # an F entry, so we must allow multiple values for the same code
12879         # point.  Fortunately this table doesn't need further manipulation
12880         # which would preclude using multiple-values.  The S is now included
12881         # so that _swash_inversion_hash() is able to construct closures
12882         # without having to worry about F mappings.
12883         if ($type eq 'C' || $type eq 'F' || $type eq 'I' || $type eq 'S') {
12884             $_ = "$range; Case_Folding; "
12885                  . "$CMD_DELIM$REPLACE_CMD=$MULTIPLE_BEFORE$CMD_DELIM$map";
12886         }
12887         else {
12888             $_ = "";
12889             $file->carp_bad_line('Expecting C F I S or T in second field');
12890         }
12891
12892         # C and S are simple foldings, but simple case folding is not needed
12893         # unless we explicitly want its map table output.
12894         if ($to_output_simple && $type eq 'C' || $type eq 'S') {
12895             $file->insert_adjusted_lines("$range; Simple_Case_Folding; $map");
12896         }
12897
12898         return;
12899     }
12900
12901 } # End case fold closure
12902
12903 sub filter_jamo_line {
12904     # Filter Jamo.txt lines.  This routine mainly is used to populate hashes
12905     # from this file that is used in generating the Name property for Jamo
12906     # code points.  But, it also is used to convert early versions' syntax
12907     # into the modern form.  Here are two examples:
12908     # 1100; G   # HANGUL CHOSEONG KIYEOK            # Modern syntax
12909     # U+1100; G; HANGUL CHOSEONG KIYEOK             # 2.0 syntax
12910     #
12911     # The input is $_, the output is $_ filtered.
12912
12913     my @fields = split /\s*;\s*/, $_, -1;  # -1 => retain trailing null fields
12914
12915     # Let the caller handle unexpected input.  In earlier versions, there was
12916     # a third field which is supposed to be a comment, but did not have a '#'
12917     # before it.
12918     return if @fields > (($v_version gt v3.0.0) ? 2 : 3);
12919
12920     $fields[0] =~ s/^U\+//;     # Also, early versions had this extraneous
12921                                 # beginning.
12922
12923     # Some 2.1 versions had this wrong.  Causes havoc with the algorithm.
12924     $fields[1] = 'R' if $fields[0] eq '1105';
12925
12926     # Add to structure so can generate Names from it.
12927     my $cp = hex $fields[0];
12928     my $short_name = $fields[1];
12929     $Jamo{$cp} = $short_name;
12930     if ($cp <= $LBase + $LCount) {
12931         $Jamo_L{$short_name} = $cp - $LBase;
12932     }
12933     elsif ($cp <= $VBase + $VCount) {
12934         $Jamo_V{$short_name} = $cp - $VBase;
12935     }
12936     elsif ($cp <= $TBase + $TCount) {
12937         $Jamo_T{$short_name} = $cp - $TBase;
12938     }
12939     else {
12940         Carp::my_carp_bug("Unexpected Jamo code point in $_");
12941     }
12942
12943
12944     # Reassemble using just the first two fields to look like a typical
12945     # property file line
12946     $_ = "$fields[0]; $fields[1]";
12947
12948     return;
12949 }
12950
12951 sub register_fraction($) {
12952     # This registers the input rational number so that it can be passed on to
12953     # utf8_heavy.pl, both in rational and floating forms.
12954
12955     my $rational = shift;
12956
12957     my $float = eval $rational;
12958     $float = sprintf "%.*e", $E_FLOAT_PRECISION, $float;
12959     if (   defined $nv_floating_to_rational{$float}
12960         && $nv_floating_to_rational{$float} ne $rational)
12961     {
12962         die Carp::my_carp_bug("Both '$rational' and"
12963                             . " '$nv_floating_to_rational{$float}' evaluate to"
12964                             . " the same floating point number."
12965                             . "  \$E_FLOAT_PRECISION must be increased");
12966     }
12967     $nv_floating_to_rational{$float} = $rational;
12968     return;
12969 }
12970
12971 sub gcd($$) {   # Greatest-common-divisor; from
12972                 # http://en.wikipedia.org/wiki/Euclidean_algorithm
12973     my ($a, $b) = @_;
12974
12975     use integer;
12976
12977     while ($b != 0) {
12978        my $temp = $b;
12979        $b = $a % $b;
12980        $a = $temp;
12981     }
12982     return $a;
12983 }
12984
12985 sub reduce_fraction($) {
12986     my $fraction_ref = shift;
12987
12988     # Reduce a fraction to lowest terms.  The Unicode data may be reducible,
12989     # hence this is needed.  The argument is a reference to the
12990     # string denoting the fraction, which must be of the form:
12991     if ($$fraction_ref !~ / ^ (-?) (\d+) \/ (\d+) $ /ax) {
12992         Carp::my_carp_bug("Non-fraction input '$$fraction_ref'.  Unchanged");
12993         return;
12994     }
12995
12996     my $sign = $1;
12997     my $numerator = $2;
12998     my $denominator = $3;
12999
13000     use integer;
13001
13002     # Find greatest common divisor
13003     my $gcd = gcd($numerator, $denominator);
13004
13005     # And reduce using the gcd.
13006     if ($gcd != 1) {
13007         $numerator    /= $gcd;
13008         $denominator  /= $gcd;
13009         $$fraction_ref = "$sign$numerator/$denominator";
13010     }
13011
13012     return;
13013 }
13014
13015 sub filter_numeric_value_line {
13016     # DNumValues contains lines of a different syntax than the typical
13017     # property file:
13018     # 0F33          ; -0.5 ; ; -1/2 # No       TIBETAN DIGIT HALF ZERO
13019     #
13020     # This routine transforms $_ containing the anomalous syntax to the
13021     # typical, by filtering out the extra columns, and convert early version
13022     # decimal numbers to strings that look like rational numbers.
13023
13024     my $file = shift;
13025     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
13026
13027     # Starting in 5.1, there is a rational field.  Just use that, omitting the
13028     # extra columns.  Otherwise convert the decimal number in the second field
13029     # to a rational, and omit extraneous columns.
13030     my @fields = split /\s*;\s*/, $_, -1;
13031     my $rational;
13032
13033     if ($v_version ge v5.1.0) {
13034         if (@fields != 4) {
13035             $file->carp_bad_line('Not 4 semi-colon separated fields');
13036             $_ = "";
13037             return;
13038         }
13039         reduce_fraction(\$fields[3]) if $fields[3] =~ qr{/};
13040         $rational = $fields[3];
13041
13042         $_ = join '; ', @fields[ 0, 3 ];
13043     }
13044     else {
13045
13046         # Here, is an older Unicode file, which has decimal numbers instead of
13047         # rationals in it.  Use the fraction to calculate the denominator and
13048         # convert to rational.
13049
13050         if (@fields != 2 && @fields != 3) {
13051             $file->carp_bad_line('Not 2 or 3 semi-colon separated fields');
13052             $_ = "";
13053             return;
13054         }
13055
13056         my $codepoints = $fields[0];
13057         my $decimal = $fields[1];
13058         if ($decimal =~ s/\.0+$//) {
13059
13060             # Anything ending with a decimal followed by nothing but 0's is an
13061             # integer
13062             $_ = "$codepoints; $decimal";
13063             $rational = $decimal;
13064         }
13065         else {
13066
13067             my $denominator;
13068             if ($decimal =~ /\.50*$/) {
13069                 $denominator = 2;
13070             }
13071
13072             # Here have the hardcoded repeating decimals in the fraction, and
13073             # the denominator they imply.  There were only a few denominators
13074             # in the older Unicode versions of this file which this code
13075             # handles, so it is easy to convert them.
13076
13077             # The 4 is because of a round-off error in the Unicode 3.2 files
13078             elsif ($decimal =~ /\.33*[34]$/ || $decimal =~ /\.6+7$/) {
13079                 $denominator = 3;
13080             }
13081             elsif ($decimal =~ /\.[27]50*$/) {
13082                 $denominator = 4;
13083             }
13084             elsif ($decimal =~ /\.[2468]0*$/) {
13085                 $denominator = 5;
13086             }
13087             elsif ($decimal =~ /\.16+7$/ || $decimal =~ /\.83+$/) {
13088                 $denominator = 6;
13089             }
13090             elsif ($decimal =~ /\.(12|37|62|87)50*$/) {
13091                 $denominator = 8;
13092             }
13093             if ($denominator) {
13094                 my $sign = ($decimal < 0) ? "-" : "";
13095                 my $numerator = int((abs($decimal) * $denominator) + .5);
13096                 $rational = "$sign$numerator/$denominator";
13097                 $_ = "$codepoints; $rational";
13098             }
13099             else {
13100                 $file->carp_bad_line("Can't cope with number '$decimal'.");
13101                 $_ = "";
13102                 return;
13103             }
13104         }
13105     }
13106
13107     register_fraction($rational) if $rational =~ qr{/};
13108     return;
13109 }
13110
13111 { # Closure
13112     my %unihan_properties;
13113
13114     sub construct_unihan {
13115
13116         my $file_object = shift;
13117
13118         return unless file_exists($file_object->file);
13119
13120         if ($v_version lt v4.0.0) {
13121             push @cjk_properties, 'URS ; Unicode_Radical_Stroke';
13122             push @cjk_property_values, split "\n", <<'END';
13123 # @missing: 0000..10FFFF; Unicode_Radical_Stroke; <none>
13124 END
13125         }
13126
13127         if ($v_version ge v3.0.0) {
13128             push @cjk_properties, split "\n", <<'END';
13129 cjkIRG_GSource; kIRG_GSource
13130 cjkIRG_JSource; kIRG_JSource
13131 cjkIRG_KSource; kIRG_KSource
13132 cjkIRG_TSource; kIRG_TSource
13133 cjkIRG_VSource; kIRG_VSource
13134 END
13135         push @cjk_property_values, split "\n", <<'END';
13136 # @missing: 0000..10FFFF; cjkIRG_GSource; <none>
13137 # @missing: 0000..10FFFF; cjkIRG_JSource; <none>
13138 # @missing: 0000..10FFFF; cjkIRG_KSource; <none>
13139 # @missing: 0000..10FFFF; cjkIRG_TSource; <none>
13140 # @missing: 0000..10FFFF; cjkIRG_VSource; <none>
13141 END
13142         }
13143         if ($v_version ge v3.1.0) {
13144             push @cjk_properties, 'cjkIRG_HSource; kIRG_HSource';
13145             push @cjk_property_values, '# @missing: 0000..10FFFF; cjkIRG_HSource; <none>';
13146         }
13147         if ($v_version ge v3.1.1) {
13148             push @cjk_properties, 'cjkIRG_KPSource; kIRG_KPSource';
13149             push @cjk_property_values, '# @missing: 0000..10FFFF; cjkIRG_KPSource; <none>';
13150         }
13151         if ($v_version ge v3.2.0) {
13152             push @cjk_properties, split "\n", <<'END';
13153 cjkAccountingNumeric; kAccountingNumeric
13154 cjkCompatibilityVariant; kCompatibilityVariant
13155 cjkOtherNumeric; kOtherNumeric
13156 cjkPrimaryNumeric; kPrimaryNumeric
13157 END
13158             push @cjk_property_values, split "\n", <<'END';
13159 # @missing: 0000..10FFFF; cjkAccountingNumeric; NaN
13160 # @missing: 0000..10FFFF; cjkCompatibilityVariant; <code point>
13161 # @missing: 0000..10FFFF; cjkOtherNumeric; NaN
13162 # @missing: 0000..10FFFF; cjkPrimaryNumeric; NaN
13163 END
13164         }
13165         if ($v_version gt v4.0.0) {
13166             push @cjk_properties, 'cjkIRG_USource; kIRG_USource';
13167             push @cjk_property_values, '# @missing: 0000..10FFFF; cjkIRG_USource; <none>';
13168         }
13169
13170         if ($v_version ge v4.1.0) {
13171             push @cjk_properties, 'cjkIICore ; kIICore';
13172             push @cjk_property_values, '# @missing: 0000..10FFFF; cjkIICore; <none>';
13173         }
13174     }
13175
13176     sub setup_unihan {
13177         # Do any special setup for Unihan properties.
13178
13179         # This property gives the wrong computed type, so override.
13180         my $usource = property_ref('kIRG_USource');
13181         $usource->set_type($STRING) if defined $usource;
13182
13183         # This property is to be considered binary (it says so in
13184         # http://www.unicode.org/reports/tr38/)
13185         my $iicore = property_ref('kIICore');
13186         if (defined $iicore) {
13187             $iicore->set_type($FORCED_BINARY);
13188             $iicore->table("Y")->add_note("Matches any code point which has a non-null value for this property; see unicode.org UAX #38.");
13189
13190             # Unicode doesn't include the maps for this property, so don't
13191             # warn that they are missing.
13192             $iicore->set_pre_declared_maps(0);
13193             $iicore->add_comment(join_lines( <<END
13194 This property contains string values, but any non-empty ones are considered to
13195 be 'core', so Perl creates tables for both: 1) its string values, plus 2)
13196 tables so that \\p{kIICore} matches any code point which has a non-empty
13197 value for this property.
13198 END
13199             ));
13200         }
13201
13202         return;
13203     }
13204
13205     sub filter_unihan_line {
13206         # Change unihan db lines to look like the others in the db.  Here is
13207         # an input sample:
13208         #   U+341C        kCangjie        IEKN
13209
13210         # Tabs are used instead of semi-colons to separate fields; therefore
13211         # they may have semi-colons embedded in them.  Change these to periods
13212         # so won't screw up the rest of the code.
13213         s/;/./g;
13214
13215         # Remove lines that don't look like ones we accept.
13216         if ($_ !~ /^ [^\t]* \t ( [^\t]* ) /x) {
13217             $_ = "";
13218             return;
13219         }
13220
13221         # Extract the property, and save a reference to its object.
13222         my $property = $1;
13223         if (! exists $unihan_properties{$property}) {
13224             $unihan_properties{$property} = property_ref($property);
13225         }
13226
13227         # Don't do anything unless the property is one we're handling, which
13228         # we determine by seeing if there is an object defined for it or not
13229         if (! defined $unihan_properties{$property}) {
13230             $_ = "";
13231             return;
13232         }
13233
13234         # Convert the tab separators to our standard semi-colons, and convert
13235         # the U+HHHH notation to the rest of the standard's HHHH
13236         s/\t/;/g;
13237         s/\b U \+ (?= $code_point_re )//xg;
13238
13239         #local $to_trace = 1 if main::DEBUG;
13240         trace $_ if main::DEBUG && $to_trace;
13241
13242         return;
13243     }
13244 }
13245
13246 sub filter_blocks_lines {
13247     # In the Blocks.txt file, the names of the blocks don't quite match the
13248     # names given in PropertyValueAliases.txt, so this changes them so they
13249     # do match:  Blanks and hyphens are changed into underscores.  Also makes
13250     # early release versions look like later ones
13251     #
13252     # $_ is transformed to the correct value.
13253
13254     my $file = shift;
13255         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
13256
13257     if ($v_version lt v3.2.0) {
13258         if (/FEFF.*Specials/) { # Bug in old versions: line wrongly inserted
13259             $_ = "";
13260             return;
13261         }
13262
13263         # Old versions used a different syntax to mark the range.
13264         $_ =~ s/;\s+/../ if $v_version lt v3.1.0;
13265     }
13266
13267     my @fields = split /\s*;\s*/, $_, -1;
13268     if (@fields != 2) {
13269         $file->carp_bad_line("Expecting exactly two fields");
13270         $_ = "";
13271         return;
13272     }
13273
13274     # Change hyphens and blanks in the block name field only
13275     $fields[1] =~ s/[ -]/_/g;
13276     $fields[1] =~ s/_ ( [a-z] ) /_\u$1/xg;   # Capitalize first letter of word
13277
13278     $_ = join("; ", @fields);
13279     return;
13280 }
13281
13282 { # Closure
13283     my $current_property;
13284
13285     sub filter_old_style_proplist {
13286         # PropList.txt has been in Unicode since version 2.0.  Until 3.1, it
13287         # was in a completely different syntax.  Ken Whistler of Unicode says
13288         # that it was something he used as an aid for his own purposes, but
13289         # was never an official part of the standard.  Many of the properties
13290         # in it were incorporated into the later PropList.txt, but some were
13291         # not.  This program uses this early file to generate property tables
13292         # that are otherwise not accessible in the early UCD's.  It does this
13293         # for the ones that eventually became official, and don't appear to be
13294         # too different in their contents from the later official version, and
13295         # throws away the rest.  It could be argued that the ones it generates
13296         # were probably not really official at that time, so should be
13297         # ignored.  You can easily modify things to skip all of them by
13298         # changing this function to just set $_ to "", and return; and to skip
13299         # certain of them by by simply removing their declarations from
13300         # get_old_property_aliases().
13301         #
13302         # Here is a list of all the ones that are thrown away:
13303         #   Alphabetic                   The definitions for this are very
13304         #                                defective, so better to not mislead
13305         #                                people into thinking it works.
13306         #                                Instead the Perl extension of the
13307         #                                same name is constructed from first
13308         #                                principles.
13309         #   Bidi=*                       duplicates UnicodeData.txt
13310         #   Combining                    never made into official property;
13311         #                                is \P{ccc=0}
13312         #   Composite                    never made into official property.
13313         #   Currency Symbol              duplicates UnicodeData.txt: gc=sc
13314         #   Decimal Digit                duplicates UnicodeData.txt: gc=nd
13315         #   Delimiter                    never made into official property;
13316         #                                removed in 3.0.1
13317         #   Format Control               never made into official property;
13318         #                                similar to gc=cf
13319         #   High Surrogate               duplicates Blocks.txt
13320         #   Ignorable Control            never made into official property;
13321         #                                similar to di=y
13322         #   ISO Control                  duplicates UnicodeData.txt: gc=cc
13323         #   Left of Pair                 never made into official property;
13324         #   Line Separator               duplicates UnicodeData.txt: gc=zl
13325         #   Low Surrogate                duplicates Blocks.txt
13326         #   Non-break                    was actually listed as a property
13327         #                                in 3.2, but without any code
13328         #                                points.  Unicode denies that this
13329         #                                was ever an official property
13330         #   Non-spacing                  duplicate UnicodeData.txt: gc=mn
13331         #   Numeric                      duplicates UnicodeData.txt: gc=cc
13332         #   Paired Punctuation           never made into official property;
13333         #                                appears to be gc=ps + gc=pe
13334         #   Paragraph Separator          duplicates UnicodeData.txt: gc=cc
13335         #   Private Use                  duplicates UnicodeData.txt: gc=co
13336         #   Private Use High Surrogate   duplicates Blocks.txt
13337         #   Punctuation                  duplicates UnicodeData.txt: gc=p
13338         #   Space                        different definition than eventual
13339         #                                one.
13340         #   Titlecase                    duplicates UnicodeData.txt: gc=lt
13341         #   Unassigned Code Value        duplicates UnicodeData.txt: gc=cn
13342         #   Zero-width                   never made into official property;
13343         #                                subset of gc=cf
13344         # Most of the properties have the same names in this file as in later
13345         # versions, but a couple do not.
13346         #
13347         # This subroutine filters $_, converting it from the old style into
13348         # the new style.  Here's a sample of the old-style
13349         #
13350         #   *******************************************
13351         #
13352         #   Property dump for: 0x100000A0 (Join Control)
13353         #
13354         #   200C..200D  (2 chars)
13355         #
13356         # In the example, the property is "Join Control".  It is kept in this
13357         # closure between calls to the subroutine.  The numbers beginning with
13358         # 0x were internal to Ken's program that generated this file.
13359
13360         # If this line contains the property name, extract it.
13361         if (/^Property dump for: [^(]*\((.*)\)/) {
13362             $_ = $1;
13363
13364             # Convert white space to underscores.
13365             s/ /_/g;
13366
13367             # Convert the few properties that don't have the same name as
13368             # their modern counterparts
13369             s/Identifier_Part/ID_Continue/
13370             or s/Not_a_Character/NChar/;
13371
13372             # If the name matches an existing property, use it.
13373             if (defined property_ref($_)) {
13374                 trace "new property=", $_ if main::DEBUG && $to_trace;
13375                 $current_property = $_;
13376             }
13377             else {        # Otherwise discard it
13378                 trace "rejected property=", $_ if main::DEBUG && $to_trace;
13379                 undef $current_property;
13380             }
13381             $_ = "";    # The property is saved for the next lines of the
13382                         # file, but this defining line is of no further use,
13383                         # so clear it so that the caller won't process it
13384                         # further.
13385         }
13386         elsif (! defined $current_property || $_ !~ /^$code_point_re/) {
13387
13388             # Here, the input line isn't a header defining a property for the
13389             # following section, and either we aren't in such a section, or
13390             # the line doesn't look like one that defines the code points in
13391             # such a section.  Ignore this line.
13392             $_ = "";
13393         }
13394         else {
13395
13396             # Here, we have a line defining the code points for the current
13397             # stashed property.  Anything starting with the first blank is
13398             # extraneous.  Otherwise, it should look like a normal range to
13399             # the caller.  Append the property name so that it looks just like
13400             # a modern PropList entry.
13401
13402             $_ =~ s/\s.*//;
13403             $_ .= "; $current_property";
13404         }
13405         trace $_ if main::DEBUG && $to_trace;
13406         return;
13407     }
13408 } # End closure for old style proplist
13409
13410 sub filter_old_style_normalization_lines {
13411     # For early releases of Unicode, the lines were like:
13412     #        74..2A76    ; NFKD_NO
13413     # For later releases this became:
13414     #        74..2A76    ; NFKD_QC; N
13415     # Filter $_ to look like those in later releases.
13416     # Similarly for MAYBEs
13417
13418     s/ _NO \b /_QC; N/x || s/ _MAYBE \b /_QC; M/x;
13419
13420     # Also, the property FC_NFKC was abbreviated to FNC
13421     s/FNC/FC_NFKC/;
13422     return;
13423 }
13424
13425 sub setup_script_extensions {
13426     # The Script_Extensions property starts out with a clone of the Script
13427     # property.
13428
13429     $scx = property_ref("Script_Extensions");
13430     return unless defined $scx;
13431
13432     $scx->_set_format($STRING_WHITE_SPACE_LIST);
13433     $scx->initialize($script);
13434     $scx->set_default_map($script->default_map);
13435     $scx->set_pre_declared_maps(0);     # PropValueAliases doesn't list these
13436     $scx->add_comment(join_lines( <<END
13437 The values for code points that appear in one script are just the same as for
13438 the 'Script' property.  Likewise the values for those that appear in many
13439 scripts are either 'Common' or 'Inherited', same as with 'Script'.  But the
13440 values of code points that appear in a few scripts are a space separated list
13441 of those scripts.
13442 END
13443     ));
13444
13445     # Initialize scx's tables and the aliases for them to be the same as sc's
13446     foreach my $table ($script->tables) {
13447         my $scx_table = $scx->add_match_table($table->name,
13448                                 Full_Name => $table->full_name);
13449         foreach my $alias ($table->aliases) {
13450             $scx_table->add_alias($alias->name);
13451         }
13452     }
13453 }
13454
13455 sub  filter_script_extensions_line {
13456     # The Scripts file comes with the full name for the scripts; the
13457     # ScriptExtensions, with the short name.  The final mapping file is a
13458     # combination of these, and without adjustment, would have inconsistent
13459     # entries.  This filters the latter file to convert to full names.
13460     # Entries look like this:
13461     # 064B..0655    ; Arab Syrc # Mn  [11] ARABIC FATHATAN..ARABIC HAMZA BELOW
13462
13463     my @fields = split /\s*;\s*/;
13464
13465     # This script was erroneously omitted in this Unicode version.
13466     $fields[1] .= ' Takr' if $v_version eq v6.1.0 && $fields[0] =~ /^0964/;
13467
13468     my @full_names;
13469     foreach my $short_name (split " ", $fields[1]) {
13470         push @full_names, $script->table($short_name)->full_name;
13471     }
13472     $fields[1] = join " ", @full_names;
13473     $_ = join "; ", @fields;
13474
13475     return;
13476 }
13477
13478 sub generate_hst {
13479
13480     # Populates the Hangul Syllable Type property from first principles
13481
13482     my $file= shift;
13483     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
13484
13485     # These few ranges are hard-coded in.
13486     $file->insert_lines(split /\n/, <<'END'
13487 1100..1159    ; L
13488 115F          ; L
13489 1160..11A2    ; V
13490 11A8..11F9    ; T
13491 END
13492 );
13493
13494     # The Hangul syllables in version 1 are at different code points than
13495     # those that came along starting in version 2, and have different names;
13496     # they comprise about 60% of the code points of the later version.
13497     # From my (khw) research on them (see <558493EB.4000807@att.net>), the
13498     # initial set is a subset of the later version, with different English
13499     # transliterations.  I did not see an easy mapping between them.  The
13500     # later set includes essentially all possibilities, even ones that aren't
13501     # in modern use (if they ever were), and over 96% of the new ones are type
13502     # LVT.  Mathematically, the early set must also contain a preponderance of
13503     # LVT values.  In lieu of doing nothing, we just set them all to LVT, and
13504     # expect that this will be right most of the time, which is better than
13505     # not being right at all.
13506     if ($v_version lt v2.0.0) {
13507         my $property = property_ref($file->property);
13508         $file->insert_lines(sprintf("%04X..%04X; LVT\n",
13509                                     $FIRST_REMOVED_HANGUL_SYLLABLE,
13510                                     $FINAL_REMOVED_HANGUL_SYLLABLE));
13511         push @tables_that_may_be_empty, $property->table('LV')->complete_name;
13512         return;
13513     }
13514
13515     # The algorithmically derived syllables are almost all LVT ones, so
13516     # initialize the whole range with that.
13517     $file->insert_lines(sprintf "%04X..%04X; LVT\n",
13518                         $SBase, $SBase + $SCount -1);
13519
13520     # Those ones that aren't LVT are LV, and they occur at intervals of
13521     # $TCount code points, starting with the first code point, at $SBase.
13522     for (my $i = $SBase; $i < $SBase + $SCount; $i += $TCount) {
13523         $file->insert_lines(sprintf "%04X..%04X; LV\n", $i, $i);
13524     }
13525
13526     return;
13527 }
13528
13529 sub generate_GCB {
13530
13531     # Populates the Grapheme Cluster Break property from first principles
13532
13533     my $file= shift;
13534     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
13535
13536     # All these definitions are from
13537     # http://www.unicode.org/reports/tr29/tr29-3.html with confirmation
13538     # from http://www.unicode.org/reports/tr29/tr29-4.html
13539
13540     foreach my $range ($gc->ranges) {
13541
13542         # Extend includes gc=Me and gc=Mn, while Control includes gc=Cc
13543         # and gc=Cf
13544         if ($range->value =~ / ^ M [en] $ /x) {
13545             $file->insert_lines(sprintf "%04X..%04X; Extend",
13546                                 $range->start,  $range->end);
13547         }
13548         elsif ($range->value =~ / ^ C [cf] $ /x) {
13549             $file->insert_lines(sprintf "%04X..%04X; Control",
13550                                 $range->start,  $range->end);
13551         }
13552     }
13553     $file->insert_lines("2028; Control"); # Line Separator
13554     $file->insert_lines("2029; Control"); # Paragraph Separator
13555
13556     $file->insert_lines("000D; CR");
13557     $file->insert_lines("000A; LF");
13558
13559     # Also from http://www.unicode.org/reports/tr29/tr29-3.html.
13560     foreach my $code_point ( qw{
13561                                 09BE 09D7 0B3E 0B57 0BBE 0BD7 0CC2 0CD5 0CD6
13562                                 0D3E 0D57 0DCF 0DDF FF9E FF9F 1D165 1D16E 1D16F
13563                                 }
13564     ) {
13565         my $category = $gc->value_of(hex $code_point);
13566         next if ! defined $category || $category eq 'Cn'; # But not if
13567                                                           # unassigned in this
13568                                                           # release
13569         $file->insert_lines("$code_point; Extend");
13570     }
13571
13572     my $hst = property_ref('Hangul_Syllable_Type');
13573     if ($hst->count > 0) {
13574         foreach my $range ($hst->ranges) {
13575             $file->insert_lines(sprintf "%04X..%04X; %s",
13576                                     $range->start, $range->end, $range->value);
13577         }
13578     }
13579     else {
13580         generate_hst($file);
13581     }
13582
13583     main::process_generic_property_file($file);
13584 }
13585
13586
13587 sub fixup_early_perl_name_alias {
13588
13589     # Different versions of Unicode have varying support for the name synonyms
13590     # below.  Just include everything.  As of 6.1, all these are correct in
13591     # the Unicode-supplied file.
13592
13593     my $file= shift;
13594     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
13595
13596
13597     # ALERT did not come along until 6.0, at which point it became preferred
13598     # over BELL.  By inserting it last in early releases, BELL is preferred
13599     # over it; and vice-vers in 6.0
13600     my $type_for_bell = ($v_version lt v6.0.0)
13601                ? 'correction'
13602                : 'alternate';
13603     $file->insert_lines(split /\n/, <<END
13604 0007;BELL; $type_for_bell
13605 000A;LINE FEED (LF);alternate
13606 000C;FORM FEED (FF);alternate
13607 000D;CARRIAGE RETURN (CR);alternate
13608 0085;NEXT LINE (NEL);alternate
13609 END
13610
13611     );
13612
13613     # One might think that the the 'Unicode_1_Name' field, could work for most
13614     # of the above names, but sadly that field varies depending on the
13615     # release.  Version 1.1.5 had no names for any of the controls; Version
13616     # 2.0 introduced names for the C0 controls, and 3.0 introduced C1 names.
13617     # 3.0.1 removed the name INDEX; and 3.2 changed some names:
13618     #   changed to parenthesized versions like "NEXT LINE" to
13619     #       "NEXT LINE (NEL)";
13620     #   changed PARTIAL LINE DOWN to PARTIAL LINE FORWARD
13621     #   changed PARTIAL LINE UP to PARTIAL LINE BACKWARD;;
13622     #   changed e.g. FILE SEPARATOR to INFORMATION SEPARATOR FOUR
13623     #
13624     # All these are present in the 6.1 NameAliases.txt
13625
13626     return;
13627 }
13628
13629 sub filter_later_version_name_alias_line {
13630
13631     # This file has an extra entry per line for the alias type.  This is
13632     # handled by creating a compound entry: "$alias: $type";  First, split
13633     # the line into components.
13634     my ($range, $alias, $type, @remainder)
13635         = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
13636
13637     # This file contains multiple entries for some components, so tell the
13638     # downstream code to allow this in our internal tables; the
13639     # $MULTIPLE_AFTER preserves the input ordering.
13640     $_ = join ";", $range, $CMD_DELIM
13641                            . $REPLACE_CMD
13642                            . '='
13643                            . $MULTIPLE_AFTER
13644                            . $CMD_DELIM
13645                            . "$alias: $type",
13646                    @remainder;
13647     return;
13648 }
13649
13650 sub filter_early_version_name_alias_line {
13651
13652     # Early versions did not have the trailing alias type field; implicitly it
13653     # was 'correction'.
13654     $_ .= "; correction";
13655
13656     filter_later_version_name_alias_line;
13657     return;
13658 }
13659
13660 sub filter_all_caps_script_names {
13661
13662     # Some early Unicode releases had the script names in all CAPS.  This
13663     # converts them to just the first letter of each word being capital.
13664
13665     my ($range, $script, @remainder)
13666         = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
13667     my @words = split /[_-]/, $script;
13668     for my $word (@words) {
13669         $word =
13670             ucfirst(lc($word)) if $word ne 'CJK';
13671     }
13672     $script = join "_", @words;
13673     $_ = join ";", $range, $script, @remainder;
13674 }
13675
13676 sub finish_Unicode() {
13677     # This routine should be called after all the Unicode files have been read
13678     # in.  It:
13679     # 1) Creates properties that are missing from the version of Unicode being
13680     #    compiled, and which, for whatever reason, are needed for the Perl
13681     #    core to function properly.  These are minimally populated as
13682     #    necessary.
13683     # 2) Adds the mappings for code points missing from the files which have
13684     #    defaults specified for them.
13685     # 3) At this this point all mappings are known, so it computes the type of
13686     #    each property whose type hasn't been determined yet.
13687     # 4) Calculates all the regular expression match tables based on the
13688     #    mappings.
13689     # 5) Calculates and adds the tables which are defined by Unicode, but
13690     #    which aren't derived by them, and certain derived tables that Perl
13691     #    uses.
13692
13693     # Folding information was introduced later into Unicode data.  To get
13694     # Perl's case ignore (/i) to work at all in releases that don't have
13695     # folding, use the best available alternative, which is lower casing.
13696     my $fold = property_ref('Case_Folding');
13697     if ($fold->is_empty) {
13698         $fold->initialize(property_ref('Lowercase_Mapping'));
13699         $fold->add_note(join_lines(<<END
13700 WARNING: This table uses lower case as a substitute for missing fold
13701 information
13702 END
13703         ));
13704     }
13705
13706     # Multiple-character mapping was introduced later into Unicode data, so it
13707     # is by default the simple version.  If to output the simple versions and
13708     # not present, just use the regular (which in these Unicode versions is
13709     # the simple as well).
13710     foreach my $map (qw {   Uppercase_Mapping
13711                             Lowercase_Mapping
13712                             Titlecase_Mapping
13713                             Case_Folding
13714                         } )
13715     {
13716         my $comment = <<END;
13717
13718 Note that although the Perl core uses this file, it has the standard values
13719 for code points from U+0000 to U+00FF compiled in, so changing this table will
13720 not change the core's behavior with respect to these code points.  Use
13721 Unicode::Casing to override this table.
13722 END
13723         if ($map eq 'Case_Folding') {
13724             $comment .= <<END;
13725 (/i regex matching is not overridable except by using a custom regex engine)
13726 END
13727         }
13728         property_ref($map)->add_comment(join_lines($comment));
13729         my $simple = property_ref("Simple_$map");
13730         next if ! $simple->is_empty;
13731         if ($simple->to_output_map) {
13732             $simple->initialize(property_ref($map));
13733         }
13734         else {
13735             property_ref($map)->set_proxy_for($simple->name);
13736         }
13737     }
13738
13739     # For each property, fill in any missing mappings, and calculate the re
13740     # match tables.  If a property has more than one missing mapping, the
13741     # default is a reference to a data structure, and may require data from
13742     # other properties to resolve.  The sort is used to cause these to be
13743     # processed last, after all the other properties have been calculated.
13744     # (Fortunately, the missing properties so far don't depend on each other.)
13745     foreach my $property
13746         (sort { (defined $a->default_map && ref $a->default_map) ? 1 : -1 }
13747         property_ref('*'))
13748     {
13749         # $perl has been defined, but isn't one of the Unicode properties that
13750         # need to be finished up.
13751         next if $property == $perl;
13752
13753         # Nor do we need to do anything with properties that aren't going to
13754         # be output.
13755         next if $property->fate == $SUPPRESSED;
13756
13757         # Handle the properties that have more than one possible default
13758         if (ref $property->default_map) {
13759             my $default_map = $property->default_map;
13760
13761             # These properties have stored in the default_map:
13762             # One or more of:
13763             #   1)  A default map which applies to all code points in a
13764             #       certain class
13765             #   2)  an expression which will evaluate to the list of code
13766             #       points in that class
13767             # And
13768             #   3) the default map which applies to every other missing code
13769             #      point.
13770             #
13771             # Go through each list.
13772             while (my ($default, $eval) = $default_map->get_next_defaults) {
13773
13774                 # Get the class list, and intersect it with all the so-far
13775                 # unspecified code points yielding all the code points
13776                 # in the class that haven't been specified.
13777                 my $list = eval $eval;
13778                 if ($@) {
13779                     Carp::my_carp("Can't set some defaults for missing code points for $property because eval '$eval' failed with '$@'");
13780                     last;
13781                 }
13782
13783                 # Narrow down the list to just those code points we don't have
13784                 # maps for yet.
13785                 $list = $list & $property->inverse_list;
13786
13787                 # Add mappings to the property for each code point in the list
13788                 foreach my $range ($list->ranges) {
13789                     $property->add_map($range->start, $range->end, $default,
13790                     Replace => $CROAK);
13791                 }
13792             }
13793
13794             # All remaining code points have the other mapping.  Set that up
13795             # so the normal single-default mapping code will work on them
13796             $property->set_default_map($default_map->other_default);
13797
13798             # And fall through to do that
13799         }
13800
13801         # We should have enough data now to compute the type of the property.
13802         my $property_name = $property->name;
13803         $property->compute_type;
13804         my $property_type = $property->type;
13805
13806         next if ! $property->to_create_match_tables;
13807
13808         # Here want to create match tables for this property
13809
13810         # The Unicode db always (so far, and they claim into the future) have
13811         # the default for missing entries in binary properties be 'N' (unless
13812         # there is a '@missing' line that specifies otherwise)
13813         if (! defined $property->default_map) {
13814             if ($property_type == $BINARY) {
13815                 $property->set_default_map('N');
13816             }
13817             elsif ($property_type == $ENUM) {
13818                 Carp::my_carp("Property '$property_name doesn't have a default mapping.  Using a fake one");
13819                 $property->set_default_map('XXX This makes sure there is a default map');
13820             }
13821         }
13822
13823         # Add any remaining code points to the mapping, using the default for
13824         # missing code points.
13825         my $default_table;
13826         my $default_map = $property->default_map;
13827         if ($property_type == $FORCED_BINARY) {
13828
13829             # A forced binary property creates a 'Y' table that matches all
13830             # non-default values.  The actual string values are also written out
13831             # as a map table.  (The default value will almost certainly be the
13832             # empty string, so the pod glosses over the distinction, and just
13833             # talks about empty vs non-empty.)
13834             my $yes = $property->table("Y");
13835             foreach my $range ($property->ranges) {
13836                 next if $range->value eq $default_map;
13837                 $yes->add_range($range->start, $range->end);
13838             }
13839             $property->table("N")->set_complement($yes);
13840         }
13841         else {
13842             if (defined $default_map) {
13843
13844                 # Make sure there is a match table for the default
13845                 if (! defined ($default_table = $property->table($default_map)))
13846                 {
13847                     $default_table = $property->add_match_table($default_map);
13848                 }
13849
13850                 # And, if the property is binary, the default table will just
13851                 # be the complement of the other table.
13852                 if ($property_type == $BINARY) {
13853                     my $non_default_table;
13854
13855                     # Find the non-default table.
13856                     for my $table ($property->tables) {
13857                         if ($table == $default_table) {
13858                             if ($v_version le v5.0.0) {
13859                                 $table->add_alias($_) for qw(N No F False);
13860                             }
13861                             next;
13862                         } elsif ($v_version le v5.0.0) {
13863                             $table->add_alias($_) for qw(Y Yes T True);
13864                         }
13865                         $non_default_table = $table;
13866                     }
13867                     $default_table->set_complement($non_default_table);
13868                 }
13869                 else {
13870
13871                     # This fills in any missing values with the default.  It's
13872                     # not necessary to do this with binary properties, as the
13873                     # default is defined completely in terms of the Y table.
13874                     $property->add_map(0, $MAX_WORKING_CODEPOINT,
13875                                     $default_map, Replace => $NO);
13876                 }
13877             }
13878
13879             # Have all we need to populate the match tables.
13880             my $maps_should_be_defined = $property->pre_declared_maps;
13881             foreach my $range ($property->ranges) {
13882                 my $map = $range->value;
13883                 my $table = $property->table($map);
13884                 if (! defined $table) {
13885
13886                     # Integral and rational property values are not
13887                     # necessarily defined in PropValueAliases, but whether all
13888                     # the other ones should be depends on the property.
13889                     if ($maps_should_be_defined
13890                         && $map !~ /^ -? \d+ ( \/ \d+ )? $/x)
13891                     {
13892                         Carp::my_carp("Table '$property_name=$map' should "
13893                                     . "have been defined.  Defining it now.")
13894                     }
13895                     $table = $property->add_match_table($map);
13896                 }
13897
13898                 next if $table->complement != 0; # Don't need to populate these
13899                 $table->add_range($range->start, $range->end);
13900             }
13901         }
13902
13903         # For Perl 5.6 compatibility, all properties matchable in regexes can
13904         # have an optional 'Is_' prefix.  This is now done in utf8_heavy.pl.
13905         # But warn if this creates a conflict with a (new) Unicode property
13906         # name, although it appears that Unicode has made a decision never to
13907         # begin a property name with 'Is_', so this shouldn't happen.
13908         foreach my $alias ($property->aliases) {
13909             my $Is_name = 'Is_' . $alias->name;
13910             if (defined (my $pre_existing = property_ref($Is_name))) {
13911                 Carp::my_carp(<<END
13912 There is already an alias named $Is_name (from " . $pre_existing . "), so
13913 creating one for $property won't work.  This is bad news.  If it is not too
13914 late, get Unicode to back off.  Otherwise go back to the old scheme (findable
13915 from the git blame log for this area of the code that suppressed individual
13916 aliases that conflict with the new Unicode names.  Proceeding anyway.
13917 END
13918                 );
13919             }
13920         } # End of loop through aliases for this property
13921     } # End of loop through all Unicode properties.
13922
13923     # Fill in the mappings that Unicode doesn't completely furnish.  First the
13924     # single letter major general categories.  If Unicode were to start
13925     # delivering the values, this would be redundant, but better that than to
13926     # try to figure out if should skip and not get it right.  Ths could happen
13927     # if a new major category were to be introduced, and the hard-coded test
13928     # wouldn't know about it.
13929     # This routine depends on the standard names for the general categories
13930     # being what it thinks they are, like 'Cn'.  The major categories are the
13931     # union of all the general category tables which have the same first
13932     # letters. eg. L = Lu + Lt + Ll + Lo + Lm
13933     foreach my $minor_table ($gc->tables) {
13934         my $minor_name = $minor_table->name;
13935         next if length $minor_name == 1;
13936         if (length $minor_name != 2) {
13937             Carp::my_carp_bug("Unexpected general category '$minor_name'.  Skipped.");
13938             next;
13939         }
13940
13941         my $major_name = uc(substr($minor_name, 0, 1));
13942         my $major_table = $gc->table($major_name);
13943         $major_table += $minor_table;
13944     }
13945
13946     # LC is Ll, Lu, and Lt.  (used to be L& or L_, but PropValueAliases.txt
13947     # defines it as LC)
13948     my $LC = $gc->table('LC');
13949     $LC->add_alias('L_', Status => $DISCOURAGED);   # For backwards...
13950     $LC->add_alias('L&', Status => $DISCOURAGED);   # compatibility.
13951
13952
13953     if ($LC->is_empty) { # Assume if not empty that Unicode has started to
13954                          # deliver the correct values in it
13955         $LC->initialize($gc->table('Ll') + $gc->table('Lu'));
13956
13957         # Lt not in release 1.
13958         if (defined $gc->table('Lt')) {
13959             $LC += $gc->table('Lt');
13960             $gc->table('Lt')->set_caseless_equivalent($LC);
13961         }
13962     }
13963     $LC->add_description('[\p{Ll}\p{Lu}\p{Lt}]');
13964
13965     $gc->table('Ll')->set_caseless_equivalent($LC);
13966     $gc->table('Lu')->set_caseless_equivalent($LC);
13967
13968     # Create digit and case fold tables with the original file names for
13969     # backwards compatibility with applications that read them directly.
13970     my $Digit = Property->new("Legacy_Perl_Decimal_Digit",
13971                               Default_Map => "",
13972                               File => 'Digit',    # Trad. location
13973                               Directory => $map_directory,
13974                               Type => $STRING,
13975                               Replacement_Property => "Perl_Decimal_Digit",
13976                               Initialize => property_ref('Perl_Decimal_Digit'),
13977                             );
13978     $Digit->add_comment(join_lines(<<END
13979 This file gives the mapping of all code points which represent a single
13980 decimal digit [0-9] to their respective digits.  For example, the code point
13981 U+0031 (an ASCII '1') is mapped to a numeric 1.  These code points are those
13982 that have Numeric_Type=Decimal; not special things, like subscripts nor Roman
13983 numerals.
13984 END
13985     ));
13986
13987     # Make sure this assumption in perl core code is valid in this Unicode
13988     # release, with known exceptions
13989     foreach my $range (property_ref('Numeric-Type')->table('Decimal')->ranges) {
13990         next if $range->end - $range->start == 9;
13991         next if $range->start == 0x1D7CE;   # This whole range was added in 3.1
13992         next if $range->end == 0x19DA && $v_version eq v5.2.0;
13993         next if $range->end - $range->start < 9 && $v_version le 4.0.0;
13994         Carp::my_carp("Range $range unexpectedly doesn't contain 10"
13995                     . " decimal digits.  Code in regcomp.c assumes it does,"
13996                     . " and will have to be fixed.  Proceeding anyway.");
13997     }
13998
13999     Property->new('Legacy_Case_Folding',
14000                     File => "Fold",
14001                     Directory => $map_directory,
14002                     Default_Map => $CODE_POINT,
14003                     Type => $STRING,
14004                     Replacement_Property => "Case_Folding",
14005                     Format => $HEX_FORMAT,
14006                     Initialize => property_ref('cf'),
14007     );
14008
14009     # The Script_Extensions property started out as a clone of the Script
14010     # property.  But processing its data file caused some elements to be
14011     # replaced with different data.  (These elements were for the Common and
14012     # Inherited properties.)  This data is a qw() list of all the scripts that
14013     # the code points in the given range are in.  An example line is:
14014     # 060C          ; Arab Syrc Thaa # Po       ARABIC COMMA
14015     #
14016     # The code above has created a new match table named "Arab Syrc Thaa"
14017     # which contains 060C.  (The cloned table started out with this code point
14018     # mapping to "Common".)  Now we add 060C to each of the Arab, Syrc, and
14019     # Thaa match tables.  Then we delete the now spurious "Arab Syrc Thaa"
14020     # match table.  This is repeated for all these tables and ranges.  The map
14021     # data is retained in the map table for reference, but the spurious match
14022     # tables are deleted.
14023
14024     if (defined $scx) {
14025         foreach my $table ($scx->tables) {
14026             next unless $table->name =~ /\s/;   # All the new and only the new
14027                                                 # tables have a space in their
14028                                                 # names
14029             my @scripts = split /\s+/, $table->name;
14030             foreach my $script (@scripts) {
14031                 my $script_table = $scx->table($script);
14032                 $script_table += $table;
14033             }
14034             $scx->delete_match_table($table);
14035         }
14036
14037         # Mark the scx table as the parent of the corresponding sc table for
14038         # those which are identical.  This causes the pod for the script table
14039         # to refer to the corresponding scx one.
14040         #
14041         # This has to be in a separate loop from above, so as to wait until
14042         # the tables are stabilized before checking for equivalency.
14043         if (defined $pod_directory) {
14044             foreach my $table ($scx->tables) {
14045                 my $plain_sc_equiv = $script->table($table->name);
14046                 if ($table->matches_identically_to($plain_sc_equiv)) {
14047                     $plain_sc_equiv->set_equivalent_to($table, Related => 1);
14048                 }
14049             }
14050         }
14051     }
14052
14053     return;
14054 }
14055
14056 sub pre_3_dot_1_Nl () {
14057
14058     # Return a range list for gc=nl for Unicode versions prior to 3.1, which
14059     # is when Unicode's became fully usable.  These code points were
14060     # determined by inspection and experimentation.  gc=nl is important for
14061     # certain Perl-extension properties that should be available in all
14062     # releases.
14063
14064     my $Nl = Range_List->new();
14065     if (defined (my $official = $gc->table('Nl'))) {
14066         $Nl += $official;
14067     }
14068     else {
14069         $Nl->add_range(0x2160, 0x2182);
14070         $Nl->add_range(0x3007, 0x3007);
14071         $Nl->add_range(0x3021, 0x3029);
14072     }
14073     $Nl->add_range(0xFE20, 0xFE23);
14074     $Nl->add_range(0x16EE, 0x16F0) if $v_version ge v3.0.0; # 3.0 was when
14075                                                             # these were added
14076     return $Nl;
14077 }
14078
14079 sub calculate_Assigned() {  # Set $Assigned to the gc != Cn code points; may be
14080                             # called before the Cn's are completely filled.
14081                             # Works on Unicodes earlier than ones that
14082                             # explicitly specify Cn.
14083     return if defined $Assigned;
14084
14085     if (! defined $gc || $gc->is_empty()) {
14086         Carp::my_carp_bug("calculate_Assigned() called before $gc is populated");
14087     }
14088
14089     $Assigned = $perl->add_match_table('Assigned',
14090                                 Description  => "All assigned code points",
14091                                 );
14092     while (defined (my $range = $gc->each_range())) {
14093         my $standard_value = standardize($range->value);
14094         next if $standard_value eq 'cn' || $standard_value eq 'unassigned';
14095         $Assigned->add_range($range->start, $range->end);
14096     }
14097 }
14098
14099 sub calculate_DI() {    # Set $DI to a Range_List equivalent to the
14100                         # Default_Ignorable_Code_Point property.  Works on
14101                         # Unicodes earlier than ones that explicitly specify
14102                         # DI.
14103     return if defined $DI;
14104
14105     if (defined (my $di = property_ref('Default_Ignorable_Code_Point'))) {
14106         $DI = $di->table('Y');
14107     }
14108     else {
14109         $DI = Range_List->new(Initialize => [ 0x180B .. 0x180D,
14110                                               0x2060 .. 0x206F,
14111                                               0xFE00 .. 0xFE0F,
14112                                               0xFFF0 .. 0xFFFB,
14113                                             ]);
14114         if ($v_version ge v2.0) {
14115             $DI += $gc->table('Cf')
14116                 +  $gc->table('Cs');
14117
14118             # These are above the Unicode version 1 max
14119             $DI->add_range(0xE0000, 0xE0FFF);
14120         }
14121         $DI += $gc->table('Cc')
14122              - ord("\t")
14123              - utf8::unicode_to_native(0x0A)  # LINE FEED
14124              - utf8::unicode_to_native(0x0B)  # VERTICAL TAB
14125              - ord("\f")
14126              - utf8::unicode_to_native(0x0D)  # CARRIAGE RETURN
14127              - utf8::unicode_to_native(0x85); # NEL
14128     }
14129 }
14130
14131 sub calculate_NChar() {  # Create a Perl extension match table which is the
14132                          # same as the Noncharacter_Code_Point property, and
14133                          # set $NChar to point to it.  Works on Unicodes
14134                          # earlier than ones that explicitly specify NChar
14135     return if defined $NChar;
14136
14137     $NChar = $perl->add_match_table('_Perl_Nchar',
14138                                     Perl_Extension => 1,
14139                                     Fate => $INTERNAL_ONLY);
14140     if (defined (my $off_nchar = property_ref('NChar'))) {
14141         $NChar->initialize($off_nchar->table('Y'));
14142     }
14143     else {
14144         $NChar->initialize([ 0xFFFE .. 0xFFFF ]);
14145         if ($v_version ge v2.0) {   # First release with these nchars
14146             for (my $i = 0x1FFFE; $i <= 0x10FFFE; $i += 0x10000) {
14147                 $NChar += [ $i .. $i+1 ];
14148             }
14149         }
14150     }
14151 }
14152
14153 sub handle_compare_versions () {
14154     # This fixes things up for the $compare_versions capability, where we
14155     # compare Unicode version X with version Y (with Y > X), and we are
14156     # running it on the Unicode Data for version Y.
14157     #
14158     # It works by calculating the code points whose meaning has been specified
14159     # after release X, by using the Age property.  The complement of this set
14160     # is the set of code points whose meaning is unchanged between the
14161     # releases.  This is the set the program restricts itself to.  It includes
14162     # everything whose meaning has been specified by the time version X came
14163     # along, plus those still unassigned by the time of version Y.  (We will
14164     # continue to use the word 'assigned' to mean 'meaning has been
14165     # specified', as it's shorter and is accurate in all cases except the
14166     # Noncharacter code points.)
14167     #
14168     # This function is run after all the properties specified by Unicode have
14169     # been calculated for release Y.  This makes sure we get all the nuances
14170     # of Y's rules.  (It is done before the Perl extensions are calculated, as
14171     # those are based entirely on the Unicode ones.)  But doing it after the
14172     # Unicode table calculations means we have to fix up the Unicode tables.
14173     # We do this by subtracting the code points that have been assigned since
14174     # X (which is actually done by ANDing each table of assigned code points
14175     # with the set of unchanged code points).  Most Unicode properties are of
14176     # the form such that all unassigned code points have a default, grab-bag,
14177     # property value which is changed when the code point gets assigned.  For
14178     # these, we just remove the changed code points from the table for the
14179     # latter property value, and add them back in to the grab-bag one.  A few
14180     # other properties are not entirely of this form and have values for some
14181     # or all unassigned code points that are not the grab-bag one.  These have
14182     # to be handled specially, and are hard-coded in to this routine based on
14183     # manual inspection of the Unicode character database.  A list of the
14184     # outlier code points is made for each of these properties, and those
14185     # outliers are excluded from adding and removing from tables.
14186     #
14187     # Note that there are glitches when comparing against Unicode 1.1, as some
14188     # Hangul syllables in it were later ripped out and eventually replaced
14189     # with other things.
14190
14191     print "Fixing up for version comparison\n" if $verbosity >= $PROGRESS;
14192
14193     my $after_first_version = "All matching code points were added after "
14194                             . "Unicode $string_compare_versions";
14195
14196     # Calculate the delta as those code points that have been newly assigned
14197     # since the first compare version.
14198     my $delta = Range_List->new();
14199     foreach my $table ($age->tables) {
14200         use version;
14201         next if $table == $age->table('Unassigned');
14202         next if version->parse($table->name)
14203              le version->parse($string_compare_versions);
14204         $delta += $table;
14205     }
14206     if ($delta->is_empty) {
14207         die ("No changes; perhaps you need a 'DAge.txt' file?");
14208     }
14209
14210     my $unchanged = ~ $delta;
14211
14212     calculate_Assigned() if ! defined $Assigned;
14213     $Assigned &= $unchanged;
14214
14215     # $Assigned now contains the code points that were assigned as of Unicode
14216     # version X.
14217
14218     # A block is all or nothing.  If nothing is assigned in it, it all goes
14219     # back to the No_Block pool; but if even one code point is assigned, the
14220     # block is retained.
14221     my $no_block = $block->table('No_Block');
14222     foreach my $this_block ($block->tables) {
14223         next if     $this_block == $no_block
14224                 ||  ! ($this_block & $Assigned)->is_empty;
14225         $this_block->set_fate($SUPPRESSED, $after_first_version);
14226         foreach my $range ($this_block->ranges) {
14227             $block->replace_map($range->start, $range->end, 'No_Block')
14228         }
14229         $no_block += $this_block;
14230     }
14231
14232     my @special_delta_properties;   # List of properties that have to be
14233                                     # handled specially.
14234     my %restricted_delta;           # Keys are the entries in
14235                                     # @special_delta_properties;  values
14236                                     # are the range list of the code points
14237                                     # that behave normally when they get
14238                                     # assigned.
14239
14240     # In the next three properties, the Default Ignorable code points are
14241     # outliers.
14242     calculate_DI();
14243     $DI &= $unchanged;
14244
14245     push @special_delta_properties, property_ref('_Perl_GCB');
14246     $restricted_delta{$special_delta_properties[-1]} = ~ $DI;
14247
14248     if (defined (my $cwnfkcc = property_ref('Changes_When_NFKC_Casefolded')))
14249     {
14250         push @special_delta_properties, $cwnfkcc;
14251         $restricted_delta{$special_delta_properties[-1]} = ~ $DI;
14252     }
14253
14254     calculate_NChar();      # Non-character code points
14255     $NChar &= $unchanged;
14256
14257     # This may have to be updated from time-to-time to get the most accurate
14258     # results.
14259     my $default_BC_non_LtoR = Range_List->new(Initialize =>
14260                         # These came from the comments in v8.0 DBidiClass.txt
14261                                                         [ # AL
14262                                                             0x0600 .. 0x07BF,
14263                                                             0x08A0 .. 0x08FF,
14264                                                             0xFB50 .. 0xFDCF,
14265                                                             0xFDF0 .. 0xFDFF,
14266                                                             0xFE70 .. 0xFEFF,
14267                                                             0x1EE00 .. 0x1EEFF,
14268                                                            # R
14269                                                             0x0590 .. 0x05FF,
14270                                                             0x07C0 .. 0x089F,
14271                                                             0xFB1D .. 0xFB4F,
14272                                                             0x10800 .. 0x10FFF,
14273                                                             0x1E800 .. 0x1EDFF,
14274                                                             0x1EF00 .. 0x1EFFF,
14275                                                            # ET
14276                                                             0x20A0 .. 0x20CF,
14277                                                          ]
14278                                           );
14279     $default_BC_non_LtoR += $DI + $NChar;
14280     push @special_delta_properties, property_ref('BidiClass');
14281     $restricted_delta{$special_delta_properties[-1]} = ~ $default_BC_non_LtoR;
14282
14283     if (defined (my $eaw = property_ref('East_Asian_Width'))) {
14284
14285         my $default_EA_width_W = Range_List->new(Initialize =>
14286                                     # From comments in v8.0 EastAsianWidth.txt
14287                                                 [
14288                                                     0x3400 .. 0x4DBF,
14289                                                     0x4E00 .. 0x9FFF,
14290                                                     0xF900 .. 0xFAFF,
14291                                                     0x20000 .. 0x2A6DF,
14292                                                     0x2A700 .. 0x2B73F,
14293                                                     0x2B740 .. 0x2B81F,
14294                                                     0x2B820 .. 0x2CEAF,
14295                                                     0x2F800 .. 0x2FA1F,
14296                                                     0x20000 .. 0x2FFFD,
14297                                                     0x30000 .. 0x3FFFD,
14298                                                 ]
14299                                              );
14300         push @special_delta_properties, $eaw;
14301         $restricted_delta{$special_delta_properties[-1]}
14302                                                        = ~ $default_EA_width_W;
14303
14304         # Line break came along in the same release as East_Asian_Width, and
14305         # the non-grab-bag default set is a superset of the EAW one.
14306         if (defined (my $lb = property_ref('Line_Break'))) {
14307             my $default_LB_non_XX = Range_List->new(Initialize =>
14308                                         # From comments in v8.0 LineBreak.txt
14309                                                         [ 0x20A0 .. 0x20CF ]);
14310             $default_LB_non_XX += $default_EA_width_W;
14311             push @special_delta_properties, $lb;
14312             $restricted_delta{$special_delta_properties[-1]}
14313                                                         = ~ $default_LB_non_XX;
14314         }
14315     }
14316
14317     # Go through every property, skipping those we've already worked on, those
14318     # that are immutable, and the perl ones that will be calculated after this
14319     # routine has done its fixup.
14320     foreach my $property (property_ref('*')) {
14321         next if    $property == $perl     # Done later in the program
14322                 || $property == $block    # Done just above
14323                 || $property == $DI       # Done just above
14324                 || $property == $NChar    # Done just above
14325
14326                    # The next two are invariant across Unicode versions
14327                 || $property == property_ref('Pattern_Syntax')
14328                 || $property == property_ref('Pattern_White_Space');
14329
14330         #  Find the grab-bag value.
14331         my $default_map = $property->default_map;
14332
14333         if (! $property->to_create_match_tables) {
14334
14335             # Here there aren't any match tables.  So far, all such properties
14336             # have a default map, and don't require special handling.  Just
14337             # change each newly assigned code point back to the default map,
14338             # as if they were unassigned.
14339             foreach my $range ($delta->ranges) {
14340                 $property->add_map($range->start,
14341                                 $range->end,
14342                                 $default_map,
14343                                 Replace => $UNCONDITIONALLY);
14344             }
14345         }
14346         else {  # Here there are match tables.  Find the one (if any) for the
14347                 # grab-bag value that unassigned code points go to.
14348             my $default_table;
14349             if (defined $default_map) {
14350                 $default_table = $property->table($default_map);
14351             }
14352
14353             # If some code points don't go back to the the grab-bag when they
14354             # are considered unassigned, exclude them from the list that does
14355             # that.
14356             my $this_delta = $delta;
14357             my $this_unchanged = $unchanged;
14358             if (grep { $_ == $property } @special_delta_properties) {
14359                 $this_delta = $delta & $restricted_delta{$property};
14360                 $this_unchanged = ~ $this_delta;
14361             }
14362
14363             # Fix up each match table for this property.
14364             foreach my $table ($property->tables) {
14365                 if (defined $default_table && $table == $default_table) {
14366
14367                     # The code points assigned after release X (the ones we
14368                     # are excluding in this routine) go back on to the default
14369                     # (grab-bag) table.  However, some of these tables don't
14370                     # actually exist, but are specified solely by the other
14371                     # tables.  (In a binary property, we don't need to
14372                     # actually have an 'N' table, as it's just the complement
14373                     # of the 'Y' table.)  Such tables will be locked, so just
14374                     # skip those.
14375                     $table += $this_delta unless $table->locked;
14376                 }
14377                 else {
14378
14379                     # Here the table is not for the default value.  We need to
14380                     # subtract the code points we are ignoring for this
14381                     # comparison (the deltas) from it.  But if the table
14382                     # started out with nothing, no need to exclude anything,
14383                     # and want to skip it here anyway, so it gets listed
14384                     # properly in the pod.
14385                     next if $table->is_empty;
14386
14387                     # Save the deltas for later, before we do the subtraction
14388                     my $deltas = $table & $this_delta;
14389
14390                     $table &= $this_unchanged;
14391
14392                     # Suppress the table if the subtraction left it with
14393                     # nothing in it
14394                     if ($table->is_empty) {
14395                         if ($property->type == $BINARY) {
14396                             push @tables_that_may_be_empty, $table->complete_name;
14397                         }
14398                         else {
14399                             $table->set_fate($SUPPRESSED, $after_first_version);
14400                         }
14401                     }
14402
14403                     # Now we add the removed code points to the property's
14404                     # map, as they should now map to the grab-bag default
14405                     # property (which they did in the first comparison
14406                     # version).  But we don't have to do this if the map is
14407                     # only for internal use.
14408                     if (defined $default_map && $property->to_output_map) {
14409
14410                         # The gc property has pseudo property values whose names
14411                         # have length 1.  These are the union of all the
14412                         # property values whose name is longer than 1 and
14413                         # whose first letter is all the same.  The replacement
14414                         # is done once for the longer-named tables.
14415                         next if $property == $gc && length $table->name == 1;
14416
14417                         foreach my $range ($deltas->ranges) {
14418                             $property->add_map($range->start,
14419                                             $range->end,
14420                                             $default_map,
14421                                             Replace => $UNCONDITIONALLY);
14422                         }
14423                     }
14424                 }
14425             }
14426         }
14427     }
14428
14429     # The above code doesn't work on 'gc=C', as it is a superset of the default
14430     # ('Cn') table.  It's easiest to just special case it here.
14431     my $C = $gc->table('C');
14432     $C += $gc->table('Cn');
14433
14434     return;
14435 }
14436
14437 sub compile_perl() {
14438     # Create perl-defined tables.  Almost all are part of the pseudo-property
14439     # named 'perl' internally to this program.  Many of these are recommended
14440     # in UTS#18 "Unicode Regular Expressions", and their derivations are based
14441     # on those found there.
14442     # Almost all of these are equivalent to some Unicode property.
14443     # A number of these properties have equivalents restricted to the ASCII
14444     # range, with their names prefaced by 'Posix', to signify that these match
14445     # what the Posix standard says they should match.  A couple are
14446     # effectively this, but the name doesn't have 'Posix' in it because there
14447     # just isn't any Posix equivalent.  'XPosix' are the Posix tables extended
14448     # to the full Unicode range, by our guesses as to what is appropriate.
14449
14450     # 'All' is all code points.  As an error check, instead of just setting it
14451     # to be that, construct it to be the union of all the major categories
14452     $All = $perl->add_match_table('All',
14453       Description
14454         => "All code points, including those above Unicode.  Same as qr/./s",
14455       Matches_All => 1);
14456
14457     foreach my $major_table ($gc->tables) {
14458
14459         # Major categories are the ones with single letter names.
14460         next if length($major_table->name) != 1;
14461
14462         $All += $major_table;
14463     }
14464
14465     if ($All->max != $MAX_WORKING_CODEPOINT) {
14466         Carp::my_carp_bug("Generated highest code point ("
14467            . sprintf("%X", $All->max)
14468            . ") doesn't match expected value $MAX_WORKING_CODEPOINT_STRING.")
14469     }
14470     if ($All->range_count != 1 || $All->min != 0) {
14471      Carp::my_carp_bug("Generated table 'All' doesn't match all code points.")
14472     }
14473
14474     my $Any = $perl->add_match_table('Any',
14475                                     Description  => "All Unicode code points");
14476     $Any->add_range(0, $MAX_UNICODE_CODEPOINT);
14477     $Any->add_alias('Unicode');
14478
14479     calculate_Assigned();
14480
14481     my $ASCII = $perl->add_match_table('ASCII');
14482     if (defined $block) {   # This is equivalent to the block if have it.
14483         my $Unicode_ASCII = $block->table('Basic_Latin');
14484         if (defined $Unicode_ASCII && ! $Unicode_ASCII->is_empty) {
14485             $ASCII->set_equivalent_to($Unicode_ASCII, Related => 1);
14486         }
14487     }
14488
14489     # Very early releases didn't have blocks, so initialize ASCII ourselves if
14490     # necessary
14491     if ($ASCII->is_empty) {
14492         if (! NON_ASCII_PLATFORM) {
14493             $ASCII->add_range(0, 127);
14494         }
14495         else {
14496             for my $i (0 .. 127) {
14497                 $ASCII->add_range(utf8::unicode_to_native($i),
14498                                   utf8::unicode_to_native($i));
14499             }
14500         }
14501     }
14502
14503     # Get the best available case definitions.  Early Unicode versions didn't
14504     # have Uppercase and Lowercase defined, so use the general category
14505     # instead for them, modified by hard-coding in the code points each is
14506     # missing.
14507     my $Lower = $perl->add_match_table('XPosixLower');
14508     my $Unicode_Lower = property_ref('Lowercase');
14509     if (defined $Unicode_Lower && ! $Unicode_Lower->is_empty) {
14510         $Lower->set_equivalent_to($Unicode_Lower->table('Y'), Related => 1);
14511
14512     }
14513     else {
14514         $Lower += $gc->table('Lowercase_Letter');
14515
14516         # There are quite a few code points in Lower, that aren't in gc=lc,
14517         # and not all are in all releases.
14518         my $temp = Range_List->new(Initialize => [
14519                                                 utf8::unicode_to_native(0xAA),
14520                                                 utf8::unicode_to_native(0xBA),
14521                                                 0x02B0 .. 0x02B8,
14522                                                 0x02C0 .. 0x02C1,
14523                                                 0x02E0 .. 0x02E4,
14524                                                 0x0345,
14525                                                 0x037A,
14526                                                 0x1D2C .. 0x1D6A,
14527                                                 0x1D78,
14528                                                 0x1D9B .. 0x1DBF,
14529                                                 0x2071,
14530                                                 0x207F,
14531                                                 0x2090 .. 0x209C,
14532                                                 0x2170 .. 0x217F,
14533                                                 0x24D0 .. 0x24E9,
14534                                                 0x2C7C .. 0x2C7D,
14535                                                 0xA770,
14536                                                 0xA7F8 .. 0xA7F9,
14537                                 ]);
14538         $Lower += $temp & $Assigned;
14539     }
14540     my $Posix_Lower = $perl->add_match_table("PosixLower",
14541                             Initialize => $Lower & $ASCII,
14542                             );
14543
14544     my $Upper = $perl->add_match_table("XPosixUpper");
14545     my $Unicode_Upper = property_ref('Uppercase');
14546     if (defined $Unicode_Upper && ! $Unicode_Upper->is_empty) {
14547         $Upper->set_equivalent_to($Unicode_Upper->table('Y'), Related => 1);
14548     }
14549     else {
14550
14551         # Unlike Lower, there are only two ranges in Upper that aren't in
14552         # gc=Lu, and all code points were assigned in all releases.
14553         $Upper += $gc->table('Uppercase_Letter');
14554         $Upper->add_range(0x2160, 0x216F);  # Uppercase Roman numerals
14555         $Upper->add_range(0x24B6, 0x24CF);  # Circled Latin upper case letters
14556     }
14557     my $Posix_Upper = $perl->add_match_table("PosixUpper",
14558                             Initialize => $Upper & $ASCII,
14559                             );
14560
14561     # Earliest releases didn't have title case.  Initialize it to empty if not
14562     # otherwise present
14563     my $Title = $perl->add_match_table('Title', Full_Name => 'Titlecase',
14564                                        Description => '(= \p{Gc=Lt})');
14565     my $lt = $gc->table('Lt');
14566
14567     # Earlier versions of mktables had this related to $lt since they have
14568     # identical code points, but their caseless equivalents are not the same,
14569     # one being 'Cased' and the other being 'LC', and so now must be kept as
14570     # separate entities.
14571     if (defined $lt) {
14572         $Title += $lt;
14573     }
14574     else {
14575         push @tables_that_may_be_empty, $Title->complete_name;
14576     }
14577
14578     my $Unicode_Cased = property_ref('Cased');
14579     if (defined $Unicode_Cased) {
14580         my $yes = $Unicode_Cased->table('Y');
14581         my $no = $Unicode_Cased->table('N');
14582         $Title->set_caseless_equivalent($yes);
14583         if (defined $Unicode_Upper) {
14584             $Unicode_Upper->table('Y')->set_caseless_equivalent($yes);
14585             $Unicode_Upper->table('N')->set_caseless_equivalent($no);
14586         }
14587         $Upper->set_caseless_equivalent($yes);
14588         if (defined $Unicode_Lower) {
14589             $Unicode_Lower->table('Y')->set_caseless_equivalent($yes);
14590             $Unicode_Lower->table('N')->set_caseless_equivalent($no);
14591         }
14592         $Lower->set_caseless_equivalent($yes);
14593     }
14594     else {
14595         # If this Unicode version doesn't have Cased, set up the Perl
14596         # extension from first principles.  From Unicode 5.1: Definition D120:
14597         # A character C is defined to be cased if and only if C has the
14598         # Lowercase or Uppercase property or has a General_Category value of
14599         # Titlecase_Letter.
14600         my $cased = $perl->add_match_table('Cased',
14601                         Initialize => $Lower + $Upper + $Title,
14602                         Description => 'Uppercase or Lowercase or Titlecase',
14603                         );
14604         # $notcased is purely for the caseless equivalents below
14605         my $notcased = $perl->add_match_table('_Not_Cased',
14606                                 Initialize => ~ $cased,
14607                                 Fate => $INTERNAL_ONLY,
14608                                 Description => 'All not-cased code points');
14609         $Title->set_caseless_equivalent($cased);
14610         if (defined $Unicode_Upper) {
14611             $Unicode_Upper->table('Y')->set_caseless_equivalent($cased);
14612             $Unicode_Upper->table('N')->set_caseless_equivalent($notcased);
14613         }
14614         $Upper->set_caseless_equivalent($cased);
14615         if (defined $Unicode_Lower) {
14616             $Unicode_Lower->table('Y')->set_caseless_equivalent($cased);
14617             $Unicode_Lower->table('N')->set_caseless_equivalent($notcased);
14618         }
14619         $Lower->set_caseless_equivalent($cased);
14620     }
14621
14622     # The remaining perl defined tables are mostly based on Unicode TR 18,
14623     # "Annex C: Compatibility Properties".  All of these have two versions,
14624     # one whose name generally begins with Posix that is posix-compliant, and
14625     # one that matches Unicode characters beyond the Posix, ASCII range
14626
14627     my $Alpha = $perl->add_match_table('XPosixAlpha');
14628
14629     # Alphabetic was not present in early releases
14630     my $Alphabetic = property_ref('Alphabetic');
14631     if (defined $Alphabetic && ! $Alphabetic->is_empty) {
14632         $Alpha->set_equivalent_to($Alphabetic->table('Y'), Related => 1);
14633     }
14634     else {
14635
14636         # The Alphabetic property doesn't exist for early releases, so
14637         # generate it.  The actual definition, in 5.2 terms is:
14638         #
14639         # gc=L + gc=Nl + Other_Alphabetic
14640         #
14641         # Other_Alphabetic is also not defined in these early releases, but it
14642         # contains one gc=So range plus most of gc=Mn and gc=Mc, so we add
14643         # those last two as well, then subtract the relatively few of them that
14644         # shouldn't have been added.  (The gc=So range is the circled capital
14645         # Latin characters.  Early releases mistakenly didn't also include the
14646         # lower-case versions of these characters, and so we don't either, to
14647         # maintain consistency with those releases that first had this
14648         # property.
14649         $Alpha->initialize($gc->table('Letter')
14650                            + pre_3_dot_1_Nl()
14651                            + $gc->table('Mn')
14652                            + $gc->table('Mc')
14653                         );
14654         $Alpha->add_range(0x24D0, 0x24E9);  # gc=So
14655         foreach my $range (     [ 0x0300, 0x0344 ],
14656                                 [ 0x0346, 0x034E ],
14657                                 [ 0x0360, 0x0362 ],
14658                                 [ 0x0483, 0x0486 ],
14659                                 [ 0x0591, 0x05AF ],
14660                                 [ 0x06DF, 0x06E0 ],
14661                                 [ 0x06EA, 0x06EC ],
14662                                 [ 0x0740, 0x074A ],
14663                                 0x093C,
14664                                 0x094D,
14665                                 [ 0x0951, 0x0954 ],
14666                                 0x09BC,
14667                                 0x09CD,
14668                                 0x0A3C,
14669                                 0x0A4D,
14670                                 0x0ABC,
14671                                 0x0ACD,
14672                                 0x0B3C,
14673                                 0x0B4D,
14674                                 0x0BCD,
14675                                 0x0C4D,
14676                                 0x0CCD,
14677                                 0x0D4D,
14678                                 0x0DCA,
14679                                 [ 0x0E47, 0x0E4C ],
14680                                 0x0E4E,
14681                                 [ 0x0EC8, 0x0ECC ],
14682                                 [ 0x0F18, 0x0F19 ],
14683                                 0x0F35,
14684                                 0x0F37,
14685                                 0x0F39,
14686                                 [ 0x0F3E, 0x0F3F ],
14687                                 [ 0x0F82, 0x0F84 ],
14688                                 [ 0x0F86, 0x0F87 ],
14689                                 0x0FC6,
14690                                 0x1037,
14691                                 0x1039,
14692                                 [ 0x17C9, 0x17D3 ],
14693                                 [ 0x20D0, 0x20DC ],
14694                                 0x20E1,
14695                                 [ 0x302A, 0x302F ],
14696                                 [ 0x3099, 0x309A ],
14697                                 [ 0xFE20, 0xFE23 ],
14698                                 [ 0x1D165, 0x1D169 ],
14699                                 [ 0x1D16D, 0x1D172 ],
14700                                 [ 0x1D17B, 0x1D182 ],
14701                                 [ 0x1D185, 0x1D18B ],
14702                                 [ 0x1D1AA, 0x1D1AD ],
14703         ) {
14704             if (ref $range) {
14705                 $Alpha->delete_range($range->[0], $range->[1]);
14706             }
14707             else {
14708                 $Alpha->delete_range($range, $range);
14709             }
14710         }
14711         $Alpha->add_description('Alphabetic');
14712         $Alpha->add_alias('Alphabetic');
14713     }
14714     my $Posix_Alpha = $perl->add_match_table("PosixAlpha",
14715                             Initialize => $Alpha & $ASCII,
14716                             );
14717     $Posix_Upper->set_caseless_equivalent($Posix_Alpha);
14718     $Posix_Lower->set_caseless_equivalent($Posix_Alpha);
14719
14720     my $Alnum = $perl->add_match_table('Alnum', Full_Name => 'XPosixAlnum',
14721                         Description => 'Alphabetic and (decimal) Numeric',
14722                         Initialize => $Alpha + $gc->table('Decimal_Number'),
14723                         );
14724     $perl->add_match_table("PosixAlnum",
14725                             Initialize => $Alnum & $ASCII,
14726                             );
14727
14728     my $Word = $perl->add_match_table('Word', Full_Name => 'XPosixWord',
14729                                 Description => '\w, including beyond ASCII;'
14730                                             . ' = \p{Alnum} + \pM + \p{Pc}'
14731                                             . ' + \p{Join_Control}',
14732                                 Initialize => $Alnum + $gc->table('Mark'),
14733                                 );
14734     my $Pc = $gc->table('Connector_Punctuation'); # 'Pc' Not in release 1
14735     if (defined $Pc) {
14736         $Word += $Pc;
14737     }
14738     else {
14739         $Word += ord('_');  # Make sure this is a $Word
14740     }
14741     my $JC = property_ref('Join_Control');  # Wasn't in release 1
14742     if (defined $JC) {
14743         $Word += $JC->table('Y');
14744     }
14745     else {
14746         $Word += 0x200C + 0x200D;
14747     }
14748
14749     # This is a Perl extension, so the name doesn't begin with Posix.
14750     my $PerlWord = $perl->add_match_table('PosixWord',
14751                     Description => '\w, restricted to ASCII',
14752                     Initialize => $Word & $ASCII,
14753                     );
14754     $PerlWord->add_alias('PerlWord');
14755
14756     my $Blank = $perl->add_match_table('Blank', Full_Name => 'XPosixBlank',
14757                                 Description => '\h, Horizontal white space',
14758
14759                                 # 200B is Zero Width Space which is for line
14760                                 # break control, and was listed as
14761                                 # Space_Separator in early releases
14762                                 Initialize => $gc->table('Space_Separator')
14763                                             +   ord("\t")
14764                                             -   0x200B, # ZWSP
14765                                 );
14766     $Blank->add_alias('HorizSpace');        # Another name for it.
14767     $perl->add_match_table("PosixBlank",
14768                             Initialize => $Blank & $ASCII,
14769                             );
14770
14771     my $VertSpace = $perl->add_match_table('VertSpace',
14772                             Description => '\v',
14773                             Initialize =>
14774                                $gc->table('Line_Separator')
14775                              + $gc->table('Paragraph_Separator')
14776                              + utf8::unicode_to_native(0x0A)  # LINE FEED
14777                              + utf8::unicode_to_native(0x0B)  # VERTICAL TAB
14778                              + ord("\f")
14779                              + utf8::unicode_to_native(0x0D)  # CARRIAGE RETURN
14780                              + utf8::unicode_to_native(0x85)  # NEL
14781                     );
14782     # No Posix equivalent for vertical space
14783
14784     my $Space = $perl->add_match_table('XPosixSpace',
14785                 Description => '\s including beyond ASCII and vertical tab',
14786                 Initialize => $Blank + $VertSpace,
14787     );
14788     $Space->add_alias('XPerlSpace');    # Pre-existing synonyms
14789     $Space->add_alias('SpacePerl');
14790     $Space->add_alias('Space') if $v_version lt v4.1.0;
14791
14792     my $Posix_space = $perl->add_match_table("PosixSpace",
14793                             Initialize => $Space & $ASCII,
14794                             );
14795     $Posix_space->add_alias('PerlSpace'); # A pre-existing synonym
14796
14797     my $Cntrl = $perl->add_match_table('Cntrl', Full_Name => 'XPosixCntrl',
14798                                         Description => 'Control characters');
14799     $Cntrl->set_equivalent_to($gc->table('Cc'), Related => 1);
14800     $perl->add_match_table("PosixCntrl",
14801                             Description => "ASCII control characters",
14802                             Definition =>  "ACK, BEL, BS, CAN, CR, DC1, DC2,"
14803                                          . " DC3, DC4, DEL, DLE, ENQ, EOM,"
14804                                          . " EOT, ESC, ETB, ETX, FF, FS, GS,"
14805                                          . " HT, LF, NAK, NUL, RS, SI, SO,"
14806                                          . " SOH, STX, SUB, SYN, US, VT",
14807                             Initialize => $Cntrl & $ASCII,
14808                             );
14809
14810     my $perl_surrogate = $perl->add_match_table('_Perl_Surrogate');
14811     my $Cs = $gc->table('Cs');
14812     if (defined $Cs && ! $Cs->is_empty) {
14813         $perl_surrogate += $Cs;
14814     }
14815     else {
14816         push @tables_that_may_be_empty, '_Perl_Surrogate';
14817     }
14818
14819     # $controls is a temporary used to construct Graph.
14820     my $controls = Range_List->new(Initialize => $gc->table('Unassigned')
14821                                                 + $gc->table('Control')
14822                                                 + $perl_surrogate);
14823
14824     # Graph is  ~space &  ~(Cc|Cs|Cn) = ~(space + $controls)
14825     my $Graph = $perl->add_match_table('Graph', Full_Name => 'XPosixGraph',
14826                         Description => 'Characters that are graphical',
14827                         Initialize => ~ ($Space + $controls),
14828                         );
14829     $perl->add_match_table("PosixGraph",
14830                             Initialize => $Graph & $ASCII,
14831                             );
14832
14833     $print = $perl->add_match_table('Print', Full_Name => 'XPosixPrint',
14834                         Description => 'Characters that are graphical plus space characters (but no controls)',
14835                         Initialize => $Blank + $Graph - $gc->table('Control'),
14836                         );
14837     $perl->add_match_table("PosixPrint",
14838                             Initialize => $print & $ASCII,
14839                             );
14840
14841     my $Punct = $perl->add_match_table('Punct');
14842     $Punct->set_equivalent_to($gc->table('Punctuation'), Related => 1);
14843
14844     # \p{punct} doesn't include the symbols, which posix does
14845     my $XPosixPunct = $perl->add_match_table('XPosixPunct',
14846                     Description => '\p{Punct} + ASCII-range \p{Symbol}',
14847                     Initialize => $gc->table('Punctuation')
14848                                 + ($ASCII & $gc->table('Symbol')),
14849                                 Perl_Extension => 1
14850         );
14851     $perl->add_match_table('PosixPunct', Perl_Extension => 1,
14852         Initialize => $ASCII & $XPosixPunct,
14853         );
14854
14855     my $Digit = $perl->add_match_table('Digit', Full_Name => 'XPosixDigit',
14856                             Description => '[0-9] + all other decimal digits');
14857     $Digit->set_equivalent_to($gc->table('Decimal_Number'), Related => 1);
14858     my $PosixDigit = $perl->add_match_table("PosixDigit",
14859                                             Initialize => $Digit & $ASCII,
14860                                             );
14861
14862     # Hex_Digit was not present in first release
14863     my $Xdigit = $perl->add_match_table('XDigit', Full_Name => 'XPosixXDigit');
14864     my $Hex = property_ref('Hex_Digit');
14865     if (defined $Hex && ! $Hex->is_empty) {
14866         $Xdigit->set_equivalent_to($Hex->table('Y'), Related => 1);
14867     }
14868     else {
14869         $Xdigit->initialize([ ord('0') .. ord('9'),
14870                               ord('A') .. ord('F'),
14871                               ord('a') .. ord('f'),
14872                               0xFF10..0xFF19, 0xFF21..0xFF26, 0xFF41..0xFF46]);
14873     }
14874
14875     # AHex was not present in early releases
14876     my $PosixXDigit = $perl->add_match_table('PosixXDigit');
14877     my $AHex = property_ref('ASCII_Hex_Digit');
14878     if (defined $AHex && ! $AHex->is_empty) {
14879         $PosixXDigit->set_equivalent_to($AHex->table('Y'), Related => 1);
14880     }
14881     else {
14882         $PosixXDigit->initialize($Xdigit & $ASCII);
14883         $PosixXDigit->add_alias('AHex');
14884         $PosixXDigit->add_alias('Ascii_Hex_Digit');
14885     }
14886
14887     my $any_folds = $perl->add_match_table("_Perl_Any_Folds",
14888                     Description => "Code points that particpate in some fold",
14889                     );
14890     my $loc_problem_folds = $perl->add_match_table(
14891                "_Perl_Problematic_Locale_Folds",
14892                Description =>
14893                    "Code points that are in some way problematic under locale",
14894     );
14895
14896     # This allows regexec.c to skip some work when appropriate.  Some of the
14897     # entries in _Perl_Problematic_Locale_Folds are multi-character folds,
14898     my $loc_problem_folds_start = $perl->add_match_table(
14899                "_Perl_Problematic_Locale_Foldeds_Start",
14900                Description =>
14901                    "The first character of every sequence in _Perl_Problematic_Locale_Folds",
14902     );
14903
14904     my $cf = property_ref('Case_Folding');
14905
14906     # Every character 0-255 is problematic because what each folds to depends
14907     # on the current locale
14908     $loc_problem_folds->add_range(0, 255);
14909     $loc_problem_folds_start += $loc_problem_folds;
14910
14911     # Also problematic are anything these fold to outside the range.  Likely
14912     # forever the only thing folded to by these outside the 0-255 range is the
14913     # GREEK SMALL MU (from the MICRO SIGN), but it's easy to make the code
14914     # completely general, which should catch any unexpected changes or errors.
14915     # We look at each code point 0-255, and add its fold (including each part
14916     # of a multi-char fold) to the list.  See commit message
14917     # 31f05a37c4e9c37a7263491f2fc0237d836e1a80 for a more complete description
14918     # of the MU issue.
14919     foreach my $range ($loc_problem_folds->ranges) {
14920         foreach my $code_point ($range->start .. $range->end) {
14921             my $fold_range = $cf->containing_range($code_point);
14922             next unless defined $fold_range;
14923
14924             # Skip if folds to itself
14925             next if $fold_range->value eq $CODE_POINT;
14926
14927             my @hex_folds = split " ", $fold_range->value;
14928             my $start_cp = $hex_folds[0];
14929             next if $start_cp eq $CODE_POINT;
14930             $start_cp = hex $start_cp;
14931             foreach my $i (0 .. @hex_folds - 1) {
14932                 my $cp = $hex_folds[$i];
14933                 next if $cp eq $CODE_POINT;
14934                 $cp = hex $cp;
14935                 next unless $cp > 255;    # Already have the < 256 ones
14936
14937                 $loc_problem_folds->add_range($cp, $cp);
14938                 $loc_problem_folds_start->add_range($start_cp, $start_cp);
14939             }
14940         }
14941     }
14942
14943     my $folds_to_multi_char = $perl->add_match_table(
14944          "_Perl_Folds_To_Multi_Char",
14945          Description =>
14946               "Code points whose fold is a string of more than one character",
14947     );
14948     if ($v_version lt v3.0.1) {
14949         push @tables_that_may_be_empty, '_Perl_Folds_To_Multi_Char';
14950     }
14951
14952     # Look through all the known folds to populate these tables.
14953     foreach my $range ($cf->ranges) {
14954         next if $range->value eq $CODE_POINT;
14955         my $start = $range->start;
14956         my $end = $range->end;
14957         $any_folds->add_range($start, $end);
14958
14959         my @hex_folds = split " ", $range->value;
14960         if (@hex_folds > 1) {   # Is multi-char fold
14961             $folds_to_multi_char->add_range($start, $end);
14962         }
14963
14964         my $found_locale_problematic = 0;
14965
14966         # Look at each of the folded-to characters...
14967         foreach my $i (0 .. @hex_folds - 1) {
14968             my $cp = hex $hex_folds[$i];
14969             $any_folds->add_range($cp, $cp);
14970
14971             # The fold is problematic if any of the folded-to characters is
14972             # already considered problematic.
14973             if ($loc_problem_folds->contains($cp)) {
14974                 $loc_problem_folds->add_range($start, $end);
14975                 $found_locale_problematic = 1;
14976             }
14977         }
14978
14979         # If this is a problematic fold, add to the start chars the
14980         # folding-from characters and first folded-to character.
14981         if ($found_locale_problematic) {
14982             $loc_problem_folds_start->add_range($start, $end);
14983             my $cp = hex $hex_folds[0];
14984             $loc_problem_folds_start->add_range($cp, $cp);
14985         }
14986     }
14987
14988     my $dt = property_ref('Decomposition_Type');
14989     $dt->add_match_table('Non_Canon', Full_Name => 'Non_Canonical',
14990         Initialize => ~ ($dt->table('None') + $dt->table('Canonical')),
14991         Perl_Extension => 1,
14992         Note => 'Union of all non-canonical decompositions',
14993         );
14994
14995     # For backward compatibility, Perl has its own definition for IDStart.
14996     # It is regular XID_Start plus the underscore, but all characters must be
14997     # Word characters as well
14998     my $XID_Start = property_ref('XID_Start');
14999     my $perl_xids = $perl->add_match_table('_Perl_IDStart',
15000                                             Perl_Extension => 1,
15001                                             Fate => $INTERNAL_ONLY,
15002                                             Initialize => ord('_')
15003                                             );
15004     if (defined $XID_Start
15005         || defined ($XID_Start = property_ref('ID_Start')))
15006     {
15007         $perl_xids += $XID_Start->table('Y');
15008     }
15009     else {
15010         # For Unicode versions that don't have the property, construct our own
15011         # from first principles.  The actual definition is:
15012         #     Letters
15013         #   + letter numbers (Nl)
15014         #   - Pattern_Syntax
15015         #   - Pattern_White_Space
15016         #   + stability extensions
15017         #   - NKFC modifications
15018         #
15019         # What we do in the code below is to include the identical code points
15020         # that are in the first release that had Unicode's version of this
15021         # property, essentially extrapolating backwards.  There were no
15022         # stability extensions until v4.1, so none are included; likewise in
15023         # no Unicode version so far do subtracting PatSyn and PatWS make any
15024         # difference, so those also are ignored.
15025         $perl_xids += $gc->table('Letter') + pre_3_dot_1_Nl();
15026
15027         # We do subtract the NFKC modifications that are in the first version
15028         # that had this property.  We don't bother to test if they are in the
15029         # version in question, because if they aren't, the operation is a
15030         # no-op.  The NKFC modifications are discussed in
15031         # http://www.unicode.org/reports/tr31/#NFKC_Modifications
15032         foreach my $range ( 0x037A,
15033                             0x0E33,
15034                             0x0EB3,
15035                             [ 0xFC5E, 0xFC63 ],
15036                             [ 0xFDFA, 0xFE70 ],
15037                             [ 0xFE72, 0xFE76 ],
15038                             0xFE78,
15039                             0xFE7A,
15040                             0xFE7C,
15041                             0xFE7E,
15042                             [ 0xFF9E, 0xFF9F ],
15043         ) {
15044             if (ref $range) {
15045                 $perl_xids->delete_range($range->[0], $range->[1]);
15046             }
15047             else {
15048                 $perl_xids->delete_range($range, $range);
15049             }
15050         }
15051     }
15052
15053     $perl_xids &= $Word;
15054
15055     my $perl_xidc = $perl->add_match_table('_Perl_IDCont',
15056                                         Perl_Extension => 1,
15057                                         Fate => $INTERNAL_ONLY);
15058     my $XIDC = property_ref('XID_Continue');
15059     if (defined $XIDC
15060         || defined ($XIDC = property_ref('ID_Continue')))
15061     {
15062         $perl_xidc += $XIDC->table('Y');
15063     }
15064     else {
15065         # Similarly, we construct our own XIDC if necessary for early Unicode
15066         # versions.  The definition is:
15067         #     everything in XIDS
15068         #   + Gc=Mn
15069         #   + Gc=Mc
15070         #   + Gc=Nd
15071         #   + Gc=Pc
15072         #   - Pattern_Syntax
15073         #   - Pattern_White_Space
15074         #   + stability extensions
15075         #   - NFKC modifications
15076         #
15077         # The same thing applies to this as with XIDS for the PatSyn, PatWS,
15078         # and stability extensions.  There is a somewhat different set of NFKC
15079         # mods to remove (and add in this case).  The ones below make this
15080         # have identical code points as in the first release that defined it.
15081         $perl_xidc += $perl_xids
15082                     + $gc->table('L')
15083                     + $gc->table('Mn')
15084                     + $gc->table('Mc')
15085                     + $gc->table('Nd')
15086                     + utf8::unicode_to_native(0xB7)
15087                     ;
15088         if (defined (my $pc = $gc->table('Pc'))) {
15089             $perl_xidc += $pc;
15090         }
15091         else {  # 1.1.5 didn't have Pc, but these should have been in it
15092             $perl_xidc += 0xFF3F;
15093             $perl_xidc->add_range(0x203F, 0x2040);
15094             $perl_xidc->add_range(0xFE33, 0xFE34);
15095             $perl_xidc->add_range(0xFE4D, 0xFE4F);
15096         }
15097
15098         # Subtract the NFKC mods
15099         foreach my $range ( 0x037A,
15100                             [ 0xFC5E, 0xFC63 ],
15101                             [ 0xFDFA, 0xFE1F ],
15102                             0xFE70,
15103                             [ 0xFE72, 0xFE76 ],
15104                             0xFE78,
15105                             0xFE7A,
15106                             0xFE7C,
15107                             0xFE7E,
15108         ) {
15109             if (ref $range) {
15110                 $perl_xidc->delete_range($range->[0], $range->[1]);
15111             }
15112             else {
15113                 $perl_xidc->delete_range($range, $range);
15114             }
15115         }
15116     }
15117
15118     $perl_xidc &= $Word;
15119
15120     my $charname_begin = $perl->add_match_table('_Perl_Charname_Begin',
15121                     Perl_Extension => 1,
15122                     Fate => $INTERNAL_ONLY,
15123                     Initialize => $gc->table('Letter') & $Alpha & $perl_xids,
15124                     );
15125
15126     my $charname_continue = $perl->add_match_table('_Perl_Charname_Continue',
15127                         Perl_Extension => 1,
15128                         Fate => $INTERNAL_ONLY,
15129                         Initialize => $perl_xidc
15130                                     + ord(" ")
15131                                     + ord("(")
15132                                     + ord(")")
15133                                     + ord("-")
15134                         );
15135
15136     my @composition = ('Name', 'Unicode_1_Name', '_Perl_Name_Alias');
15137
15138     if (@named_sequences) {
15139         push @composition, 'Named_Sequence';
15140         foreach my $sequence (@named_sequences) {
15141             $perl_charname->add_anomalous_entry($sequence);
15142         }
15143     }
15144
15145     my $alias_sentence = "";
15146     my %abbreviations;
15147     my $alias = property_ref('_Perl_Name_Alias');
15148     $perl_charname->set_proxy_for('_Perl_Name_Alias');
15149
15150     # Add each entry in _Perl_Name_Alias to Perl_Charnames.  Where these go
15151     # with respect to any existing entry depends on the entry type.
15152     # Corrections go before said entry, as they should be returned in
15153     # preference over the existing entry.  (A correction to a correction
15154     # should be later in the _Perl_Name_Alias table, so it will correctly
15155     # precede the erroneous correction in Perl_Charnames.)
15156     #
15157     # Abbreviations go after everything else, so they are saved temporarily in
15158     # a hash for later.
15159     #
15160     # Everything else is added added afterwards, which preserves the input
15161     # ordering
15162
15163     foreach my $range ($alias->ranges) {
15164         next if $range->value eq "";
15165         my $code_point = $range->start;
15166         if ($code_point != $range->end) {
15167             Carp::my_carp_bug("Bad News.  Expecting only one code point in the range $range.  Just to keep going, using only the first code point;");
15168         }
15169         my ($value, $type) = split ': ', $range->value;
15170         my $replace_type;
15171         if ($type eq 'correction') {
15172             $replace_type = $MULTIPLE_BEFORE;
15173         }
15174         elsif ($type eq 'abbreviation') {
15175
15176             # Save for later
15177             $abbreviations{$value} = $code_point;
15178             next;
15179         }
15180         else {
15181             $replace_type = $MULTIPLE_AFTER;
15182         }
15183
15184         # Actually add; before or after current entry(ies) as determined
15185         # above.
15186
15187         $perl_charname->add_duplicate($code_point, $value, Replace => $replace_type);
15188     }
15189     $alias_sentence = <<END;
15190 The _Perl_Name_Alias property adds duplicate code point entries that are
15191 alternatives to the original name.  If an addition is a corrected
15192 name, it will be physically first in the table.  The original (less correct,
15193 but still valid) name will be next; then any alternatives, in no particular
15194 order; and finally any abbreviations, again in no particular order.
15195 END
15196
15197     # Now add the Unicode_1 names for the controls.  The Unicode_1 names had
15198     # precedence before 6.1, including the awful ones like "LINE FEED (LF)",
15199     # so should be first in the file; the other names have precedence starting
15200     # in 6.1,
15201     my $before_or_after = ($v_version lt v6.1.0)
15202                           ? $MULTIPLE_BEFORE
15203                           : $MULTIPLE_AFTER;
15204
15205     foreach my $range (property_ref('Unicode_1_Name')->ranges) {
15206         my $code_point = $range->start;
15207         my $unicode_1_value = $range->value;
15208         next if $unicode_1_value eq "";     # Skip if name doesn't exist.
15209
15210         if ($code_point != $range->end) {
15211             Carp::my_carp_bug("Bad News.  Expecting only one code point in the range $range.  Just to keep going, using only the first code point;");
15212         }
15213
15214         # To handle EBCDIC, we don't hard code in the code points of the
15215         # controls; instead realizing that all of them are below 256.
15216         last if $code_point > 255;
15217
15218         # We only add in the controls.
15219         next if $gc->value_of($code_point) ne 'Cc';
15220
15221         # We reject this Unicode1 name for later Perls, as it is used for
15222         # another code point
15223         next if $unicode_1_value eq 'BELL' && $^V ge v5.17.0;
15224
15225         # This won't add an exact duplicate.
15226         $perl_charname->add_duplicate($code_point, $unicode_1_value,
15227                                         Replace => $before_or_after);
15228     }
15229
15230     # Now that have everything added, add in abbreviations after
15231     # everything else.  Sort so results don't change between runs of this
15232     # program
15233     foreach my $value (sort keys %abbreviations) {
15234         $perl_charname->add_duplicate($abbreviations{$value}, $value,
15235                                         Replace => $MULTIPLE_AFTER);
15236     }
15237
15238     my $comment;
15239     if (@composition <= 2) { # Always at least 2
15240         $comment = join " and ", @composition;
15241     }
15242     else {
15243         $comment = join ", ", @composition[0 .. scalar @composition - 2];
15244         $comment .= ", and $composition[-1]";
15245     }
15246
15247     $perl_charname->add_comment(join_lines( <<END
15248 This file is for charnames.pm.  It is the union of the $comment properties.
15249 Unicode_1_Name entries are used only for nameless code points in the Name
15250 property.
15251 $alias_sentence
15252 This file doesn't include the algorithmically determinable names.  For those,
15253 use 'unicore/Name.pm'
15254 END
15255     ));
15256     property_ref('Name')->add_comment(join_lines( <<END
15257 This file doesn't include the algorithmically determinable names.  For those,
15258 use 'unicore/Name.pm'
15259 END
15260     ));
15261
15262     # Construct the Present_In property from the Age property.
15263     if (-e 'DAge.txt' && defined $age) {
15264         my $default_map = $age->default_map;
15265         my $in = Property->new('In',
15266                                 Default_Map => $default_map,
15267                                 Full_Name => "Present_In",
15268                                 Perl_Extension => 1,
15269                                 Type => $ENUM,
15270                                 Initialize => $age,
15271                                 );
15272         $in->add_comment(join_lines(<<END
15273 THIS FILE SHOULD NOT BE USED FOR ANY PURPOSE.  The values in this file are the
15274 same as for $age, and not for what $in really means.  This is because anything
15275 defined in a given release should have multiple values: that release and all
15276 higher ones.  But only one value per code point can be represented in a table
15277 like this.
15278 END
15279         ));
15280
15281         # The Age tables are named like 1.5, 2.0, 2.1, ....  Sort so that the
15282         # lowest numbered (earliest) come first, with the non-numeric one
15283         # last.
15284         my ($first_age, @rest_ages) = sort { ($a->name !~ /^[\d.]*$/)
15285                                             ? 1
15286                                             : ($b->name !~ /^[\d.]*$/)
15287                                                 ? -1
15288                                                 : $a->name <=> $b->name
15289                                             } $age->tables;
15290
15291         # The Present_In property is the cumulative age properties.  The first
15292         # one hence is identical to the first age one.
15293         my $previous_in = $in->add_match_table($first_age->name);
15294         $previous_in->set_equivalent_to($first_age, Related => 1);
15295
15296         my $description_start = "Code point's usage introduced in version ";
15297         $first_age->add_description($description_start . $first_age->name);
15298
15299         # To construct the accumulated values, for each of the age tables
15300         # starting with the 2nd earliest, merge the earliest with it, to get
15301         # all those code points existing in the 2nd earliest.  Repeat merging
15302         # the new 2nd earliest with the 3rd earliest to get all those existing
15303         # in the 3rd earliest, and so on.
15304         foreach my $current_age (@rest_ages) {
15305             next if $current_age->name !~ /^[\d.]*$/;   # Skip the non-numeric
15306
15307             my $current_in = $in->add_match_table(
15308                                     $current_age->name,
15309                                     Initialize => $current_age + $previous_in,
15310                                     Description => $description_start
15311                                                     . $current_age->name
15312                                                     . ' or earlier',
15313                                     );
15314             foreach my $alias ($current_age->aliases) {
15315                 $current_in->add_alias($alias->name);
15316             }
15317             $previous_in = $current_in;
15318
15319             # Add clarifying material for the corresponding age file.  This is
15320             # in part because of the confusing and contradictory information
15321             # given in the Standard's documentation itself, as of 5.2.
15322             $current_age->add_description(
15323                             "Code point's usage was introduced in version "
15324                             . $current_age->name);
15325             $current_age->add_note("See also $in");
15326
15327         }
15328
15329         # And finally the code points whose usages have yet to be decided are
15330         # the same in both properties.  Note that permanently unassigned code
15331         # points actually have their usage assigned (as being permanently
15332         # unassigned), so that these tables are not the same as gc=cn.
15333         my $unassigned = $in->add_match_table($default_map);
15334         my $age_default = $age->table($default_map);
15335         $age_default->add_description(<<END
15336 Code point's usage has not been assigned in any Unicode release thus far.
15337 END
15338         );
15339         $unassigned->set_equivalent_to($age_default, Related => 1);
15340     }
15341
15342     my $patws = $perl->add_match_table('_Perl_PatWS',
15343                                        Perl_Extension => 1,
15344                                        Fate => $INTERNAL_ONLY);
15345     if (defined (my $off_patws = property_ref('Pattern_White_Space'))) {
15346         $patws->initialize($off_patws->table('Y'));
15347     }
15348     else {
15349         $patws->initialize([ ord("\t"),
15350                              ord("\n"),
15351                              utf8::unicode_to_native(0x0B), # VT
15352                              ord("\f"),
15353                              ord("\r"),
15354                              ord(" "),
15355                              utf8::unicode_to_native(0x85), # NEL
15356                              0x200E..0x200F,             # Left, Right marks
15357                              0x2028..0x2029              # Line, Paragraph seps
15358                            ] );
15359     }
15360
15361     # See L<perlfunc/quotemeta>
15362     my $quotemeta = $perl->add_match_table('_Perl_Quotemeta',
15363                                            Perl_Extension => 1,
15364                                            Fate => $INTERNAL_ONLY,
15365
15366                                            # Initialize to what's common in
15367                                            # all Unicode releases.
15368                                            Initialize =>
15369                                                   $gc->table('Control')
15370                                                 + $Space
15371                                                 + $patws
15372                                                 + ((~ $Word) & $ASCII)
15373                            );
15374
15375     if (defined (my $patsyn = property_ref('Pattern_Syntax'))) {
15376         $quotemeta += $patsyn->table('Y');
15377     }
15378     else {
15379         $quotemeta += ((~ $Word) & Range->new(0, 255))
15380                     - utf8::unicode_to_native(0xA8)
15381                     - utf8::unicode_to_native(0xAF)
15382                     - utf8::unicode_to_native(0xB2)
15383                     - utf8::unicode_to_native(0xB3)
15384                     - utf8::unicode_to_native(0xB4)
15385                     - utf8::unicode_to_native(0xB7)
15386                     - utf8::unicode_to_native(0xB8)
15387                     - utf8::unicode_to_native(0xB9)
15388                     - utf8::unicode_to_native(0xBC)
15389                     - utf8::unicode_to_native(0xBD)
15390                     - utf8::unicode_to_native(0xBE);
15391         $quotemeta += [ # These are above-Latin1 patsyn; hence should be the
15392                         # same in all releases
15393                         0x2010 .. 0x2027,
15394                         0x2030 .. 0x203E,
15395                         0x2041 .. 0x2053,
15396                         0x2055 .. 0x205E,
15397                         0x2190 .. 0x245F,
15398                         0x2500 .. 0x2775,
15399                         0x2794 .. 0x2BFF,
15400                         0x2E00 .. 0x2E7F,
15401                         0x3001 .. 0x3003,
15402                         0x3008 .. 0x3020,
15403                         0x3030 .. 0x3030,
15404                         0xFD3E .. 0xFD3F,
15405                         0xFE45 .. 0xFE46
15406                       ];
15407     }
15408
15409     if (defined (my $di = property_ref('Default_Ignorable_Code_Point'))) {
15410         $quotemeta += $di->table('Y')
15411     }
15412     else {
15413         if ($v_version ge v2.0) {
15414             $quotemeta += $gc->table('Cf')
15415                        +  $gc->table('Cs');
15416
15417             # These are above the Unicode version 1 max
15418             $quotemeta->add_range(0xE0000, 0xE0FFF);
15419         }
15420         $quotemeta += $gc->table('Cc')
15421                     - $Space;
15422         my $temp = Range_List->new(Initialize => [ 0x180B .. 0x180D,
15423                                                    0x2060 .. 0x206F,
15424                                                    0xFE00 .. 0xFE0F,
15425                                                    0xFFF0 .. 0xFFFB,
15426                                                   ]);
15427         $temp->add_range(0xE0000, 0xE0FFF) if $v_version ge v2.0;
15428         $quotemeta += $temp;
15429     }
15430     calculate_DI();
15431     $quotemeta += $DI;
15432
15433     calculate_NChar();
15434
15435     # Finished creating all the perl properties.  All non-internal non-string
15436     # ones have a synonym of 'Is_' prefixed.  (Internal properties begin with
15437     # an underscore.)  These do not get a separate entry in the pod file
15438     foreach my $table ($perl->tables) {
15439         foreach my $alias ($table->aliases) {
15440             next if $alias->name =~ /^_/;
15441             $table->add_alias('Is_' . $alias->name,
15442                                Re_Pod_Entry => 0,
15443                                UCD => 0,
15444                                Status => $alias->status,
15445                                OK_as_Filename => 0);
15446         }
15447     }
15448
15449     # Perl tailors the WordBreak property so that \b{wb} doesn't split
15450     # adjacent spaces into separate words.  First create a copy of the regular
15451     # WB property as '_Perl_WB'.  (On Unicode releases earlier than when WB
15452     # was defined for, this will already have been done by the substitute file
15453     # portion for 'Input_file' code for WB.)
15454     my $perl_wb = property_ref('_Perl_WB');
15455     if (! defined $perl_wb) {
15456         $perl_wb = Property->new('_Perl_WB',
15457                                  Fate => $INTERNAL_ONLY,
15458                                  Perl_Extension => 1,
15459                                  Directory => $map_directory,
15460                                  Type => $STRING);
15461         my $wb = property_ref('Word_Break');
15462         $perl_wb->initialize($wb);
15463         $perl_wb->set_default_map($wb->default_map);
15464     }
15465
15466     # And simply replace the mappings of horizontal space characters that
15467     # otherwise would map to the default to instead map to our tailoring.
15468     my $default = $perl_wb->default_map;
15469     for my $range ($Blank->ranges) {
15470         for my $i ($range->start .. $range->end) {
15471             next unless $perl_wb->value_of($i) eq $default;
15472             $perl_wb->add_map($i, $i, 'Perl_Tailored_HSpace',
15473                               Replace => $UNCONDITIONALLY);
15474         }
15475     }
15476
15477     # Create a version of the LineBreak property with the mappings that are
15478     # omitted in the default algorithm remapped to what
15479     # http://www.unicode.org/reports/tr14 says they should be.
15480     #
15481     # Original     Resolved  General_Category
15482     # AI, SG, XX      AL      Any
15483     # SA              CM      Only Mn or Mc
15484     # SA              AL      Any except Mn and Mc
15485     # CJ              NS      Any
15486     #
15487     # All property values are also written out in their long form, as
15488     # regen/mk_invlist.pl expects that.  This also fixes occurrences of the
15489     # typo in early Unicode versions: 'inseperable'.
15490     my $perl_lb = property_ref('_Perl_LB');
15491     if (! defined $perl_lb) {
15492         $perl_lb = Property->new('_Perl_LB',
15493                                  Fate => $INTERNAL_ONLY,
15494                                  Perl_Extension => 1,
15495                                  Directory => $map_directory,
15496                                  Type => $STRING);
15497         my $lb = property_ref('Line_Break');
15498
15499         # Populate from $lb, but use full name and fix typo.
15500         foreach my $range ($lb->ranges) {
15501             my $full_name = $lb->table($range->value)->full_name;
15502             $full_name = 'Inseparable'
15503                                 if standardize($full_name) eq 'inseperable';
15504             $perl_lb->add_map($range->start, $range->end, $full_name);
15505         }
15506     }
15507
15508     $perl_lb->set_default_map('Alphabetic', 'full_name');    # XX -> AL
15509
15510     for my $range ($perl_lb->ranges) {
15511         my $value = standardize($range->value);
15512         if (   $value eq standardize('Unknown')
15513             || $value eq standardize('Ambiguous')
15514             || $value eq standardize('Surrogate'))
15515         {
15516             $perl_lb->add_map($range->start, $range->end, 'Alphabetic',
15517                               Replace => $UNCONDITIONALLY);
15518         }
15519         elsif ($value eq standardize('Conditional_Japanese_Starter')) {
15520             $perl_lb->add_map($range->start, $range->end, 'Nonstarter',
15521                               Replace => $UNCONDITIONALLY);
15522         }
15523         elsif ($value eq standardize('Complex_Context')) {
15524             for my $i ($range->start .. $range->end) {
15525                 my $gc_val = $gc->value_of($i);
15526                 if ($gc_val eq 'Mn' || $gc_val eq 'Mc') {
15527                     $perl_lb->add_map($i, $i, 'Combining_Mark',
15528                                       Replace => $UNCONDITIONALLY);
15529                 }
15530                 else {
15531                     $perl_lb->add_map($i, $i, 'Alphabetic',
15532                                       Replace => $UNCONDITIONALLY);
15533                 }
15534             }
15535         }
15536     }
15537
15538     # This property is a modification of the scx property
15539     my $perl_scx = Property->new('_Perl_SCX',
15540                                  Fate => $INTERNAL_ONLY,
15541                                  Perl_Extension => 1,
15542                                  Directory => $map_directory,
15543                                  Type => $ENUM);
15544     my $source;
15545
15546     # Use scx if available; otherwise sc;  if neither is there (a very old
15547     # Unicode version, just say that everything is 'Common'
15548     if (defined $scx) {
15549         $source = $scx;
15550         $perl_scx->set_default_map('Unknown');
15551     }
15552     elsif (defined $script) {
15553         $source = $script;
15554
15555         # Early versions of 'sc', had everything be 'Common'
15556         if (defined $script->table('Unknown')) {
15557             $perl_scx->set_default_map('Unknown');
15558         }
15559         else {
15560             $perl_scx->set_default_map('Common');
15561         }
15562     } else {
15563         $perl_scx->add_match_table('Common');
15564         $perl_scx->add_map(0, $MAX_UNICODE_CODEPOINT, 'Common');
15565
15566         $perl_scx->add_match_table('Unknown');
15567         $perl_scx->set_default_map('Unknown');
15568     }
15569
15570     $perl_scx->_set_format($STRING_WHITE_SPACE_LIST);
15571     $perl_scx->set_pre_declared_maps(0); # PropValueAliases doesn't list these
15572
15573     if (defined $source) {
15574         $perl_scx->initialize($source);
15575
15576         # UTS 39 says that the scx property should be modified for these
15577         # countries where certain mixed scripts are commonly used.
15578         for my $range ($perl_scx->ranges) {
15579             my $value = $range->value;
15580             my $changed = $value =~ s/ ( \b Han i? \b ) /$1 Hanb Jpan Kore/xi;
15581              $changed |=  $value =~ s/ ( \b Hira (gana)? \b ) /$1 Jpan/xi;
15582              $changed |=  $value =~ s/ ( \b Kata (kana)? \b ) /$1 Jpan/xi;
15583              $changed |=  $value =~ s{ ( \b Katakana_or_Hiragana \b ) }
15584                                      {$1 Katakana Hiragana Jpan}xi;
15585              $changed |=  $value =~ s/ ( \b Hang (ul)? \b ) /$1 Kore/xi;
15586              $changed |=  $value =~ s/ ( \b Bopo (mofo)? \b ) /$1 Hanb/xi;
15587
15588             if ($changed) {
15589                 $value = join " ", uniques split " ", $value;
15590                 $range->set_value($value)
15591             }
15592         }
15593
15594         foreach my $table ($source->tables) {
15595             my $scx_table = $perl_scx->add_match_table($table->name,
15596                                     Full_Name => $table->full_name);
15597             foreach my $alias ($table->aliases) {
15598                 $scx_table->add_alias($alias->name);
15599             }
15600         }
15601     }
15602
15603     # Here done with all the basic stuff.  Ready to populate the information
15604     # about each character if annotating them.
15605     if ($annotate) {
15606
15607         # See comments at its declaration
15608         $annotate_ranges = Range_Map->new;
15609
15610         # This separates out the non-characters from the other unassigneds, so
15611         # can give different annotations for each.
15612         $unassigned_sans_noncharacters = Range_List->new(
15613                                     Initialize => $gc->table('Unassigned'));
15614         $unassigned_sans_noncharacters &= (~ $NChar);
15615
15616         for (my $i = 0; $i <= $MAX_UNICODE_CODEPOINT + 1; $i++ ) {
15617             $i = populate_char_info($i);    # Note sets $i so may cause skips
15618
15619         }
15620     }
15621
15622     return;
15623 }
15624
15625 sub add_perl_synonyms() {
15626     # A number of Unicode tables have Perl synonyms that are expressed in
15627     # the single-form, \p{name}.  These are:
15628     #   All the binary property Y tables, so that \p{Name=Y} gets \p{Name} and
15629     #       \p{Is_Name} as synonyms
15630     #   \p{Script_Extensions=Value} gets \p{Value}, \p{Is_Value} as synonyms
15631     #   \p{General_Category=Value} gets \p{Value}, \p{Is_Value} as synonyms
15632     #   \p{Block=Value} gets \p{In_Value} as a synonym, and, if there is no
15633     #       conflict, \p{Value} and \p{Is_Value} as well
15634     #
15635     # This routine generates these synonyms, warning of any unexpected
15636     # conflicts.
15637
15638     # Construct the list of tables to get synonyms for.  Start with all the
15639     # binary and the General_Category ones.
15640     my @tables = grep { $_->type == $BINARY || $_->type == $FORCED_BINARY }
15641                                                             property_ref('*');
15642     push @tables, $gc->tables;
15643
15644     # If the version of Unicode includes the Script Extensions (preferably),
15645     # or Script property, add its tables
15646     if (defined $scx) {
15647         push @tables, $scx->tables;
15648     }
15649     else {
15650         push @tables, $script->tables if defined $script;
15651     }
15652
15653     # The Block tables are kept separate because they are treated differently.
15654     # And the earliest versions of Unicode didn't include them, so add only if
15655     # there are some.
15656     my @blocks;
15657     push @blocks, $block->tables if defined $block;
15658
15659     # Here, have the lists of tables constructed.  Process blocks last so that
15660     # if there are name collisions with them, blocks have lowest priority.
15661     # Should there ever be other collisions, manual intervention would be
15662     # required.  See the comments at the beginning of the program for a
15663     # possible way to handle those semi-automatically.
15664     foreach my $table (@tables,  @blocks) {
15665
15666         # For non-binary properties, the synonym is just the name of the
15667         # table, like Greek, but for binary properties the synonym is the name
15668         # of the property, and means the code points in its 'Y' table.
15669         my $nominal = $table;
15670         my $nominal_property = $nominal->property;
15671         my $actual;
15672         if (! $nominal->isa('Property')) {
15673             $actual = $table;
15674         }
15675         else {
15676
15677             # Here is a binary property.  Use the 'Y' table.  Verify that is
15678             # there
15679             my $yes = $nominal->table('Y');
15680             unless (defined $yes) {  # Must be defined, but is permissible to
15681                                      # be empty.
15682                 Carp::my_carp_bug("Undefined $nominal, 'Y'.  Skipping.");
15683                 next;
15684             }
15685             $actual = $yes;
15686         }
15687
15688         foreach my $alias ($nominal->aliases) {
15689
15690             # Attempt to create a table in the perl directory for the
15691             # candidate table, using whatever aliases in it that don't
15692             # conflict.  Also add non-conflicting aliases for all these
15693             # prefixed by 'Is_' (and/or 'In_' for Block property tables)
15694             PREFIX:
15695             foreach my $prefix ("", 'Is_', 'In_') {
15696
15697                 # Only Block properties can have added 'In_' aliases.
15698                 next if $prefix eq 'In_' and $nominal_property != $block;
15699
15700                 my $proposed_name = $prefix . $alias->name;
15701
15702                 # No Is_Is, In_In, nor combinations thereof
15703                 trace "$proposed_name is a no-no" if main::DEBUG && $to_trace && $proposed_name =~ /^ I [ns] _I [ns] _/x;
15704                 next if $proposed_name =~ /^ I [ns] _I [ns] _/x;
15705
15706                 trace "Seeing if can add alias or table: 'perl=$proposed_name' based on $nominal" if main::DEBUG && $to_trace;
15707
15708                 # Get a reference to any existing table in the perl
15709                 # directory with the desired name.
15710                 my $pre_existing = $perl->table($proposed_name);
15711
15712                 if (! defined $pre_existing) {
15713
15714                     # No name collision, so OK to add the perl synonym.
15715
15716                     my $make_re_pod_entry;
15717                     my $ok_as_filename;
15718                     my $status = $alias->status;
15719                     if ($nominal_property == $block) {
15720
15721                         # For block properties, only the compound form is
15722                         # preferred for external use; the others are
15723                         # discouraged.  The pod file contains wild cards for
15724                         # the 'In' and 'Is' forms so no entries for those; and
15725                         # we don't want people using the name without any
15726                         # prefix, so discourage that.
15727                         if ($prefix eq "") {
15728                             $make_re_pod_entry = 1;
15729                             $status = $status || $DISCOURAGED;
15730                             $ok_as_filename = 0;
15731                         }
15732                         elsif ($prefix eq 'In_') {
15733                             $make_re_pod_entry = 0;
15734                             $status = $status || $DISCOURAGED;
15735                             $ok_as_filename = 1;
15736                         }
15737                         else {
15738                             $make_re_pod_entry = 0;
15739                             $status = $status || $DISCOURAGED;
15740                             $ok_as_filename = 0;
15741                         }
15742                     }
15743                     elsif ($prefix ne "") {
15744
15745                         # The 'Is' prefix is handled in the pod by a wild
15746                         # card, and we won't use it for an external name
15747                         $make_re_pod_entry = 0;
15748                         $status = $status || $NORMAL;
15749                         $ok_as_filename = 0;
15750                     }
15751                     else {
15752
15753                         # Here, is an empty prefix, non block.  This gets its
15754                         # own pod entry and can be used for an external name.
15755                         $make_re_pod_entry = 1;
15756                         $status = $status || $NORMAL;
15757                         $ok_as_filename = 1;
15758                     }
15759
15760                     # Here, there isn't a perl pre-existing table with the
15761                     # name.  Look through the list of equivalents of this
15762                     # table to see if one is a perl table.
15763                     foreach my $equivalent ($actual->leader->equivalents) {
15764                         next if $equivalent->property != $perl;
15765
15766                         # Here, have found a table for $perl.  Add this alias
15767                         # to it, and are done with this prefix.
15768                         $equivalent->add_alias($proposed_name,
15769                                         Re_Pod_Entry => $make_re_pod_entry,
15770
15771                                         # Currently don't output these in the
15772                                         # ucd pod, as are strongly discouraged
15773                                         # from being used
15774                                         UCD => 0,
15775
15776                                         Status => $status,
15777                                         OK_as_Filename => $ok_as_filename);
15778                         trace "adding alias perl=$proposed_name to $equivalent" if main::DEBUG && $to_trace;
15779                         next PREFIX;
15780                     }
15781
15782                     # Here, $perl doesn't already have a table that is a
15783                     # synonym for this property, add one.
15784                     my $added_table = $perl->add_match_table($proposed_name,
15785                                             Re_Pod_Entry => $make_re_pod_entry,
15786
15787                                             # See UCD comment just above
15788                                             UCD => 0,
15789
15790                                             Status => $status,
15791                                             OK_as_Filename => $ok_as_filename);
15792                     # And it will be related to the actual table, since it is
15793                     # based on it.
15794                     $added_table->set_equivalent_to($actual, Related => 1);
15795                     trace "added ", $perl->table($proposed_name) if main::DEBUG && $to_trace;
15796                     next;
15797                 } # End of no pre-existing.
15798
15799                 # Here, there is a pre-existing table that has the proposed
15800                 # name.  We could be in trouble, but not if this is just a
15801                 # synonym for another table that we have already made a child
15802                 # of the pre-existing one.
15803                 if ($pre_existing->is_set_equivalent_to($actual)) {
15804                     trace "$pre_existing is already equivalent to $actual; adding alias perl=$proposed_name to it" if main::DEBUG && $to_trace;
15805                     $pre_existing->add_alias($proposed_name);
15806                     next;
15807                 }
15808
15809                 # Here, there is a name collision, but it still could be OK if
15810                 # the tables match the identical set of code points, in which
15811                 # case, we can combine the names.  Compare each table's code
15812                 # point list to see if they are identical.
15813                 trace "Potential name conflict with $pre_existing having ", $pre_existing->count, " code points" if main::DEBUG && $to_trace;
15814                 if ($pre_existing->matches_identically_to($actual)) {
15815
15816                     # Here, they do match identically.  Not a real conflict.
15817                     # Make the perl version a child of the Unicode one, except
15818                     # in the non-obvious case of where the perl name is
15819                     # already a synonym of another Unicode property.  (This is
15820                     # excluded by the test for it being its own parent.)  The
15821                     # reason for this exclusion is that then the two Unicode
15822                     # properties become related; and we don't really know if
15823                     # they are or not.  We generate documentation based on
15824                     # relatedness, and this would be misleading.  Code
15825                     # later executed in the process will cause the tables to
15826                     # be represented by a single file anyway, without making
15827                     # it look in the pod like they are necessarily related.
15828                     if ($pre_existing->parent == $pre_existing
15829                         && ($pre_existing->property == $perl
15830                             || $actual->property == $perl))
15831                     {
15832                         trace "Setting $pre_existing equivalent to $actual since one is \$perl, and match identical sets" if main::DEBUG && $to_trace;
15833                         $pre_existing->set_equivalent_to($actual, Related => 1);
15834                     }
15835                     elsif (main::DEBUG && $to_trace) {
15836                         trace "$pre_existing is equivalent to $actual since match identical sets, but not setting them equivalent, to preserve the separateness of the perl aliases";
15837                         trace $pre_existing->parent;
15838                     }
15839                     next PREFIX;
15840                 }
15841
15842                 # Here they didn't match identically, there is a real conflict
15843                 # between our new name and a pre-existing property.
15844                 $actual->add_conflicting($proposed_name, 'p', $pre_existing);
15845                 $pre_existing->add_conflicting($nominal->full_name,
15846                                                'p',
15847                                                $actual);
15848
15849                 # Don't output a warning for aliases for the block
15850                 # properties (unless they start with 'In_') as it is
15851                 # expected that there will be conflicts and the block
15852                 # form loses.
15853                 if ($verbosity >= $NORMAL_VERBOSITY
15854                     && ($actual->property != $block || $prefix eq 'In_'))
15855                 {
15856                     print simple_fold(join_lines(<<END
15857 There is already an alias named $proposed_name (from $pre_existing),
15858 so not creating this alias for $actual
15859 END
15860                     ), "", 4);
15861                 }
15862
15863                 # Keep track for documentation purposes.
15864                 $has_In_conflicts++ if $prefix eq 'In_';
15865                 $has_Is_conflicts++ if $prefix eq 'Is_';
15866             }
15867         }
15868     }
15869
15870     # There are some properties which have No and Yes (and N and Y) as
15871     # property values, but aren't binary, and could possibly be confused with
15872     # binary ones.  So create caveats for them.  There are tables that are
15873     # named 'No', and tables that are named 'N', but confusion is not likely
15874     # unless they are the same table.  For example, N meaning Number or
15875     # Neutral is not likely to cause confusion, so don't add caveats to things
15876     # like them.
15877     foreach my $property (grep { $_->type != $BINARY
15878                                  && $_->type != $FORCED_BINARY }
15879                                                             property_ref('*'))
15880     {
15881         my $yes = $property->table('Yes');
15882         if (defined $yes) {
15883             my $y = $property->table('Y');
15884             if (defined $y && $yes == $y) {
15885                 foreach my $alias ($property->aliases) {
15886                     $yes->add_conflicting($alias->name);
15887                 }
15888             }
15889         }
15890         my $no = $property->table('No');
15891         if (defined $no) {
15892             my $n = $property->table('N');
15893             if (defined $n && $no == $n) {
15894                 foreach my $alias ($property->aliases) {
15895                     $no->add_conflicting($alias->name, 'P');
15896                 }
15897             }
15898         }
15899     }
15900
15901     return;
15902 }
15903
15904 sub register_file_for_name($$$) {
15905     # Given info about a table and a datafile that it should be associated
15906     # with, register that association
15907
15908     my $table = shift;
15909     my $directory_ref = shift;   # Array of the directory path for the file
15910     my $file = shift;            # The file name in the final directory.
15911     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
15912
15913     trace "table=$table, file=$file, directory=@$directory_ref, fate=", $table->fate if main::DEBUG && $to_trace;
15914
15915     if ($table->isa('Property')) {
15916         $table->set_file_path(@$directory_ref, $file);
15917         push @map_properties, $table;
15918
15919         # No swash means don't do the rest of this.
15920         return if $table->fate != $ORDINARY
15921                   && ! ($table->name =~ /^_/ && $table->fate == $INTERNAL_ONLY);
15922
15923         # Get the path to the file
15924         my @path = $table->file_path;
15925
15926         # Use just the file name if no subdirectory.
15927         shift @path if $path[0] eq File::Spec->curdir();
15928
15929         my $file = join '/', @path;
15930
15931         # Create a hash entry for utf8_heavy to get the file that stores this
15932         # property's map table
15933         foreach my $alias ($table->aliases) {
15934             my $name = $alias->name;
15935             if ($name =~ /^_/) {
15936                 $strict_property_to_file_of{lc $name} = $file;
15937             }
15938             else {
15939                 $loose_property_to_file_of{standardize($name)} = $file;
15940             }
15941         }
15942
15943         # And a way for utf8_heavy to find the proper key in the SwashInfo
15944         # hash for this property.
15945         $file_to_swash_name{$file} = "To" . $table->swash_name;
15946         return;
15947     }
15948
15949     # Do all of the work for all equivalent tables when called with the leader
15950     # table, so skip if isn't the leader.
15951     return if $table->leader != $table;
15952
15953     # If this is a complement of another file, use that other file instead,
15954     # with a ! prepended to it.
15955     my $complement;
15956     if (($complement = $table->complement) != 0) {
15957         my @directories = $complement->file_path;
15958
15959         # This assumes that the 0th element is something like 'lib',
15960         # the 1th element the property name (in its own directory), like
15961         # 'AHex', and the 2th element the file like 'Y' which will have a .pl
15962         # appended to it later.
15963         $directories[1] =~ s/^/!/;
15964         $file = pop @directories;
15965         $directory_ref =\@directories;
15966     }
15967
15968     # Join all the file path components together, using slashes.
15969     my $full_filename = join('/', @$directory_ref, $file);
15970
15971     # All go in the same subdirectory of unicore, or the special
15972     # pseudo-directory '#'
15973     if ($directory_ref->[0] !~ / ^ $matches_directory | \# $ /x) {
15974         Carp::my_carp("Unexpected directory in "
15975                 .  join('/', @{$directory_ref}, $file));
15976     }
15977
15978     # For this table and all its equivalents ...
15979     foreach my $table ($table, $table->equivalents) {
15980
15981         # Associate it with its file internally.  Don't include the
15982         # $matches_directory first component
15983         $table->set_file_path(@$directory_ref, $file);
15984
15985         # No swash means don't do the rest of this.
15986         next if $table->isa('Map_Table') && $table->fate != $ORDINARY;
15987
15988         my $sub_filename = join('/', $directory_ref->[1, -1], $file);
15989
15990         my $property = $table->property;
15991         my $property_name = ($property == $perl)
15992                              ? ""  # 'perl' is never explicitly stated
15993                              : standardize($property->name) . '=';
15994
15995         my $is_default = 0; # Is this table the default one for the property?
15996
15997         # To calculate $is_default, we find if this table is the same as the
15998         # default one for the property.  But this is complicated by the
15999         # possibility that there is a master table for this one, and the
16000         # information is stored there instead of here.
16001         my $parent = $table->parent;
16002         my $leader_prop = $parent->property;
16003         my $default_map = $leader_prop->default_map;
16004         if (defined $default_map) {
16005             my $default_table = $leader_prop->table($default_map);
16006             $is_default = 1 if defined $default_table && $parent == $default_table;
16007         }
16008
16009         # Calculate the loose name for this table.  Mostly it's just its name,
16010         # standardized.  But in the case of Perl tables that are single-form
16011         # equivalents to Unicode properties, it is the latter's name.
16012         my $loose_table_name =
16013                         ($property != $perl || $leader_prop == $perl)
16014                         ? standardize($table->name)
16015                         : standardize($parent->name);
16016
16017         my $deprecated = ($table->status eq $DEPRECATED)
16018                          ? $table->status_info
16019                          : "";
16020         my $caseless_equivalent = $table->caseless_equivalent;
16021
16022         # And for each of the table's aliases...  This inner loop eventually
16023         # goes through all aliases in the UCD that we generate regex match
16024         # files for
16025         foreach my $alias ($table->aliases) {
16026             my $standard = utf8_heavy_name($table, $alias);
16027
16028             # Generate an entry in either the loose or strict hashes, which
16029             # will translate the property and alias names combination into the
16030             # file where the table for them is stored.
16031             if ($alias->loose_match) {
16032                 if (exists $loose_to_file_of{$standard}) {
16033                     Carp::my_carp("Can't change file registered to $loose_to_file_of{$standard} to '$sub_filename'.");
16034                 }
16035                 else {
16036                     $loose_to_file_of{$standard} = $sub_filename;
16037                 }
16038             }
16039             else {
16040                 if (exists $stricter_to_file_of{$standard}) {
16041                     Carp::my_carp("Can't change file registered to $stricter_to_file_of{$standard} to '$sub_filename'.");
16042                 }
16043                 else {
16044                     $stricter_to_file_of{$standard} = $sub_filename;
16045
16046                     # Tightly coupled with how utf8_heavy.pl works, for a
16047                     # floating point number that is a whole number, get rid of
16048                     # the trailing decimal point and 0's, so that utf8_heavy
16049                     # will work.  Also note that this assumes that such a
16050                     # number is matched strictly; so if that were to change,
16051                     # this would be wrong.
16052                     if ((my $integer_name = $alias->name)
16053                             =~ s/^ ( -? \d+ ) \.0+ $ /$1/x)
16054                     {
16055                         $stricter_to_file_of{$property_name . $integer_name}
16056                                                             = $sub_filename;
16057                     }
16058                 }
16059             }
16060
16061             # For Unicode::UCD, create a mapping of the prop=value to the
16062             # canonical =value for that property.
16063             if ($standard =~ /=/) {
16064
16065                 # This could happen if a strict name mapped into an existing
16066                 # loose name.  In that event, the strict names would have to
16067                 # be moved to a new hash.
16068                 if (exists($loose_to_standard_value{$standard})) {
16069                     Carp::my_carp_bug("'$standard' conflicts with a pre-existing use.  Bad News.  Continuing anyway");
16070                 }
16071                 $loose_to_standard_value{$standard} = $loose_table_name;
16072             }
16073
16074             # Keep a list of the deprecated properties and their filenames
16075             if ($deprecated && $complement == 0) {
16076                 $utf8::why_deprecated{$sub_filename} = $deprecated;
16077             }
16078
16079             # And a substitute table, if any, for case-insensitive matching
16080             if ($caseless_equivalent != 0) {
16081                 $caseless_equivalent_to{$standard} = $caseless_equivalent;
16082             }
16083
16084             # Add to defaults list if the table this alias belongs to is the
16085             # default one
16086             $loose_defaults{$standard} = 1 if $is_default;
16087         }
16088     }
16089
16090     return;
16091 }
16092
16093 {   # Closure
16094     my %base_names;  # Names already used for avoiding DOS 8.3 filesystem
16095                      # conflicts
16096     my %full_dir_name_of;   # Full length names of directories used.
16097
16098     sub construct_filename($$$) {
16099         # Return a file name for a table, based on the table name, but perhaps
16100         # changed to get rid of non-portable characters in it, and to make
16101         # sure that it is unique on a file system that allows the names before
16102         # any period to be at most 8 characters (DOS).  While we're at it
16103         # check and complain if there are any directory conflicts.
16104
16105         my $name = shift;       # The name to start with
16106         my $mutable = shift;    # Boolean: can it be changed?  If no, but
16107                                 # yet it must be to work properly, a warning
16108                                 # is given
16109         my $directories_ref = shift;  # A reference to an array containing the
16110                                 # path to the file, with each element one path
16111                                 # component.  This is used because the same
16112                                 # name can be used in different directories.
16113         Carp::carp_extra_args(\@_) if main::DEBUG && @_;
16114
16115         my $warn = ! defined wantarray;  # If true, then if the name is
16116                                 # changed, a warning is issued as well.
16117
16118         if (! defined $name) {
16119             Carp::my_carp("Undefined name in directory "
16120                           . File::Spec->join(@$directories_ref)
16121                           . ". '_' used");
16122             return '_';
16123         }
16124
16125         # Make sure that no directory names conflict with each other.  Look at
16126         # each directory in the input file's path.  If it is already in use,
16127         # assume it is correct, and is merely being re-used, but if we
16128         # truncate it to 8 characters, and find that there are two directories
16129         # that are the same for the first 8 characters, but differ after that,
16130         # then that is a problem.
16131         foreach my $directory (@$directories_ref) {
16132             my $short_dir = substr($directory, 0, 8);
16133             if (defined $full_dir_name_of{$short_dir}) {
16134                 next if $full_dir_name_of{$short_dir} eq $directory;
16135                 Carp::my_carp("$directory conflicts with $full_dir_name_of{$short_dir}.  Bad News.  Continuing anyway");
16136             }
16137             else {
16138                 $full_dir_name_of{$short_dir} = $directory;
16139             }
16140         }
16141
16142         my $path = join '/', @$directories_ref;
16143         $path .= '/' if $path;
16144
16145         # Remove interior underscores.
16146         (my $filename = $name) =~ s/ (?<=.) _ (?=.) //xg;
16147
16148         # Convert the dot in floating point numbers to an underscore
16149         $filename =~ s/\./_/ if $filename =~ / ^ \d+ \. \d+ $ /x;
16150
16151         my $suffix = "";
16152
16153         # Extract any suffix, delete any non-word character, and truncate to 3
16154         # after the dot
16155         if ($filename =~ m/ ( .*? ) ( \. .* ) /x) {
16156             $filename = $1;
16157             $suffix = $2;
16158             $suffix =~ s/\W+//g;
16159             substr($suffix, 4) = "" if length($suffix) > 4;
16160         }
16161
16162         # Change any non-word character outside the suffix into an underscore,
16163         # and truncate to 8.
16164         $filename =~ s/\W+/_/g;   # eg., "L&" -> "L_"
16165         substr($filename, 8) = "" if length($filename) > 8;
16166
16167         # Make sure the basename doesn't conflict with something we
16168         # might have already written. If we have, say,
16169         #     InGreekExtended1
16170         #     InGreekExtended2
16171         # they become
16172         #     InGreekE
16173         #     InGreek2
16174         my $warned = 0;
16175         while (my $num = $base_names{$path}{lc "$filename$suffix"}++) {
16176             $num++; # so basenames with numbers start with '2', which
16177                     # just looks more natural.
16178
16179             # Want to append $num, but if it'll make the basename longer
16180             # than 8 characters, pre-truncate $filename so that the result
16181             # is acceptable.
16182             my $delta = length($filename) + length($num) - 8;
16183             if ($delta > 0) {
16184                 substr($filename, -$delta) = $num;
16185             }
16186             else {
16187                 $filename .= $num;
16188             }
16189             if ($warn && ! $warned) {
16190                 $warned = 1;
16191                 Carp::my_carp("'$path$name' conflicts with another name on a filesystem with 8 significant characters (like DOS).  Proceeding anyway.");
16192             }
16193         }
16194
16195         return $filename if $mutable;
16196
16197         # If not changeable, must return the input name, but warn if needed to
16198         # change it beyond shortening it.
16199         if ($name ne $filename
16200             && substr($name, 0, length($filename)) ne $filename) {
16201             Carp::my_carp("'$path$name' had to be changed into '$filename'.  Bad News.  Proceeding anyway.");
16202         }
16203         return $name;
16204     }
16205 }
16206
16207 # The pod file contains a very large table.  Many of the lines in that table
16208 # would exceed a typical output window's size, and so need to be wrapped with
16209 # a hanging indent to make them look good.  The pod language is really
16210 # insufficient here.  There is no general construct to do that in pod, so it
16211 # is done here by beginning each such line with a space to cause the result to
16212 # be output without formatting, and doing all the formatting here.  This leads
16213 # to the result that if the eventual display window is too narrow it won't
16214 # look good, and if the window is too wide, no advantage is taken of that
16215 # extra width.  A further complication is that the output may be indented by
16216 # the formatter so that there is less space than expected.  What I (khw) have
16217 # done is to assume that that indent is a particular number of spaces based on
16218 # what it is in my Linux system;  people can always resize their windows if
16219 # necessary, but this is obviously less than desirable, but the best that can
16220 # be expected.
16221 my $automatic_pod_indent = 8;
16222
16223 # Try to format so that uses fewest lines, but few long left column entries
16224 # slide into the right column.  An experiment on 5.1 data yielded the
16225 # following percentages that didn't cut into the other side along with the
16226 # associated first-column widths
16227 # 69% = 24
16228 # 80% not too bad except for a few blocks
16229 # 90% = 33; # , cuts 353/3053 lines from 37 = 12%
16230 # 95% = 37;
16231 my $indent_info_column = 27;    # 75% of lines didn't have overlap
16232
16233 my $FILLER = 3;     # Length of initial boiler-plate columns in a pod line
16234                     # The 3 is because of:
16235                     #   1   for the leading space to tell the pod formatter to
16236                     #       output as-is
16237                     #   1   for the flag
16238                     #   1   for the space between the flag and the main data
16239
16240 sub format_pod_line ($$$;$$) {
16241     # Take a pod line and return it, formatted properly
16242
16243     my $first_column_width = shift;
16244     my $entry = shift;  # Contents of left column
16245     my $info = shift;   # Contents of right column
16246
16247     my $status = shift || "";   # Any flag
16248
16249     my $loose_match = shift;    # Boolean.
16250     $loose_match = 1 unless defined $loose_match;
16251
16252     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
16253
16254     my $flags = "";
16255     $flags .= $STRICTER if ! $loose_match;
16256
16257     $flags .= $status if $status;
16258
16259     # There is a blank in the left column to cause the pod formatter to
16260     # output the line as-is.
16261     return sprintf " %-*s%-*s %s\n",
16262                     # The first * in the format is replaced by this, the -1 is
16263                     # to account for the leading blank.  There isn't a
16264                     # hard-coded blank after this to separate the flags from
16265                     # the rest of the line, so that in the unlikely event that
16266                     # multiple flags are shown on the same line, they both
16267                     # will get displayed at the expense of that separation,
16268                     # but since they are left justified, a blank will be
16269                     # inserted in the normal case.
16270                     $FILLER - 1,
16271                     $flags,
16272
16273                     # The other * in the format is replaced by this number to
16274                     # cause the first main column to right fill with blanks.
16275                     # The -1 is for the guaranteed blank following it.
16276                     $first_column_width - $FILLER - 1,
16277                     $entry,
16278                     $info;
16279 }
16280
16281 my @zero_match_tables;  # List of tables that have no matches in this release
16282
16283 sub make_re_pod_entries($) {
16284     # This generates the entries for the pod file for a given table.
16285     # Also done at this time are any children tables.  The output looks like:
16286     # \p{Common}              \p{Script=Common} (Short: \p{Zyyy}) (5178)
16287
16288     my $input_table = shift;        # Table the entry is for
16289     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
16290
16291     # Generate parent and all its children at the same time.
16292     return if $input_table->parent != $input_table;
16293
16294     my $property = $input_table->property;
16295     my $type = $property->type;
16296     my $full_name = $property->full_name;
16297
16298     my $count = $input_table->count;
16299     my $unicode_count;
16300     my $non_unicode_string;
16301     if ($count > $MAX_UNICODE_CODEPOINTS) {
16302         $unicode_count = $count - ($MAX_WORKING_CODEPOINT
16303                                     - $MAX_UNICODE_CODEPOINT);
16304         $non_unicode_string = " plus all above-Unicode code points";
16305     }
16306     else {
16307         $unicode_count = $count;
16308         $non_unicode_string = "";
16309     }
16310
16311     my $string_count = clarify_number($unicode_count) . $non_unicode_string;
16312
16313     my $definition = $input_table->calculate_table_definition;
16314     if ($definition) {
16315
16316         # Save the definition for later use.
16317         $input_table->set_definition($definition);
16318
16319         $definition = ": $definition";
16320     }
16321
16322     my $status = $input_table->status;
16323     my $status_info = $input_table->status_info;
16324     my $caseless_equivalent = $input_table->caseless_equivalent;
16325
16326     # Don't mention a placeholder equivalent as it isn't to be listed in the
16327     # pod
16328     $caseless_equivalent = 0 if $caseless_equivalent != 0
16329                                 && $caseless_equivalent->fate > $ORDINARY;
16330
16331     my $entry_for_first_table; # The entry for the first table output.
16332                            # Almost certainly, it is the parent.
16333
16334     # For each related table (including itself), we will generate a pod entry
16335     # for each name each table goes by
16336     foreach my $table ($input_table, $input_table->children) {
16337
16338         # utf8_heavy.pl cannot deal with null string property values, so skip
16339         # any tables that have no non-null names.
16340         next if ! grep { $_->name ne "" } $table->aliases;
16341
16342         # First, gather all the info that applies to this table as a whole.
16343
16344         push @zero_match_tables, $table if $count == 0
16345                                             # Don't mention special tables
16346                                             # as being zero length
16347                                            && $table->fate == $ORDINARY;
16348
16349         my $table_property = $table->property;
16350
16351         # The short name has all the underscores removed, while the full name
16352         # retains them.  Later, we decide whether to output a short synonym
16353         # for the full one, we need to compare apples to apples, so we use the
16354         # short name's length including underscores.
16355         my $table_property_short_name_length;
16356         my $table_property_short_name
16357             = $table_property->short_name(\$table_property_short_name_length);
16358         my $table_property_full_name = $table_property->full_name;
16359
16360         # Get how much savings there is in the short name over the full one
16361         # (delta will always be <= 0)
16362         my $table_property_short_delta = $table_property_short_name_length
16363                                          - length($table_property_full_name);
16364         my @table_description = $table->description;
16365         my @table_note = $table->note;
16366
16367         # Generate an entry for each alias in this table.
16368         my $entry_for_first_alias;  # saves the first one encountered.
16369         foreach my $alias ($table->aliases) {
16370
16371             # Skip if not to go in pod.
16372             next unless $alias->make_re_pod_entry;
16373
16374             # Start gathering all the components for the entry
16375             my $name = $alias->name;
16376
16377             # Skip if name is empty, as can't be accessed by regexes.
16378             next if $name eq "";
16379
16380             my $entry;      # Holds the left column, may include extras
16381             my $entry_ref;  # To refer to the left column's contents from
16382                             # another entry; has no extras
16383
16384             # First the left column of the pod entry.  Tables for the $perl
16385             # property always use the single form.
16386             if ($table_property == $perl) {
16387                 $entry = "\\p{$name}";
16388                 $entry .= " \\p$name" if length $name == 1; # Show non-braced
16389                                                             # form too
16390                 $entry_ref = "\\p{$name}";
16391             }
16392             else {    # Compound form.
16393
16394                 # Only generate one entry for all the aliases that mean true
16395                 # or false in binary properties.  Append a '*' to indicate
16396                 # some are missing.  (The heading comment notes this.)
16397                 my $rhs;
16398                 if ($type == $BINARY) {
16399                     next if $name ne 'N' && $name ne 'Y';
16400                     $rhs = "$name*";
16401                 }
16402                 elsif ($type != $FORCED_BINARY) {
16403                     $rhs = $name;
16404                 }
16405                 else {
16406
16407                     # Forced binary properties require special handling.  It
16408                     # has two sets of tables, one set is true/false; and the
16409                     # other set is everything else.  Entries are generated for
16410                     # each set.  Use the Bidi_Mirrored property (which appears
16411                     # in all Unicode versions) to get a list of the aliases
16412                     # for the true/false tables.  Of these, only output the N
16413                     # and Y ones, the same as, a regular binary property.  And
16414                     # output all the rest, same as a non-binary property.
16415                     my $bm = property_ref("Bidi_Mirrored");
16416                     if ($name eq 'N' || $name eq 'Y') {
16417                         $rhs = "$name*";
16418                     } elsif (grep { $name eq $_->name } $bm->table("Y")->aliases,
16419                                                         $bm->table("N")->aliases)
16420                     {
16421                         next;
16422                     }
16423                     else {
16424                         $rhs = $name;
16425                     }
16426                 }
16427
16428                 # Colon-space is used to give a little more space to be easier
16429                 # to read;
16430                 $entry = "\\p{"
16431                         . $table_property_full_name
16432                         . ": $rhs}";
16433
16434                 # But for the reference to this entry, which will go in the
16435                 # right column, where space is at a premium, use equals
16436                 # without a space
16437                 $entry_ref = "\\p{" . $table_property_full_name . "=$name}";
16438             }
16439
16440             # Then the right (info) column.  This is stored as components of
16441             # an array for the moment, then joined into a string later.  For
16442             # non-internal only properties, begin the info with the entry for
16443             # the first table we encountered (if any), as things are ordered
16444             # so that that one is the most descriptive.  This leads to the
16445             # info column of an entry being a more descriptive version of the
16446             # name column
16447             my @info;
16448             if ($name =~ /^_/) {
16449                 push @info,
16450                         '(For internal use by Perl, not necessarily stable)';
16451             }
16452             elsif ($entry_for_first_alias) {
16453                 push @info, $entry_for_first_alias;
16454             }
16455
16456             # If this entry is equivalent to another, add that to the info,
16457             # using the first such table we encountered
16458             if ($entry_for_first_table) {
16459                 if (@info) {
16460                     push @info, "(= $entry_for_first_table)";
16461                 }
16462                 else {
16463                     push @info, $entry_for_first_table;
16464                 }
16465             }
16466
16467             # If the name is a large integer, add an equivalent with an
16468             # exponent for better readability
16469             if ($name =~ /^[+-]?[\d]+$/ && $name >= 10_000) {
16470                 push @info, sprintf "(= %.1e)", $name
16471             }
16472
16473             my $parenthesized = "";
16474             if (! $entry_for_first_alias) {
16475
16476                 # This is the first alias for the current table.  The alias
16477                 # array is ordered so that this is the fullest, most
16478                 # descriptive alias, so it gets the fullest info.  The other
16479                 # aliases are mostly merely pointers to this one, using the
16480                 # information already added above.
16481
16482                 # Display any status message, but only on the parent table
16483                 if ($status && ! $entry_for_first_table) {
16484                     push @info, $status_info;
16485                 }
16486
16487                 # Put out any descriptive info
16488                 if (@table_description || @table_note) {
16489                     push @info, join "; ", @table_description, @table_note;
16490                 }
16491
16492                 # Look to see if there is a shorter name we can point people
16493                 # at
16494                 my $standard_name = standardize($name);
16495                 my $short_name;
16496                 my $proposed_short = $table->short_name;
16497                 if (defined $proposed_short) {
16498                     my $standard_short = standardize($proposed_short);
16499
16500                     # If the short name is shorter than the standard one, or
16501                     # even it it's not, but the combination of it and its
16502                     # short property name (as in \p{prop=short} ($perl doesn't
16503                     # have this form)) saves at least two characters, then,
16504                     # cause it to be listed as a shorter synonym.
16505                     if (length $standard_short < length $standard_name
16506                         || ($table_property != $perl
16507                             && (length($standard_short)
16508                                 - length($standard_name)
16509                                 + $table_property_short_delta)  # (<= 0)
16510                                 < -2))
16511                     {
16512                         $short_name = $proposed_short;
16513                         if ($table_property != $perl) {
16514                             $short_name = $table_property_short_name
16515                                           . "=$short_name";
16516                         }
16517                         $short_name = "\\p{$short_name}";
16518                     }
16519                 }
16520
16521                 # And if this is a compound form name, see if there is a
16522                 # single form equivalent
16523                 my $single_form;
16524                 if ($table_property != $perl && $table_property != $block) {
16525
16526                     # Special case the binary N tables, so that will print
16527                     # \P{single}, but use the Y table values to populate
16528                     # 'single', as we haven't likewise populated the N table.
16529                     # For forced binary tables, we can't just look at the N
16530                     # table, but must see if this table is equivalent to the N
16531                     # one, as there are two equivalent beasts in these
16532                     # properties.
16533                     my $test_table;
16534                     my $p;
16535                     if (   ($type == $BINARY
16536                             && $input_table == $property->table('No'))
16537                         || ($type == $FORCED_BINARY
16538                             && $property->table('No')->
16539                                         is_set_equivalent_to($input_table)))
16540                     {
16541                         $test_table = $property->table('Yes');
16542                         $p = 'P';
16543                     }
16544                     else {
16545                         $test_table = $input_table;
16546                         $p = 'p';
16547                     }
16548
16549                     # Look for a single form amongst all the children.
16550                     foreach my $table ($test_table->children) {
16551                         next if $table->property != $perl;
16552                         my $proposed_name = $table->short_name;
16553                         next if ! defined $proposed_name;
16554
16555                         # Don't mention internal-only properties as a possible
16556                         # single form synonym
16557                         next if substr($proposed_name, 0, 1) eq '_';
16558
16559                         $proposed_name = "\\$p\{$proposed_name}";
16560                         if (! defined $single_form
16561                             || length($proposed_name) < length $single_form)
16562                         {
16563                             $single_form = $proposed_name;
16564
16565                             # The goal here is to find a single form; not the
16566                             # shortest possible one.  We've already found a
16567                             # short name.  So, stop at the first single form
16568                             # found, which is likely to be closer to the
16569                             # original.
16570                             last;
16571                         }
16572                     }
16573                 }
16574
16575                 # Output both short and single in the same parenthesized
16576                 # expression, but with only one of 'Single', 'Short' if there
16577                 # are both items.
16578                 if ($short_name || $single_form || $table->conflicting) {
16579                     $parenthesized .= "Short: $short_name" if $short_name;
16580                     if ($short_name && $single_form) {
16581                         $parenthesized .= ', ';
16582                     }
16583                     elsif ($single_form) {
16584                         $parenthesized .= 'Single: ';
16585                     }
16586                     $parenthesized .= $single_form if $single_form;
16587                 }
16588             }
16589
16590             if ($caseless_equivalent != 0) {
16591                 $parenthesized .=  '; ' if $parenthesized ne "";
16592                 $parenthesized .= "/i= " . $caseless_equivalent->complete_name;
16593             }
16594
16595
16596             # Warn if this property isn't the same as one that a
16597             # semi-casual user might expect.  The other components of this
16598             # parenthesized structure are calculated only for the first entry
16599             # for this table, but the conflicting is deemed important enough
16600             # to go on every entry.
16601             my $conflicting = join " NOR ", $table->conflicting;
16602             if ($conflicting) {
16603                 $parenthesized .=  '; ' if $parenthesized ne "";
16604                 $parenthesized .= "NOT $conflicting";
16605             }
16606
16607             push @info, "($parenthesized)" if $parenthesized;
16608
16609             if ($name =~ /_$/ && $alias->loose_match) {
16610                 push @info, "Note the trailing '_' matters in spite of loose matching rules.";
16611             }
16612
16613             if ($table_property != $perl && $table->perl_extension) {
16614                 push @info, '(Perl extension)';
16615             }
16616             my $definition = $table->definition // "";
16617             $definition = "" if $entry_for_first_alias;
16618             $definition = ": $definition" if $definition;
16619             push @info, "($string_count$definition)";
16620
16621             # Now, we have both the entry and info so add them to the
16622             # list of all the properties.
16623             push @match_properties,
16624                 format_pod_line($indent_info_column,
16625                                 $entry,
16626                                 join( " ", @info),
16627                                 $alias->status,
16628                                 $alias->loose_match);
16629
16630             $entry_for_first_alias = $entry_ref unless $entry_for_first_alias;
16631         } # End of looping through the aliases for this table.
16632
16633         if (! $entry_for_first_table) {
16634             $entry_for_first_table = $entry_for_first_alias;
16635         }
16636     } # End of looping through all the related tables
16637     return;
16638 }
16639
16640 sub make_ucd_table_pod_entries {
16641     my $table = shift;
16642
16643     # Generate the entries for the UCD section of the pod for $table.  This
16644     # also calculates if names are ambiguous, so has to be called even if the
16645     # pod is not being output
16646
16647     my $short_name = $table->name;
16648     my $standard_short_name = standardize($short_name);
16649     my $full_name = $table->full_name;
16650     my $standard_full_name = standardize($full_name);
16651
16652     my $full_info = "";     # Text of info column for full-name entries
16653     my $other_info = "";    # Text of info column for short-name entries
16654     my $short_info = "";    # Text of info column for other entries
16655     my $meaning = "";       # Synonym of this table
16656
16657     my $property = ($table->isa('Property'))
16658                    ? $table
16659                    : $table->parent->property;
16660
16661     my $perl_extension = $table->perl_extension;
16662     my $is_perl_extension_match_table_but_not_dollar_perl
16663                                                         = $property != $perl
16664                                                        && $perl_extension
16665                                                        && $property != $table;
16666
16667     # Get the more official name for for perl extensions that aren't
16668     # stand-alone properties
16669     if ($is_perl_extension_match_table_but_not_dollar_perl) {
16670         if ($property->type == $BINARY) {
16671             $meaning = $property->full_name;
16672         }
16673         else {
16674             $meaning = $table->parent->complete_name;
16675         }
16676     }
16677
16678     # There are three types of info column.  One for the short name, one for
16679     # the full name, and one for everything else.  They mostly are the same,
16680     # so initialize in the same loop.
16681
16682     foreach my $info_ref (\$full_info, \$short_info, \$other_info) {
16683         if ($info_ref != \$full_info) {
16684
16685             # The non-full name columns include the full name
16686             $$info_ref .= $full_name;
16687         }
16688
16689
16690         if ($is_perl_extension_match_table_but_not_dollar_perl) {
16691
16692             # Add the synonymous name for the non-full name entries; and to
16693             # the full-name entry if it adds extra information
16694             if (   standardize($meaning) ne $standard_full_name
16695                 || $info_ref == \$other_info
16696                 || $info_ref == \$short_info)
16697             {
16698                 my $parenthesized =  $info_ref != \$full_info;
16699                 $$info_ref .= " " if $$info_ref && $parenthesized;
16700                 $$info_ref .= "(=" if $parenthesized;
16701                 $$info_ref .= "$meaning";
16702                 $$info_ref .= ")" if $parenthesized;
16703                 $$info_ref .= ".";
16704             }
16705         }
16706
16707         # And the full-name entry includes the short name, if shorter
16708         if ($info_ref == \$full_info
16709             && length $standard_short_name < length $standard_full_name)
16710         {
16711             $full_info =~ s/\.\Z//;
16712             $full_info .= "  " if $full_info;
16713             $full_info .= "(Short: $short_name)";
16714         }
16715
16716         if ($table->perl_extension) {
16717             $$info_ref =~ s/\.\Z//;
16718             $$info_ref .= ".  " if $$info_ref;
16719             $$info_ref .= "(Perl extension)";
16720         }
16721     }
16722
16723     my $definition;
16724     my $definition_table;
16725     my $type = $table->property->type;
16726     if ($type == $BINARY || $type == $FORCED_BINARY) {
16727         $definition_table = $table->property->table('Y');
16728     }
16729     elsif ($table->isa('Match_Table')) {
16730         $definition_table = $table;
16731     }
16732
16733     $definition = $definition_table->calculate_table_definition
16734                                             if defined $definition_table
16735                                                     && $definition_table != 0;
16736
16737     # Add any extra annotations to the full name entry
16738     foreach my $more_info ($table->description,
16739                             $definition,
16740                             $table->note,
16741                             $table->status_info)
16742     {
16743         next unless $more_info;
16744         $full_info =~ s/\.\Z//;
16745         $full_info .= ".  " if $full_info;
16746         $full_info .= $more_info;
16747     }
16748     if ($table->property->type == $FORCED_BINARY) {
16749         if ($full_info) {
16750             $full_info =~ s/\.\Z//;
16751             $full_info .= ".  ";
16752         }
16753         $full_info .= "This is a combination property which has both:"
16754                     . " 1) a map to various string values; and"
16755                     . " 2) a map to boolean Y/N, where 'Y' means the"
16756                     . " string value is non-empty.  Add the prefix 'is'"
16757                     . " to the prop_invmap() call to get the latter";
16758     }
16759
16760     # These keep track if have created full and short name pod entries for the
16761     # property
16762     my $done_full = 0;
16763     my $done_short = 0;
16764
16765     # Every possible name is kept track of, even those that aren't going to be
16766     # output.  This way we can be sure to find the ambiguities.
16767     foreach my $alias ($table->aliases) {
16768         my $name = $alias->name;
16769         my $standard = standardize($name);
16770         my $info;
16771         my $output_this = $alias->ucd;
16772
16773         # If the full and short names are the same, we want to output the full
16774         # one's entry, so it has priority.
16775         if ($standard eq $standard_full_name) {
16776             next if $done_full;
16777             $done_full = 1;
16778             $info = $full_info;
16779         }
16780         elsif ($standard eq $standard_short_name) {
16781             next if $done_short;
16782             $done_short = 1;
16783             next if $standard_short_name eq $standard_full_name;
16784             $info = $short_info;
16785         }
16786         else {
16787             $info = $other_info;
16788         }
16789
16790         $combination_property{$standard} = 1
16791                                   if $table->property->type == $FORCED_BINARY;
16792
16793         # Here, we have set up the two columns for this entry.  But if an
16794         # entry already exists for this name, we have to decide which one
16795         # we're going to later output.
16796         if (exists $ucd_pod{$standard}) {
16797
16798             # If the two entries refer to the same property, it's not going to
16799             # be ambiguous.  (Likely it's because the names when standardized
16800             # are the same.)  But that means if they are different properties,
16801             # there is ambiguity.
16802             if ($ucd_pod{$standard}->{'property'} != $property) {
16803
16804                 # Here, we have an ambiguity.  This code assumes that one is
16805                 # scheduled to be output and one not and that one is a perl
16806                 # extension (which is not to be output) and the other isn't.
16807                 # If those assumptions are wrong, things have to be rethought.
16808                 if ($ucd_pod{$standard}{'output_this'} == $output_this
16809                     || $ucd_pod{$standard}{'perl_extension'} == $perl_extension
16810                     || $output_this == $perl_extension)
16811                 {
16812                     Carp::my_carp("Bad news.  $property and $ucd_pod{$standard}->{'property'} have unexpected output status and perl-extension combinations.  Proceeding anyway.");
16813                 }
16814
16815                 # We modify the info column of the one being output to
16816                 # indicate the ambiguity.  Set $which to point to that one's
16817                 # info.
16818                 my $which;
16819                 if ($ucd_pod{$standard}{'output_this'}) {
16820                     $which = \$ucd_pod{$standard}->{'info'};
16821                 }
16822                 else {
16823                     $which = \$info;
16824                     $meaning = $ucd_pod{$standard}{'meaning'};
16825                 }
16826
16827                 chomp $$which;
16828                 $$which =~ s/\.\Z//;
16829                 $$which .= "; NOT '$standard' meaning '$meaning'";
16830
16831                 $ambiguous_names{$standard} = 1;
16832             }
16833
16834             # Use the non-perl-extension variant
16835             next unless $ucd_pod{$standard}{'perl_extension'};
16836         }
16837
16838         # Store enough information about this entry that we can later look for
16839         # ambiguities, and output it properly.
16840         $ucd_pod{$standard} = { 'name' => $name,
16841                                 'info' => $info,
16842                                 'meaning' => $meaning,
16843                                 'output_this' => $output_this,
16844                                 'perl_extension' => $perl_extension,
16845                                 'property' => $property,
16846                                 'status' => $alias->status,
16847         };
16848     } # End of looping through all this table's aliases
16849
16850     return;
16851 }
16852
16853 sub pod_alphanumeric_sort {
16854     # Sort pod entries alphanumerically.
16855
16856     # The first few character columns are filler, plus the '\p{'; and get rid
16857     # of all the trailing stuff, starting with the trailing '}', so as to sort
16858     # on just 'Name=Value'
16859     (my $a = lc $a) =~ s/^ .*? \{ //x;
16860     $a =~ s/}.*//;
16861     (my $b = lc $b) =~ s/^ .*? \{ //x;
16862     $b =~ s/}.*//;
16863
16864     # Determine if the two operands are both internal only or both not.
16865     # Character 0 should be a '\'; 1 should be a p; 2 should be '{', so 3
16866     # should be the underscore that begins internal only
16867     my $a_is_internal = (substr($a, 0, 1) eq '_');
16868     my $b_is_internal = (substr($b, 0, 1) eq '_');
16869
16870     # Sort so the internals come last in the table instead of first (which the
16871     # leading underscore would otherwise indicate).
16872     if ($a_is_internal != $b_is_internal) {
16873         return 1 if $a_is_internal;
16874         return -1
16875     }
16876
16877     # Determine if the two operands are compound or not, and if so if are
16878     # "numeric" property values or not, like \p{Age: 3.0}.  But there are also
16879     # things like \p{Canonical_Combining_Class: CCC133} and \p{Age: V10_0},
16880     # all of which this considers numeric, and for sorting, looks just at the
16881     # numeric parts.  It can also be a rational like \p{Numeric Value=-1/2}.
16882     my $split_re = qr/
16883         ^ ( [^:=]+ ) # $1 is undef if not a compound form, otherwise is the
16884                      # property name
16885         [:=] \s*     # The syntax for the compound form
16886         (?:          # followed by ...
16887             (        # $2 gets defined if what follows is a "numeric"
16888                      # expression, which is ...
16889               ( -? \d+ (?: [.\/] \d+)?  # An integer, float, or rational
16890                                         # number, optionally signed
16891                | [[:alpha:]]{2,} \d+ $ ) # or something like CCC131.  Either
16892                                          # of these go into $3
16893              | ( V \d+ _ \d+ )           # or a Unicode's Age property version
16894                                          # number, into $4
16895             )
16896             | .* $    # If not "numeric", accept anything so that $1 gets
16897                       # defined if it is any compound form
16898         ) /ix;
16899     my ($a_initial, $a_numeric, $a_number, $a_version) = ($a =~ $split_re);
16900     my ($b_initial, $b_numeric, $b_number, $b_version) = ($b =~ $split_re);
16901
16902     # Sort alphabeticlly on the whole property name if either operand isn't
16903     # compound, or they differ.
16904     return $a cmp $b if   ! defined $a_initial
16905                        || ! defined $b_initial
16906                        || $a_initial ne $b_initial;
16907
16908     if (! defined $a_numeric) {
16909
16910         # If neither is numeric, use alpha sort
16911         return $a cmp $b if ! defined $b_numeric;
16912         return 1;  # Sort numeric ahead of alpha
16913     }
16914
16915     # Here $a is numeric
16916     return -1 if ! defined $b_numeric;  # Numeric sorts before alpha
16917
16918     # Here they are both numeric in the same property.
16919     # Convert version numbers into regular numbers
16920     if (defined $a_version) {
16921         ($a_number = $a_version) =~ s/^V//i;
16922         $a_number =~ s/_/./;
16923     }
16924     else {  # Otherwise get rid of the, e.g., CCC in CCC9 */
16925         $a_number =~ s/ ^ [[:alpha:]]+ //x;
16926     }
16927     if (defined $b_version) {
16928         ($b_number = $b_version) =~ s/^V//i;
16929         $b_number =~ s/_/./;
16930     }
16931     else {
16932         $b_number =~ s/ ^ [[:alpha:]]+ //x;
16933     }
16934
16935     # Convert rationals to floating for the comparison.
16936     $a_number = eval $a_number if $a_number =~ qr{/};
16937     $b_number = eval $b_number if $b_number =~ qr{/};
16938
16939     return $a_number <=> $b_number || $a cmp $b;
16940 }
16941
16942 sub make_pod () {
16943     # Create the .pod file.  This generates the various subsections and then
16944     # combines them in one big HERE document.
16945
16946     my $Is_flags_text = "If an entry has flag(s) at its beginning, like \"$DEPRECATED\", the \"Is_\" form has the same flag(s)";
16947
16948     return unless defined $pod_directory;
16949     print "Making pod file\n" if $verbosity >= $PROGRESS;
16950
16951     my $exception_message =
16952     '(Any exceptions are individually noted beginning with the word NOT.)';
16953     my @block_warning;
16954     if (-e 'Blocks.txt') {
16955
16956         # Add the line: '\p{In_*}    \p{Block: *}', with the warning message
16957         # if the global $has_In_conflicts indicates we have them.
16958         push @match_properties, format_pod_line($indent_info_column,
16959                                                 '\p{In_*}',
16960                                                 '\p{Block: *}'
16961                                                     . (($has_In_conflicts)
16962                                                       ? " $exception_message"
16963                                                       : ""),
16964                                                  $DISCOURAGED);
16965         @block_warning = << "END";
16966
16967 In particular, matches in the Block property have single forms
16968 defined by Perl that begin with C<"In_">, C<"Is_>, or even with no prefix at
16969 all,  Like all B<DISCOURAGED> forms, these are not stable.  For example,
16970 C<\\p{Block=Deseret}> can currently be written as C<\\p{In_Deseret}>,
16971 C<\\p{Is_Deseret}>, or C<\\p{Deseret}>.  But, a new Unicode version may
16972 come along that would force Perl to change the meaning of one or more of
16973 these, and your program would no longer be correct.  Currently there are no
16974 such conflicts with the form that begins C<"In_">, but there are many with the
16975 other two shortcuts, and Unicode continues to define new properties that begin
16976 with C<"In">, so it's quite possible that a conflict will occur in the future.
16977 The compound form is guaranteed to not become obsolete, and its meaning is
16978 clearer anyway.  See L<perlunicode/"Blocks"> for more information about this.
16979 END
16980     }
16981     my $text = $Is_flags_text;
16982     $text = "$exception_message $text" if $has_Is_conflicts;
16983
16984     # And the 'Is_ line';
16985     push @match_properties, format_pod_line($indent_info_column,
16986                                             '\p{Is_*}',
16987                                             "\\p{*} $text");
16988
16989     # Sort the properties array for output.  It is sorted alphabetically
16990     # except numerically for numeric properties, and only output unique lines.
16991     @match_properties = sort pod_alphanumeric_sort uniques @match_properties;
16992
16993     my $formatted_properties = simple_fold(\@match_properties,
16994                                         "",
16995                                         # indent succeeding lines by two extra
16996                                         # which looks better
16997                                         $indent_info_column + 2,
16998
16999                                         # shorten the line length by how much
17000                                         # the formatter indents, so the folded
17001                                         # line will fit in the space
17002                                         # presumably available
17003                                         $automatic_pod_indent);
17004     # Add column headings, indented to be a little more centered, but not
17005     # exactly
17006     $formatted_properties =  format_pod_line($indent_info_column,
17007                                                     '    NAME',
17008                                                     '           INFO')
17009                                     . "\n"
17010                                     . $formatted_properties;
17011
17012     # Generate pod documentation lines for the tables that match nothing
17013     my $zero_matches = "";
17014     if (@zero_match_tables) {
17015         @zero_match_tables = uniques(@zero_match_tables);
17016         $zero_matches = join "\n\n",
17017                         map { $_ = '=item \p{' . $_->complete_name . "}" }
17018                             sort { $a->complete_name cmp $b->complete_name }
17019                             @zero_match_tables;
17020
17021         $zero_matches = <<END;
17022
17023 =head2 Legal C<\\p{}> and C<\\P{}> constructs that match no characters
17024
17025 Unicode has some property-value pairs that currently don't match anything.
17026 This happens generally either because they are obsolete, or they exist for
17027 symmetry with other forms, but no language has yet been encoded that uses
17028 them.  In this version of Unicode, the following match zero code points:
17029
17030 =over 4
17031
17032 $zero_matches
17033
17034 =back
17035
17036 END
17037     }
17038
17039     # Generate list of properties that we don't accept, grouped by the reasons
17040     # why.  This is so only put out the 'why' once, and then list all the
17041     # properties that have that reason under it.
17042
17043     my %why_list;   # The keys are the reasons; the values are lists of
17044                     # properties that have the key as their reason
17045
17046     # For each property, add it to the list that are suppressed for its reason
17047     # The sort will cause the alphabetically first properties to be added to
17048     # each list first, so each list will be sorted.
17049     foreach my $property (sort keys %why_suppressed) {
17050         next unless $why_suppressed{$property};
17051         push @{$why_list{$why_suppressed{$property}}}, $property;
17052     }
17053
17054     # For each reason (sorted by the first property that has that reason)...
17055     my @bad_re_properties;
17056     foreach my $why (sort { $why_list{$a}->[0] cmp $why_list{$b}->[0] }
17057                      keys %why_list)
17058     {
17059         # Add to the output, all the properties that have that reason.
17060         my $has_item = 0;   # Flag if actually output anything.
17061         foreach my $name (@{$why_list{$why}}) {
17062
17063             # Split compound names into $property and $table components
17064             my $property = $name;
17065             my $table;
17066             if ($property =~ / (.*) = (.*) /x) {
17067                 $property = $1;
17068                 $table = $2;
17069             }
17070
17071             # This release of Unicode may not have a property that is
17072             # suppressed, so don't reference a non-existent one.
17073             $property = property_ref($property);
17074             next if ! defined $property;
17075
17076             # And since this list is only for match tables, don't list the
17077             # ones that don't have match tables.
17078             next if ! $property->to_create_match_tables;
17079
17080             # Find any abbreviation, and turn it into a compound name if this
17081             # is a property=value pair.
17082             my $short_name = $property->name;
17083             $short_name .= '=' . $property->table($table)->name if $table;
17084
17085             # Start with an empty line.
17086             push @bad_re_properties, "\n\n" unless $has_item;
17087
17088             # And add the property as an item for the reason.
17089             push @bad_re_properties, "\n=item I<$name> ($short_name)\n";
17090             $has_item = 1;
17091         }
17092
17093         # And add the reason under the list of properties, if such a list
17094         # actually got generated.  Note that the header got added
17095         # unconditionally before.  But pod ignores extra blank lines, so no
17096         # harm.
17097         push @bad_re_properties, "\n$why\n" if $has_item;
17098
17099     } # End of looping through each reason.
17100
17101     if (! @bad_re_properties) {
17102         push @bad_re_properties,
17103                 "*** This installation accepts ALL non-Unihan properties ***";
17104     }
17105     else {
17106         # Add =over only if non-empty to avoid an empty =over/=back section,
17107         # which is considered bad form.
17108         unshift @bad_re_properties, "\n=over 4\n";
17109         push @bad_re_properties, "\n=back\n";
17110     }
17111
17112     # Similarly, generate a list of files that we don't use, grouped by the
17113     # reasons why (Don't output if the reason is empty).  First, create a hash
17114     # whose keys are the reasons, and whose values are anonymous arrays of all
17115     # the files that share that reason.
17116     my %grouped_by_reason;
17117     foreach my $file (keys %skipped_files) {
17118         next unless $skipped_files{$file};
17119         push @{$grouped_by_reason{$skipped_files{$file}}}, $file;
17120     }
17121
17122     # Then, sort each group.
17123     foreach my $group (keys %grouped_by_reason) {
17124         @{$grouped_by_reason{$group}} = sort { lc $a cmp lc $b }
17125                                         @{$grouped_by_reason{$group}} ;
17126     }
17127
17128     # Finally, create the output text.  For each reason (sorted by the
17129     # alphabetically first file that has that reason)...
17130     my @unused_files;
17131     foreach my $reason (sort { lc $grouped_by_reason{$a}->[0]
17132                                cmp lc $grouped_by_reason{$b}->[0]
17133                               }
17134                          keys %grouped_by_reason)
17135     {
17136         # Add all the files that have that reason to the output.  Start
17137         # with an empty line.
17138         push @unused_files, "\n\n";
17139         push @unused_files, map { "\n=item F<$_> \n" }
17140                             @{$grouped_by_reason{$reason}};
17141         # And add the reason under the list of files
17142         push @unused_files, "\n$reason\n";
17143     }
17144
17145     # Similarly, create the output text for the UCD section of the pod
17146     my @ucd_pod;
17147     foreach my $key (keys %ucd_pod) {
17148         next unless $ucd_pod{$key}->{'output_this'};
17149         push @ucd_pod, format_pod_line($indent_info_column,
17150                                        $ucd_pod{$key}->{'name'},
17151                                        $ucd_pod{$key}->{'info'},
17152                                        $ucd_pod{$key}->{'status'},
17153                                       );
17154     }
17155
17156     # Sort alphabetically, and fold for output
17157     @ucd_pod = sort { lc substr($a, 2) cmp lc substr($b, 2) } @ucd_pod;
17158     my $ucd_pod = simple_fold(\@ucd_pod,
17159                            ' ',
17160                            $indent_info_column,
17161                            $automatic_pod_indent);
17162     $ucd_pod =  format_pod_line($indent_info_column, 'NAME', '  INFO')
17163                 . "\n"
17164                 . $ucd_pod;
17165     my $space_hex = sprintf("%02x", ord " ");
17166     local $" = "";
17167
17168     # Everything is ready to assemble.
17169     my @OUT = << "END";
17170 =begin comment
17171
17172 $HEADER
17173
17174 To change this file, edit $0 instead.
17175
17176 =end comment
17177
17178 =head1 NAME
17179
17180 $pod_file - Index of Unicode Version $unicode_version character properties in Perl
17181
17182 =head1 DESCRIPTION
17183
17184 This document provides information about the portion of the Unicode database
17185 that deals with character properties, that is the portion that is defined on
17186 single code points.  (L</Other information in the Unicode data base>
17187 below briefly mentions other data that Unicode provides.)
17188
17189 Perl can provide access to all non-provisional Unicode character properties,
17190 though not all are enabled by default.  The omitted ones are the Unihan
17191 properties (accessible via the CPAN module L<Unicode::Unihan>) and certain
17192 deprecated or Unicode-internal properties.  (An installation may choose to
17193 recompile Perl's tables to change this.  See L<Unicode character
17194 properties that are NOT accepted by Perl>.)
17195
17196 For most purposes, access to Unicode properties from the Perl core is through
17197 regular expression matches, as described in the next section.
17198 For some special purposes, and to access the properties that are not suitable
17199 for regular expression matching, all the Unicode character properties that
17200 Perl handles are accessible via the standard L<Unicode::UCD> module, as
17201 described in the section L</Properties accessible through Unicode::UCD>.
17202
17203 Perl also provides some additional extensions and short-cut synonyms
17204 for Unicode properties.
17205
17206 This document merely lists all available properties and does not attempt to
17207 explain what each property really means.  There is a brief description of each
17208 Perl extension; see L<perlunicode/Other Properties> for more information on
17209 these.  There is some detail about Blocks, Scripts, General_Category,
17210 and Bidi_Class in L<perlunicode>, but to find out about the intricacies of the
17211 official Unicode properties, refer to the Unicode standard.  A good starting
17212 place is L<$unicode_reference_url>.
17213
17214 Note that you can define your own properties; see
17215 L<perlunicode/"User-Defined Character Properties">.
17216
17217 =head1 Properties accessible through C<\\p{}> and C<\\P{}>
17218
17219 The Perl regular expression C<\\p{}> and C<\\P{}> constructs give access to
17220 most of the Unicode character properties.  The table below shows all these
17221 constructs, both single and compound forms.
17222
17223 B<Compound forms> consist of two components, separated by an equals sign or a
17224 colon.  The first component is the property name, and the second component is
17225 the particular value of the property to match against, for example,
17226 C<\\p{Script_Extensions: Greek}> and C<\\p{Script_Extensions=Greek}> both mean
17227 to match characters whose Script_Extensions property value is Greek.
17228 (C<Script_Extensions> is an improved version of the C<Script> property.)
17229
17230 B<Single forms>, like C<\\p{Greek}>, are mostly Perl-defined shortcuts for
17231 their equivalent compound forms.  The table shows these equivalences.  (In our
17232 example, C<\\p{Greek}> is a just a shortcut for
17233 C<\\p{Script_Extensions=Greek}>).  There are also a few Perl-defined single
17234 forms that are not shortcuts for a compound form.  One such is C<\\p{Word}>.
17235 These are also listed in the table.
17236
17237 In parsing these constructs, Perl always ignores Upper/lower case differences
17238 everywhere within the {braces}.  Thus C<\\p{Greek}> means the same thing as
17239 C<\\p{greek}>.  But note that changing the case of the C<"p"> or C<"P"> before
17240 the left brace completely changes the meaning of the construct, from "match"
17241 (for C<\\p{}>) to "doesn't match" (for C<\\P{}>).  Casing in this document is
17242 for improved legibility.
17243
17244 Also, white space, hyphens, and underscores are normally ignored
17245 everywhere between the {braces}, and hence can be freely added or removed
17246 even if the C</x> modifier hasn't been specified on the regular expression.
17247 But in the table below $a_bold_stricter at the beginning of an entry
17248 means that tighter (stricter) rules are used for that entry:
17249
17250 =over 4
17251
17252 =over 4
17253
17254 =item Single form (C<\\p{name}>) tighter rules:
17255
17256 White space, hyphens, and underscores ARE significant
17257 except for:
17258
17259 =over 4
17260
17261 =item * white space adjacent to a non-word character
17262
17263 =item * underscores separating digits in numbers
17264
17265 =back
17266
17267 That means, for example, that you can freely add or remove white space
17268 adjacent to (but within) the braces without affecting the meaning.
17269
17270 =item Compound form (C<\\p{name=value}> or C<\\p{name:value}>) tighter rules:
17271
17272 The tighter rules given above for the single form apply to everything to the
17273 right of the colon or equals; the looser rules still apply to everything to
17274 the left.
17275
17276 That means, for example, that you can freely add or remove white space
17277 adjacent to (but within) the braces and the colon or equal sign.
17278
17279 =back
17280
17281 =back
17282
17283 Some properties are considered obsolete by Unicode, but still available.
17284 There are several varieties of obsolescence:
17285
17286 =over 4
17287
17288 =over 4
17289
17290 =item Stabilized
17291
17292 A property may be stabilized.  Such a determination does not indicate
17293 that the property should or should not be used; instead it is a declaration
17294 that the property will not be maintained nor extended for newly encoded
17295 characters.  Such properties are marked with $a_bold_stabilized in the
17296 table.
17297
17298 =item Deprecated
17299
17300 A property may be deprecated, perhaps because its original intent
17301 has been replaced by another property, or because its specification was
17302 somehow defective.  This means that its use is strongly
17303 discouraged, so much so that a warning will be issued if used, unless the
17304 regular expression is in the scope of a C<S<no warnings 'deprecated'>>
17305 statement.  $A_bold_deprecated flags each such entry in the table, and
17306 the entry there for the longest, most descriptive version of the property will
17307 give the reason it is deprecated, and perhaps advice.  Perl may issue such a
17308 warning, even for properties that aren't officially deprecated by Unicode,
17309 when there used to be characters or code points that were matched by them, but
17310 no longer.  This is to warn you that your program may not work like it did on
17311 earlier Unicode releases.
17312
17313 A deprecated property may be made unavailable in a future Perl version, so it
17314 is best to move away from them.
17315
17316 A deprecated property may also be stabilized, but this fact is not shown.
17317
17318 =item Obsolete
17319
17320 Properties marked with $a_bold_obsolete in the table are considered (plain)
17321 obsolete.  Generally this designation is given to properties that Unicode once
17322 used for internal purposes (but not any longer).
17323
17324 =item Discouraged
17325
17326 This is not actually a Unicode-specified obsolescence, but applies to certain
17327 Perl extensions that are present for backwards compatibility, but are
17328 discouraged from being used.  These are not obsolete, but their meanings are
17329 not stable.  Future Unicode versions could force any of these extensions to be
17330 removed without warning, replaced by another property with the same name that
17331 means something different.  $A_bold_discouraged flags each such entry in the
17332 table.  Use the equivalent shown instead.
17333
17334 @block_warning
17335
17336 =back
17337
17338 =back
17339
17340 The table below has two columns.  The left column contains the C<\\p{}>
17341 constructs to look up, possibly preceded by the flags mentioned above; and
17342 the right column contains information about them, like a description, or
17343 synonyms.  The table shows both the single and compound forms for each
17344 property that has them.  If the left column is a short name for a property,
17345 the right column will give its longer, more descriptive name; and if the left
17346 column is the longest name, the right column will show any equivalent shortest
17347 name, in both single and compound forms if applicable.
17348
17349 If braces are not needed to specify a property (e.g., C<\\pL>), the left
17350 column contains both forms, with and without braces.
17351
17352 The right column will also caution you if a property means something different
17353 than what might normally be expected.
17354
17355 All single forms are Perl extensions; a few compound forms are as well, and
17356 are noted as such.
17357
17358 Numbers in (parentheses) indicate the total number of Unicode code points
17359 matched by the property.  For the entries that give the longest, most
17360 descriptive version of the property, the count is followed by a list of some
17361 of the code points matched by it.  The list includes all the matched
17362 characters in the 0-255 range, enclosed in the familiar [brackets] the same as
17363 a regular expression bracketed character class.  Following that, the next few
17364 higher matching ranges are also given.  To avoid visual ambiguity, the SPACE
17365 character is represented as C<\\x$space_hex>.
17366
17367 For emphasis, those properties that match no code points at all are listed as
17368 well in a separate section following the table.
17369
17370 Most properties match the same code points regardless of whether C<"/i">
17371 case-insensitive matching is specified or not.  But a few properties are
17372 affected.  These are shown with the notation S<C<(/i= I<other_property>)>>
17373 in the second column.  Under case-insensitive matching they match the
17374 same code pode points as the property I<other_property>.
17375
17376 There is no description given for most non-Perl defined properties (See
17377 L<$unicode_reference_url> for that).
17378
17379 For compactness, 'B<*>' is used as a wildcard instead of showing all possible
17380 combinations.  For example, entries like:
17381
17382  \\p{Gc: *}                                  \\p{General_Category: *}
17383
17384 mean that 'Gc' is a synonym for 'General_Category', and anything that is valid
17385 for the latter is also valid for the former.  Similarly,
17386
17387  \\p{Is_*}                                   \\p{*}
17388
17389 means that if and only if, for example, C<\\p{Foo}> exists, then
17390 C<\\p{Is_Foo}> and C<\\p{IsFoo}> are also valid and all mean the same thing.
17391 And similarly, C<\\p{Foo=Bar}> means the same as C<\\p{Is_Foo=Bar}> and
17392 C<\\p{IsFoo=Bar}>.  "*" here is restricted to something not beginning with an
17393 underscore.
17394
17395 Also, in binary properties, 'Yes', 'T', and 'True' are all synonyms for 'Y'.
17396 And 'No', 'F', and 'False' are all synonyms for 'N'.  The table shows 'Y*' and
17397 'N*' to indicate this, and doesn't have separate entries for the other
17398 possibilities.  Note that not all properties which have values 'Yes' and 'No'
17399 are binary, and they have all their values spelled out without using this wild
17400 card, and a C<NOT> clause in their description that highlights their not being
17401 binary.  These also require the compound form to match them, whereas true
17402 binary properties have both single and compound forms available.
17403
17404 Note that all non-essential underscores are removed in the display of the
17405 short names below.
17406
17407 B<Legend summary:>
17408
17409 =over 4
17410
17411 =item Z<>B<*> is a wild-card
17412
17413 =item B<(\\d+)> in the info column gives the number of Unicode code points matched
17414 by this property.
17415
17416 =item B<$DEPRECATED> means this is deprecated.
17417
17418 =item B<$OBSOLETE> means this is obsolete.
17419
17420 =item B<$STABILIZED> means this is stabilized.
17421
17422 =item B<$STRICTER> means tighter (stricter) name matching applies.
17423
17424 =item B<$DISCOURAGED> means use of this form is discouraged, and may not be
17425 stable.
17426
17427 =back
17428
17429 $formatted_properties
17430
17431 $zero_matches
17432
17433 =head1 Properties accessible through Unicode::UCD
17434
17435 The value of any Unicode (not including Perl extensions) character
17436 property mentioned above for any single code point is available through
17437 L<Unicode::UCD/charprop()>.  L<Unicode::UCD/charprops_all()> returns the
17438 values of all the Unicode properties for a given code point.
17439
17440 Besides these, all the Unicode character properties mentioned above
17441 (except for those marked as for internal use by Perl) are also
17442 accessible by L<Unicode::UCD/prop_invlist()>.
17443
17444 Due to their nature, not all Unicode character properties are suitable for
17445 regular expression matches, nor C<prop_invlist()>.  The remaining
17446 non-provisional, non-internal ones are accessible via
17447 L<Unicode::UCD/prop_invmap()> (except for those that this Perl installation
17448 hasn't included; see L<below for which those are|/Unicode character properties
17449 that are NOT accepted by Perl>).
17450
17451 For compatibility with other parts of Perl, all the single forms given in the
17452 table in the L<section above|/Properties accessible through \\p{} and \\P{}>
17453 are recognized.  BUT, there are some ambiguities between some Perl extensions
17454 and the Unicode properties, all of which are silently resolved in favor of the
17455 official Unicode property.  To avoid surprises, you should only use
17456 C<prop_invmap()> for forms listed in the table below, which omits the
17457 non-recommended ones.  The affected forms are the Perl single form equivalents
17458 of Unicode properties, such as C<\\p{sc}> being a single-form equivalent of
17459 C<\\p{gc=sc}>, which is treated by C<prop_invmap()> as the C<Script> property,
17460 whose short name is C<sc>.  The table indicates the current ambiguities in the
17461 INFO column, beginning with the word C<"NOT">.
17462
17463 The standard Unicode properties listed below are documented in
17464 L<$unicode_reference_url>; Perl_Decimal_Digit is documented in
17465 L<Unicode::UCD/prop_invmap()>.  The other Perl extensions are in
17466 L<perlunicode/Other Properties>;
17467
17468 The first column in the table is a name for the property; the second column is
17469 an alternative name, if any, plus possibly some annotations.  The alternative
17470 name is the property's full name, unless that would simply repeat the first
17471 column, in which case the second column indicates the property's short name
17472 (if different).  The annotations are given only in the entry for the full
17473 name.  The annotations for binary properties include a list of the first few
17474 ranges that the property matches.  To avoid any ambiguity, the SPACE character
17475 is represented as C<\\x$space_hex>.
17476
17477 If a property is obsolete, etc, the entry will be flagged with the same
17478 characters used in the table in the L<section above|/Properties accessible
17479 through \\p{} and \\P{}>, like B<$DEPRECATED> or B<$STABILIZED>.
17480
17481 $ucd_pod
17482
17483 =head1 Properties accessible through other means
17484
17485 Certain properties are accessible also via core function calls.  These are:
17486
17487  Lowercase_Mapping          lc() and lcfirst()
17488  Titlecase_Mapping          ucfirst()
17489  Uppercase_Mapping          uc()
17490
17491 Also, Case_Folding is accessible through the C</i> modifier in regular
17492 expressions, the C<\\F> transliteration escape, and the C<L<fc|perlfunc/fc>>
17493 operator.
17494
17495 And, the Name and Name_Aliases properties are accessible through the C<\\N{}>
17496 interpolation in double-quoted strings and regular expressions; and functions
17497 C<charnames::viacode()>, C<charnames::vianame()>, and
17498 C<charnames::string_vianame()> (which require a C<use charnames ();> to be
17499 specified.
17500
17501 Finally, most properties related to decomposition are accessible via
17502 L<Unicode::Normalize>.
17503
17504 =head1 Unicode character properties that are NOT accepted by Perl
17505
17506 Perl will generate an error for a few character properties in Unicode when
17507 used in a regular expression.  The non-Unihan ones are listed below, with the
17508 reasons they are not accepted, perhaps with work-arounds.  The short names for
17509 the properties are listed enclosed in (parentheses).
17510 As described after the list, an installation can change the defaults and choose
17511 to accept any of these.  The list is machine generated based on the
17512 choices made for the installation that generated this document.
17513
17514 @bad_re_properties
17515
17516 An installation can choose to allow any of these to be matched by downloading
17517 the Unicode database from L<http://www.unicode.org/Public/> to
17518 C<\$Config{privlib}>/F<unicore/> in the Perl source tree, changing the
17519 controlling lists contained in the program
17520 C<\$Config{privlib}>/F<unicore/mktables> and then re-compiling and installing.
17521 (C<\%Config> is available from the Config module).
17522
17523 Also, perl can be recompiled to operate on an earlier version of the Unicode
17524 standard.  Further information is at
17525 C<\$Config{privlib}>/F<unicore/README.perl>.
17526
17527 =head1 Other information in the Unicode data base
17528
17529 The Unicode data base is delivered in two different formats.  The XML version
17530 is valid for more modern Unicode releases.  The other version is a collection
17531 of files.  The two are intended to give equivalent information.  Perl uses the
17532 older form; this allows you to recompile Perl to use early Unicode releases.
17533
17534 The only non-character property that Perl currently supports is Named
17535 Sequences, in which a sequence of code points
17536 is given a name and generally treated as a single entity.  (Perl supports
17537 these via the C<\\N{...}> double-quotish construct,
17538 L<charnames/charnames::string_vianame(name)>, and L<Unicode::UCD/namedseq()>.
17539
17540 Below is a list of the files in the Unicode data base that Perl doesn't
17541 currently use, along with very brief descriptions of their purposes.
17542 Some of the names of the files have been shortened from those that Unicode
17543 uses, in order to allow them to be distinguishable from similarly named files
17544 on file systems for which only the first 8 characters of a name are
17545 significant.
17546
17547 =over 4
17548
17549 @unused_files
17550
17551 =back
17552
17553 =head1 SEE ALSO
17554
17555 L<$unicode_reference_url>
17556
17557 L<perlrecharclass>
17558
17559 L<perlunicode>
17560
17561 END
17562
17563     # And write it.  The 0 means no utf8.
17564     main::write([ $pod_directory, "$pod_file.pod" ], 0, \@OUT);
17565     return;
17566 }
17567
17568 sub make_Heavy () {
17569     # Create and write Heavy.pl, which passes info about the tables to
17570     # utf8_heavy.pl
17571
17572     # Stringify structures for output
17573     my $loose_property_name_of
17574                            = simple_dumper(\%loose_property_name_of, ' ' x 4);
17575     chomp $loose_property_name_of;
17576
17577     my $strict_property_name_of
17578                            = simple_dumper(\%strict_property_name_of, ' ' x 4);
17579     chomp $strict_property_name_of;
17580
17581     my $stricter_to_file_of = simple_dumper(\%stricter_to_file_of, ' ' x 4);
17582     chomp $stricter_to_file_of;
17583
17584     my $inline_definitions = simple_dumper(\@inline_definitions, " " x 4);
17585     chomp $inline_definitions;
17586
17587     my $loose_to_file_of = simple_dumper(\%loose_to_file_of, ' ' x 4);
17588     chomp $loose_to_file_of;
17589
17590     my $nv_floating_to_rational
17591                            = simple_dumper(\%nv_floating_to_rational, ' ' x 4);
17592     chomp $nv_floating_to_rational;
17593
17594     my $why_deprecated = simple_dumper(\%utf8::why_deprecated, ' ' x 4);
17595     chomp $why_deprecated;
17596
17597     # We set the key to the file when we associated files with tables, but we
17598     # couldn't do the same for the value then, as we might not have the file
17599     # for the alternate table figured out at that time.
17600     foreach my $cased (keys %caseless_equivalent_to) {
17601         my @path = $caseless_equivalent_to{$cased}->file_path;
17602         my $path;
17603         if ($path[0] eq "#") {  # Pseudo-directory '#'
17604             $path = join '/', @path;
17605         }
17606         else {  # Gets rid of lib/
17607             $path = join '/', @path[1, -1];
17608         }
17609         $caseless_equivalent_to{$cased} = $path;
17610     }
17611     my $caseless_equivalent_to
17612                            = simple_dumper(\%caseless_equivalent_to, ' ' x 4);
17613     chomp $caseless_equivalent_to;
17614
17615     my $loose_property_to_file_of
17616                         = simple_dumper(\%loose_property_to_file_of, ' ' x 4);
17617     chomp $loose_property_to_file_of;
17618
17619     my $strict_property_to_file_of
17620                         = simple_dumper(\%strict_property_to_file_of, ' ' x 4);
17621     chomp $strict_property_to_file_of;
17622
17623     my $file_to_swash_name = simple_dumper(\%file_to_swash_name, ' ' x 4);
17624     chomp $file_to_swash_name;
17625
17626     my @heavy = <<END;
17627 $HEADER
17628 $INTERNAL_ONLY_HEADER
17629
17630 # This file is for the use of utf8_heavy.pl and Unicode::UCD
17631
17632 # Maps Unicode (not Perl single-form extensions) property names in loose
17633 # standard form to their corresponding standard names
17634 \%utf8::loose_property_name_of = (
17635 $loose_property_name_of
17636 );
17637
17638 # Same, but strict names
17639 \%utf8::strict_property_name_of = (
17640 $strict_property_name_of
17641 );
17642
17643 # Gives the definitions (in the form of inversion lists) for those properties
17644 # whose definitions aren't kept in files
17645 \@utf8::inline_definitions = (
17646 $inline_definitions
17647 );
17648
17649 # Maps property, table to file for those using stricter matching.  For paths
17650 # whose directory is '#', the file is in the form of a numeric index into
17651 # \@inline_definitions
17652 \%utf8::stricter_to_file_of = (
17653 $stricter_to_file_of
17654 );
17655
17656 # Maps property, table to file for those using loose matching.  For paths
17657 # whose directory is '#', the file is in the form of a numeric index into
17658 # \@inline_definitions
17659 \%utf8::loose_to_file_of = (
17660 $loose_to_file_of
17661 );
17662
17663 # Maps floating point to fractional form
17664 \%utf8::nv_floating_to_rational = (
17665 $nv_floating_to_rational
17666 );
17667
17668 # If a %e floating point number doesn't have this number of digits in it after
17669 # the decimal point to get this close to a fraction, it isn't considered to be
17670 # that fraction even if all the digits it does have match.
17671 \$utf8::e_precision = $E_FLOAT_PRECISION;
17672
17673 # Deprecated tables to generate a warning for.  The key is the file containing
17674 # the table, so as to avoid duplication, as many property names can map to the
17675 # file, but we only need one entry for all of them.
17676 \%utf8::why_deprecated = (
17677 $why_deprecated
17678 );
17679
17680 # A few properties have different behavior under /i matching.  This maps
17681 # those to substitute files to use under /i.
17682 \%utf8::caseless_equivalent = (
17683 $caseless_equivalent_to
17684 );
17685
17686 # Property names to mapping files
17687 \%utf8::loose_property_to_file_of = (
17688 $loose_property_to_file_of
17689 );
17690
17691 # Property names to mapping files
17692 \%utf8::strict_property_to_file_of = (
17693 $strict_property_to_file_of
17694 );
17695
17696 # Files to the swash names within them.
17697 \%utf8::file_to_swash_name = (
17698 $file_to_swash_name
17699 );
17700
17701 1;
17702 END
17703
17704     main::write("Heavy.pl", 0, \@heavy);  # The 0 means no utf8.
17705     return;
17706 }
17707
17708 sub make_Name_pm () {
17709     # Create and write Name.pm, which contains subroutines and data to use in
17710     # conjunction with Name.pl
17711
17712     # Maybe there's nothing to do.
17713     return unless $has_hangul_syllables || @code_points_ending_in_code_point;
17714
17715     my @name = <<END;
17716 $HEADER
17717 $INTERNAL_ONLY_HEADER
17718 END
17719
17720     # Convert these structures to output format.
17721     my $code_points_ending_in_code_point =
17722         main::simple_dumper(\@code_points_ending_in_code_point,
17723                             ' ' x 8);
17724     my $names = main::simple_dumper(\%names_ending_in_code_point,
17725                                     ' ' x 8);
17726     my $loose_names = main::simple_dumper(\%loose_names_ending_in_code_point,
17727                                     ' ' x 8);
17728
17729     # Do the same with the Hangul names,
17730     my $jamo;
17731     my $jamo_l;
17732     my $jamo_v;
17733     my $jamo_t;
17734     my $jamo_re;
17735     if ($has_hangul_syllables) {
17736
17737         # Construct a regular expression of all the possible
17738         # combinations of the Hangul syllables.
17739         my @L_re;   # Leading consonants
17740         for my $i ($LBase .. $LBase + $LCount - 1) {
17741             push @L_re, $Jamo{$i}
17742         }
17743         my @V_re;   # Middle vowels
17744         for my $i ($VBase .. $VBase + $VCount - 1) {
17745             push @V_re, $Jamo{$i}
17746         }
17747         my @T_re;   # Trailing consonants
17748         for my $i ($TBase + 1 .. $TBase + $TCount - 1) {
17749             push @T_re, $Jamo{$i}
17750         }
17751
17752         # The whole re is made up of the L V T combination.
17753         $jamo_re = '('
17754                     . join ('|', sort @L_re)
17755                     . ')('
17756                     . join ('|', sort @V_re)
17757                     . ')('
17758                     . join ('|', sort @T_re)
17759                     . ')?';
17760
17761         # These hashes needed by the algorithm were generated
17762         # during reading of the Jamo.txt file
17763         $jamo = main::simple_dumper(\%Jamo, ' ' x 8);
17764         $jamo_l = main::simple_dumper(\%Jamo_L, ' ' x 8);
17765         $jamo_v = main::simple_dumper(\%Jamo_V, ' ' x 8);
17766         $jamo_t = main::simple_dumper(\%Jamo_T, ' ' x 8);
17767     }
17768
17769     push @name, <<END;
17770
17771 package charnames;
17772
17773 # This module contains machine-generated tables and code for the
17774 # algorithmically-determinable Unicode character names.  The following
17775 # routines can be used to translate between name and code point and vice versa
17776
17777 { # Closure
17778
17779     # Matches legal code point.  4-6 hex numbers, If there are 6, the first
17780     # two must be 10; if there are 5, the first must not be a 0.  Written this
17781     # way to decrease backtracking.  The first regex allows the code point to
17782     # be at the end of a word, but to work properly, the word shouldn't end
17783     # with a valid hex character.  The second one won't match a code point at
17784     # the end of a word, and doesn't have the run-on issue
17785     my \$run_on_code_point_re = qr/$run_on_code_point_re/;
17786     my \$code_point_re = qr/$code_point_re/;
17787
17788     # In the following hash, the keys are the bases of names which include
17789     # the code point in the name, like CJK UNIFIED IDEOGRAPH-4E01.  The value
17790     # of each key is another hash which is used to get the low and high ends
17791     # for each range of code points that apply to the name.
17792     my %names_ending_in_code_point = (
17793 $names
17794     );
17795
17796     # The following hash is a copy of the previous one, except is for loose
17797     # matching, so each name has blanks and dashes squeezed out
17798     my %loose_names_ending_in_code_point = (
17799 $loose_names
17800     );
17801
17802     # And the following array gives the inverse mapping from code points to
17803     # names.  Lowest code points are first
17804     my \@code_points_ending_in_code_point = (
17805 $code_points_ending_in_code_point
17806     );
17807 END
17808     # Earlier releases didn't have Jamos.  No sense outputting
17809     # them unless will be used.
17810     if ($has_hangul_syllables) {
17811         push @name, <<END;
17812
17813     # Convert from code point to Jamo short name for use in composing Hangul
17814     # syllable names
17815     my %Jamo = (
17816 $jamo
17817     );
17818
17819     # Leading consonant (can be null)
17820     my %Jamo_L = (
17821 $jamo_l
17822     );
17823
17824     # Vowel
17825     my %Jamo_V = (
17826 $jamo_v
17827     );
17828
17829     # Optional trailing consonant
17830     my %Jamo_T = (
17831 $jamo_t
17832     );
17833
17834     # Computed re that splits up a Hangul name into LVT or LV syllables
17835     my \$syllable_re = qr/$jamo_re/;
17836
17837     my \$HANGUL_SYLLABLE = "HANGUL SYLLABLE ";
17838     my \$loose_HANGUL_SYLLABLE = "HANGULSYLLABLE";
17839
17840     # These constants names and values were taken from the Unicode standard,
17841     # version 5.1, section 3.12.  They are used in conjunction with Hangul
17842     # syllables
17843     my \$SBase = $SBase_string;
17844     my \$LBase = $LBase_string;
17845     my \$VBase = $VBase_string;
17846     my \$TBase = $TBase_string;
17847     my \$SCount = $SCount;
17848     my \$LCount = $LCount;
17849     my \$VCount = $VCount;
17850     my \$TCount = $TCount;
17851     my \$NCount = \$VCount * \$TCount;
17852 END
17853     } # End of has Jamos
17854
17855     push @name, << 'END';
17856
17857     sub name_to_code_point_special {
17858         my ($name, $loose) = @_;
17859
17860         # Returns undef if not one of the specially handled names; otherwise
17861         # returns the code point equivalent to the input name
17862         # $loose is non-zero if to use loose matching, 'name' in that case
17863         # must be input as upper case with all blanks and dashes squeezed out.
17864 END
17865     if ($has_hangul_syllables) {
17866         push @name, << 'END';
17867
17868         if ((! $loose && $name =~ s/$HANGUL_SYLLABLE//)
17869             || ($loose && $name =~ s/$loose_HANGUL_SYLLABLE//))
17870         {
17871             return if $name !~ qr/^$syllable_re$/;
17872             my $L = $Jamo_L{$1};
17873             my $V = $Jamo_V{$2};
17874             my $T = (defined $3) ? $Jamo_T{$3} : 0;
17875             return ($L * $VCount + $V) * $TCount + $T + $SBase;
17876         }
17877 END
17878     }
17879     push @name, << 'END';
17880
17881         # Name must end in 'code_point' for this to handle.
17882         return if (($loose && $name !~ /^ (.*?) ($run_on_code_point_re) $/x)
17883                    || (! $loose && $name !~ /^ (.*) ($code_point_re) $/x));
17884
17885         my $base = $1;
17886         my $code_point = CORE::hex $2;
17887         my $names_ref;
17888
17889         if ($loose) {
17890             $names_ref = \%loose_names_ending_in_code_point;
17891         }
17892         else {
17893             return if $base !~ s/-$//;
17894             $names_ref = \%names_ending_in_code_point;
17895         }
17896
17897         # Name must be one of the ones which has the code point in it.
17898         return if ! $names_ref->{$base};
17899
17900         # Look through the list of ranges that apply to this name to see if
17901         # the code point is in one of them.
17902         for (my $i = 0; $i < scalar @{$names_ref->{$base}{'low'}}; $i++) {
17903             return if $names_ref->{$base}{'low'}->[$i] > $code_point;
17904             next if $names_ref->{$base}{'high'}->[$i] < $code_point;
17905
17906             # Here, the code point is in the range.
17907             return $code_point;
17908         }
17909
17910         # Here, looked like the name had a code point number in it, but
17911         # did not match one of the valid ones.
17912         return;
17913     }
17914
17915     sub code_point_to_name_special {
17916         my $code_point = shift;
17917
17918         # Returns the name of a code point if algorithmically determinable;
17919         # undef if not
17920 END
17921     if ($has_hangul_syllables) {
17922         push @name, << 'END';
17923
17924         # If in the Hangul range, calculate the name based on Unicode's
17925         # algorithm
17926         if ($code_point >= $SBase && $code_point <= $SBase + $SCount -1) {
17927             use integer;
17928             my $SIndex = $code_point - $SBase;
17929             my $L = $LBase + $SIndex / $NCount;
17930             my $V = $VBase + ($SIndex % $NCount) / $TCount;
17931             my $T = $TBase + $SIndex % $TCount;
17932             $name = "$HANGUL_SYLLABLE$Jamo{$L}$Jamo{$V}";
17933             $name .= $Jamo{$T} if $T != $TBase;
17934             return $name;
17935         }
17936 END
17937     }
17938     push @name, << 'END';
17939
17940         # Look through list of these code points for one in range.
17941         foreach my $hash (@code_points_ending_in_code_point) {
17942             return if $code_point < $hash->{'low'};
17943             if ($code_point <= $hash->{'high'}) {
17944                 return sprintf("%s-%04X", $hash->{'name'}, $code_point);
17945             }
17946         }
17947         return;            # None found
17948     }
17949 } # End closure
17950
17951 1;
17952 END
17953
17954     main::write("Name.pm", 0, \@name);  # The 0 means no utf8.
17955     return;
17956 }
17957
17958 sub make_UCD () {
17959     # Create and write UCD.pl, which passes info about the tables to
17960     # Unicode::UCD
17961
17962     # Create a mapping from each alias of Perl single-form extensions to all
17963     # its equivalent aliases, for quick look-up.
17964     my %perlprop_to_aliases;
17965     foreach my $table ($perl->tables) {
17966
17967         # First create the list of the aliases of each extension
17968         my @aliases_list;    # List of legal aliases for this extension
17969
17970         my $table_name = $table->name;
17971         my $standard_table_name = standardize($table_name);
17972         my $table_full_name = $table->full_name;
17973         my $standard_table_full_name = standardize($table_full_name);
17974
17975         # Make sure that the list has both the short and full names
17976         push @aliases_list, $table_name, $table_full_name;
17977
17978         my $found_ucd = 0;  # ? Did we actually get an alias that should be
17979                             # output for this table
17980
17981         # Go through all the aliases (including the two just added), and add
17982         # any new unique ones to the list
17983         foreach my $alias ($table->aliases) {
17984
17985             # Skip non-legal names
17986             next unless $alias->ok_as_filename;
17987             next unless $alias->ucd;
17988
17989             $found_ucd = 1;     # have at least one legal name
17990
17991             my $name = $alias->name;
17992             my $standard = standardize($name);
17993
17994             # Don't repeat a name that is equivalent to one already on the
17995             # list
17996             next if $standard eq $standard_table_name;
17997             next if $standard eq $standard_table_full_name;
17998
17999             push @aliases_list, $name;
18000         }
18001
18002         # If there were no legal names, don't output anything.
18003         next unless $found_ucd;
18004
18005         # To conserve memory in the program reading these in, omit full names
18006         # that are identical to the short name, when those are the only two
18007         # aliases for the property.
18008         if (@aliases_list == 2 && $aliases_list[0] eq $aliases_list[1]) {
18009             pop @aliases_list;
18010         }
18011
18012         # Here, @aliases_list is the list of all the aliases that this
18013         # extension legally has.  Now can create a map to it from each legal
18014         # standardized alias
18015         foreach my $alias ($table->aliases) {
18016             next unless $alias->ucd;
18017             next unless $alias->ok_as_filename;
18018             push @{$perlprop_to_aliases{standardize($alias->name)}},
18019                  @aliases_list;
18020         }
18021     }
18022
18023     # Make a list of all combinations of properties/values that are suppressed.
18024     my @suppressed;
18025     if (! $debug_skip) {    # This tends to fail in this debug mode
18026         foreach my $property_name (keys %why_suppressed) {
18027
18028             # Just the value
18029             my $value_name = $1 if $property_name =~ s/ = ( .* ) //x;
18030
18031             # The hash may contain properties not in this release of Unicode
18032             next unless defined (my $property = property_ref($property_name));
18033
18034             # Find all combinations
18035             foreach my $prop_alias ($property->aliases) {
18036                 my $prop_alias_name = standardize($prop_alias->name);
18037
18038                 # If no =value, there's just one combination possible for this
18039                 if (! $value_name) {
18040
18041                     # The property may be suppressed, but there may be a proxy
18042                     # for it, so it shouldn't be listed as suppressed
18043                     next if $prop_alias->ucd;
18044                     push @suppressed, $prop_alias_name;
18045                 }
18046                 else {  # Otherwise
18047                     foreach my $value_alias
18048                                     ($property->table($value_name)->aliases)
18049                     {
18050                         next if $value_alias->ucd;
18051
18052                         push @suppressed, "$prop_alias_name="
18053                                         .  standardize($value_alias->name);
18054                     }
18055                 }
18056             }
18057         }
18058     }
18059     @suppressed = sort @suppressed; # So doesn't change between runs of this
18060                                     # program
18061
18062     # Convert the structure below (designed for Name.pm) to a form that UCD
18063     # wants, so it doesn't have to modify it at all; i.e. so that it includes
18064     # an element for the Hangul syllables in the appropriate place, and
18065     # otherwise changes the name to include the "-<code point>" suffix.
18066     my @algorithm_names;
18067     my $done_hangul = $v_version lt v2.0.0;  # Hanguls as we know them came
18068                                              # along in this version
18069     # Copy it linearly.
18070     for my $i (0 .. @code_points_ending_in_code_point - 1) {
18071
18072         # Insert the hanguls in the correct place.
18073         if (! $done_hangul
18074             && $code_points_ending_in_code_point[$i]->{'low'} > $SBase)
18075         {
18076             $done_hangul = 1;
18077             push @algorithm_names, { low => $SBase,
18078                                      high => $SBase + $SCount - 1,
18079                                      name => '<hangul syllable>',
18080                                     };
18081         }
18082
18083         # Copy the current entry, modified.
18084         push @algorithm_names, {
18085             low => $code_points_ending_in_code_point[$i]->{'low'},
18086             high => $code_points_ending_in_code_point[$i]->{'high'},
18087             name =>
18088                "$code_points_ending_in_code_point[$i]->{'name'}-<code point>",
18089         };
18090     }
18091
18092     # Serialize these structures for output.
18093     my $loose_to_standard_value
18094                           = simple_dumper(\%loose_to_standard_value, ' ' x 4);
18095     chomp $loose_to_standard_value;
18096
18097     my $string_property_loose_to_name
18098                     = simple_dumper(\%string_property_loose_to_name, ' ' x 4);
18099     chomp $string_property_loose_to_name;
18100
18101     my $perlprop_to_aliases = simple_dumper(\%perlprop_to_aliases, ' ' x 4);
18102     chomp $perlprop_to_aliases;
18103
18104     my $prop_aliases = simple_dumper(\%prop_aliases, ' ' x 4);
18105     chomp $prop_aliases;
18106
18107     my $prop_value_aliases = simple_dumper(\%prop_value_aliases, ' ' x 4);
18108     chomp $prop_value_aliases;
18109
18110     my $suppressed = (@suppressed) ? simple_dumper(\@suppressed, ' ' x 4) : "";
18111     chomp $suppressed;
18112
18113     my $algorithm_names = simple_dumper(\@algorithm_names, ' ' x 4);
18114     chomp $algorithm_names;
18115
18116     my $ambiguous_names = simple_dumper(\%ambiguous_names, ' ' x 4);
18117     chomp $ambiguous_names;
18118
18119     my $combination_property = simple_dumper(\%combination_property, ' ' x 4);
18120     chomp $combination_property;
18121
18122     my $loose_defaults = simple_dumper(\%loose_defaults, ' ' x 4);
18123     chomp $loose_defaults;
18124
18125     my @ucd = <<END;
18126 $HEADER
18127 $INTERNAL_ONLY_HEADER
18128
18129 # This file is for the use of Unicode::UCD
18130
18131 # Highest legal Unicode code point
18132 \$Unicode::UCD::MAX_UNICODE_CODEPOINT = 0x$MAX_UNICODE_CODEPOINT_STRING;
18133
18134 # Hangul syllables
18135 \$Unicode::UCD::HANGUL_BEGIN = $SBase_string;
18136 \$Unicode::UCD::HANGUL_COUNT = $SCount;
18137
18138 # Keys are all the possible "prop=value" combinations, in loose form; values
18139 # are the standard loose name for the 'value' part of the key
18140 \%Unicode::UCD::loose_to_standard_value = (
18141 $loose_to_standard_value
18142 );
18143
18144 # String property loose names to standard loose name
18145 \%Unicode::UCD::string_property_loose_to_name = (
18146 $string_property_loose_to_name
18147 );
18148
18149 # Keys are Perl extensions in loose form; values are each one's list of
18150 # aliases
18151 \%Unicode::UCD::loose_perlprop_to_name = (
18152 $perlprop_to_aliases
18153 );
18154
18155 # Keys are standard property name; values are each one's aliases
18156 \%Unicode::UCD::prop_aliases = (
18157 $prop_aliases
18158 );
18159
18160 # Keys of top level are standard property name; values are keys to another
18161 # hash,  Each one is one of the property's values, in standard form.  The
18162 # values are that prop-val's aliases.  If only one specified, the short and
18163 # long alias are identical.
18164 \%Unicode::UCD::prop_value_aliases = (
18165 $prop_value_aliases
18166 );
18167
18168 # Ordered (by code point ordinal) list of the ranges of code points whose
18169 # names are algorithmically determined.  Each range entry is an anonymous hash
18170 # of the start and end points and a template for the names within it.
18171 \@Unicode::UCD::algorithmic_named_code_points = (
18172 $algorithm_names
18173 );
18174
18175 # The properties that as-is have two meanings, and which must be disambiguated
18176 \%Unicode::UCD::ambiguous_names = (
18177 $ambiguous_names
18178 );
18179
18180 # Keys are the prop-val combinations which are the default values for the
18181 # given property, expressed in standard loose form
18182 \%Unicode::UCD::loose_defaults = (
18183 $loose_defaults
18184 );
18185
18186 # The properties that are combinations, in that they have both a map table and
18187 # a match table.  This is actually for UCD.t, so it knows how to test for
18188 # these.
18189 \%Unicode::UCD::combination_property = (
18190 $combination_property
18191 );
18192
18193 # All combinations of names that are suppressed.
18194 # This is actually for UCD.t, so it knows which properties shouldn't have
18195 # entries.  If it got any bigger, would probably want to put it in its own
18196 # file to use memory only when it was needed, in testing.
18197 \@Unicode::UCD::suppressed_properties = (
18198 $suppressed
18199 );
18200
18201 1;
18202 END
18203
18204     main::write("UCD.pl", 0, \@ucd);  # The 0 means no utf8.
18205     return;
18206 }
18207
18208 sub write_all_tables() {
18209     # Write out all the tables generated by this program to files, as well as
18210     # the supporting data structures, pod file, and .t file.
18211
18212     my @writables;              # List of tables that actually get written
18213     my %match_tables_to_write;  # Used to collapse identical match tables
18214                                 # into one file.  Each key is a hash function
18215                                 # result to partition tables into buckets.
18216                                 # Each value is an array of the tables that
18217                                 # fit in the bucket.
18218
18219     # For each property ...
18220     # (sort so that if there is an immutable file name, it has precedence, so
18221     # some other property can't come in and take over its file name.  (We
18222     # don't care if both defined, as they had better be different anyway.)
18223     # The property named 'Perl' needs to be first (it doesn't have any
18224     # immutable file name) because empty properties are defined in terms of
18225     # its table named 'All' under the -annotate option.)   We also sort by
18226     # the property's name.  This is just for repeatability of the outputs
18227     # between runs of this program, but does not affect correctness.
18228     PROPERTY:
18229     foreach my $property ($perl,
18230                           sort { return -1 if defined $a->file;
18231                                  return 1 if defined $b->file;
18232                                  return $a->name cmp $b->name;
18233                                 } grep { $_ != $perl } property_ref('*'))
18234     {
18235         my $type = $property->type;
18236
18237         # And for each table for that property, starting with the mapping
18238         # table for it ...
18239         TABLE:
18240         foreach my $table($property,
18241
18242                         # and all the match tables for it (if any), sorted so
18243                         # the ones with the shortest associated file name come
18244                         # first.  The length sorting prevents problems of a
18245                         # longer file taking a name that might have to be used
18246                         # by a shorter one.  The alphabetic sorting prevents
18247                         # differences between releases
18248                         sort {  my $ext_a = $a->external_name;
18249                                 return 1 if ! defined $ext_a;
18250                                 my $ext_b = $b->external_name;
18251                                 return -1 if ! defined $ext_b;
18252
18253                                 # But return the non-complement table before
18254                                 # the complement one, as the latter is defined
18255                                 # in terms of the former, and needs to have
18256                                 # the information for the former available.
18257                                 return 1 if $a->complement != 0;
18258                                 return -1 if $b->complement != 0;
18259
18260                                 # Similarly, return a subservient table after
18261                                 # a leader
18262                                 return 1 if $a->leader != $a;
18263                                 return -1 if $b->leader != $b;
18264
18265                                 my $cmp = length $ext_a <=> length $ext_b;
18266
18267                                 # Return result if lengths not equal
18268                                 return $cmp if $cmp;
18269
18270                                 # Alphabetic if lengths equal
18271                                 return $ext_a cmp $ext_b
18272                         } $property->tables
18273                     )
18274         {
18275
18276             # Here we have a table associated with a property.  It could be
18277             # the map table (done first for each property), or one of the
18278             # other tables.  Determine which type.
18279             my $is_property = $table->isa('Property');
18280
18281             my $name = $table->name;
18282             my $complete_name = $table->complete_name;
18283
18284             # See if should suppress the table if is empty, but warn if it
18285             # contains something.
18286             my $suppress_if_empty_warn_if_not
18287                     = $why_suppress_if_empty_warn_if_not{$complete_name} || 0;
18288
18289             # Calculate if this table should have any code points associated
18290             # with it or not.
18291             my $expected_empty =
18292
18293                 # $perl should be empty
18294                 ($is_property && ($table == $perl))
18295
18296                 # Match tables in properties we skipped populating should be
18297                 # empty
18298                 || (! $is_property && ! $property->to_create_match_tables)
18299
18300                 # Tables and properties that are expected to have no code
18301                 # points should be empty
18302                 || $suppress_if_empty_warn_if_not
18303             ;
18304
18305             # Set a boolean if this table is the complement of an empty binary
18306             # table
18307             my $is_complement_of_empty_binary =
18308                 $type == $BINARY &&
18309                 (($table == $property->table('Y')
18310                     && $property->table('N')->is_empty)
18311                 || ($table == $property->table('N')
18312                     && $property->table('Y')->is_empty));
18313
18314             if ($table->is_empty) {
18315
18316                 if ($suppress_if_empty_warn_if_not) {
18317                     $table->set_fate($SUPPRESSED,
18318                                      $suppress_if_empty_warn_if_not);
18319                 }
18320
18321                 # Suppress (by skipping them) expected empty tables.
18322                 next TABLE if $expected_empty;
18323
18324                 # And setup to later output a warning for those that aren't
18325                 # known to be allowed to be empty.  Don't do the warning if
18326                 # this table is a child of another one to avoid duplicating
18327                 # the warning that should come from the parent one.
18328                 if (($table == $property || $table->parent == $table)
18329                     && $table->fate != $SUPPRESSED
18330                     && $table->fate != $MAP_PROXIED
18331                     && ! grep { $complete_name =~ /^$_$/ }
18332                                                     @tables_that_may_be_empty)
18333                 {
18334                     push @unhandled_properties, "$table";
18335                 }
18336
18337                 # The old way of expressing an empty match list was to
18338                 # complement the list that matches everything.  The new way is
18339                 # to create an empty inversion list, but this doesn't work for
18340                 # annotating, so use the old way then.
18341                 $table->set_complement($All) if $annotate
18342                                                 && $table != $property;
18343             }
18344             elsif ($expected_empty) {
18345                 my $because = "";
18346                 if ($suppress_if_empty_warn_if_not) {
18347                     $because = " because $suppress_if_empty_warn_if_not";
18348                 }
18349
18350                 Carp::my_carp("Not expecting property $table$because.  Generating file for it anyway.");
18351             }
18352
18353             # Some tables should match everything
18354             my $expected_full =
18355                 ($table->fate == $SUPPRESSED)
18356                 ? 0
18357                 : ($is_property)
18358                   ? # All these types of map tables will be full because
18359                     # they will have been populated with defaults
18360                     ($type == $ENUM)
18361
18362                   : # A match table should match everything if its method
18363                     # shows it should
18364                     ($table->matches_all
18365
18366                     # The complement of an empty binary table will match
18367                     # everything
18368                     || $is_complement_of_empty_binary
18369                     )
18370             ;
18371
18372             my $count = $table->count;
18373             if ($expected_full) {
18374                 if ($count != $MAX_WORKING_CODEPOINTS) {
18375                     Carp::my_carp("$table matches only "
18376                     . clarify_number($count)
18377                     . " Unicode code points but should match "
18378                     . clarify_number($MAX_WORKING_CODEPOINTS)
18379                     . " (off by "
18380                     .  clarify_number(abs($MAX_WORKING_CODEPOINTS - $count))
18381                     . ").  Proceeding anyway.");
18382                 }
18383
18384                 # Here is expected to be full.  If it is because it is the
18385                 # complement of an (empty) binary table that is to be
18386                 # suppressed, then suppress this one as well.
18387                 if ($is_complement_of_empty_binary) {
18388                     my $opposing_name = ($name eq 'Y') ? 'N' : 'Y';
18389                     my $opposing = $property->table($opposing_name);
18390                     my $opposing_status = $opposing->status;
18391                     if ($opposing_status) {
18392                         $table->set_status($opposing_status,
18393                                            $opposing->status_info);
18394                     }
18395                 }
18396             }
18397             elsif ($count == $MAX_UNICODE_CODEPOINTS
18398                    && $name ne "Any"
18399                    && ($table == $property || $table->leader == $table)
18400                    && $table->property->status ne $NORMAL)
18401             {
18402                     Carp::my_carp("$table unexpectedly matches all Unicode code points.  Proceeding anyway.");
18403             }
18404
18405             if ($table->fate >= $SUPPRESSED) {
18406                 if (! $is_property) {
18407                     my @children = $table->children;
18408                     foreach my $child (@children) {
18409                         if ($child->fate < $SUPPRESSED) {
18410                             Carp::my_carp_bug("'$table' is suppressed and has a child '$child' which isn't");
18411                         }
18412                     }
18413                 }
18414                 next TABLE;
18415
18416             }
18417
18418             if (! $is_property) {
18419
18420                 make_ucd_table_pod_entries($table) if $table->property == $perl;
18421
18422                 # Several things need to be done just once for each related
18423                 # group of match tables.  Do them on the parent.
18424                 if ($table->parent == $table) {
18425
18426                     # Add an entry in the pod file for the table; it also does
18427                     # the children.
18428                     make_re_pod_entries($table) if defined $pod_directory;
18429
18430                     # See if the table matches identical code points with
18431                     # something that has already been processed and is ready
18432                     # for output.  In that case, no need to have two files
18433                     # with the same code points in them.  We use the table's
18434                     # hash() method to store these in buckets, so that it is
18435                     # quite likely that if two tables are in the same bucket
18436                     # they will be identical, so don't have to compare tables
18437                     # frequently.  The tables have to have the same status to
18438                     # share a file, so add this to the bucket hash.  (The
18439                     # reason for this latter is that Heavy.pl associates a
18440                     # status with a file.) We don't check tables that are
18441                     # inverses of others, as it would lead to some coding
18442                     # complications, and checking all the regular ones should
18443                     # find everything.
18444                     if ($table->complement == 0) {
18445                         my $hash = $table->hash . ';' . $table->status;
18446
18447                         # Look at each table that is in the same bucket as
18448                         # this one would be.
18449                         foreach my $comparison
18450                                             (@{$match_tables_to_write{$hash}})
18451                         {
18452                             # If the table doesn't point back to this one, we
18453                             # see if it matches identically
18454                             if (   $comparison->leader != $table
18455                                 && $table->matches_identically_to($comparison))
18456                             {
18457                                 $table->set_equivalent_to($comparison,
18458                                                                 Related => 0);
18459                                 next TABLE;
18460                             }
18461                         }
18462
18463                         # Here, not equivalent, add this table to the bucket.
18464                         push @{$match_tables_to_write{$hash}}, $table;
18465                     }
18466                 }
18467             }
18468             else {
18469
18470                 # Here is the property itself.
18471                 # Don't write out or make references to the $perl property
18472                 next if $table == $perl;
18473
18474                 make_ucd_table_pod_entries($table);
18475
18476                 # There is a mapping stored of the various synonyms to the
18477                 # standardized name of the property for utf8_heavy.pl.
18478                 # Also, the pod file contains entries of the form:
18479                 # \p{alias: *}         \p{full: *}
18480                 # rather than show every possible combination of things.
18481
18482                 my @property_aliases = $property->aliases;
18483
18484                 my $full_property_name = $property->full_name;
18485                 my $property_name = $property->name;
18486                 my $standard_property_name = standardize($property_name);
18487                 my $standard_property_full_name
18488                                         = standardize($full_property_name);
18489
18490                 # We also create for Unicode::UCD a list of aliases for
18491                 # the property.  The list starts with the property name;
18492                 # then its full name.  Legacy properties are not listed in
18493                 # Unicode::UCD.
18494                 my @property_list;
18495                 my @standard_list;
18496                 if ( $property->fate <= $MAP_PROXIED) {
18497                     @property_list = ($property_name, $full_property_name);
18498                     @standard_list = ($standard_property_name,
18499                                         $standard_property_full_name);
18500                 }
18501
18502                 # For each synonym ...
18503                 for my $i (0 .. @property_aliases - 1)  {
18504                     my $alias = $property_aliases[$i];
18505                     my $alias_name = $alias->name;
18506                     my $alias_standard = standardize($alias_name);
18507
18508
18509                     # Add other aliases to the list of property aliases
18510                     if ($property->fate <= $MAP_PROXIED
18511                         && ! grep { $alias_standard eq $_ } @standard_list)
18512                     {
18513                         push @property_list, $alias_name;
18514                         push @standard_list, $alias_standard;
18515                     }
18516
18517                     # For utf8_heavy, set the mapping of the alias to the
18518                     # property
18519                     if ($type == $STRING) {
18520                         if ($property->fate <= $MAP_PROXIED) {
18521                             $string_property_loose_to_name{$alias_standard}
18522                                             = $standard_property_name;
18523                         }
18524                     }
18525                     else {
18526                         my $hash_ref = ($alias_standard =~ /^_/)
18527                                        ? \%strict_property_name_of
18528                                        : \%loose_property_name_of;
18529                         if (exists $hash_ref->{$alias_standard}) {
18530                             Carp::my_carp("There already is a property with the same standard name as $alias_name: $hash_ref->{$alias_standard}.  Old name is retained");
18531                         }
18532                         else {
18533                             $hash_ref->{$alias_standard}
18534                                                 = $standard_property_name;
18535                         }
18536
18537                         # Now for the re pod entry for this alias.  Skip if not
18538                         # outputting a pod; skip the first one, which is the
18539                         # full name so won't have an entry like: '\p{full: *}
18540                         # \p{full: *}', and skip if don't want an entry for
18541                         # this one.
18542                         next if $i == 0
18543                                 || ! defined $pod_directory
18544                                 || ! $alias->make_re_pod_entry;
18545
18546                         my $rhs = "\\p{$full_property_name: *}";
18547                         if ($property != $perl && $table->perl_extension) {
18548                             $rhs .= ' (Perl extension)';
18549                         }
18550                         push @match_properties,
18551                             format_pod_line($indent_info_column,
18552                                         '\p{' . $alias->name . ': *}',
18553                                         $rhs,
18554                                         $alias->status);
18555                     }
18556                 }
18557
18558                 # The list of all possible names is attached to each alias, so
18559                 # lookup is easy
18560                 if (@property_list) {
18561                     push @{$prop_aliases{$standard_list[0]}}, @property_list;
18562                 }
18563
18564                 if ($property->fate <= $MAP_PROXIED) {
18565
18566                     # Similarly, we create for Unicode::UCD a list of
18567                     # property-value aliases.
18568
18569                     # Look at each table in the property...
18570                     foreach my $table ($property->tables) {
18571                         my @values_list;
18572                         my $table_full_name = $table->full_name;
18573                         my $standard_table_full_name
18574                                               = standardize($table_full_name);
18575                         my $table_name = $table->name;
18576                         my $standard_table_name = standardize($table_name);
18577
18578                         # The list starts with the table name and its full
18579                         # name.
18580                         push @values_list, $table_name, $table_full_name;
18581
18582                         # We add to the table each unique alias that isn't
18583                         # discouraged from use.
18584                         foreach my $alias ($table->aliases) {
18585                             next if $alias->status
18586                                  && $alias->status eq $DISCOURAGED;
18587                             my $name = $alias->name;
18588                             my $standard = standardize($name);
18589                             next if $standard eq $standard_table_name;
18590                             next if $standard eq $standard_table_full_name;
18591                             push @values_list, $name;
18592                         }
18593
18594                         # Here @values_list is a list of all the aliases for
18595                         # the table.  That is, all the property-values given
18596                         # by this table.  By agreement with Unicode::UCD,
18597                         # if the name and full name are identical, and there
18598                         # are no other names, drop the duplicate entry to save
18599                         # memory.
18600                         if (@values_list == 2
18601                             && $values_list[0] eq $values_list[1])
18602                         {
18603                             pop @values_list
18604                         }
18605
18606                         # To save memory, unlike the similar list for property
18607                         # aliases above, only the standard forms have the list.
18608                         # This forces an extra step of converting from input
18609                         # name to standard name, but the savings are
18610                         # considerable.  (There is only marginal savings if we
18611                         # did this with the property aliases.)
18612                         push @{$prop_value_aliases{$standard_property_name}{$standard_table_name}}, @values_list;
18613                     }
18614                 }
18615
18616                 # Don't write out a mapping file if not desired.
18617                 next if ! $property->to_output_map;
18618             }
18619
18620             # Here, we know we want to write out the table, but don't do it
18621             # yet because there may be other tables that come along and will
18622             # want to share the file, and the file's comments will change to
18623             # mention them.  So save for later.
18624             push @writables, $table;
18625
18626         } # End of looping through the property and all its tables.
18627     } # End of looping through all properties.
18628
18629     # Now have all the tables that will have files written for them.  Do it.
18630     foreach my $table (@writables) {
18631         my @directory;
18632         my $filename;
18633         my $property = $table->property;
18634         my $is_property = ($table == $property);
18635
18636         # For very short tables, instead of writing them out to actual files,
18637         # we in-line their inversion list definitions into Heavy.pl.  The
18638         # definition replaces the file name, and the special pseudo-directory
18639         # '#' is used to signal this.  This significantly cuts down the number
18640         # of files written at little extra cost to the hashes in Heavy.pl.
18641         # And it means, no run-time files to read to get the definitions.
18642         if (! $is_property
18643             && ! $annotate  # For annotation, we want to explicitly show
18644                             # everything, so keep in files
18645             && $table->ranges <= 3)
18646         {
18647             my @ranges = $table->ranges;
18648             my $count = @ranges;
18649             if ($count == 0) {  # 0th index reserved for 0-length lists
18650                 $filename = 0;
18651             }
18652             elsif ($table->leader != $table) {
18653
18654                 # Here, is a table that is equivalent to another; code
18655                 # in register_file_for_name() causes its leader's definition
18656                 # to be used
18657
18658                 next;
18659             }
18660             else {  # No equivalent table so far.
18661
18662                 # Build up its definition range-by-range.
18663                 my $definition = "";
18664                 while (defined (my $range = shift @ranges)) {
18665                     my $end = $range->end;
18666                     if ($end < $MAX_WORKING_CODEPOINT) {
18667                         $count++;
18668                         $end = "\n" . ($end + 1);
18669                     }
18670                     else {  # Extends to infinity, hence no 'end'
18671                         $end = "";
18672                     }
18673                     $definition .= "\n" . $range->start . $end;
18674                 }
18675                 $definition = "V$count" . $definition;
18676                 $filename = @inline_definitions;
18677                 push @inline_definitions, $definition;
18678             }
18679             @directory = "#";
18680             register_file_for_name($table, \@directory, $filename);
18681             next;
18682         }
18683
18684         if (! $is_property) {
18685             # Match tables for the property go in lib/$subdirectory, which is
18686             # the property's name.  Don't use the standard file name for this,
18687             # as may get an unfamiliar alias
18688             @directory = ($matches_directory, $property->external_name);
18689         }
18690         else {
18691
18692             @directory = $table->directory;
18693             $filename = $table->file;
18694         }
18695
18696         # Use specified filename if available, or default to property's
18697         # shortest name.  We need an 8.3 safe filename (which means "an 8
18698         # safe" filename, since after the dot is only 'pl', which is < 3)
18699         # The 2nd parameter is if the filename shouldn't be changed, and
18700         # it shouldn't iff there is a hard-coded name for this table.
18701         $filename = construct_filename(
18702                                 $filename || $table->external_name,
18703                                 ! $filename,    # mutable if no filename
18704                                 \@directory);
18705
18706         register_file_for_name($table, \@directory, $filename);
18707
18708         # Only need to write one file when shared by more than one
18709         # property
18710         next if ! $is_property
18711                 && ($table->leader != $table || $table->complement != 0);
18712
18713         # Construct a nice comment to add to the file
18714         $table->set_final_comment;
18715
18716         $table->write;
18717     }
18718
18719
18720     # Write out the pod file
18721     make_pod;
18722
18723     # And Heavy.pl, Name.pm, UCD.pl
18724     make_Heavy;
18725     make_Name_pm;
18726     make_UCD;
18727
18728     make_property_test_script() if $make_test_script;
18729     make_normalization_test_script() if $make_norm_test_script;
18730     return;
18731 }
18732
18733 my @white_space_separators = ( # This used only for making the test script.
18734                             "",
18735                             ' ',
18736                             "\t",
18737                             '   '
18738                         );
18739
18740 sub generate_separator($) {
18741     # This used only for making the test script.  It generates the colon or
18742     # equal separator between the property and property value, with random
18743     # white space surrounding the separator
18744
18745     my $lhs = shift;
18746
18747     return "" if $lhs eq "";  # No separator if there's only one (the r) side
18748
18749     # Choose space before and after randomly
18750     my $spaces_before =$white_space_separators[rand(@white_space_separators)];
18751     my $spaces_after = $white_space_separators[rand(@white_space_separators)];
18752
18753     # And return the whole complex, half the time using a colon, half the
18754     # equals
18755     return $spaces_before
18756             . (rand() < 0.5) ? '=' : ':'
18757             . $spaces_after;
18758 }
18759
18760 sub generate_tests($$$$$) {
18761     # This used only for making the test script.  It generates test cases that
18762     # are expected to compile successfully in perl.  Note that the LHS and
18763     # RHS are assumed to already be as randomized as the caller wants.
18764
18765     my $lhs = shift;           # The property: what's to the left of the colon
18766                                #  or equals separator
18767     my $rhs = shift;           # The property value; what's to the right
18768     my $valid_code = shift;    # A code point that's known to be in the
18769                                # table given by LHS=RHS; undef if table is
18770                                # empty
18771     my $invalid_code = shift;  # A code point known to not be in the table;
18772                                # undef if the table is all code points
18773     my $warning = shift;
18774
18775     # Get the colon or equal
18776     my $separator = generate_separator($lhs);
18777
18778     # The whole 'property=value'
18779     my $name = "$lhs$separator$rhs";
18780
18781     my @output;
18782     # Create a complete set of tests, with complements.
18783     if (defined $valid_code) {
18784         push @output, <<"EOC"
18785 Expect(1, $valid_code, '\\p{$name}', $warning);
18786 Expect(0, $valid_code, '\\p{^$name}', $warning);
18787 Expect(0, $valid_code, '\\P{$name}', $warning);
18788 Expect(1, $valid_code, '\\P{^$name}', $warning);
18789 EOC
18790     }
18791     if (defined $invalid_code) {
18792         push @output, <<"EOC"
18793 Expect(0, $invalid_code, '\\p{$name}', $warning);
18794 Expect(1, $invalid_code, '\\p{^$name}', $warning);
18795 Expect(1, $invalid_code, '\\P{$name}', $warning);
18796 Expect(0, $invalid_code, '\\P{^$name}', $warning);
18797 EOC
18798     }
18799     return @output;
18800 }
18801
18802 sub generate_error($$$) {
18803     # This used only for making the test script.  It generates test cases that
18804     # are expected to not only not match, but to be syntax or similar errors
18805
18806     my $lhs = shift;                # The property: what's to the left of the
18807                                     # colon or equals separator
18808     my $rhs = shift;                # The property value; what's to the right
18809     my $already_in_error = shift;   # Boolean; if true it's known that the
18810                                 # unmodified LHS and RHS will cause an error.
18811                                 # This routine should not force another one
18812     # Get the colon or equal
18813     my $separator = generate_separator($lhs);
18814
18815     # Since this is an error only, don't bother to randomly decide whether to
18816     # put the error on the left or right side; and assume that the RHS is
18817     # loosely matched, again for convenience rather than rigor.
18818     $rhs = randomize_loose_name($rhs, 'ERROR') unless $already_in_error;
18819
18820     my $property = $lhs . $separator . $rhs;
18821
18822     return <<"EOC";
18823 Error('\\p{$property}');
18824 Error('\\P{$property}');
18825 EOC
18826 }
18827
18828 # These are used only for making the test script
18829 # XXX Maybe should also have a bad strict seps, which includes underscore.
18830
18831 my @good_loose_seps = (
18832             " ",
18833             "-",
18834             "\t",
18835             "",
18836             "_",
18837            );
18838 my @bad_loose_seps = (
18839            "/a/",
18840            ':=',
18841           );
18842
18843 sub randomize_stricter_name {
18844     # This used only for making the test script.  Take the input name and
18845     # return a randomized, but valid version of it under the stricter matching
18846     # rules.
18847
18848     my $name = shift;
18849     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
18850
18851     # If the name looks like a number (integer, floating, or rational), do
18852     # some extra work
18853     if ($name =~ qr{ ^ ( -? ) (\d+ ( ( [./] ) \d+ )? ) $ }x) {
18854         my $sign = $1;
18855         my $number = $2;
18856         my $separator = $3;
18857
18858         # If there isn't a sign, part of the time add a plus
18859         # Note: Not testing having any denominator having a minus sign
18860         if (! $sign) {
18861             $sign = '+' if rand() <= .3;
18862         }
18863
18864         # And add 0 or more leading zeros.
18865         $name = $sign . ('0' x int rand(10)) . $number;
18866
18867         if (defined $separator) {
18868             my $extra_zeros = '0' x int rand(10);
18869
18870             if ($separator eq '.') {
18871
18872                 # Similarly, add 0 or more trailing zeros after a decimal
18873                 # point
18874                 $name .= $extra_zeros;
18875             }
18876             else {
18877
18878                 # Or, leading zeros before the denominator
18879                 $name =~ s,/,/$extra_zeros,;
18880             }
18881         }
18882     }
18883
18884     # For legibility of the test, only change the case of whole sections at a
18885     # time.  To do this, first split into sections.  The split returns the
18886     # delimiters
18887     my @sections;
18888     for my $section (split / ( [ - + \s _ . ]+ ) /x, $name) {
18889         trace $section if main::DEBUG && $to_trace;
18890
18891         if (length $section > 1 && $section !~ /\D/) {
18892
18893             # If the section is a sequence of digits, about half the time
18894             # randomly add underscores between some of them.
18895             if (rand() > .5) {
18896
18897                 # Figure out how many underscores to add.  max is 1 less than
18898                 # the number of digits.  (But add 1 at the end to make sure
18899                 # result isn't 0, and compensate earlier by subtracting 2
18900                 # instead of 1)
18901                 my $num_underscores = int rand(length($section) - 2) + 1;
18902
18903                 # And add them evenly throughout, for convenience, not rigor
18904                 use integer;
18905                 my $spacing = (length($section) - 1)/ $num_underscores;
18906                 my $temp = $section;
18907                 $section = "";
18908                 for my $i (1 .. $num_underscores) {
18909                     $section .= substr($temp, 0, $spacing, "") . '_';
18910                 }
18911                 $section .= $temp;
18912             }
18913             push @sections, $section;
18914         }
18915         else {
18916
18917             # Here not a sequence of digits.  Change the case of the section
18918             # randomly
18919             my $switch = int rand(4);
18920             if ($switch == 0) {
18921                 push @sections, uc $section;
18922             }
18923             elsif ($switch == 1) {
18924                 push @sections, lc $section;
18925             }
18926             elsif ($switch == 2) {
18927                 push @sections, ucfirst $section;
18928             }
18929             else {
18930                 push @sections, $section;
18931             }
18932         }
18933     }
18934     trace "returning", join "", @sections if main::DEBUG && $to_trace;
18935     return join "", @sections;
18936 }
18937
18938 sub randomize_loose_name($;$) {
18939     # This used only for making the test script
18940
18941     my $name = shift;
18942     my $want_error = shift;  # if true, make an error
18943     Carp::carp_extra_args(\@_) if main::DEBUG && @_;
18944
18945     $name = randomize_stricter_name($name);
18946
18947     my @parts;
18948     push @parts, $good_loose_seps[rand(@good_loose_seps)];
18949
18950     # Preserve trailing ones for the sake of not stripping the underscore from
18951     # 'L_'
18952     for my $part (split /[-\s_]+ (?= . )/, $name) {
18953         if (@parts) {
18954             if ($want_error and rand() < 0.3) {
18955                 push @parts, $bad_loose_seps[rand(@bad_loose_seps)];
18956                 $want_error = 0;
18957             }
18958             else {
18959                 push @parts, $good_loose_seps[rand(@good_loose_seps)];
18960             }
18961         }
18962         push @parts, $part;
18963     }
18964     my $new = join("", @parts);
18965     trace "$name => $new" if main::DEBUG && $to_trace;
18966
18967     if ($want_error) {
18968         if (rand() >= 0.5) {
18969             $new .= $bad_loose_seps[rand(@bad_loose_seps)];
18970         }
18971         else {
18972             $new = $bad_loose_seps[rand(@bad_loose_seps)] . $new;
18973         }
18974     }
18975     return $new;
18976 }
18977
18978 # Used to make sure don't generate duplicate test cases.
18979 my %test_generated;
18980
18981 sub make_property_test_script() {
18982     # This used only for making the test script
18983     # this written directly -- it's huge.
18984
18985     print "Making test script\n" if $verbosity >= $PROGRESS;
18986
18987     # This uses randomness to test different possibilities without testing all
18988     # possibilities.  To ensure repeatability, set the seed to 0.  But if
18989     # tests are added, it will perturb all later ones in the .t file
18990     srand 0;
18991
18992     $t_path = 'TestProp.pl' unless defined $t_path; # the traditional name
18993
18994     # Create a list of what the %f representation is for each rational number.
18995     # This will be used below.
18996     my @valid_base_floats = '0.0';
18997     foreach my $e_representation (keys %nv_floating_to_rational) {
18998         push @valid_base_floats,
18999                             eval $nv_floating_to_rational{$e_representation};
19000     }
19001
19002     # It doesn't matter whether the elements of this array contain single lines
19003     # or multiple lines. main::write doesn't count the lines.
19004     my @output;
19005
19006     push @output, <<'EOF_CODE';
19007 Error('\p{Script=InGreek}');    # Bug #69018
19008 Test_GCB("1100 $nobreak 1161");  # Bug #70940
19009 Expect(0, 0x2028, '\p{Print}', ""); # Bug # 71722
19010 Expect(0, 0x2029, '\p{Print}', ""); # Bug # 71722
19011 Expect(1, 0xFF10, '\p{XDigit}', ""); # Bug # 71726
19012
19013 # Make sure this gets tested; it was not part of the official test suite at
19014 # the time this was added.  Note that this is as it would appear in the
19015 # official suite, and gets modified to check for the perl tailoring by
19016 # Test_WB()
19017 Test_WB("$breakable 0020 $breakable 0020 $breakable 0308 $breakable");
19018 Test_LB("$nobreak 200B $nobreak 0020 $nobreak 0020 $breakable 2060 $breakable");
19019 EOF_CODE
19020
19021     # Sort these so get results in same order on different runs of this
19022     # program
19023     foreach my $property (sort { $a->has_dependency <=> $b->has_dependency
19024                                     or
19025                                  lc $a->name cmp lc $b->name
19026                                } property_ref('*'))
19027     {
19028         # Non-binary properties should not match \p{};  Test all for that.
19029         if ($property->type != $BINARY && $property->type != $FORCED_BINARY) {
19030             my @property_aliases = grep { $_->status ne $INTERNAL_ALIAS }
19031                                                             $property->aliases;
19032             foreach my $property_alias ($property->aliases) {
19033                 my $name = standardize($property_alias->name);
19034
19035                 # But some names are ambiguous, meaning a binary property with
19036                 # the same name when used in \p{}, and a different
19037                 # (non-binary) property in other contexts.
19038                 next if grep { $name eq $_ } keys %ambiguous_names;
19039
19040                 push @output, <<"EOF_CODE";
19041 Error('\\p{$name}');
19042 Error('\\P{$name}');
19043 EOF_CODE
19044             }
19045         }
19046         foreach my $table (sort { $a->has_dependency <=> $b->has_dependency
19047                                     or
19048                                   lc $a->name cmp lc $b->name
19049                                 } $property->tables)
19050         {
19051
19052             # Find code points that match, and don't match this table.
19053             my $valid = $table->get_valid_code_point;
19054             my $invalid = $table->get_invalid_code_point;
19055             my $warning = ($table->status eq $DEPRECATED)
19056                             ? "'deprecated'"
19057                             : '""';
19058
19059             # Test each possible combination of the property's aliases with
19060             # the table's.  If this gets to be too many, could do what is done
19061             # in the set_final_comment() for Tables
19062             my @table_aliases = grep { $_->status ne $INTERNAL_ALIAS } $table->aliases;
19063             next unless @table_aliases;
19064             my @property_aliases = grep { $_->status ne $INTERNAL_ALIAS } $table->property->aliases;
19065             next unless @property_aliases;
19066
19067             # Every property can be optionally be prefixed by 'Is_', so test
19068             # that those work, by creating such a new alias for each
19069             # pre-existing one.
19070             push @property_aliases, map { Alias->new("Is_" . $_->name,
19071                                                     $_->loose_match,
19072                                                     $_->make_re_pod_entry,
19073                                                     $_->ok_as_filename,
19074                                                     $_->status,
19075                                                     $_->ucd,
19076                                                     )
19077                                          } @property_aliases;
19078             my $max = max(scalar @table_aliases, scalar @property_aliases);
19079             for my $j (0 .. $max - 1) {
19080
19081                 # The current alias for property is the next one on the list,
19082                 # or if beyond the end, start over.  Similarly for table
19083                 my $property_name
19084                             = $property_aliases[$j % @property_aliases]->name;
19085
19086                 $property_name = "" if $table->property == $perl;
19087                 my $table_alias = $table_aliases[$j % @table_aliases];
19088                 my $table_name = $table_alias->name;
19089                 my $loose_match = $table_alias->loose_match;
19090
19091                 # If the table doesn't have a file, any test for it is
19092                 # already guaranteed to be in error
19093                 my $already_error = ! $table->file_path;
19094
19095                 # Generate error cases for this alias.
19096                 push @output, generate_error($property_name,
19097                                              $table_name,
19098                                              $already_error);
19099
19100                 # If the table is guaranteed to always generate an error,
19101                 # quit now without generating success cases.
19102                 next if $already_error;
19103
19104                 # Now for the success cases.
19105                 my $random;
19106                 if ($loose_match) {
19107
19108                     # For loose matching, create an extra test case for the
19109                     # standard name.
19110                     my $standard = standardize($table_name);
19111
19112                     # $test_name should be a unique combination for each test
19113                     # case; used just to avoid duplicate tests
19114                     my $test_name = "$property_name=$standard";
19115
19116                     # Don't output duplicate test cases.
19117                     if (! exists $test_generated{$test_name}) {
19118                         $test_generated{$test_name} = 1;
19119                         push @output, generate_tests($property_name,
19120                                                      $standard,
19121                                                      $valid,
19122                                                      $invalid,
19123                                                      $warning,
19124                                                  );
19125                     }
19126                     $random = randomize_loose_name($table_name)
19127                 }
19128                 else { # Stricter match
19129                     $random = randomize_stricter_name($table_name);
19130                 }
19131
19132                 # Now for the main test case for this alias.
19133                 my $test_name = "$property_name=$random";
19134                 if (! exists $test_generated{$test_name}) {
19135                     $test_generated{$test_name} = 1;
19136                     push @output, generate_tests($property_name,
19137                                                  $random,
19138                                                  $valid,
19139                                                  $invalid,
19140                                                  $warning,
19141                                              );
19142
19143                     if ($property->name eq 'nv') {
19144                         if ($table_name !~ qr{/}) {
19145                             push @output, generate_tests($property_name,
19146                                                 sprintf("%.15e", $table_name),
19147                                                 $valid,
19148                                                 $invalid,
19149                                                 $warning,
19150                                             );
19151                     }
19152                     else {
19153                     # If the name is a rational number, add tests for a
19154                     # non-reduced form, and for a floating point equivalent.
19155
19156                         # 60 is a number divisible by a bunch of things
19157                         my ($numerator, $denominator) = $table_name
19158                                                         =~ m! (.+) / (.+) !x;
19159                         $numerator *= 60;
19160                         $denominator *= 60;
19161                         push @output, generate_tests($property_name,
19162                                                     "$numerator/$denominator",
19163                                                     $valid,
19164                                                     $invalid,
19165                                                     $warning,
19166                                     );
19167
19168                         # Calculate the float, and the %e representation
19169                         my $float = eval $table_name;
19170                         my $e_representation = sprintf("%.*e",
19171                                                 $E_FLOAT_PRECISION, $float);
19172                         # Parse that
19173                         my ($non_zeros, $zeros, $exponent_sign, $exponent)
19174                            = $e_representation
19175                                =~ / -? [1-9] \. (\d*?) (0*) e ([+-]) (\d+) /x;
19176                         my $min_e_precision;
19177                         my $min_f_precision;
19178
19179                         if ($exponent_sign eq '+' && $exponent != 0) {
19180                             Carp::my_carp_bug("Not yet equipped to handle"
19181                                             . " positive exponents");
19182                             return;
19183                         }
19184                         else {
19185                             # We're trying to find the minimum precision that
19186                             # is needed to indicate this particular rational
19187                             # for the given $E_FLOAT_PRECISION.  For %e, any
19188                             # trailing zeros, like 1.500e-02 aren't needed, so
19189                             # the correct value is how many non-trailing zeros
19190                             # there are after the decimal point.
19191                             $min_e_precision = length $non_zeros;
19192
19193                             # For %f, like .01500, we want at least
19194                             # $E_FLOAT_PRECISION digits, but any trailing
19195                             # zeros aren't needed, so we can subtract the
19196                             # length of those.  But we also need to include
19197                             # the zeros after the decimal point, but before
19198                             # the first significant digit.
19199                             $min_f_precision = $E_FLOAT_PRECISION
19200                                              + $exponent
19201                                              - length $zeros;
19202                         }
19203
19204                         # Make tests for each possible precision from 1 to
19205                         # just past the worst case.
19206                         my $upper_limit = ($min_e_precision > $min_f_precision)
19207                                            ? $min_e_precision
19208                                            : $min_f_precision;
19209
19210                         for my $i (1 .. $upper_limit + 1) {
19211                             for my $format ("e", "f") {
19212                                 my $this_table
19213                                           = sprintf("%.*$format", $i, $float);
19214
19215                                 # If we don't have enough precision digits,
19216                                 # make a fail test; otherwise a pass test.
19217                                 my $pass = ($format eq "e")
19218                                             ? $i >= $min_e_precision
19219                                             : $i >= $min_f_precision;
19220                                 if ($pass) {
19221                                     push @output, generate_tests($property_name,
19222                                                                 $this_table,
19223                                                                 $valid,
19224                                                                 $invalid,
19225                                                                 $warning,
19226                                                 );
19227                                 }
19228                                 elsif (   $format eq "e"
19229
19230                                           # Here we would fail, but in the %f
19231                                           # case, the representation at this
19232                                           # precision could actually be a
19233                                           # valid one for some other rational
19234                                        || ! grep { $_ eq $this_table }
19235                                                             @valid_base_floats)
19236                                 {
19237                                     push @output,
19238                                         generate_error($property_name,
19239                                                        $this_table,
19240                                                        1   # 1 => already an
19241                                                            # error
19242                                                 );
19243                                 }
19244                             }
19245                         }
19246                     }
19247                     }
19248                 }
19249             }
19250             $table->DESTROY();
19251         }
19252         $property->DESTROY();
19253     }
19254
19255     # Make any test of the boundary (break) properties TODO if the code
19256     # doesn't match the version being compiled
19257     my $TODO_FAILING_BREAKS = ($version_of_mk_invlist_bounds ne $v_version)
19258                              ? "\nsub TODO_FAILING_BREAKS { 1 }\n"
19259                              : "\nsub TODO_FAILING_BREAKS { 0 }\n";
19260
19261     @output= map {
19262         map s/^/    /mgr,
19263         map "$_;\n",
19264         split /;\n/, $_
19265     } @output;
19266
19267     # Cause there to be 'if' statements to only execute a portion of this
19268     # long-running test each time, so that we can have a bunch of .t's running
19269     # in parallel
19270     my $chunks = 10     # Number of test files
19271                - 1      # For GCB & SB
19272                - 1      # For WB
19273                - 4;     # LB split into this many files
19274     my @output_chunked;
19275     my $chunk_count=0;
19276     my $chunk_size= int(@output / $chunks) + 1;
19277     while (@output) {
19278         $chunk_count++;
19279         my @chunk= splice @output, 0, $chunk_size;
19280         push @output_chunked,
19281             "if (!\$::TESTCHUNK or \$::TESTCHUNK == $chunk_count) {\n",
19282                 @chunk,
19283             "}\n";
19284     }
19285
19286     $chunk_count++;
19287     push @output_chunked,
19288         "if (!\$::TESTCHUNK or \$::TESTCHUNK == $chunk_count) {\n",
19289             (map {"    Test_GCB('$_');\n"} @backslash_X_tests),
19290             (map {"    Test_SB('$_');\n"} @SB_tests),
19291         "}\n";
19292
19293
19294     $chunk_size= int(@LB_tests / 4) + 1;
19295     @LB_tests = map {"    Test_LB('$_');\n"} @LB_tests;
19296     while (@LB_tests) {
19297         $chunk_count++;
19298         my @chunk= splice @LB_tests, 0, $chunk_size;
19299         push @output_chunked,
19300             "if (!\$::TESTCHUNK or \$::TESTCHUNK == $chunk_count) {\n",
19301                 @chunk,
19302             "}\n";
19303     }
19304
19305     $chunk_count++;
19306     push @output_chunked,
19307         "if (!\$::TESTCHUNK or \$::TESTCHUNK == $chunk_count) {\n",
19308             (map {"    Test_WB('$_');\n"} @WB_tests),
19309         "}\n";
19310
19311     &write($t_path,
19312            0,           # Not utf8;
19313            [$HEADER,
19314             $TODO_FAILING_BREAKS,
19315             <DATA>,
19316             @output_chunked,
19317             "Finished();\n",
19318            ]);
19319
19320     return;
19321 }
19322
19323 sub make_normalization_test_script() {
19324     print "Making normalization test script\n" if $verbosity >= $PROGRESS;
19325
19326     my $n_path = 'TestNorm.pl';
19327
19328     unshift @normalization_tests, <<'END';
19329 use utf8;
19330 use Test::More;
19331
19332 sub ord_string {    # Convert packed ords to printable string
19333     use charnames ();
19334     return "'" . join("", map { '\N{' . charnames::viacode($_) . '}' }
19335                                                 unpack "U*", shift) .  "'";
19336     #return "'" . join(" ", map { sprintf "%04X", $_ } unpack "U*", shift) .  "'";
19337 }
19338
19339 sub Test_N {
19340     my ($source, $nfc, $nfd, $nfkc, $nfkd) = @_;
19341     my $display_source = ord_string($source);
19342     my $display_nfc = ord_string($nfc);
19343     my $display_nfd = ord_string($nfd);
19344     my $display_nfkc = ord_string($nfkc);
19345     my $display_nfkd = ord_string($nfkd);
19346
19347     use Unicode::Normalize;
19348     #    NFC
19349     #      nfc ==  toNFC(source) ==  toNFC(nfc) ==  toNFC(nfd)
19350     #      nfkc ==  toNFC(nfkc) ==  toNFC(nfkd)
19351     #
19352     #    NFD
19353     #      nfd ==  toNFD(source) ==  toNFD(nfc) ==  toNFD(nfd)
19354     #      nfkd ==  toNFD(nfkc) ==  toNFD(nfkd)
19355     #
19356     #    NFKC
19357     #      nfkc == toNFKC(source) == toNFKC(nfc) == toNFKC(nfd) ==
19358     #      toNFKC(nfkc) == toNFKC(nfkd)
19359     #
19360     #    NFKD
19361     #      nfkd == toNFKD(source) == toNFKD(nfc) == toNFKD(nfd) ==
19362     #      toNFKD(nfkc) == toNFKD(nfkd)
19363
19364     is(NFC($source), $nfc, "NFC($display_source) eq $display_nfc");
19365     is(NFC($nfc), $nfc, "NFC($display_nfc) eq $display_nfc");
19366     is(NFC($nfd), $nfc, "NFC($display_nfd) eq $display_nfc");
19367     is(NFC($nfkc), $nfkc, "NFC($display_nfkc) eq $display_nfkc");
19368     is(NFC($nfkd), $nfkc, "NFC($display_nfkd) eq $display_nfkc");
19369
19370     is(NFD($source), $nfd, "NFD($display_source) eq $display_nfd");
19371     is(NFD($nfc), $nfd, "NFD($display_nfc) eq $display_nfd");
19372     is(NFD($nfd), $nfd, "NFD($display_nfd) eq $display_nfd");
19373     is(NFD($nfkc), $nfkd, "NFD($display_nfkc) eq $display_nfkd");
19374     is(NFD($nfkd), $nfkd, "NFD($display_nfkd) eq $display_nfkd");
19375
19376     is(NFKC($source), $nfkc, "NFKC($display_source) eq $display_nfkc");
19377     is(NFKC($nfc), $nfkc, "NFKC($display_nfc) eq $display_nfkc");
19378     is(NFKC($nfd), $nfkc, "NFKC($display_nfd) eq $display_nfkc");
19379     is(NFKC($nfkc), $nfkc, "NFKC($display_nfkc) eq $display_nfkc");
19380     is(NFKC($nfkd), $nfkc, "NFKC($display_nfkd) eq $display_nfkc");
19381
19382     is(NFKD($source), $nfkd, "NFKD($display_source) eq $display_nfkd");
19383     is(NFKD($nfc), $nfkd, "NFKD($display_nfc) eq $display_nfkd");
19384     is(NFKD($nfd), $nfkd, "NFKD($display_nfd) eq $display_nfkd");
19385     is(NFKD($nfkc), $nfkd, "NFKD($display_nfkc) eq $display_nfkd");
19386     is(NFKD($nfkd), $nfkd, "NFKD($display_nfkd) eq $display_nfkd");
19387 }
19388 END
19389
19390     &write($n_path,
19391            1,           # Is utf8;
19392            [
19393             @normalization_tests,
19394             'done_testing();'
19395             ]);
19396     return;
19397 }
19398
19399 # Skip reasons, so will be exact same text and hence the files with each
19400 # reason will get grouped together in perluniprops.
19401 my $Documentation = "Documentation";
19402 my $Indic_Skip
19403             = "Provisional; for the analysis and processing of Indic scripts";
19404 my $Validation = "Validation Tests";
19405 my $Validation_Documentation = "Documentation of validation Tests";
19406
19407 # This is a list of the input files and how to handle them.  The files are
19408 # processed in their order in this list.  Some reordering is possible if
19409 # desired, but the PropertyAliases and PropValueAliases files should be first,
19410 # and the extracted before the others (as data in an extracted file can be
19411 # over-ridden by the non-extracted.  Some other files depend on data derived
19412 # from an earlier file, like UnicodeData requires data from Jamo, and the case
19413 # changing and folding requires data from Unicode.  Mostly, it is safest to
19414 # order by first version releases in (except the Jamo).
19415 #
19416 # The version strings allow the program to know whether to expect a file or
19417 # not, but if a file exists in the directory, it will be processed, even if it
19418 # is in a version earlier than expected, so you can copy files from a later
19419 # release into an earlier release's directory.
19420 my @input_file_objects = (
19421     Input_file->new('PropertyAliases.txt', v3.2,
19422                     Handler => \&process_PropertyAliases,
19423                     Early => [ \&substitute_PropertyAliases ],
19424                     Required_Even_in_Debug_Skip => 1,
19425                    ),
19426     Input_file->new(undef, v0,  # No file associated with this
19427                     Progress_Message => 'Finishing property setup',
19428                     Handler => \&finish_property_setup,
19429                    ),
19430     Input_file->new('PropValueAliases.txt', v3.2,
19431                      Handler => \&process_PropValueAliases,
19432                      Early => [ \&substitute_PropValueAliases ],
19433                      Has_Missings_Defaults => $NOT_IGNORED,
19434                      Required_Even_in_Debug_Skip => 1,
19435                     ),
19436     Input_file->new("${EXTRACTED}DGeneralCategory.txt", v3.1.0,
19437                     Property => 'General_Category',
19438                    ),
19439     Input_file->new("${EXTRACTED}DCombiningClass.txt", v3.1.0,
19440                     Property => 'Canonical_Combining_Class',
19441                     Has_Missings_Defaults => $NOT_IGNORED,
19442                    ),
19443     Input_file->new("${EXTRACTED}DNumType.txt", v3.1.0,
19444                     Property => 'Numeric_Type',
19445                     Has_Missings_Defaults => $NOT_IGNORED,
19446                    ),
19447     Input_file->new("${EXTRACTED}DEastAsianWidth.txt", v3.1.0,
19448                     Property => 'East_Asian_Width',
19449                     Has_Missings_Defaults => $NOT_IGNORED,
19450                    ),
19451     Input_file->new("${EXTRACTED}DLineBreak.txt", v3.1.0,
19452                     Property => 'Line_Break',
19453                     Has_Missings_Defaults => $NOT_IGNORED,
19454                    ),
19455     Input_file->new("${EXTRACTED}DBidiClass.txt", v3.1.1,
19456                     Property => 'Bidi_Class',
19457                     Has_Missings_Defaults => $NOT_IGNORED,
19458                    ),
19459     Input_file->new("${EXTRACTED}DDecompositionType.txt", v3.1.0,
19460                     Property => 'Decomposition_Type',
19461                     Has_Missings_Defaults => $NOT_IGNORED,
19462                    ),
19463     Input_file->new("${EXTRACTED}DBinaryProperties.txt", v3.1.0),
19464     Input_file->new("${EXTRACTED}DNumValues.txt", v3.1.0,
19465                     Property => 'Numeric_Value',
19466                     Each_Line_Handler => \&filter_numeric_value_line,
19467                     Has_Missings_Defaults => $NOT_IGNORED,
19468                    ),
19469     Input_file->new("${EXTRACTED}DJoinGroup.txt", v3.1.0,
19470                     Property => 'Joining_Group',
19471                     Has_Missings_Defaults => $NOT_IGNORED,
19472                    ),
19473
19474     Input_file->new("${EXTRACTED}DJoinType.txt", v3.1.0,
19475                     Property => 'Joining_Type',
19476                     Has_Missings_Defaults => $NOT_IGNORED,
19477                    ),
19478     Input_file->new("${EXTRACTED}DName.txt", v10.0.0,
19479                     Skip => 'This file adds no new information not already'
19480                           . ' present in other files',
19481                     # And it's unnecessary programmer work to handle this new
19482                     # format.  Previous Derived files actually had bug fixes
19483                     # in them that were useful, but that should not be the
19484                     # case here.
19485                    ),
19486     Input_file->new('Jamo.txt', v2.0.0,
19487                     Property => 'Jamo_Short_Name',
19488                     Each_Line_Handler => \&filter_jamo_line,
19489                    ),
19490     Input_file->new('UnicodeData.txt', v1.1.5,
19491                     Pre_Handler => \&setup_UnicodeData,
19492
19493                     # We clean up this file for some early versions.
19494                     Each_Line_Handler => [ (($v_version lt v2.0.0 )
19495                                             ? \&filter_v1_ucd
19496                                             : ($v_version eq v2.1.5)
19497                                                 ? \&filter_v2_1_5_ucd
19498
19499                                                 # And for 5.14 Perls with 6.0,
19500                                                 # have to also make changes
19501                                                 : ($v_version ge v6.0.0
19502                                                    && $^V lt v5.17.0)
19503                                                     ? \&filter_v6_ucd
19504                                                     : undef),
19505
19506                                             # Early versions did not have the
19507                                             # proper Unicode_1 names for the
19508                                             # controls
19509                                             (($v_version lt v3.0.0)
19510                                             ? \&filter_early_U1_names
19511                                             : undef),
19512
19513                                             # Early versions did not correctly
19514                                             # use the later method for giving
19515                                             # decimal digit values
19516                                             (($v_version le v3.2.0)
19517                                             ? \&filter_bad_Nd_ucd
19518                                             : undef),
19519
19520                                             # And the main filter
19521                                             \&filter_UnicodeData_line,
19522                                          ],
19523                     EOF_Handler => \&EOF_UnicodeData,
19524                    ),
19525     Input_file->new('CJKXREF.TXT', v1.1.5,
19526                     Withdrawn => v2.0.0,
19527                     Skip => 'Gives the mapping of CJK code points '
19528                           . 'between Unicode and various other standards',
19529                    ),
19530     Input_file->new('ArabicShaping.txt', v2.0.0,
19531                     Each_Line_Handler =>
19532                         ($v_version lt 4.1.0)
19533                                     ? \&filter_old_style_arabic_shaping
19534                                     : undef,
19535                     # The first field after the range is a "schematic name"
19536                     # not used by Perl
19537                     Properties => [ '<ignored>', 'Joining_Type', 'Joining_Group' ],
19538                     Has_Missings_Defaults => $NOT_IGNORED,
19539                    ),
19540     Input_file->new('Blocks.txt', v2.0.0,
19541                     Property => 'Block',
19542                     Has_Missings_Defaults => $NOT_IGNORED,
19543                     Each_Line_Handler => \&filter_blocks_lines
19544                    ),
19545     Input_file->new('Index.txt', v2.0.0,
19546                     Skip => 'Alphabetical index of Unicode characters',
19547                    ),
19548     Input_file->new('NamesList.txt', v2.0.0,
19549                     Skip => 'Annotated list of characters',
19550                    ),
19551     Input_file->new('PropList.txt', v2.0.0,
19552                     Each_Line_Handler => (($v_version lt v3.1.0)
19553                                             ? \&filter_old_style_proplist
19554                                             : undef),
19555                    ),
19556     Input_file->new('Props.txt', v2.0.0,
19557                     Withdrawn => v3.0.0,
19558                     Skip => 'A subset of F<PropList.txt> (which is used instead)',
19559                    ),
19560     Input_file->new('ReadMe.txt', v2.0.0,
19561                     Skip => $Documentation,
19562                    ),
19563     Input_file->new('Unihan.txt', v2.0.0,
19564                     Withdrawn => v5.2.0,
19565                     Construction_Time_Handler => \&construct_unihan,
19566                     Pre_Handler => \&setup_unihan,
19567                     Optional => [ "",
19568                                   'Unicode_Radical_Stroke'
19569                                 ],
19570                     Each_Line_Handler => \&filter_unihan_line,
19571                    ),
19572     Input_file->new('SpecialCasing.txt', v2.1.8,
19573                     Each_Line_Handler => ($v_version eq 2.1.8)
19574                                          ? \&filter_2_1_8_special_casing_line
19575                                          : \&filter_special_casing_line,
19576                     Pre_Handler => \&setup_special_casing,
19577                     Has_Missings_Defaults => $IGNORED,
19578                    ),
19579     Input_file->new(
19580                     'LineBreak.txt', v3.0.0,
19581                     Has_Missings_Defaults => $NOT_IGNORED,
19582                     Property => 'Line_Break',
19583                     # Early versions had problematic syntax
19584                     Each_Line_Handler => ($v_version ge v3.1.0)
19585                                           ? undef
19586                                           : ($v_version lt v3.0.0)
19587                                             ? \&filter_substitute_lb
19588                                             : \&filter_early_ea_lb,
19589                     # Must use long names for property values see comments at
19590                     # sub filter_substitute_lb
19591                     Early => [ "LBsubst.txt", '_Perl_LB', 'Alphabetic',
19592                                'Alphabetic', # default to this because XX ->
19593                                              # AL
19594
19595                                # Don't use _Perl_LB as a synonym for
19596                                # Line_Break in later perls, as it is tailored
19597                                # and isn't the same as Line_Break
19598                                'ONLY_EARLY' ],
19599                    ),
19600     Input_file->new('EastAsianWidth.txt', v3.0.0,
19601                     Property => 'East_Asian_Width',
19602                     Has_Missings_Defaults => $NOT_IGNORED,
19603                     # Early versions had problematic syntax
19604                     Each_Line_Handler => (($v_version lt v3.1.0)
19605                                         ? \&filter_early_ea_lb
19606                                         : undef),
19607                    ),
19608     Input_file->new('CompositionExclusions.txt', v3.0.0,
19609                     Property => 'Composition_Exclusion',
19610                    ),
19611     Input_file->new('UnicodeData.html', v3.0.0,
19612                     Withdrawn => v4.0.1,
19613                     Skip => $Documentation,
19614                    ),
19615     Input_file->new('BidiMirroring.txt', v3.0.1,
19616                     Property => 'Bidi_Mirroring_Glyph',
19617                     Has_Missings_Defaults => ($v_version lt v6.2.0)
19618                                               ? $NO_DEFAULTS
19619                                               # Is <none> which doesn't mean
19620                                               # anything to us, we will use the
19621                                               # null string
19622                                               : $IGNORED,
19623                    ),
19624     Input_file->new('NamesList.html', v3.0.0,
19625                     Skip => 'Describes the format and contents of '
19626                           . 'F<NamesList.txt>',
19627                    ),
19628     Input_file->new('UnicodeCharacterDatabase.html', v3.0.0,
19629                     Withdrawn => v5.1,
19630                     Skip => $Documentation,
19631                    ),
19632     Input_file->new('CaseFolding.txt', v3.0.1,
19633                     Pre_Handler => \&setup_case_folding,
19634                     Each_Line_Handler =>
19635                         [ ($v_version lt v3.1.0)
19636                                  ? \&filter_old_style_case_folding
19637                                  : undef,
19638                            \&filter_case_folding_line
19639                         ],
19640                     Has_Missings_Defaults => $IGNORED,
19641                    ),
19642     Input_file->new("NormTest.txt", v3.0.1,
19643                      Handler => \&process_NormalizationsTest,
19644                      Skip => ($make_norm_test_script) ? 0 : $Validation,
19645                    ),
19646     Input_file->new('DCoreProperties.txt', v3.1.0,
19647                     # 5.2 changed this file
19648                     Has_Missings_Defaults => (($v_version ge v5.2.0)
19649                                             ? $NOT_IGNORED
19650                                             : $NO_DEFAULTS),
19651                    ),
19652     Input_file->new('DProperties.html', v3.1.0,
19653                     Withdrawn => v3.2.0,
19654                     Skip => $Documentation,
19655                    ),
19656     Input_file->new('PropList.html', v3.1.0,
19657                     Withdrawn => v5.1,
19658                     Skip => $Documentation,
19659                    ),
19660     Input_file->new('Scripts.txt', v3.1.0,
19661                     Property => 'Script',
19662                     Each_Line_Handler => (($v_version le v4.0.0)
19663                                           ? \&filter_all_caps_script_names
19664                                           : undef),
19665                     Has_Missings_Defaults => $NOT_IGNORED,
19666                    ),
19667     Input_file->new('DNormalizationProps.txt', v3.1.0,
19668                     Has_Missings_Defaults => $NOT_IGNORED,
19669                     Each_Line_Handler => (($v_version lt v4.0.1)
19670                                       ? \&filter_old_style_normalization_lines
19671                                       : undef),
19672                    ),
19673     Input_file->new('DerivedProperties.html', v3.1.1,
19674                     Withdrawn => v5.1,
19675                     Skip => $Documentation,
19676                    ),
19677     Input_file->new('DAge.txt', v3.2.0,
19678                     Has_Missings_Defaults => $NOT_IGNORED,
19679                     Property => 'Age'
19680                    ),
19681     Input_file->new('HangulSyllableType.txt', v4.0,
19682                     Has_Missings_Defaults => $NOT_IGNORED,
19683                     Early => [ \&generate_hst, 'Hangul_Syllable_Type' ],
19684                     Property => 'Hangul_Syllable_Type'
19685                    ),
19686     Input_file->new('NormalizationCorrections.txt', v3.2.0,
19687                      # This documents the cumulative fixes to erroneous
19688                      # normalizations in earlier Unicode versions.  Its main
19689                      # purpose is so that someone running on an earlier
19690                      # version can use this file to override what got
19691                      # published in that earlier release.  It would be easy
19692                      # for mktables to handle this file.  But all the
19693                      # corrections in it should already be in the other files
19694                      # for the release it is.  To get it to actually mean
19695                      # something useful, someone would have to be using an
19696                      # earlier Unicode release, and copy it into the directory
19697                      # for that release and recompile.  So far there has been
19698                      # no demand to do that, so this hasn't been implemented.
19699                     Skip => 'Documentation of corrections already '
19700                           . 'incorporated into the Unicode data base',
19701                    ),
19702     Input_file->new('StandardizedVariants.html', v3.2.0,
19703                     Skip => 'Obsoleted as of Unicode 9.0, but previously '
19704                           . 'provided a visual display of the standard '
19705                           . 'variant sequences derived from '
19706                           . 'F<StandardizedVariants.txt>.',
19707                         # I don't know why the html came earlier than the
19708                         # .txt, but both are skipped anyway, so it doesn't
19709                         # matter.
19710                    ),
19711     Input_file->new('StandardizedVariants.txt', v4.0.0,
19712                     Skip => 'Certain glyph variations for character display '
19713                           . 'are standardized.  This lists the non-Unihan '
19714                           . 'ones; the Unihan ones are also not used by '
19715                           . 'Perl, and are in a separate Unicode data base '
19716                           . 'L<http://www.unicode.org/ivd>',
19717                    ),
19718     Input_file->new('UCD.html', v4.0.0,
19719                     Withdrawn => v5.2,
19720                     Skip => $Documentation,
19721                    ),
19722     Input_file->new("$AUXILIARY/WordBreakProperty.txt", v4.1.0,
19723                     Early => [ "WBsubst.txt", '_Perl_WB', 'ALetter',
19724
19725                                # Don't use _Perl_WB as a synonym for
19726                                # Word_Break in later perls, as it is tailored
19727                                # and isn't the same as Word_Break
19728                                'ONLY_EARLY' ],
19729                     Property => 'Word_Break',
19730                     Has_Missings_Defaults => $NOT_IGNORED,
19731                    ),
19732     Input_file->new("$AUXILIARY/GraphemeBreakProperty.txt", v4.1,
19733                     Early => [ \&generate_GCB, '_Perl_GCB' ],
19734                     Property => 'Grapheme_Cluster_Break',
19735                     Has_Missings_Defaults => $NOT_IGNORED,
19736                    ),
19737     Input_file->new("$AUXILIARY/GCBTest.txt", v4.1.0,
19738                     Handler => \&process_GCB_test,
19739                     retain_trailing_comments => 1,
19740                    ),
19741     Input_file->new("$AUXILIARY/GraphemeBreakTest.html", v4.1.0,
19742                     Skip => $Validation_Documentation,
19743                    ),
19744     Input_file->new("$AUXILIARY/SBTest.txt", v4.1.0,
19745                     Handler => \&process_SB_test,
19746                     retain_trailing_comments => 1,
19747                    ),
19748     Input_file->new("$AUXILIARY/SentenceBreakTest.html", v4.1.0,
19749                     Skip => $Validation_Documentation,
19750                    ),
19751     Input_file->new("$AUXILIARY/WBTest.txt", v4.1.0,
19752                     Handler => \&process_WB_test,
19753                     retain_trailing_comments => 1,
19754                    ),
19755     Input_file->new("$AUXILIARY/WordBreakTest.html", v4.1.0,
19756                     Skip => $Validation_Documentation,
19757                    ),
19758     Input_file->new("$AUXILIARY/SentenceBreakProperty.txt", v4.1.0,
19759                     Property => 'Sentence_Break',
19760                     Early => [ "SBsubst.txt", '_Perl_SB', 'OLetter' ],
19761                     Has_Missings_Defaults => $NOT_IGNORED,
19762                    ),
19763     Input_file->new('NamedSequences.txt', v4.1.0,
19764                     Handler => \&process_NamedSequences
19765                    ),
19766     Input_file->new('Unihan.html', v4.1.0,
19767                     Withdrawn => v5.2,
19768                     Skip => $Documentation,
19769                    ),
19770     Input_file->new('NameAliases.txt', v5.0,
19771                     Property => 'Name_Alias',
19772                     Each_Line_Handler => ($v_version le v6.0.0)
19773                                    ? \&filter_early_version_name_alias_line
19774                                    : \&filter_later_version_name_alias_line,
19775                    ),
19776         # NameAliases.txt came along in v5.0.  The above constructor handles
19777         # this.  But until 6.1, it was lacking some information needed by core
19778         # perl.  The constructor below handles that.  It is either a kludge or
19779         # clever, depending on your point of view.  The 'Withdrawn' parameter
19780         # indicates not to use it at all starting in 6.1 (so the above
19781         # constructor applies), and the 'v6.1' parameter indicates to use the
19782         # Early parameter before 6.1.  Therefore 'Early" is always used,
19783         # yielding the internal-only property '_Perl_Name_Alias', which it
19784         # gets from a NameAliases.txt from 6.1 or later stored in
19785         # N_Asubst.txt.  In combination with the above constructor,
19786         # 'Name_Alias' is publicly accessible starting with v5.0, and the
19787         # better 6.1 version is accessible to perl core in all releases.
19788     Input_file->new("NameAliases.txt", v6.1,
19789                     Withdrawn => v6.1,
19790                     Early => [ "N_Asubst.txt", '_Perl_Name_Alias', "" ],
19791                     Property => 'Name_Alias',
19792                     EOF_Handler => \&fixup_early_perl_name_alias,
19793                     Each_Line_Handler =>
19794                                        \&filter_later_version_name_alias_line,
19795                    ),
19796     Input_file->new('NamedSqProv.txt', v5.0.0,
19797                     Skip => 'Named sequences proposed for inclusion in a '
19798                           . 'later version of the Unicode Standard; if you '
19799                           . 'need them now, you can append this file to '
19800                           . 'F<NamedSequences.txt> and recompile perl',
19801                    ),
19802     Input_file->new("$AUXILIARY/LBTest.txt", v5.1.0,
19803                     Handler => \&process_LB_test,
19804                     retain_trailing_comments => 1,
19805                    ),
19806     Input_file->new("$AUXILIARY/LineBreakTest.html", v5.1.0,
19807                     Skip => $Validation_Documentation,
19808                    ),
19809     Input_file->new("BidiTest.txt", v5.2.0,
19810                     Skip => $Validation,
19811                    ),
19812     Input_file->new('UnihanIndicesDictionary.txt', v5.2.0,
19813                     Optional => "",
19814                     Each_Line_Handler => \&filter_unihan_line,
19815                    ),
19816     Input_file->new('UnihanDataDictionaryLike.txt', v5.2.0,
19817                     Optional => "",
19818                     Each_Line_Handler => \&filter_unihan_line,
19819                    ),
19820     Input_file->new('UnihanIRGSources.txt', v5.2.0,
19821                     Optional => [ "",
19822                                   'kCompatibilityVariant',
19823                                   'kIICore',
19824                                   'kIRG_GSource',
19825                                   'kIRG_HSource',
19826                                   'kIRG_JSource',
19827                                   'kIRG_KPSource',
19828                                   'kIRG_MSource',
19829                                   'kIRG_KSource',
19830                                   'kIRG_TSource',
19831                                   'kIRG_USource',
19832                                   'kIRG_VSource',
19833                                ],
19834                     Pre_Handler => \&setup_unihan,
19835                     Each_Line_Handler => \&filter_unihan_line,
19836                    ),
19837     Input_file->new('UnihanNumericValues.txt', v5.2.0,
19838                     Optional => [ "",
19839                                   'kAccountingNumeric',
19840                                   'kOtherNumeric',
19841                                   'kPrimaryNumeric',
19842                                 ],
19843                     Each_Line_Handler => \&filter_unihan_line,
19844                    ),
19845     Input_file->new('UnihanOtherMappings.txt', v5.2.0,
19846                     Optional => "",
19847                     Each_Line_Handler => \&filter_unihan_line,
19848                    ),
19849     Input_file->new('UnihanRadicalStrokeCounts.txt', v5.2.0,
19850                     Optional => [ "",
19851                                   'Unicode_Radical_Stroke'
19852                                 ],
19853                     Each_Line_Handler => \&filter_unihan_line,
19854                    ),
19855     Input_file->new('UnihanReadings.txt', v5.2.0,
19856                     Optional => "",
19857                     Each_Line_Handler => \&filter_unihan_line,
19858                    ),
19859     Input_file->new('UnihanVariants.txt', v5.2.0,
19860                     Optional => "",
19861                     Each_Line_Handler => \&filter_unihan_line,
19862                    ),
19863     Input_file->new('CJKRadicals.txt', v5.2.0,
19864                     Skip => 'Maps the kRSUnicode property values to '
19865                           . 'corresponding code points',
19866                    ),
19867     Input_file->new('EmojiSources.txt', v6.0.0,
19868                     Skip => 'Maps certain Unicode code points to their '
19869                           . 'legacy Japanese cell-phone values',
19870                    ),
19871     Input_file->new('ScriptExtensions.txt', v6.0.0,
19872                     Property => 'Script_Extensions',
19873                     Early => [ sub {} ], # Doesn't do anything but ensures
19874                                          # that this isn't skipped for early
19875                                          # versions
19876                     Pre_Handler => \&setup_script_extensions,
19877                     Each_Line_Handler => \&filter_script_extensions_line,
19878                     Has_Missings_Defaults => (($v_version le v6.0.0)
19879                                             ? $NO_DEFAULTS
19880                                             : $IGNORED),
19881                    ),
19882     # These two Indic files are actually not usable as-is until 6.1.0,
19883     # because their property values are missing from PropValueAliases.txt
19884     # until that release, so that further work would have to be done to get
19885     # them to work properly, which isn't worth it because of them being
19886     # provisional.
19887     Input_file->new('IndicMatraCategory.txt', v6.0.0,
19888                     Withdrawn => v8.0.0,
19889                     Property => 'Indic_Matra_Category',
19890                     Has_Missings_Defaults => $NOT_IGNORED,
19891                     Skip => $Indic_Skip,
19892                    ),
19893     Input_file->new('IndicSyllabicCategory.txt', v6.0.0,
19894                     Property => 'Indic_Syllabic_Category',
19895                     Has_Missings_Defaults => $NOT_IGNORED,
19896                     Skip => (($v_version lt v8.0.0)
19897                               ? $Indic_Skip
19898                               : 0),
19899                    ),
19900     Input_file->new('USourceData.txt', v6.2.0,
19901                     Skip => 'Documentation of status and cross reference of '
19902                           . 'proposals for encoding by Unicode of Unihan '
19903                           . 'characters',
19904                    ),
19905     Input_file->new('USourceGlyphs.pdf', v6.2.0,
19906                     Skip => 'Pictures of the characters in F<USourceData.txt>',
19907                    ),
19908     Input_file->new('BidiBrackets.txt', v6.3.0,
19909                     Properties => [ 'Bidi_Paired_Bracket',
19910                                     'Bidi_Paired_Bracket_Type'
19911                                   ],
19912                     Has_Missings_Defaults => $NO_DEFAULTS,
19913                    ),
19914     Input_file->new("BidiCharacterTest.txt", v6.3.0,
19915                     Skip => $Validation,
19916                    ),
19917     Input_file->new('IndicPositionalCategory.txt', v8.0.0,
19918                     Property => 'Indic_Positional_Category',
19919                     Has_Missings_Defaults => $NOT_IGNORED,
19920                    ),
19921     Input_file->new('TangutSources.txt', v9.0.0,
19922                     Skip => 'Specifies source mappings for Tangut ideographs'
19923                           . ' and components. This data file also includes'
19924                           . ' informative radical-stroke values that are used'
19925                           . ' internally by Unicode',
19926                    ),
19927     Input_file->new('VerticalOrientation.txt', v10.0.0,
19928                     Property => 'Vertical_Orientation',
19929                     Has_Missings_Defaults => $NOT_IGNORED,
19930                    ),
19931     Input_file->new('NushuSources.txt', v10.0.0,
19932                     Skip => 'Specifies source material for Nushu characters',
19933                    ),
19934 );
19935
19936 # End of all the preliminaries.
19937 # Do it...
19938
19939 if (@missing_early_files) {
19940     print simple_fold(join_lines(<<END
19941
19942 The compilation cannot be completed because one or more required input files,
19943 listed below, are missing.  This is because you are compiling Unicode version
19944 $unicode_version, which predates the existence of these file(s).  To fully
19945 function, perl needs the data that these files would have contained if they
19946 had been in this release.  To work around this, create copies of later
19947 versions of the missing files in the directory containing '$0'.  (Perl will
19948 make the necessary adjustments to the data to compensate for it not being the
19949 same version as is being compiled.)  The files are available from unicode.org,
19950 via either ftp or http.  If using http, they will be under
19951 www.unicode.org/versions/.  Below are listed the source file name of each
19952 missing file, the Unicode version to copy it from, and the name to store it
19953 as.  (Note that the listed source file name may not be exactly the one that
19954 Unicode calls it.  If you don't find it, you can look it up in 'README.perl'
19955 to get the correct name.)
19956 END
19957     ));
19958     print simple_fold(join_lines("\n$_")) for @missing_early_files;
19959     exit 2;
19960 }
19961
19962 if ($compare_versions) {
19963     Carp::my_carp(<<END
19964 Warning.  \$compare_versions is set.  Output is not suitable for production
19965 END
19966     );
19967 }
19968
19969 # Put into %potential_files a list of all the files in the directory structure
19970 # that could be inputs to this program
19971 File::Find::find({
19972     wanted=>sub {
19973         return unless / \. ( txt | htm l? ) $ /xi;  # Some platforms change the
19974                                                     # name's case
19975         my $full = lc(File::Spec->rel2abs($_));
19976         $potential_files{$full} = 1;
19977         return;
19978     }
19979 }, File::Spec->curdir());
19980
19981 my @mktables_list_output_files;
19982 my $old_start_time = 0;
19983 my $old_options = "";
19984
19985 if (! -e $file_list) {
19986     print "'$file_list' doesn't exist, so forcing rebuild.\n" if $verbosity >= $VERBOSE;
19987     $write_unchanged_files = 1;
19988 } elsif ($write_unchanged_files) {
19989     print "Not checking file list '$file_list'.\n" if $verbosity >= $VERBOSE;
19990 }
19991 else {
19992     print "Reading file list '$file_list'\n" if $verbosity >= $VERBOSE;
19993     my $file_handle;
19994     if (! open $file_handle, "<", $file_list) {
19995         Carp::my_carp("Failed to open '$file_list'; turning on -globlist option instead: $!");
19996         $glob_list = 1;
19997     }
19998     else {
19999         my @input;
20000
20001         # Read and parse mktables.lst, placing the results from the first part
20002         # into @input, and the second part into @mktables_list_output_files
20003         for my $list ( \@input, \@mktables_list_output_files ) {
20004             while (<$file_handle>) {
20005                 s/^ \s+ | \s+ $//xg;
20006                 if (/^ \s* \# \s* Autogenerated\ starting\ on\ (\d+)/x) {
20007                     $old_start_time = $1;
20008                     next;
20009                 }
20010                 if (/^ \s* \# \s* From\ options\ (.+) /x) {
20011                     $old_options = $1;
20012                     next;
20013                 }
20014                 next if /^ \s* (?: \# .* )? $/x;
20015                 last if /^ =+ $/x;
20016                 my ( $file ) = split /\t/;
20017                 push @$list, $file;
20018             }
20019             @$list = uniques(@$list);
20020             next;
20021         }
20022
20023         # Look through all the input files
20024         foreach my $input (@input) {
20025             next if $input eq 'version'; # Already have checked this.
20026
20027             # Ignore if doesn't exist.  The checking about whether we care or
20028             # not is done via the Input_file object.
20029             next if ! file_exists($input);
20030
20031             # The paths are stored with relative names, and with '/' as the
20032             # delimiter; convert to absolute on this machine
20033             my $full = lc(File::Spec->rel2abs(internal_file_to_platform($input)));
20034             $potential_files{lc $full} = 1;
20035         }
20036     }
20037
20038     close $file_handle;
20039 }
20040
20041 if ($glob_list) {
20042
20043     # Here wants to process all .txt files in the directory structure.
20044     # Convert them to full path names.  They are stored in the platform's
20045     # relative style
20046     my @known_files;
20047     foreach my $object (@input_file_objects) {
20048         my $file = $object->file;
20049         next unless defined $file;
20050         push @known_files, File::Spec->rel2abs($file);
20051     }
20052
20053     my @unknown_input_files;
20054     foreach my $file (keys %potential_files) {  # The keys are stored in lc
20055         next if grep { $file eq lc($_) } @known_files;
20056
20057         # Here, the file is unknown to us.  Get relative path name
20058         $file = File::Spec->abs2rel($file);
20059         push @unknown_input_files, $file;
20060
20061         # What will happen is we create a data structure for it, and add it to
20062         # the list of input files to process.  First get the subdirectories
20063         # into an array
20064         my (undef, $directories, undef) = File::Spec->splitpath($file);
20065         $directories =~ s;/$;;;     # Can have extraneous trailing '/'
20066         my @directories = File::Spec->splitdir($directories);
20067
20068         # If the file isn't extracted (meaning none of the directories is the
20069         # extracted one), just add it to the end of the list of inputs.
20070         if (! grep { $EXTRACTED_DIR eq $_ } @directories) {
20071             push @input_file_objects, Input_file->new($file, v0);
20072         }
20073         else {
20074
20075             # Here, the file is extracted.  It needs to go ahead of most other
20076             # processing.  Search for the first input file that isn't a
20077             # special required property (that is, find one whose first_release
20078             # is non-0), and isn't extracted.  Also, the Age property file is
20079             # processed before the extracted ones, just in case
20080             # $compare_versions is set.
20081             for (my $i = 0; $i < @input_file_objects; $i++) {
20082                 if ($input_file_objects[$i]->first_released ne v0
20083                     && lc($input_file_objects[$i]->file) ne 'dage.txt'
20084                     && $input_file_objects[$i]->file !~ /$EXTRACTED_DIR/i)
20085                 {
20086                     splice @input_file_objects, $i, 0,
20087                                                 Input_file->new($file, v0);
20088                     last;
20089                 }
20090             }
20091
20092         }
20093     }
20094     if (@unknown_input_files) {
20095         print STDERR simple_fold(join_lines(<<END
20096
20097 The following files are unknown as to how to handle.  Assuming they are
20098 typical property files.  You'll know by later error messages if it worked or
20099 not:
20100 END
20101         ) . " " . join(", ", @unknown_input_files) . "\n\n");
20102     }
20103 } # End of looking through directory structure for more .txt files.
20104
20105 # Create the list of input files from the objects we have defined, plus
20106 # version
20107 my @input_files = qw(version Makefile);
20108 foreach my $object (@input_file_objects) {
20109     my $file = $object->file;
20110     next if ! defined $file;    # Not all objects have files
20111     next if defined $object->skip;;
20112     push @input_files,  $file;
20113 }
20114
20115 if ( $verbosity >= $VERBOSE ) {
20116     print "Expecting ".scalar( @input_files )." input files. ",
20117          "Checking ".scalar( @mktables_list_output_files )." output files.\n";
20118 }
20119
20120 # We set $most_recent to be the most recently changed input file, including
20121 # this program itself (done much earlier in this file)
20122 foreach my $in (@input_files) {
20123     next unless -e $in;        # Keep going even if missing a file
20124     my $mod_time = (stat $in)[9];
20125     $most_recent = $mod_time if $mod_time > $most_recent;
20126
20127     # See that the input files have distinct names, to warn someone if they
20128     # are adding a new one
20129     if ($make_list) {
20130         my ($volume, $directories, $file ) = File::Spec->splitpath($in);
20131         $directories =~ s;/$;;;     # Can have extraneous trailing '/'
20132         my @directories = File::Spec->splitdir($directories);
20133         construct_filename($file, 'mutable', \@directories);
20134     }
20135 }
20136
20137 # We use 'Makefile' just to see if it has changed since the last time we
20138 # rebuilt.  Now discard it.
20139 @input_files = grep { $_ ne 'Makefile' } @input_files;
20140
20141 my $rebuild = $write_unchanged_files    # Rebuild: if unconditional rebuild
20142               || ! scalar @mktables_list_output_files  # or if no outputs known
20143               || $old_start_time < $most_recent        # or out-of-date
20144               || $old_options ne $command_line_arguments; # or with different
20145                                                           # options
20146
20147 # Now we check to see if any output files are older than youngest, if
20148 # they are, we need to continue on, otherwise we can presumably bail.
20149 if (! $rebuild) {
20150     foreach my $out (@mktables_list_output_files) {
20151         if ( ! file_exists($out)) {
20152             print "'$out' is missing.\n" if $verbosity >= $VERBOSE;
20153             $rebuild = 1;
20154             last;
20155          }
20156         #local $to_trace = 1 if main::DEBUG;
20157         trace $most_recent, (stat $out)[9] if main::DEBUG && $to_trace;
20158         if ( (stat $out)[9] <= $most_recent ) {
20159             #trace "$out:  most recent mod time: ", (stat $out)[9], ", youngest: $most_recent\n" if main::DEBUG && $to_trace;
20160             print "'$out' is too old.\n" if $verbosity >= $VERBOSE;
20161             $rebuild = 1;
20162             last;
20163         }
20164     }
20165 }
20166 if (! $rebuild) {
20167     print "$0: Files seem to be ok, not bothering to rebuild.  Add '-w' option to force build\n";
20168     exit(0);
20169 }
20170 print "$0: Must rebuild tables.\n" if $verbosity >= $VERBOSE;
20171
20172 # Ready to do the major processing.  First create the perl pseudo-property.
20173 $perl = Property->new('perl', Type => $NON_STRING, Perl_Extension => 1);
20174
20175 # Process each input file
20176 foreach my $file (@input_file_objects) {
20177     $file->run;
20178 }
20179
20180 # Finish the table generation.
20181
20182 print "Finishing processing Unicode properties\n" if $verbosity >= $PROGRESS;
20183 finish_Unicode();
20184
20185 # For the very specialized case of comparing two Unicode versions...
20186 if (DEBUG && $compare_versions) {
20187     handle_compare_versions();
20188 }
20189
20190 print "Compiling Perl properties\n" if $verbosity >= $PROGRESS;
20191 compile_perl();
20192
20193 print "Creating Perl synonyms\n" if $verbosity >= $PROGRESS;
20194 add_perl_synonyms();
20195
20196 print "Writing tables\n" if $verbosity >= $PROGRESS;
20197 write_all_tables();
20198
20199 # Write mktables.lst
20200 if ( $file_list and $make_list ) {
20201
20202     print "Updating '$file_list'\n" if $verbosity >= $PROGRESS;
20203     foreach my $file (@input_files, @files_actually_output) {
20204         my (undef, $directories, $basefile) = File::Spec->splitpath($file);
20205         my @directories = grep length, File::Spec->splitdir($directories);
20206         $file = join '/', @directories, $basefile;
20207     }
20208
20209     my $ofh;
20210     if (! open $ofh,">",$file_list) {
20211         Carp::my_carp("Can't write to '$file_list'.  Skipping: $!");
20212         return
20213     }
20214     else {
20215         my $localtime = localtime $start_time;
20216         print $ofh <<"END";
20217 #
20218 # $file_list -- File list for $0.
20219 #
20220 #   Autogenerated starting on $start_time ($localtime)
20221 #   From options $command_line_arguments
20222 #
20223 # - First section is input files
20224 #   ($0 itself is not listed but is automatically considered an input)
20225 # - Section separator is /^=+\$/
20226 # - Second section is a list of output files.
20227 # - Lines matching /^\\s*#/ are treated as comments
20228 #   which along with blank lines are ignored.
20229 #
20230
20231 # Input files:
20232
20233 END
20234         print $ofh "$_\n" for sort(@input_files);
20235         print $ofh "\n=================================\n# Output files:\n\n";
20236         print $ofh "$_\n" for sort @files_actually_output;
20237         print $ofh "\n# ",scalar(@input_files)," input files\n",
20238                 "# ",scalar(@files_actually_output)+1," output files\n\n",
20239                 "# End list\n";
20240         close $ofh
20241             or Carp::my_carp("Failed to close $ofh: $!");
20242
20243         print "Filelist has ",scalar(@input_files)," input files and ",
20244             scalar(@files_actually_output)+1," output files\n"
20245             if $verbosity >= $VERBOSE;
20246     }
20247 }
20248
20249 # Output these warnings unless -q explicitly specified.
20250 if ($verbosity >= $NORMAL_VERBOSITY && ! $debug_skip) {
20251     if (@unhandled_properties) {
20252         print "\nProperties and tables that unexpectedly have no code points\n";
20253         foreach my $property (sort @unhandled_properties) {
20254             print $property, "\n";
20255         }
20256     }
20257
20258     if (%potential_files) {
20259         print "\nInput files that are not considered:\n";
20260         foreach my $file (sort keys %potential_files) {
20261             print File::Spec->abs2rel($file), "\n";
20262         }
20263     }
20264     print "\nAll done\n" if $verbosity >= $VERBOSE;
20265 }
20266
20267 if ($version_of_mk_invlist_bounds lt $v_version) {
20268     Carp::my_carp("WARNING: \\b{} algorithms (regen/mk_invlist.pl) need"
20269                 . " to be checked and possibly updated to Unicode"
20270                 . " $string_version");
20271 }
20272
20273 exit(0);
20274
20275 # TRAILING CODE IS USED BY make_property_test_script()
20276 __DATA__
20277
20278 use strict;
20279 use warnings;
20280
20281 # Test qr/\X/ and the \p{} regular expression constructs.  This file is
20282 # constructed by mktables from the tables it generates, so if mktables is
20283 # buggy, this won't necessarily catch those bugs.  Tests are generated for all
20284 # feasible properties; a few aren't currently feasible; see
20285 # is_code_point_usable() in mktables for details.
20286
20287 # Standard test packages are not used because this manipulates SIG_WARN.  It
20288 # exits 0 if every non-skipped test succeeded; -1 if any failed.
20289
20290 my $Tests = 0;
20291 my $Fails = 0;
20292
20293 # loc_tools.pl requires this function to be defined
20294 sub ok($@) {
20295     my ($pass, @msg) = @_;
20296     print "not " unless $pass;
20297     print "ok ";
20298     print ++$Tests;
20299     print " - ", join "", @msg if @msg;
20300     print "\n";
20301 }
20302
20303 sub Expect($$$$) {
20304     my $expected = shift;
20305     my $ord = shift;
20306     my $regex  = shift;
20307     my $warning_type = shift;   # Type of warning message, like 'deprecated'
20308                                 # or empty if none
20309     my $line   = (caller)[2];
20310
20311     # Convert the code point to hex form
20312     my $string = sprintf "\"\\x{%04X}\"", $ord;
20313
20314     my @tests = "";
20315
20316     # The first time through, use all warnings.  If the input should generate
20317     # a warning, add another time through with them turned off
20318     push @tests, "no warnings '$warning_type';" if $warning_type;
20319
20320     foreach my $no_warnings (@tests) {
20321
20322         # Store any warning messages instead of outputting them
20323         local $SIG{__WARN__} = $SIG{__WARN__};
20324         my $warning_message;
20325         $SIG{__WARN__} = sub { $warning_message = $_[0] };
20326
20327         $Tests++;
20328
20329         # A string eval is needed because of the 'no warnings'.
20330         # Assumes no parentheses in the regular expression
20331         my $result = eval "$no_warnings
20332                             my \$RegObj = qr($regex);
20333                             $string =~ \$RegObj ? 1 : 0";
20334         if (not defined $result) {
20335             print "not ok $Tests - couldn't compile /$regex/; line $line: $@\n";
20336             $Fails++;
20337         }
20338         elsif ($result ^ $expected) {
20339             print "not ok $Tests - expected $expected but got $result for $string =~ qr/$regex/; line $line\n";
20340             $Fails++;
20341         }
20342         elsif ($warning_message) {
20343             if (! $warning_type || ($warning_type && $no_warnings)) {
20344                 print "not ok $Tests - for qr/$regex/ did not expect warning message '$warning_message'; line $line\n";
20345                 $Fails++;
20346             }
20347             else {
20348                 print "ok $Tests - expected and got a warning message for qr/$regex/; line $line\n";
20349             }
20350         }
20351         elsif ($warning_type && ! $no_warnings) {
20352             print "not ok $Tests - for qr/$regex/ expected a $warning_type warning message, but got none; line $line\n";
20353             $Fails++;
20354         }
20355         else {
20356             print "ok $Tests - got $result for $string =~ qr/$regex/; line $line\n";
20357         }
20358     }
20359     return;
20360 }
20361
20362 sub Error($) {
20363     my $regex  = shift;
20364     $Tests++;
20365     if (eval { 'x' =~ qr/$regex/; 1 }) {
20366         $Fails++;
20367         my $line = (caller)[2];
20368         print "not ok $Tests - re compiled ok, but expected error for qr/$regex/; line $line: $@\n";
20369     }
20370     else {
20371         my $line = (caller)[2];
20372         print "ok $Tests - got and expected error for qr/$regex/; line $line\n";
20373     }
20374     return;
20375 }
20376
20377 # Break test files (e.g. GCBTest.txt) character that break allowed here
20378 my $breakable_utf8 = my $breakable = chr(utf8::unicode_to_native(0xF7));
20379 utf8::upgrade($breakable_utf8);
20380
20381 # Break test files (e.g. GCBTest.txt) character that indicates can't break
20382 # here
20383 my $nobreak_utf8 = my $nobreak = chr(utf8::unicode_to_native(0xD7));
20384 utf8::upgrade($nobreak_utf8);
20385
20386 my $are_ctype_locales_available;
20387 my $utf8_locale;
20388 chdir 't' if -d 't';
20389 eval { require "./loc_tools.pl" };
20390 if (defined &locales_enabled) {
20391     $are_ctype_locales_available = locales_enabled('LC_CTYPE');
20392     if ($are_ctype_locales_available) {
20393         $utf8_locale = &find_utf8_ctype_locale;
20394     }
20395 }
20396
20397 # Eval'd so can run on versions earlier than the property is available in
20398 my $WB_Extend_or_Format_re = eval 'qr/[\p{WB=Extend}\p{WB=Format}\p{WB=ZWJ}]/';
20399 if (! defined $WB_Extend_or_Format_re) {
20400     $WB_Extend_or_Format_re = eval 'qr/[\p{WB=Extend}\p{WB=Format}]/';
20401 }
20402
20403 sub _test_break($$) {
20404     # Test various break property matches.  The 2nd parameter gives the
20405     # property name.  The input is a line from auxiliary/*Test.txt for the
20406     # given property.  Each such line is a sequence of Unicode (not native)
20407     # code points given by their hex numbers, separated by the two characters
20408     # defined just before this subroutine that indicate that either there can
20409     # or cannot be a break between the adjacent code points.  All these are
20410     # tested.
20411     #
20412     # For the gcb property extra tests are made.  if there isn't a break, that
20413     # means the sequence forms an extended grapheme cluster, which means that
20414     # \X should match the whole thing.  If there is a break, \X should stop
20415     # there.  This is all converted by this routine into a match: $string =~
20416     # /(\X)/, Each \X should match the next cluster; and that is what is
20417     # checked.
20418
20419     my $template = shift;
20420     my $break_type = shift;
20421
20422     my $line   = (caller 1)[2];   # Line number
20423     my $comment = "";
20424
20425     if ($template =~ / ( .*? ) \s* \# (.*) /x) {
20426         $template = $1;
20427         $comment = $2;
20428
20429         # Replace leading spaces with a single one.
20430         $comment =~ s/ ^ \s* / # /x;
20431     }
20432
20433     # The line contains characters above the ASCII range, but in Latin1.  It
20434     # may or may not be in utf8, and if it is, it may or may not know it.  So,
20435     # convert these characters to 8 bits.  If knows is in utf8, simply
20436     # downgrade.
20437     if (utf8::is_utf8($template)) {
20438         utf8::downgrade($template);
20439     } else {
20440
20441         # Otherwise, if it is in utf8, but doesn't know it, the next lines
20442         # convert the two problematic characters to their 8-bit equivalents.
20443         # If it isn't in utf8, they don't harm anything.
20444         use bytes;
20445         $template =~ s/$nobreak_utf8/$nobreak/g;
20446         $template =~ s/$breakable_utf8/$breakable/g;
20447     }
20448
20449     # Perl customizes wb.  So change the official tests accordingly
20450     if ($break_type eq 'wb' && $WB_Extend_or_Format_re) {
20451
20452         # Split into elements that alternate between code point and
20453         # break/no-break
20454         my @line = split / +/, $template;
20455
20456         # Look at each code point and its following one
20457         for (my $i = 1; $i <  @line - 1 - 1; $i+=2) {
20458
20459             # The customization only involves changing some breaks to
20460             # non-breaks.
20461             next if $line[$i+1] =~ /$nobreak/;
20462
20463             my $lhs = chr utf8::unicode_to_native(hex $line[$i]);
20464             my $rhs = chr utf8::unicode_to_native(hex $line[$i+2]);
20465
20466             # And it only affects adjacent space characters.
20467             next if $lhs !~ /\s/u;
20468
20469             # But, we want to make sure to test spaces followed by a Extend
20470             # or Format.
20471             next if $rhs !~ /\s|$WB_Extend_or_Format_re/;
20472
20473             # To test the customization, add some white-space before this to
20474             # create a span.  The $lhs white space may or may not be bound to
20475             # that span, and also with the $rhs.  If the $rhs is a binding
20476             # character, the $lhs is bound to it and not to the span, unless
20477             # $lhs is vertical space.  In all other cases, the $lhs is bound
20478             # to the span.  If the $rhs is white space, it is bound to the
20479             # $lhs
20480             my $bound;
20481             my $span;
20482             if ($rhs =~ /$WB_Extend_or_Format_re/) {
20483                 if ($lhs =~ /\v/) {
20484                     $bound = $breakable;
20485                     $span = $nobreak;
20486                 }
20487                 else {
20488                     $bound = $nobreak;
20489                     $span = $breakable;
20490                 }
20491             }
20492             else {
20493                 $span = $nobreak;
20494                 $bound = $nobreak;
20495             }
20496
20497             splice @line, $i, 0, ( '0020', $nobreak, '0020', $span);
20498             $i += 4;
20499             $line[$i+1] = $bound;
20500         }
20501         $template = join " ", @line;
20502     }
20503
20504     # The input is just the break/no-break symbols and sequences of Unicode
20505     # code points as hex digits separated by spaces for legibility. e.g.:
20506     # ÷ 0020 × 0308 ÷ 0020 ÷
20507     # Convert to native \x format
20508     $template =~ s/ \s* ( [[:xdigit:]]+ ) \s* /sprintf("\\x{%02X}", utf8::unicode_to_native(hex $1))/gex;
20509     $template =~ s/ \s* //gx;   # Probably the line above removed all spaces;
20510                                 # but be sure
20511
20512     # Make a copy of the input with the symbols replaced by \b{} and \B{} as
20513     # appropriate
20514     my $break_pattern = $template =~ s/ $breakable /\\b{$break_type}/grx;
20515     $break_pattern =~ s/ $nobreak /\\B{$break_type}/gx;
20516
20517     my $display_string = $template =~ s/[$breakable$nobreak]//gr;
20518     my $string = eval "\"$display_string\"";
20519
20520     # The remaining massaging of the input is for the \X tests.  Get rid of
20521     # the leading and trailing breakables
20522     $template =~ s/^ \s* $breakable \s* //x;
20523     $template =~ s/ \s* $breakable \s* $ //x;
20524
20525     # Delete no-breaks
20526     $template =~ s/ \s* $nobreak \s* //xg;
20527
20528     # Split the input into segments that are breakable between them.
20529     my @should_display = split /\s*$breakable\s*/, $template;
20530     my @should_match = map { eval "\"$_\"" } @should_display;
20531
20532     # If a string can be represented in both non-ut8 and utf8, test both cases
20533     my $display_upgrade = "";
20534     UPGRADE:
20535     for my $to_upgrade (0 .. 1) {
20536
20537         if ($to_upgrade) {
20538
20539             # If already in utf8, would just be a repeat
20540             next UPGRADE if utf8::is_utf8($string);
20541
20542             utf8::upgrade($string);
20543             $display_upgrade = " (utf8-upgraded)";
20544         }
20545
20546         my @modifiers = qw(a aa d u i);
20547         if ($are_ctype_locales_available) {
20548             push @modifiers, "l$utf8_locale" if defined $utf8_locale;
20549
20550             # The /l modifier has C after it to indicate the locale to try
20551             push @modifiers, "lC";
20552         }
20553
20554         # Test for each of the regex modifiers.
20555         for my $modifier (@modifiers) {
20556             my $display_locale = "";
20557
20558             # For /l, set the locale to what it says to.
20559             if ($modifier =~ / ^ l (.*) /x) {
20560                 my $locale = $1;
20561                 $display_locale = "(locale = $locale)";
20562                 POSIX::setlocale(&POSIX::LC_CTYPE, $locale);
20563                 $modifier = 'l';
20564             }
20565
20566             no warnings qw(locale regexp surrogate);
20567             my $pattern = "(?$modifier:$break_pattern)";
20568
20569             # Actually do the test
20570             my $matched_text;
20571             my $matched = $string =~ qr/$pattern/;
20572             if ($matched) {
20573                 $matched_text = "matched";
20574             }
20575             else {
20576                 $matched_text = "failed to match";
20577                 print "not ";
20578
20579                 if (TODO_FAILING_BREAKS) {
20580                     $comment = " # $comment" unless $comment =~ / ^ \s* \# /x;
20581                     $comment =~ s/#/# TODO/;
20582                 }
20583             }
20584             print "ok ", ++$Tests, " - \"$display_string\" $matched_text /$pattern/$display_upgrade; line $line $display_locale$comment\n";
20585
20586             # Only print the comment on the first use of this line
20587             $comment = "";
20588
20589             # Repeat with the first \B{} in the pattern.  This makes sure the
20590             # code in regexec.c:find_byclass() for \B gets executed
20591             if ($pattern =~ / ( .*? : ) .* ( \\B\{ .* ) /x) {
20592                 my $B_pattern = "$1$2";
20593                 $matched = $string =~ qr/$B_pattern/;
20594                 print "not " unless $matched;
20595                 $matched_text = ($matched) ? "matched" : "failed to match";
20596                 print "ok ", ++$Tests, " - \"$display_string\" $matched_text /$B_pattern/$display_upgrade; line $line $display_locale";
20597                 print " # TODO" if TODO_FAILING_BREAKS && ! $matched;
20598                 print "\n";
20599             }
20600         }
20601
20602         next if $break_type ne 'gcb';
20603
20604         # Finally, do the \X match.
20605         my @matches = $string =~ /(\X)/g;
20606
20607         # Look through each matched cluster to verify that it matches what we
20608         # expect.
20609         my $min = (@matches < @should_match) ? @matches : @should_match;
20610         for my $i (0 .. $min - 1) {
20611             $Tests++;
20612             if ($matches[$i] eq $should_match[$i]) {
20613                 print "ok $Tests - ";
20614                 if ($i == 0) {
20615                     print "In \"$display_string\" =~ /(\\X)/g, \\X #1";
20616                 } else {
20617                     print "And \\X #", $i + 1,
20618                 }
20619                 print " correctly matched $should_display[$i]; line $line\n";
20620             } else {
20621                 $matches[$i] = join("", map { sprintf "\\x{%04X}", ord $_ }
20622                                                     split "", $matches[$i]);
20623                 print "not ok $Tests -";
20624                 print " # TODO" if TODO_FAILING_BREAKS;
20625                 print " In \"$display_string\" =~ /(\\X)/g, \\X #",
20626                     $i + 1,
20627                     " should have matched $should_display[$i]",
20628                     " but instead matched $matches[$i]",
20629                     ".  Abandoning rest of line $line\n";
20630                 next UPGRADE;
20631             }
20632         }
20633
20634         # And the number of matches should equal the number of expected matches.
20635         $Tests++;
20636         if (@matches == @should_match) {
20637             print "ok $Tests - Nothing was left over; line $line\n";
20638         } else {
20639             print "not ok $Tests - There were ", scalar @should_match, " \\X matches expected, but got ", scalar @matches, " instead; line $line";
20640             print " # TODO" if TODO_FAILING_BREAKS;
20641             print "\n";
20642         }
20643     }
20644
20645     return;
20646 }
20647
20648 sub Test_GCB($) {
20649     _test_break(shift, 'gcb');
20650 }
20651
20652 sub Test_LB($) {
20653     _test_break(shift, 'lb');
20654 }
20655
20656 sub Test_SB($) {
20657     _test_break(shift, 'sb');
20658 }
20659
20660 sub Test_WB($) {
20661     _test_break(shift, 'wb');
20662 }
20663
20664 sub Finished() {
20665     print "1..$Tests\n";
20666     exit($Fails ? -1 : 0);
20667 }
20668